import time import uuid import re import urllib from selenium.webdriver.edge.options import Options from selenium import webdriver from bs4 import BeautifulSoup from urllib.parse import urljoin # Get the information in the webpage through selenium def source(driver, num): if driver.find_elements(by='id', value='columnRight'): html_code = driver.page_source soup = BeautifulSoup(html_code, 'html.parser') return soup elif num == 5: print('Out of times!') driver.quit() return None else: num += 1 time.sleep(3) return source(driver, num) # Get the links of the authors' information def author_links(Data): Author_links = [] Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&' 'paramtipus_ertek=person_data&param_ertek=\d+') Author_hrefs = re.findall(Author_hrefs_pattern, str(Data)) for Author_href in Author_hrefs: Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href) Author_links.append(Author_href) return Author_links # Get the information of the authors def author_detail(Data, Year, article_id, Author_list): # Name author = Data.find('p', class_='publication_head').get_text() author = author.split(',') author = [char.replace(' ', '') for char in author] Firstname = author[0] Lastname = author[-1] Middlename = ''.join(author[1:-1]) if author[1:-1] else None # infor table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'}) Td = table.find_all('td') line = [td for td in Td] # Affiliation Affiliation = line[1].get_text() # Email Email = line[0].find('a').get('href') author_data = { "author_id": str(uuid.uuid4()), "from_article": article_id, "firstname": Firstname, "lastname": Lastname, "middlename": Middlename, "affiliation": [ { "year": Year, "affiliation": Affiliation, "email": Email } ] } Author_list.append(author_data) return Author_list # Get the article's information def article_detail(Data, URL, article_id, Aricle_list): # Title font = Data.find('font', attrs={'size': '+1'}) Title = font.find('b').get_text() # Author and Corresponding_authors author_pattern = re.compile(r'periodica\.html\?periodica=1&' r'paramtipus_ertek=person_data&param_ertek=\d+">(.*?)') Author = re.findall(author_pattern, str(Data)) Corresponding_author = Author[-1] # Corresponding_authors del Author[-1] # Submit_datetime and publish_datetime time = Data.find('td', attrs={'align': 'right', 'width': '50%'}) time = re.findall(r'\d+-\d+-\d+', str(time)) Submit_date = time[0] if time[0] else None Publish_date = time[1] if time[1] else None # Keyword Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None Keyword = Keyword.split(', ') if Keyword is not None else None # MSC MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None MSC = MSC.split(', ') if MSC is not None else None # DOI if len(re.findall(r' 0: DOI = re.findall(r'(\d+)', str(Data))[0] # Issue and page result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text() Issue = re.findall(r'(\d+), \d+-\d+', result)[0] Page = re.findall(r'\d+, (\d+-\d+)', result)[0] article_data = { "article_id": article_id, "title": Title, "authors": Author, "corresponding_authors": Corresponding_author, "submit_datetime": Submit_date, "publish_datetime": Publish_date, "keywords": Keyword, "MSC": MSC, "URL": URL, "DOI": DOI, "publisher": Publisher, "journal": Journal, "volume": Volume, "issue": Issue, "page": Page, } Aricle_list.append(article_data) return Aricle_list # Main code of scrawler def scrawler(URL, lock, Article_list, Author_list): print('Start: ', URL) driver = webdriver.Edge(options=options) driver.get(URL) # Enter the detail page Max_retryTimes = 3 Essay_data = source(driver, Max_retryTimes) if Essay_data is not None: article_id = str(uuid.uuid4()) Article_list = article_detail(Essay_data, URL, article_id, Article_list) # Get the authors' information Year = re.findall(r'(\d+)', str(Essay_data))[0] for author_link in author_links(Essay_data): driver.get(author_link) Author_detail = source(driver, Max_retryTimes) Author_list = author_detail(Author_detail, Year, article_id, Author_list) print('Complete: ', URL) driver.quit() else: print('Wrong: Some error occurred: ', URL) pass # Options setting options = Options() options.add_argument('--headless') # Run Edge in headless mode options.add_argument('disable-gpu') # Disable GPU acceleration options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none'