ScholarDataMining/EJQTDE_spider/ejqtde_scrawler.py

188 lines
5.5 KiB
Python

import time
import uuid
import re
import urllib
from selenium.webdriver.edge.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# Get the information in the webpage through selenium
def source(driver, num):
if driver.find_elements(by='id', value='columnRight'):
html_code = driver.page_source
soup = BeautifulSoup(html_code, 'html.parser')
return soup
elif num == 5:
print('Out of times!')
driver.quit()
return None
else:
num += 1
time.sleep(3)
return source(driver, num)
# Get the links of the authors' information
def author_links(Data):
Author_links = []
Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&'
'paramtipus_ertek=person_data&param_ertek=\d+')
Author_hrefs = re.findall(Author_hrefs_pattern, str(Data))
for Author_href in Author_hrefs:
Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href)
Author_links.append(Author_href)
return Author_links
# Get the information of the authors
def author_detail(Data, Year, article_id, Author_list):
# Name
author = Data.find('p', class_='publication_head').get_text()
author = author.split(',')
author = [char.replace(' ', '') for char in author]
Firstname = author[0]
Lastname = author[-1]
Middlename = ''.join(author[1:-1]) if author[1:-1] else None
# infor
table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'})
Td = table.find_all('td')
line = [td for td in Td]
# Affiliation
Affiliation = line[1].get_text()
# Email
Email = line[0].find('a').get('href')
author_data = {
"author_id": str(uuid.uuid4()),
"from_article": article_id,
"firstname": Firstname,
"lastname": Lastname,
"middlename": Middlename,
"affiliation": [
{
"year": Year,
"affiliation": Affiliation,
"email": Email
}
]
}
Author_list.append(author_data)
return Author_list
# Get the article's information
def article_detail(Data, URL, article_id, Aricle_list):
# Title
font = Data.find('font', attrs={'size': '+1'})
Title = font.find('b').get_text()
# Author and Corresponding_authors
author_pattern = re.compile(r'periodica\.html\?periodica=1&'
r'paramtipus_ertek=person_data&amp;param_ertek=\d+"><b>(.*?)</b>')
Author = re.findall(author_pattern, str(Data))
Corresponding_author = Author[-1] # Corresponding_authors
del Author[-1]
# Submit_datetime and publish_datetime
time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
time = re.findall(r'\d+-\d+-\d+', str(time))
Submit_date = time[0] if time[0] else None
Publish_date = time[1] if time[1] else None
# Keyword
Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
Keyword = Keyword.split(', ') if Keyword is not None else None
# MSC
MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
MSC = MSC.split(', ') if MSC is not None else None
# DOI
if len(re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))) > 0:
DOI = re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))[0]
else:
DOI = None
# Publisher
Publisher = 'www.math.u-szeged.hu/ejqtde'
# Journal
Journal = 'Electronic Journal of Qualitative Theory of Differential Equations'
# Volume
Volume = re.findall(r'<b>(\d+)</b>', str(Data))[0]
# Issue and page
result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text()
Issue = re.findall(r'(\d+), \d+-\d+', result)[0]
Page = re.findall(r'\d+, (\d+-\d+)', result)[0]
article_data = {
"article_id": article_id,
"title": Title,
"authors": Author,
"corresponding_authors": Corresponding_author,
"submit_datetime": Submit_date,
"publish_datetime": Publish_date,
"keywords": Keyword,
"MSC": MSC,
"URL": URL,
"DOI": DOI,
"publisher": Publisher,
"journal": Journal,
"volume": Volume,
"issue": Issue,
"page": Page,
}
Aricle_list.append(article_data)
return Aricle_list
# Main code of scrawler
def scrawler(URL, lock, Article_list, Author_list):
print('Start: ', URL)
driver = webdriver.Edge(options=options)
driver.get(URL)
# Enter the detail page
Max_retryTimes = 3
Essay_data = source(driver, Max_retryTimes)
if Essay_data is not None:
article_id = str(uuid.uuid4())
Article_list = article_detail(Essay_data, URL, article_id, Article_list)
# Get the authors' information
Year = re.findall(r'<b>(\d+)</b>', str(Essay_data))[0]
for author_link in author_links(Essay_data):
driver.get(author_link)
Author_detail = source(driver, Max_retryTimes)
Author_list = author_detail(Author_detail, Year, article_id, Author_list)
print('Complete: ', URL)
driver.quit()
else:
print('Wrong: Some error occurred: ', URL)
pass
# Options setting
options = Options()
options.add_argument('--headless') # Run Edge in headless mode
options.add_argument('disable-gpu') # Disable GPU acceleration
options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none'