194 lines
5.7 KiB
Python
194 lines
5.7 KiB
Python
import time
|
|
import uuid
|
|
import re
|
|
import urllib
|
|
|
|
from selenium.webdriver.edge.options import Options
|
|
from selenium import webdriver
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
# Get the information in the webpage through selenium
|
|
def source(driver, num):
|
|
if driver.find_elements(by='id', value='columnRight'):
|
|
html_code = driver.page_source
|
|
soup = BeautifulSoup(html_code, 'html.parser')
|
|
return soup
|
|
elif num == 5:
|
|
print('Out of times!')
|
|
driver.quit()
|
|
return None
|
|
else:
|
|
num += 1
|
|
time.sleep(3)
|
|
return source(driver, num)
|
|
|
|
|
|
# Get the links of the authors' information
|
|
def author_links(Data):
|
|
Author_links = []
|
|
Author_hrefs_pattern = re.compile(r'periodica\.html\?periodica=1&'
|
|
'paramtipus_ertek=person_data&param_ertek=\d+')
|
|
Author_hrefs = re.findall(Author_hrefs_pattern, str(Data))
|
|
for Author_href in Author_hrefs:
|
|
Author_href = urllib.parse.urljoin('https://www.math.u-szeged.hu/ejqtde/', Author_href)
|
|
Author_links.append(Author_href)
|
|
|
|
return Author_links
|
|
|
|
|
|
# Get the information of the authors
|
|
def author_detail(Data, Year, article_id, Author_list):
|
|
# Name
|
|
author = Data.find('p', class_='publication_head').get_text()
|
|
|
|
author = author.split(',')
|
|
author = [char.replace(' ', '') for char in author]
|
|
|
|
Firstname = author[0]
|
|
Lastname = author[-1]
|
|
Middlename = ''.join(author[1:-1]) if author[1:-1] else None
|
|
|
|
# infor
|
|
table = Data.find('table', attrs={'border': '1', 'cellpadding': '2px'})
|
|
Td = table.find_all('td')
|
|
line = [td for td in Td]
|
|
|
|
# Affiliation
|
|
Affiliation = line[1].get_text()
|
|
|
|
# Email
|
|
Email = line[0].find('a').get('href')
|
|
|
|
author_data = {
|
|
"author_id": str(uuid.uuid4()),
|
|
"from_article": article_id,
|
|
"firstname": Firstname,
|
|
"lastname": Lastname,
|
|
"middlename": Middlename,
|
|
"affiliation": [
|
|
{
|
|
"year": Year,
|
|
"affiliation": Affiliation,
|
|
"email": Email
|
|
}
|
|
]
|
|
}
|
|
|
|
Author_list.append(author_data)
|
|
return Author_list
|
|
|
|
|
|
# Get the article's information
|
|
def article_detail(Data, URL, article_id, Aricle_list):
|
|
# Title
|
|
font = Data.find('font', attrs={'size': '+1'})
|
|
Title = font.find('b').get_text()
|
|
|
|
# Author and Corresponding_authors
|
|
author_pattern = re.compile(r'periodica\.html\?periodica=1&'
|
|
r'paramtipus_ertek=person_data&param_ertek=\d+"><b>(.*?)</b>')
|
|
Author = re.findall(author_pattern, str(Data))
|
|
Corresponding_author = Author[-1] # Corresponding_authors
|
|
del Author[-1]
|
|
|
|
# Submit_datetime and publish_datetime
|
|
def timeSet(time):
|
|
time = time.split('-')
|
|
time[1] = time[1].strip('0')
|
|
time = time[0] + '-' + time[1] + '-' + time[2]
|
|
return time
|
|
|
|
time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
|
|
time = re.findall(r'\d+-\d+-\d+', str(time))
|
|
Submit_date = timeSet(time[0]) if time[0] else None
|
|
Publish_date = timeSet(time[1]) if time[1] else None
|
|
|
|
# Keyword
|
|
Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
|
|
Keyword = Keyword.split(', ') if Keyword is not None else None
|
|
|
|
# MSC
|
|
MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
|
|
MSC = MSC.split(', ') if MSC is not None else None
|
|
|
|
# DOI
|
|
if len(re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))) > 0:
|
|
DOI = re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))[0]
|
|
else:
|
|
DOI = None
|
|
|
|
# Publisher
|
|
Publisher = 'www.math.u-szeged.hu/ejqtde'
|
|
|
|
# Journal
|
|
Journal = 'Electronic Journal of Qualitative Theory of Differential Equations'
|
|
|
|
# Volume
|
|
Volume = re.findall(r'<b>(\d+)</b>', str(Data))[0]
|
|
|
|
# Issue and page
|
|
result = Data.select_one('body > div:nth-of-type(3) > div:nth-of-type(2)').get_text()
|
|
Issue = re.findall(r'(\d+), \d+-\d+', result)[0]
|
|
Page = re.findall(r'\d+, (\d+-\d+)', result)[0]
|
|
|
|
article_data = {
|
|
"article_id": article_id,
|
|
"title": Title,
|
|
"authors": Author,
|
|
"corresponding_authors": Corresponding_author,
|
|
"submit_datetime": Submit_date,
|
|
"publish_datetime": Publish_date,
|
|
"keywords": Keyword,
|
|
"MSC": MSC,
|
|
"URL": URL,
|
|
"DOI": DOI,
|
|
"publisher": Publisher,
|
|
"journal": Journal,
|
|
"volume": Volume,
|
|
"issue": Issue,
|
|
"page": Page,
|
|
}
|
|
|
|
Aricle_list.append(article_data)
|
|
return Aricle_list
|
|
|
|
|
|
# Main code of scrawler
|
|
def scrawler(URL, lock, Article_list, Author_list):
|
|
print('Start: ', URL)
|
|
driver = webdriver.Edge(options=options)
|
|
driver.get(URL)
|
|
|
|
# Enter the detail page
|
|
Max_retryTimes = 3
|
|
Essay_data = source(driver, Max_retryTimes)
|
|
if Essay_data is not None:
|
|
article_id = str(uuid.uuid4())
|
|
Article_list = article_detail(Essay_data, URL, article_id, Article_list)
|
|
|
|
# Get the authors' information
|
|
Year = re.findall(r'<b>(\d+)</b>', str(Essay_data))[0]
|
|
for author_link in author_links(Essay_data):
|
|
driver.get(author_link)
|
|
Author_detail = source(driver, Max_retryTimes)
|
|
Author_list = author_detail(Author_detail, Year, article_id, Author_list)
|
|
|
|
|
|
print('Complete: ', URL)
|
|
driver.quit()
|
|
|
|
else:
|
|
print('Wrong: Some error occurred: ', URL)
|
|
pass
|
|
|
|
|
|
# Options setting
|
|
options = Options()
|
|
options.add_argument('--headless') # Run Edge in headless mode
|
|
options.add_argument('disable-gpu') # Disable GPU acceleration
|
|
options.add_argument('pageLoadStrategy=none') # Set page load strategy to 'none'
|
|
|
|
|