1. unidecode for EJDE parser 2. get paper citation by aminer api Bug contains: 1. function "scholarly_get_citation" cannot work properly
490 lines
19 KiB
Python
490 lines
19 KiB
Python
import time
|
||
import uuid
|
||
import requests
|
||
import re
|
||
import ejde_save
|
||
|
||
from retrying import retry
|
||
from datetime import datetime
|
||
from bs4 import BeautifulSoup
|
||
from unidecode import unidecode
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
||
'''
|
||
爬取网站:'ejde.math.txstate.edu'
|
||
|
||
Total number of papers: 2023/08/08 - 4761
|
||
Total Time via VPN w/100ms-delay: 306.73s
|
||
|
||
==========运行顺序==========
|
||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||
2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||
'''
|
||
|
||
|
||
def datetime_transform(date):
|
||
month_typo = {
|
||
"Janaury": "January",
|
||
"Febrary": "February",
|
||
"Februay": "February",
|
||
"Mar": "March",
|
||
"Mach": "March",
|
||
"Match": "March",
|
||
"Maay": "May",
|
||
"Jun": "June",
|
||
"Juy": "July",
|
||
"Aapril": "April",
|
||
"Spetember": "September",
|
||
"Septembere": "September",
|
||
"Ocotber": "October",
|
||
"Nobember": "November",
|
||
}
|
||
try:
|
||
input_date = datetime.strptime(date, "%B %d, %Y")
|
||
return input_date.strftime("%Y-%m-%d")
|
||
except ValueError:
|
||
for typo, correction in month_typo.items():
|
||
date = date.replace(typo, correction)
|
||
|
||
try:
|
||
input_date = datetime.strptime(date, "%B %d, %Y")
|
||
return input_date.strftime("%Y-%m-%d")
|
||
except ValueError as val_err:
|
||
print("TYPO:", str(val_err))
|
||
return date
|
||
|
||
|
||
# Article and author detail
|
||
def process_volume(url):
|
||
articles = []
|
||
baseWeb = None
|
||
|
||
retries = 5
|
||
for attempt in range(retries):
|
||
try:
|
||
volume_response = requests.get(url)
|
||
if volume_response.status_code == 200:
|
||
volume_response.raise_for_status()
|
||
|
||
baseWeb = url[:url.rfind('/')] + "/"
|
||
html = volume_response.text
|
||
volume_soup = BeautifulSoup(html, "html.parser")
|
||
li_elements = volume_soup.find_all('ol')
|
||
if not li_elements:
|
||
li_elements = volume_soup.find_all('ul')
|
||
|
||
for li in li_elements:
|
||
em_elements = li.find_all('em')
|
||
if em_elements:
|
||
articles.extend(em for em in em_elements)
|
||
# Another html style
|
||
else:
|
||
i_elements = li.find_all('i')
|
||
if i_elements:
|
||
articles.extend(i for i in i_elements)
|
||
else:
|
||
print("HTML FORMAT FAILURE:", url)
|
||
fail = {
|
||
"website": url
|
||
}
|
||
failedFormatData.append(fail)
|
||
return
|
||
break
|
||
except Exception as fetch_err:
|
||
if attempt < retries - 1:
|
||
print("RETRYING TO FETCH HTML:", str(fetch_err))
|
||
time.sleep(1)
|
||
continue
|
||
else:
|
||
print("HTML FETCHING FAILURE:", url)
|
||
fail = {
|
||
"website": url
|
||
}
|
||
failedVolData.append(fail)
|
||
return
|
||
|
||
# Process each article using multithreading (>20 threads would cause more error)
|
||
volume_executor = ThreadPoolExecutor(max_workers=15)
|
||
volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
|
||
|
||
# Wait for all tasks to complete
|
||
for volume_futures in as_completed(volume_futures):
|
||
try:
|
||
volume_futures.result()
|
||
except Exception as html_err:
|
||
print("HTML PROCESSING ERROR:", str(html_err))
|
||
|
||
|
||
def process_html_article(baseweb, article):
|
||
# Get article title & url
|
||
try:
|
||
title = article.text.strip()
|
||
title = re.sub(r'\s+', ' ', title).strip()
|
||
article_url = baseweb + article.find_next("a")["href"]
|
||
if "../../index.html" in article_url:
|
||
print("Redundant URL:", article_url)
|
||
return
|
||
except Exception as html_format_err:
|
||
print("HTML FORMAT FAILURE:", str(html_format_err))
|
||
fail = {
|
||
"article": str(article)
|
||
}
|
||
failedFormatData.append(fail)
|
||
return
|
||
|
||
# Crawl article data
|
||
try:
|
||
process_article(title, article_url)
|
||
except Exception as article_err:
|
||
print("ARTICLE PROCESSING FAILURE:", str(article_err))
|
||
fail = {
|
||
"title": title,
|
||
"URL": article_url
|
||
}
|
||
failedData.append(fail)
|
||
return
|
||
|
||
|
||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||
def process_article(title, article_url):
|
||
global articleNum, authorNum
|
||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
||
article_response = requests.get(article_url, headers=headers)
|
||
if article_response.status_code == 200:
|
||
article_response.raise_for_status()
|
||
|
||
html = article_response.text
|
||
article_soup = BeautifulSoup(html, 'html.parser')
|
||
article_text = article_soup.get_text()
|
||
|
||
# Extract title if title == None
|
||
if not title:
|
||
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
||
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
|
||
|
||
# Extract issue
|
||
issue_match = re.search(r'No\. (\d+)', article_text)
|
||
issue = issue_match.group(1) if issue_match else None
|
||
|
||
# Extract volume
|
||
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||
volume = str(volume_match.group(1)) if volume_match else None
|
||
if not volume:
|
||
volume_match = re.search(r'Special Issue (\d+) \((\d+)\)', article_text)
|
||
if volume_match:
|
||
issue_number, volume = volume_match.groups()
|
||
volume = str(volume)
|
||
issue = "Special Issue " + str(issue_number)
|
||
else:
|
||
volume_match = re.search(r'Conf\. (\d{2}), (\d{4})', article_text)
|
||
if volume_match:
|
||
issue = "Conference " + str(volume_match.group(1))
|
||
volume = str(volume_match.group(2))
|
||
else:
|
||
volume_match = re.search(r'Conference (\d+) \((\d+)\)', article_text)
|
||
if volume_match:
|
||
issue_number, volume = volume_match.groups()
|
||
volume = str(volume)
|
||
issue = "Conference " + str(issue_number)
|
||
else:
|
||
volume = None
|
||
|
||
# Extract pp
|
||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||
pp = pp_match.group(1) if pp_match else None
|
||
|
||
# Extract submission date
|
||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||
submitted_date = match.group(1) if match else None
|
||
if submitted_date:
|
||
submitted_date = datetime_transform(submitted_date)
|
||
|
||
# Extract publication date
|
||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||
publish_date = match.group(1) if match else None
|
||
if publish_date:
|
||
publish_date = datetime_transform(publish_date)
|
||
|
||
# Extract MSC
|
||
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
||
if not msc_match:
|
||
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
|
||
if msc_match:
|
||
msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
|
||
msc = msc.strip('.').strip()
|
||
msc = re.split(r', |;', msc)
|
||
else:
|
||
msc = []
|
||
|
||
# Extract KeyWords
|
||
keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
|
||
if not keywords_match:
|
||
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
||
if keywords_match:
|
||
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
|
||
keywords = re.split(r', |;', keywords)
|
||
keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in
|
||
keywords if len(keyword.strip())]
|
||
else:
|
||
keywords = []
|
||
|
||
# Extract DOI
|
||
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
|
||
if not doi_match:
|
||
doi_match = re.search(r'DOI: (.+)', html)
|
||
doi = doi_match.group(1) if doi_match else None
|
||
doi = doi.replace('https://doi.org/', '') # strip doi website header
|
||
|
||
# Article_id
|
||
article_id = str(uuid.uuid4())
|
||
|
||
# Author info
|
||
authors = []
|
||
table = article_soup.find('table')
|
||
if table:
|
||
for row in table.find_all('tr'):
|
||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||
for cell in cells:
|
||
if "email:" in cell:
|
||
cell = cell.split("email:")
|
||
email_list = str(cell[1]).split(',')
|
||
cell = cell[0]
|
||
elif "e-mail:" in cell:
|
||
cell = cell.split("e-mail:")
|
||
email_list = str(cell[1]).split(',')
|
||
cell = cell[0]
|
||
else:
|
||
email_list = None
|
||
|
||
cell = re.split(r'[\r\n]+', cell)
|
||
cell = [c.replace('\\newline', '') for c in cell]
|
||
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
|
||
|
||
# Data processing
|
||
if cell[0]:
|
||
authors.append(unidecode(cell[0]))
|
||
name = re.split(r'[ .]', cell[0])
|
||
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||
emails = []
|
||
if email_list:
|
||
for email in email_list:
|
||
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
||
emails.append(unidecode(email_match.group())) if email_match else None
|
||
|
||
author_data = {
|
||
"author_id": str(uuid.uuid4()),
|
||
"from_article": [article_id],
|
||
"first_name": unidecode(name[0]),
|
||
"last_name": unidecode(name[-1]),
|
||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
||
"affiliation": [{
|
||
"year": volume,
|
||
"affiliation": unidecode(affiliation),
|
||
"email": emails
|
||
}]
|
||
}
|
||
authorData.append(author_data)
|
||
authorNum += 1
|
||
# If no author table
|
||
else:
|
||
match_type = 0
|
||
hr_count = len(article_soup.find_all('hr'))
|
||
if hr_count < 3:
|
||
pattern = r'<hr>(.*?)<hr>'
|
||
else:
|
||
pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
|
||
matches = str(re.findall(pattern, html, re.DOTALL))
|
||
if len(matches) < 5:
|
||
match_type = 1
|
||
last_p_tag = str(article_soup.find_all('p')[-1])
|
||
pattern = r'<p>(.*?)<hr/>'
|
||
matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip()
|
||
|
||
if matches:
|
||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||
matches = matches.split("<p>")
|
||
for match in matches:
|
||
if "email:" in match:
|
||
match = match.split("email:")
|
||
email_list = str(match[1]).split(',')
|
||
match = match[0]
|
||
elif "e-mail:" in match:
|
||
match = match.split("e-mail:")
|
||
email_list = str(match[1]).split(',')
|
||
match = match[0]
|
||
else:
|
||
email_list = None
|
||
|
||
match = re.sub(r'<[^>]+>', '', match)
|
||
match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
|
||
if match_type == 0:
|
||
match = match.split("\\n")
|
||
else:
|
||
match = match.split("\n")
|
||
match = [m.replace('\\newline', '') for m in match]
|
||
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
|
||
|
||
# Data processing
|
||
if match[0]:
|
||
authors.append(unidecode(match[0]))
|
||
name = re.split(r'[ .]', match[0])
|
||
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||
emails = []
|
||
if email_list:
|
||
for email in email_list:
|
||
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
||
emails.append(unidecode(email_match.group())) if email_match else None
|
||
|
||
author_data = {
|
||
"author_id": str(uuid.uuid4()),
|
||
"from_article": [article_id],
|
||
"first_name": unidecode(name[0]),
|
||
"last_name": unidecode(name[-1]),
|
||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
||
"affiliation": [{
|
||
"year": volume,
|
||
"affiliation": unidecode(affiliation),
|
||
"email": emails
|
||
}]
|
||
}
|
||
authorData.append(author_data)
|
||
authorNum += 1
|
||
else:
|
||
print("AUTHOR SEARCHING ERROR:", article_url)
|
||
fail = {
|
||
"title": title,
|
||
"URL": article_url
|
||
}
|
||
failedFormatData.append(fail)
|
||
|
||
# Article info
|
||
article_data = {
|
||
"article_id": article_id,
|
||
"title": unidecode(title),
|
||
"authors": authors,
|
||
"corresponding_authors": None,
|
||
"submit_datetime": submitted_date,
|
||
"publish_datetime": publish_date,
|
||
"keywords": keywords,
|
||
"MSC": msc,
|
||
"URL": article_url,
|
||
"DOI": doi,
|
||
"publisher": "Texas State University",
|
||
"journal": "Electronic Journal of Differential Equations",
|
||
"volume": volume,
|
||
"issue": issue,
|
||
"page": pp
|
||
}
|
||
articleData.append(article_data)
|
||
articleNum += 1
|
||
|
||
# Save the data periodically based on batch size
|
||
if len(articleData) % batch_size == 0:
|
||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||
articleData.clear()
|
||
|
||
if len(authorData) % batch_size == 0:
|
||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||
authorData.clear()
|
||
|
||
|
||
start_time = time.time()
|
||
url_list = []
|
||
|
||
# Get all general volumes url
|
||
index = "https://ejde.math.txstate.edu/indexleft.html"
|
||
response = requests.get(index)
|
||
soup = BeautifulSoup(response.content, 'html.parser')
|
||
volume_links = soup.select('font > a[href]')
|
||
url_list.extend(["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1])
|
||
|
||
# Get all special issues url
|
||
index = "https://ejde.math.txstate.edu/special-toc.html"
|
||
response = requests.get(index)
|
||
soup = BeautifulSoup(response.content, 'html.parser')
|
||
special_links = soup.find_all("a", href=True)
|
||
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])
|
||
|
||
# Get all conference special issues url
|
||
index = "https://ejde.math.txstate.edu/conf-toc.html#latest"
|
||
response = requests.get(index)
|
||
soup = BeautifulSoup(response.content, 'html.parser')
|
||
special_links = soup.find_all("a", href=True)
|
||
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-2]])
|
||
|
||
authorData = []
|
||
articleData = []
|
||
failedData = []
|
||
totallyFailedData = []
|
||
failedVolData = []
|
||
failedFormatData = []
|
||
|
||
authorNum = 0
|
||
articleNum = 0
|
||
|
||
batch_size = 100 # Number of articles to process before saving
|
||
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
|
||
|
||
# Process each URL using multithreading
|
||
futures = [executor.submit(process_volume, url) for url in url_list]
|
||
|
||
# Wait for all tasks to complete
|
||
for future in as_completed(futures):
|
||
try:
|
||
future.result()
|
||
except Exception as vol_err:
|
||
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
||
|
||
# Retry failed processing paper
|
||
print("START RETRYING:", len(failedData))
|
||
while failedData:
|
||
data = failedData.pop(0)
|
||
articleTitle = data["title"]
|
||
articleUrl = data["URL"]
|
||
try:
|
||
process_article(articleTitle, articleUrl)
|
||
except Exception as retry_err:
|
||
print("ARTICLE RETRYING FAILURE:", str(retry_err))
|
||
totally_fail = {
|
||
"title": articleTitle,
|
||
"URL": articleUrl
|
||
}
|
||
totallyFailedData.append(totally_fail)
|
||
|
||
# Save remaining data
|
||
if len(articleData) > 0:
|
||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||
|
||
if len(authorData) > 0:
|
||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||
|
||
# Save error record
|
||
if len(totallyFailedData) > 0:
|
||
ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json")
|
||
print("Total failed processing paper:", len(totallyFailedData))
|
||
|
||
if len(failedVolData) > 0:
|
||
ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json")
|
||
print("Total failed fetching volume:", len(failedVolData))
|
||
|
||
if len(failedFormatData) > 0:
|
||
ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
|
||
print("Total failed searching article:", len(failedFormatData))
|
||
|
||
# Statistics
|
||
print("Total fetched paper:", articleNum)
|
||
print("Total fetched author:", authorNum)
|
||
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||
|
||
# Transfer to large file and delete the temporary storage files
|
||
ejde_save.Transf()
|
||
ejde_save.delete()
|