1. unworkable retrying function back online baby New Function: 1. reformatted datetime_transform funtion to handle more month typos 2. reformatted process_article function into 3 functions to use multi-threads better running time 3. renewed article url search technique to handle different volume websites 4. more exception handling 5. bettered keywords and affiliation strip method 6. added methods for processing author data when there exists no author table 7. added code for retry failed processing paper 8. more detailed error messages storage
384 lines
14 KiB
Python
384 lines
14 KiB
Python
import time
|
||
import uuid
|
||
import requests
|
||
import re
|
||
import ejde_save
|
||
|
||
from retrying import retry
|
||
from datetime import datetime
|
||
from bs4 import BeautifulSoup
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
||
'''
|
||
爬取网站:'ejde.math.txstate.edu'
|
||
|
||
Total number of papers:
|
||
2023/08/08 - 4300
|
||
|
||
==========运行顺序==========
|
||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||
2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||
'''
|
||
|
||
|
||
def datetime_transform(date):
|
||
month_typo = {
|
||
"Janaury": "January",
|
||
"Febrary": "February",
|
||
"Februay": "February",
|
||
"Mar": "March",
|
||
"Mach": "March",
|
||
"Match": "March",
|
||
"Maay": "May",
|
||
"Jun": "June",
|
||
"Juy": "July",
|
||
"Aapril": "April",
|
||
"Spetember": "September",
|
||
"Septembere": "September",
|
||
"Ocotber": "October",
|
||
}
|
||
try:
|
||
input_date = datetime.strptime(date, "%B %d, %Y")
|
||
return input_date.strftime("%Y-%m-%d")
|
||
except ValueError:
|
||
for typo, correction in month_typo.items():
|
||
date = date.replace(typo, correction)
|
||
|
||
try:
|
||
input_date = datetime.strptime(date, "%B %d, %Y")
|
||
return input_date.strftime("%Y-%m-%d")
|
||
except ValueError as val_err:
|
||
print("TYPO:", str(val_err))
|
||
return date
|
||
|
||
|
||
# Article and author detail
|
||
def process_volume(url):
|
||
articles = []
|
||
baseWeb = None
|
||
|
||
retries = 5
|
||
for attempt in range(retries):
|
||
try:
|
||
volume_response = requests.get(url)
|
||
if volume_response.status_code == 200:
|
||
volume_response.raise_for_status()
|
||
|
||
baseWeb = url[:url.rfind('/')] + "/"
|
||
html = volume_response.text
|
||
volume_soup = BeautifulSoup(html, "html.parser")
|
||
ol_elements = volume_soup.find_all('ol')
|
||
|
||
for ol in ol_elements:
|
||
em_elements = ol.find_all('em')
|
||
if em_elements:
|
||
articles.extend(em for em in em_elements)
|
||
# Another html style
|
||
else:
|
||
i_elements = ol.find_all('i')
|
||
if i_elements:
|
||
articles.extend(i for i in i_elements)
|
||
else:
|
||
print("HTML FORMAT FAILURE:", url)
|
||
return
|
||
break
|
||
except Exception as fetch_err:
|
||
if attempt < retries - 1:
|
||
print("RETRYING TO FETCH HTML:", str(fetch_err))
|
||
time.sleep(1)
|
||
continue
|
||
else:
|
||
print("HTML FETCHING FAILURE:", url)
|
||
fail = {
|
||
"website": url,
|
||
}
|
||
failedVolData.append(fail)
|
||
return
|
||
|
||
# Process each article using multithreading
|
||
volume_executor = ThreadPoolExecutor(max_workers=15)
|
||
volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
|
||
|
||
# Wait for all tasks to complete
|
||
for volume_futures in as_completed(volume_futures):
|
||
try:
|
||
volume_futures.result()
|
||
except Exception as html_err:
|
||
print("HTML PROCESSING ERROR:", str(html_err))
|
||
|
||
|
||
def process_html_article(baseweb, article):
|
||
global articleNum, authorNum
|
||
# Get article title & url
|
||
try:
|
||
title = article.text.strip()
|
||
title = re.sub(r'\s+', ' ', title).strip()
|
||
article_url = baseweb + article.find_next("a")["href"]
|
||
except Exception as html_format_err:
|
||
print("HTML FORMAT FAILURE:", str(html_format_err))
|
||
fail = {
|
||
"article": str(article),
|
||
}
|
||
failedFormatData.append(fail)
|
||
return
|
||
|
||
# Crawl article data
|
||
try:
|
||
process_article(title, article_url)
|
||
except Exception as article_err:
|
||
print("ARTICLE PROCESSING FAILURE:", str(article_err))
|
||
fail = {
|
||
"title": title,
|
||
"URL": article_url,
|
||
}
|
||
failedData.append(fail)
|
||
return
|
||
|
||
# Save the data periodically based on batch size
|
||
if len(articleData) % batch_size == 0:
|
||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||
articleNum += len(articleData)
|
||
articleData.clear()
|
||
|
||
if len(authorData) % batch_size == 0:
|
||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||
authorNum += len(authorData)
|
||
authorData.clear()
|
||
|
||
|
||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||
def process_article(title, article_url):
|
||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
||
article_response = requests.get(article_url, headers=headers)
|
||
if article_response.status_code == 200:
|
||
article_response.raise_for_status()
|
||
|
||
html = article_response.text
|
||
article_soup = BeautifulSoup(html, 'html.parser')
|
||
article_text = article_soup.get_text()
|
||
|
||
# Extract volume
|
||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||
volume = str(volume_match.group(1)) if volume_match else None
|
||
|
||
# Extract pp
|
||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||
pp = pp_match.group(1) if pp_match else None
|
||
|
||
# Extract issue
|
||
issue_match = re.search(r'No\. (\d+)', article_text)
|
||
issue = issue_match.group(1) if issue_match else None
|
||
|
||
# Extract submission date
|
||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||
submitted_date = match.group(1) if match else None
|
||
if submitted_date:
|
||
submitted_date = datetime_transform(submitted_date)
|
||
|
||
# Extract publication date
|
||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||
publish_date = match.group(1) if match else None
|
||
if publish_date:
|
||
publish_date = datetime_transform(publish_date)
|
||
|
||
# Extract MSC
|
||
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
||
if not msc_match:
|
||
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
|
||
if msc_match:
|
||
msc = msc_match.group(1).strip().strip('.').strip()
|
||
msc = re.split(r', |;', msc)
|
||
else:
|
||
msc = None
|
||
|
||
# Extract KeyWords
|
||
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
||
if not keywords_match:
|
||
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
|
||
if keywords_match:
|
||
keywords = keywords_match.group(1).strip().replace('\n', '')
|
||
keywords = re.split(r', |;', keywords)
|
||
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
|
||
else:
|
||
keywords = None
|
||
|
||
# Extract DOI
|
||
doi_match = re.search(r'DOI: (.+)(?=<)', html)
|
||
if not doi_match:
|
||
doi_match = re.search(r'DOI: (.+)', html)
|
||
doi = doi_match.group(1) if doi_match else None
|
||
|
||
# Article_id
|
||
article_id = str(uuid.uuid4())
|
||
|
||
# Author info
|
||
authors = []
|
||
table = article_soup.find('table')
|
||
if table:
|
||
for row in table.find_all('tr'):
|
||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||
for cell in cells:
|
||
cell = cell.split("\n")
|
||
cell = [element.replace('email: ', '') for element in cell]
|
||
cell = [c.strip() for c in cell]
|
||
|
||
# Data processing
|
||
authors.append(cell[0])
|
||
name = cell[0].split(" ")
|
||
affiliation = ', '.join(cell[1:-1])
|
||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
|
||
email = email_match.group() if email_match else None
|
||
|
||
author_data = {
|
||
"author_id": str(uuid.uuid4()),
|
||
"from_article": article_id,
|
||
"firstname": name[0],
|
||
"lastname": name[-1],
|
||
"middlename": name[1:len(name) - 1] if len(name) > 2 else None,
|
||
"affiliation": [{
|
||
"year": volume,
|
||
"affiliation": affiliation,
|
||
"email": email,
|
||
}]
|
||
}
|
||
authorData.append(author_data)
|
||
# If no author table
|
||
else:
|
||
pattern = r'<hr>(.*?)<hr>'
|
||
matches = str(re.findall(pattern, html, re.DOTALL))
|
||
if matches:
|
||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||
matches = matches.split("<p>")
|
||
|
||
for match in matches:
|
||
match = re.sub(r'<[^>]+>', '', match)
|
||
match = match.lstrip("\\n ").rstrip("\\n ").strip()
|
||
match = match.split("\\n")
|
||
match = [element.replace('email: ', '') for element in match]
|
||
match = [m.strip() for m in match]
|
||
|
||
# Data processing
|
||
authors.append(match[0])
|
||
name = match[0].split(" ")
|
||
affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
|
||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
|
||
email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
|
||
email = email_match.group() if email_match else None
|
||
|
||
author_data = {
|
||
"author_id": str(uuid.uuid4()),
|
||
"from_article": article_id,
|
||
"firstname": name[0],
|
||
"lastname": name[-1],
|
||
"middlename": name[1:len(name) - 1] if len(name) > 2 else None,
|
||
"affiliation": [{
|
||
"year": volume,
|
||
"affiliation": affiliation,
|
||
"email": email,
|
||
}]
|
||
}
|
||
authorData.append(author_data)
|
||
else:
|
||
print("AUTHOR SEARCHING ERROR:", article_url)
|
||
return
|
||
|
||
# Article info
|
||
article_data = {
|
||
"article_id": article_id,
|
||
"title": title,
|
||
"authors": authors,
|
||
"corresponding_authors": None,
|
||
"submit_datetime": submitted_date,
|
||
"publish_datetime": publish_date,
|
||
"keywords": keywords,
|
||
"MSC": msc,
|
||
"URL": article_url,
|
||
"DOI": doi,
|
||
"publisher": "Texas State University",
|
||
"journal": "Electronic Journal of Differential Equations",
|
||
"volume": volume,
|
||
"issue": issue,
|
||
"page": pp,
|
||
}
|
||
articleData.append(article_data)
|
||
|
||
|
||
index = "https://ejde.math.txstate.edu/indexleft.html"
|
||
response = requests.get(index)
|
||
soup = BeautifulSoup(response.content, 'html.parser')
|
||
|
||
# Find all the URL links under the first Volume section
|
||
volume_links = soup.select('font > a[href]')
|
||
# Extract and store the URLs in a list using list comprehension
|
||
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
|
||
|
||
# Initialize lists
|
||
authorData = []
|
||
articleData = []
|
||
failedData = []
|
||
totallyFailedData = []
|
||
failedVolData = []
|
||
failedFormatData = []
|
||
|
||
# Initialize variables for counting
|
||
authorNum = 0
|
||
articleNum = 0
|
||
|
||
batch_size = 100 # Number of articles to process before saving
|
||
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads
|
||
|
||
# Process each URL using multithreading
|
||
futures = [executor.submit(process_volume, url) for url in url_list]
|
||
|
||
# Wait for all tasks to complete
|
||
for future in as_completed(futures):
|
||
try:
|
||
future.result()
|
||
except Exception as vol_err:
|
||
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
||
|
||
# Retry failed processing paper
|
||
for data in failedData:
|
||
articleTitle = data["title"]
|
||
articleUrl = data["URL"]
|
||
try:
|
||
process_article(articleTitle, articleUrl)
|
||
except Exception as retry_err:
|
||
print("ARTICLE RETRYING FAILURE:", str(retry_err))
|
||
totally_fail = {
|
||
"title": articleTitle,
|
||
"URL": articleUrl,
|
||
}
|
||
totallyFailedData.append(totally_fail)
|
||
|
||
# Save remaining data
|
||
if len(articleData) > 0:
|
||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||
print("Total fetched paper:", len(articleData) + articleNum)
|
||
|
||
if len(authorData) > 0:
|
||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||
print("Total fetched author:", len(authorData) + authorNum)
|
||
|
||
# Save error record
|
||
if len(totallyFailedData) > 0:
|
||
ejde_save.save_data(failedData, "", "Failed_article_record.json")
|
||
print("Total failed processing paper:", len(totallyFailedData))
|
||
|
||
if len(failedVolData) > 0:
|
||
ejde_save.save_data(failedVolData, "", "Failed_volume_record.json")
|
||
print("Total failed fetching volume:", len(failedVolData))
|
||
|
||
if len(failedFormatData) > 0:
|
||
ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
|
||
print("Total failed searching article:", len(failedFormatData))
|
||
|
||
# Transfer to large file and delete the temporary storage files
|
||
ejde_save.Transf()
|
||
# ejde_save.delete()
|