import time
import uuid
import requests
import re
import ejde_save
from retrying import retry
from datetime import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
'''
爬取网站:'ejde.math.txstate.edu'
Total number of papers: 2023/08/08 - 4300
Total Time via VPN w/119ms-delay: 441.80s
==========运行顺序==========
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
'''
def datetime_transform(date):
month_typo = {
"Janaury": "January",
"Febrary": "February",
"Februay": "February",
"Mar": "March",
"Mach": "March",
"Match": "March",
"Maay": "May",
"Jun": "June",
"Juy": "July",
"Aapril": "April",
"Spetember": "September",
"Septembere": "September",
"Ocotber": "October",
}
try:
input_date = datetime.strptime(date, "%B %d, %Y")
return input_date.strftime("%Y-%m-%d")
except ValueError:
for typo, correction in month_typo.items():
date = date.replace(typo, correction)
try:
input_date = datetime.strptime(date, "%B %d, %Y")
return input_date.strftime("%Y-%m-%d")
except ValueError as val_err:
print("TYPO:", str(val_err))
return date
# Article and author detail
def process_volume(url):
articles = []
baseWeb = None
retries = 5
for attempt in range(retries):
try:
volume_response = requests.get(url)
if volume_response.status_code == 200:
volume_response.raise_for_status()
baseWeb = url[:url.rfind('/')] + "/"
html = volume_response.text
volume_soup = BeautifulSoup(html, "html.parser")
ol_elements = volume_soup.find_all('ol')
for ol in ol_elements:
em_elements = ol.find_all('em')
if em_elements:
articles.extend(em for em in em_elements)
# Another html style
else:
i_elements = ol.find_all('i')
if i_elements:
articles.extend(i for i in i_elements)
else:
print("HTML FORMAT FAILURE:", url)
fail = {
"website": url
}
failedFormatData.append(fail)
return
break
except Exception as fetch_err:
if attempt < retries - 1:
print("RETRYING TO FETCH HTML:", str(fetch_err))
time.sleep(1)
continue
else:
print("HTML FETCHING FAILURE:", url)
fail = {
"website": url
}
failedVolData.append(fail)
return
# Process each article using multithreading (>20 threads would cause more error)
volume_executor = ThreadPoolExecutor(max_workers=15)
volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
# Wait for all tasks to complete
for volume_futures in as_completed(volume_futures):
try:
volume_futures.result()
except Exception as html_err:
print("HTML PROCESSING ERROR:", str(html_err))
def process_html_article(baseweb, article):
# Get article title & url
try:
title = article.text.strip()
title = re.sub(r'\s+', ' ', title).strip()
article_url = baseweb + article.find_next("a")["href"]
except Exception as html_format_err:
print("HTML FORMAT FAILURE:", str(html_format_err))
fail = {
"article": str(article)
}
failedFormatData.append(fail)
return
# Crawl article data
try:
process_article(title, article_url)
except Exception as article_err:
print("ARTICLE PROCESSING FAILURE:", str(article_err))
fail = {
"title": title,
"URL": article_url
}
failedData.append(fail)
return
@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(title, article_url):
global articleNum, authorNum
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
article_response = requests.get(article_url, headers=headers)
if article_response.status_code == 200:
article_response.raise_for_status()
html = article_response.text
article_soup = BeautifulSoup(html, 'html.parser')
article_text = article_soup.get_text()
# Extract volume
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
volume = str(volume_match.group(1)) if volume_match else None
# Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
pp = pp_match.group(1) if pp_match else None
# Extract issue
issue_match = re.search(r'No\. (\d+)', article_text)
issue = issue_match.group(1) if issue_match else None
# Extract submission date
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
submitted_date = match.group(1) if match else None
if submitted_date:
submitted_date = datetime_transform(submitted_date)
# Extract publication date
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
publish_date = match.group(1) if match else None
if publish_date:
publish_date = datetime_transform(publish_date)
# Extract MSC
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
if not msc_match:
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
if msc_match:
msc = msc_match.group(1).strip().strip('.').strip()
msc = re.split(r', |;', msc)
else:
msc = []
# Extract KeyWords
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
if not keywords_match:
keywords_match = re.search(r'Key Words: (.*?)
', html, re.DOTALL)
if keywords_match:
keywords = keywords_match.group(1).strip().replace('\n', '')
keywords = re.split(r', |;', keywords)
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
else:
keywords = []
# Extract DOI
doi_match = re.search(r'DOI: (.+)(?=<)', html)
if not doi_match:
doi_match = re.search(r'DOI: (.+)', html)
doi = doi_match.group(1) if doi_match else None
# Article_id
article_id = str(uuid.uuid4())
# Author info
authors = []
table = article_soup.find('table')
if table:
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells:
cell = cell.split("\n")
cell = [element.replace('email: ', '') for element in cell]
cell = [c.strip() for c in cell]
# Data processing
authors.append(cell[0])
name = cell[0].split(" ")
middle_name = ''.join(name[1:-1]) if name[1:-1] else None
affiliation = ', '.join(cell[1:-1])
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
email = email_match.group() if email_match else None
author_data = {
"author_id": str(uuid.uuid4()),
"from_article": [article_id],
"first_name": name[0],
"last_name": name[-1],
"middle_name": middle_name,
"affiliation": [{
"year": volume,
"affiliation": affiliation,
"email": email
}]
}
authorData.append(author_data)
# If no author table
else:
pattern = r'
") for match in matches: match = re.sub(r'<[^>]+>', '', match) match = match.lstrip("\\n ").rstrip("\\n ").strip() match = match.split("\\n") match = [element.replace('email: ', '') for element in match] match = [m.strip() for m in match] # Data processing authors.append(match[0]) name = match[0].split(" ") middle_name = ''.join(name[1:-1]) if name[1:-1] else None affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip() email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1]) email = email_match.group() if email_match else None author_data = { "author_id": str(uuid.uuid4()), "from_article": [article_id], "first_name": name[0], "last_name": name[-1], "middle_name": middle_name, "affiliation": [{ "year": volume, "affiliation": affiliation, "email": email }] } authorData.append(author_data) else: print("AUTHOR SEARCHING ERROR:", article_url) return # Article info article_data = { "article_id": article_id, "title": title, "authors": authors, "corresponding_authors": None, "submit_datetime": submitted_date, "publish_datetime": publish_date, "keywords": keywords, "MSC": msc, "URL": article_url, "DOI": doi, "publisher": "Texas State University", "journal": "Electronic Journal of Differential Equations", "volume": volume, "issue": issue, "page": pp } articleData.append(article_data) # Save the data periodically based on batch size if len(articleData) % batch_size == 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") articleNum += len(articleData) articleData.clear() if len(authorData) % batch_size == 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") authorNum += len(authorData) authorData.clear() start_time = time.time() index = "https://ejde.math.txstate.edu/indexleft.html" response = requests.get(index) soup = BeautifulSoup(response.content, 'html.parser') # Find all the URL links under the first Volume section volume_links = soup.select('font > a[href]') # Extract and store the URLs in a list using list comprehension url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1] # Initialize lists authorData = [] articleData = [] failedData = [] totallyFailedData = [] failedVolData = [] failedFormatData = [] # Initialize variables for counting authorNum = 0 articleNum = 0 batch_size = 100 # Number of articles to process before saving executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads # Process each URL using multithreading futures = [executor.submit(process_volume, url) for url in url_list] # Wait for all tasks to complete for future in as_completed(futures): try: future.result() except Exception as vol_err: print("VOLUME PROCESSING ERROR:", str(vol_err)) wait(futures) # Retry failed processing paper print("START RETRYING:", len(failedData)) while failedData: data = failedData.pop(0) articleTitle = data["title"] articleUrl = data["URL"] try: process_article(articleTitle, articleUrl) except Exception as retry_err: print("ARTICLE RETRYING FAILURE:", str(retry_err)) totally_fail = { "title": articleTitle, "URL": articleUrl } totallyFailedData.append(totally_fail) # Save remaining data if len(articleData) > 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") print("Total fetched paper:", len(articleData) + articleNum) if len(authorData) > 0: ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") print("Total fetched author:", len(authorData) + authorNum) # Save error record if len(totallyFailedData) > 0: ejde_save.save_data(failedData, "", "Failed_article_record.json") print("Total failed processing paper:", len(totallyFailedData)) if len(failedVolData) > 0: ejde_save.save_data(failedVolData, "", "Failed_volume_record.json") print("Total failed fetching volume:", len(failedVolData)) if len(failedFormatData) > 0: ejde_save.save_data(failedFormatData, "", "Failed_format_record.json") print("Total failed searching article:", len(failedFormatData)) # Total running time print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files ejde_save.Transf() ejde_save.delete()