2023-11-02 11:19:59 +08:00

518 lines
20 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import time
import uuid
import requests
import threading
import ejde_save
from retrying import retry
from datetime import datetime
from bs4 import BeautifulSoup
from unidecode import unidecode
from concurrent.futures import ThreadPoolExecutor, as_completed
'''
爬取网站:'ejde.math.txstate.edu'
Total number of papers: 2023/08/08 - 4785
Total Time via VPN w/100ms-delay: 96.30s
==========运行顺序==========
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存
2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
'''
def append_data_thread_safe(from_list, to_list, data_lock):
with data_lock:
to_list.append(from_list)
def save_data_thread_safe(data, data_lock, data_type):
global articleNum, authorNum
with data_lock:
ejde_save.save_data(data, f"{data_type}_TS", str(uuid.uuid4()) + ".json")
if data_type == "Article":
articleNum += len(data)
else:
authorNum += len(data)
data.clear()
def datetime_transform(date):
month_typo = {
"Janaury": "January",
"Febrary": "February",
"Februay": "February",
"Mar": "March",
"Mach": "March",
"Match": "March",
"Maay": "May",
"Jun": "June",
"Juy": "July",
"Aapril": "April",
"Spetember": "September",
"Septembere": "September",
"Ocotber": "October",
"Nobember": "November",
}
try:
input_date = datetime.strptime(date, "%B %d, %Y")
return input_date.strftime("%Y-%m-%d")
except ValueError:
for typo, correction in month_typo.items():
date = date.replace(typo, correction)
try:
input_date = datetime.strptime(date, "%B %d, %Y")
return input_date.strftime("%Y-%m-%d")
except ValueError as val_err:
print("TYPO:", str(val_err))
return date
# Article and author detail
def process_volume(url):
articles = []
baseWeb = None
retries = 5
for attempt in range(retries):
try:
volume_response = requests.get(url)
if volume_response.status_code == 200:
volume_response.raise_for_status()
baseWeb = url[:url.rfind('/')] + "/"
html = volume_response.text
volume_soup = BeautifulSoup(html, "html.parser")
li_elements = volume_soup.find_all('ol')
if not li_elements:
li_elements = volume_soup.find_all('ul')
for li in li_elements:
em_elements = li.find_all('em')
if em_elements:
articles.extend(em for em in em_elements)
# Another html style
else:
i_elements = li.find_all('i')
if i_elements:
articles.extend(i for i in i_elements)
else:
print("HTML FORMAT FAILURE:", url)
fail = {
"website": url
}
failedFormatData.append(fail)
return
break
except Exception as fetch_err:
if attempt < retries - 1:
print("RETRYING TO FETCH HTML:", str(fetch_err))
time.sleep(1)
continue
else:
print("HTML FETCHING FAILURE:", url)
fail = {
"website": url
}
failedVolData.append(fail)
return
# Process each article using multithreading (>20 threads would cause more error)
volume_executor = ThreadPoolExecutor(max_workers=15)
volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]
# Wait for all tasks to complete
for volume_futures in as_completed(volume_futures):
try:
volume_futures.result()
except Exception as html_err:
print("HTML PROCESSING ERROR:", str(html_err))
def process_html_article(baseweb, article):
# Get article title & url
try:
title = article.text.strip()
title = str(re.sub(r'\s+', ' ', title).strip())
article_url = baseweb + article.find_next("a")["href"]
if "../../index.html" in article_url:
print("Redundant URL:", article_url)
return
except Exception as html_format_err:
print("HTML FORMAT FAILURE:", str(html_format_err))
fail = {
"article": str(article)
}
failedFormatData.append(fail)
return
# Crawl article data
try:
process_article(title, article_url)
except Exception as article_err:
print("ARTICLE PROCESSING FAILURE:", str(article_err))
fail = {
"title": title,
"URL": article_url
}
failedData.append(fail)
return
@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(title, article_url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
article_response = requests.get(article_url, headers=headers)
if article_response.status_code == 200:
article_response.raise_for_status()
html = article_response.text
article_soup = BeautifulSoup(html, 'html.parser')
article_text = article_soup.get_text()
# Extract title if title == None
if not title:
title_match = re.search(r"<h3>(.*?)<p>", article_text)
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""
# Extract issue
issue_match = re.search(r'No\. (\d+)', article_text)
issue = issue_match.group(1) if issue_match else ""
# Extract volume
volume_match = re.search(r'Vol\. (\d+)', article_text)
volume = str(volume_match.group(1)) if volume_match else None
if not volume:
volume_match = re.search(r'Special Issue (\d+) \((\d+)\)', article_text)
if volume_match:
issue_number, volume = volume_match.groups()
volume = str(volume)
issue = "Special Issue " + str(issue_number)
else:
volume_match = re.search(r'Conf\. (\d{2}), (\d{4})', article_text)
if volume_match:
issue = "Conference " + str(volume_match.group(1))
volume = str(volume_match.group(2))
else:
volume_match = re.search(r'Conference (\d+) \((\d+)\)', article_text)
if volume_match:
issue_number, volume = volume_match.groups()
volume = str(volume)
issue = "Conference " + str(issue_number)
else:
volume = ""
# Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
pp = pp_match.group(1) if pp_match else ""
# Extract submission date
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
if not match:
match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
submitted_date = match.group(1) if match else ""
if submitted_date:
submitted_date = datetime_transform(submitted_date)
# Extract publication date
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
publish_date = match.group(1) if match else ""
if publish_date:
publish_date = datetime_transform(publish_date)
# Extract MSC
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
if not msc_match:
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
if msc_match:
msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
msc = msc.strip('.').strip()
msc = re.split(r', |;', msc)
else:
msc = []
# Extract KeyWords
keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
if not keywords_match:
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
if keywords_match:
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
keywords = re.split(r', |;', keywords)
keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in
keywords if len(keyword.strip())]
else:
keywords = []
# Extract DOI
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
if not doi_match:
doi_match = re.search(r'DOI: (.+)', html)
doi = doi_match.group(1) if doi_match else ""
# Article_id
article_id = str(uuid.uuid4())
# Author info
authors = []
author_names = []
table = article_soup.find('table')
if table:
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells:
if "email" in cell:
cell = cell.split("email")
email_list = str(cell[1]).split(',')
cell = cell[0]
elif "e-mail" in cell:
cell = cell.split("e-mail")
email_list = str(cell[1]).split(',')
cell = cell[0]
else:
email_list = None
cell = re.split(r'[\r\n]+', cell)
cell = [c.replace('\\newline', '') for c in cell]
cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
# Data processing
if cell[0]:
author_id = str(uuid.uuid4())
authors.append(author_id)
author_names.append(unidecode(cell[0]))
name = re.split(r'\s+', cell[0])
name = [item for item in name if item != '']
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
affiliation = affiliation.lstrip(",").rstrip(",").strip()
emails = []
if email_list:
for email in email_list:
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
emails.append(unidecode(email_match.group())) if email_match else None
author_data = {
"author_id": author_id,
"from_article": article_id,
"first_name": unidecode(name[0]),
"last_name": unidecode(name[-1]),
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
"raw_name": unidecode(cell[0]),
"affiliation": [
{
"year": volume,
"affiliation": unidecode(affiliation),
"email": ", ".join(emails)
}
]
}
append_data_thread_safe(author_data, authorData, authorDataLock)
# If no author table
else:
match_type = 0
hr_count = len(article_soup.find_all('hr'))
if hr_count < 3:
pattern = r'<hr>(.*?)<hr>'
else:
pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
matches = str(re.findall(pattern, html, re.DOTALL))
if len(matches) < 5:
match_type = 1
last_p_tag = str(article_soup.find_all('p')[-1])
pattern = r'<p>(.*?)<hr/>'
matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip()
if matches:
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
matches = matches.split("<p>")
for match in matches:
if "email" in match:
match = match.split("email")
email_list = str(match[1]).split(',')
match = match[0]
elif "e-mail" in match:
match = match.split("e-mail")
email_list = str(match[1]).split(',')
match = match[0]
else:
email_list = None
match = re.sub(r'<[^>]+>', '', match)
match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
if match_type == 0:
match = match.split("\\n")
else:
match = match.split("\n")
match = [m.replace('\\newline', '') for m in match]
match = [re.sub(r'\s+', ' ', m).strip() for m in match]
# Data processing
if match[0]:
author_id = str(uuid.uuid4())
authors.append(author_id)
authors.append(unidecode(match[0]))
name = re.split(r'\s+', match[0])
name = [item for item in name if item != '']
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
affiliation = affiliation.lstrip(",").rstrip(",").strip()
emails = []
if email_list:
for email in email_list:
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
emails.append(unidecode(email_match.group())) if email_match else None
author_data = {
"author_id": author_id,
"from_article": article_id,
"first_name": unidecode(name[0]),
"last_name": unidecode(name[-1]),
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
"raw_name": unidecode(match[0]),
"affiliation": [
{
"year": volume,
"affiliation": unidecode(affiliation),
"email": ", ".join(emails)
}
]
}
append_data_thread_safe(author_data, authorData, authorDataLock)
else:
print("AUTHOR SEARCHING ERROR:", article_url)
fail = {
"title": title,
"URL": article_url
}
failedFormatData.append(fail)
# Article info
article_data = {
"article_id": article_id,
"title": unidecode(title),
"authors": authors,
"author_names": author_names,
"submit_datetime": submitted_date,
"publish_datetime": publish_date,
"keywords": keywords,
"MSC": msc,
"URL": article_url,
"DOI": doi,
"publisher": "Texas State University",
"journal": "Electronic Journal of Differential Equations",
"volume": volume,
"issue": issue,
"page": pp
}
append_data_thread_safe(article_data, articleData, articleDataLock)
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
save_data_thread_safe(articleData, articleDataLock, "Article")
if len(authorData) % batch_size == 0:
save_data_thread_safe(authorData, authorDataLock, "Author")
start_time = time.time()
url_list = []
# Get all general volumes url
index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
volume_links = soup.select('font > a[href]')
url_list.extend(["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1])
# Get all special issues url
index = "https://ejde.math.txstate.edu/special-toc.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
special_links = soup.find_all("a", href=True)
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])
# Get all conference special issues url
index = "https://ejde.math.txstate.edu/conf-toc.html#latest"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
special_links = soup.find_all("a", href=True)
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-2]])
authorData = []
articleData = []
failedData = []
totallyFailedData = []
failedVolData = []
failedFormatData = []
authorNum = 0
articleNum = 0
batch_size = 100 # Number of articles to process before saving
authorDataLock = threading.Lock()
articleDataLock = threading.Lock()
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
# Process each URL using multithreading
futures = [executor.submit(process_volume, url) for url in url_list]
# Wait for all tasks to complete
for future in as_completed(futures):
try:
future.result()
except Exception as vol_err:
print("VOLUME PROCESSING ERROR:", str(vol_err))
# Retry failed processing paper
if len(failedData):
print("START RETRYING:", len(failedData))
while failedData:
fail_data = failedData.pop(0)
articleTitle = fail_data["title"]
articleUrl = fail_data["URL"]
try:
process_article(articleTitle, articleUrl)
except Exception as retry_err:
print("ARTICLE RETRYING FAILURE:", str(retry_err))
totally_fail = {
"title": articleTitle,
"URL": articleUrl
}
totallyFailedData.append(totally_fail)
# Save remaining data
if len(articleData) > 0:
save_data_thread_safe(articleData, articleDataLock, "Article")
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
if len(authorData) > 0:
save_data_thread_safe(authorData, authorDataLock, "Author")
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
# Save error record
if len(totallyFailedData) > 0:
ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json")
print("Total failed processing paper:", len(totallyFailedData))
if len(failedVolData) > 0:
ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json")
print("Total failed fetching volume:", len(failedVolData))
if len(failedFormatData) > 0:
ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
print("Total failed searching article:", len(failedFormatData))
# Statistics
print("Total fetched paper:", articleNum)
print("Total fetched author:", authorNum)
print("time elapsed: {:.2f}s".format(time.time() - start_time))
# Transfer to large file and delete the temporary storage files
ejde_save.transform_data()
ejde_save.delete_data()