201 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import uuid
import requests
import re
import ejde_save
from datetime import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
'''
爬取网站:'ejde.math.txstate.edu'
==========运行顺序==========
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存
2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
'''
def datetime_transform(date):
input_date = datetime.strptime(date, "%B %d, %Y")
return input_date.strftime("%Y-%m-%d")
# Article and author detail
def process_article(url):
response = requests.get(url)
response.raise_for_status()
baseWeb = url[:url.rfind('/')] + "/"
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all("li")
for article in articles:
authors = article.find("strong").text.strip().split(", ")
title = article.find("em").text.strip()
article_url = baseWeb + article.find("a")["href"]
# Access article detail page
response = requests.get(article_url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
article_text = soup.get_text()
# Extract volume
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
volume = str(volume_match.group(1)) if volume_match else None
# Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
pp = pp_match.group(1) if pp_match else None
# Extract issue
issue_match = re.search(r'No\. (\d+)', article_text)
issue = issue_match.group(1) if issue_match else None
# Extract submission date
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
submitted_date = match.group(1)
if match:
submitted_date = datetime_transform(submitted_date)
else:
submitted_date = None
# Extract publication date
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
publish_date = match.group(1)
if match:
publish_date = datetime_transform(publish_date)
else:
publish_date = None
# Extract MSC
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
if not msc_match:
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
if msc_match:
msc = msc_match.group(1).strip().strip('.')
msc = re.split(r', |;', msc)
else:
msc = None
# Extract KeyWords
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
if not keywords_match:
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
if keywords_match:
keywords = keywords_match.group(1).strip().replace('\n', '')
keywords = re.split(r', |;', keywords)
keywords = [keyword.strip().strip('.') for keyword in keywords]
else:
keywords = None
# Extract DOI
doi_match = re.search(r'DOI: (.+)(?=<)', html)
if not doi_match:
doi_match = re.search(r'DOI: (.+)', html)
doi = doi_match.group(1) if doi_match else None
# Article_id
article_id = str(uuid.uuid4())
article_data = {
"article_id": article_id,
"title": title,
"authors": authors,
"corresponding_authors": None,
"submit_datetime": submitted_date,
"publish_datetime": publish_date,
"keywords": keywords,
"MSC": msc,
"URL": article_url,
"DOI": doi,
"publisher": "Texas State University",
"journal": "Electronic Journal of Differential Equations",
"volume": volume,
"issue": issue,
"page": pp,
}
articleData.append(article_data)
# Author info
table = soup.find('table')
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells:
cell = cell.split("\n")
cell = [element.replace('email: ', '') for element in cell]
cell = [c.strip() for c in cell]
# Data processing
name = cell[0].split(" ")
affiliation = ', '.join(cell[1:-1])
email = cell[-1]
author_data = {
"author_id": str(uuid.uuid4()),
"from_article": article_id,
"firstname": name[0],
"lastname": name[-1],
"middlename": name[1:len(name) - 1] if len(name) > 2 else None,
"affiliation": [{
"year": volume,
"affiliation": affiliation,
"email": email,
}]
}
authorData.append(author_data)
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleData.clear()
if len(authorData) % batch_size == 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorData.clear()
index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the URL links under the first (Volumes) section
volume_links = soup.select('font > a[href]')
# Extract and store the URLs in a list using list comprehension
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
authorData = []
articleData = []
batch_size = 100 # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads
# Process each URL using multithreading
futures = [executor.submit(process_article, url) for url in url_list]
# Wait for all tasks to complete
for future in as_completed(futures):
try:
future.result()
except Exception as e:
print("An error occurred:", str(e))
# Save remaining data
if len(articleData) > 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
if len(authorData) > 0:
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
# Transfer to large file and delete the temporary storage files
ejde_save.Transf()
ejde_save.delete()