201 lines
6.9 KiB
Python
201 lines
6.9 KiB
Python
import uuid
|
||
import requests
|
||
import re
|
||
import ejde_save
|
||
|
||
from datetime import datetime
|
||
from bs4 import BeautifulSoup
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
||
'''
|
||
爬取网站:'ejde.math.txstate.edu'
|
||
|
||
==========运行顺序==========
|
||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||
2、ejde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件
|
||
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||
'''
|
||
|
||
|
||
def datetime_transform(date):
|
||
input_date = datetime.strptime(date, "%B %d, %Y")
|
||
return input_date.strftime("%Y-%m-%d")
|
||
|
||
|
||
# Article and author detail
|
||
def process_article(url):
|
||
response = requests.get(url)
|
||
response.raise_for_status()
|
||
|
||
baseWeb = url[:url.rfind('/')] + "/"
|
||
html = response.text
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
|
||
articles = soup.find_all("li")
|
||
|
||
for article in articles:
|
||
authors = article.find("strong").text.strip().split(", ")
|
||
title = article.find("em").text.strip()
|
||
article_url = baseWeb + article.find("a")["href"]
|
||
|
||
# Access article detail page
|
||
response = requests.get(article_url)
|
||
html = response.text
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
article_text = soup.get_text()
|
||
|
||
# Extract volume
|
||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||
volume = str(volume_match.group(1)) if volume_match else None
|
||
|
||
# Extract pp
|
||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||
pp = pp_match.group(1) if pp_match else None
|
||
|
||
# Extract issue
|
||
issue_match = re.search(r'No\. (\d+)', article_text)
|
||
issue = issue_match.group(1) if issue_match else None
|
||
|
||
# Extract submission date
|
||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||
submitted_date = match.group(1)
|
||
if match:
|
||
submitted_date = datetime_transform(submitted_date)
|
||
else:
|
||
submitted_date = None
|
||
|
||
# Extract publication date
|
||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||
publish_date = match.group(1)
|
||
if match:
|
||
publish_date = datetime_transform(publish_date)
|
||
else:
|
||
publish_date = None
|
||
|
||
# Extract MSC
|
||
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
||
if not msc_match:
|
||
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
|
||
if msc_match:
|
||
msc = msc_match.group(1).strip().strip('.')
|
||
msc = re.split(r', |;', msc)
|
||
else:
|
||
msc = None
|
||
|
||
# Extract KeyWords
|
||
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
||
if not keywords_match:
|
||
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
|
||
if keywords_match:
|
||
keywords = keywords_match.group(1).strip().replace('\n', '')
|
||
keywords = re.split(r', |;', keywords)
|
||
keywords = [keyword.strip().strip('.') for keyword in keywords]
|
||
else:
|
||
keywords = None
|
||
|
||
# Extract DOI
|
||
doi_match = re.search(r'DOI: (.+)(?=<)', html)
|
||
if not doi_match:
|
||
doi_match = re.search(r'DOI: (.+)', html)
|
||
doi = doi_match.group(1) if doi_match else None
|
||
|
||
# Article_id
|
||
article_id = str(uuid.uuid4())
|
||
|
||
article_data = {
|
||
"article_id": article_id,
|
||
"title": title,
|
||
"authors": authors,
|
||
"corresponding_authors": None,
|
||
"submit_datetime": submitted_date,
|
||
"publish_datetime": publish_date,
|
||
"keywords": keywords,
|
||
"MSC": msc,
|
||
"URL": article_url,
|
||
"DOI": doi,
|
||
"publisher": "Texas State University",
|
||
"journal": "Electronic Journal of Differential Equations",
|
||
"volume": volume,
|
||
"issue": issue,
|
||
"page": pp,
|
||
}
|
||
articleData.append(article_data)
|
||
|
||
# Author info
|
||
table = soup.find('table')
|
||
for row in table.find_all('tr'):
|
||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||
for cell in cells:
|
||
cell = cell.split("\n")
|
||
cell = [element.replace('email: ', '') for element in cell]
|
||
cell = [c.strip() for c in cell]
|
||
|
||
# Data processing
|
||
name = cell[0].split(" ")
|
||
affiliation = ', '.join(cell[1:-1])
|
||
email = cell[-1]
|
||
|
||
author_data = {
|
||
"author_id": str(uuid.uuid4()),
|
||
"from_article": article_id,
|
||
"firstname": name[0],
|
||
"lastname": name[-1],
|
||
"middlename": name[1:len(name) - 1] if len(name) > 2 else None,
|
||
"affiliation": [{
|
||
"year": volume,
|
||
"affiliation": affiliation,
|
||
"email": email,
|
||
}]
|
||
}
|
||
authorData.append(author_data)
|
||
|
||
# Save the data periodically based on batch size
|
||
if len(articleData) % batch_size == 0:
|
||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||
articleData.clear()
|
||
|
||
if len(authorData) % batch_size == 0:
|
||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||
authorData.clear()
|
||
|
||
|
||
index = "https://ejde.math.txstate.edu/indexleft.html"
|
||
response = requests.get(index)
|
||
soup = BeautifulSoup(response.content, 'html.parser')
|
||
|
||
# Find all the URL links under the first (Volumes) section
|
||
volume_links = soup.select('font > a[href]')
|
||
|
||
# Extract and store the URLs in a list using list comprehension
|
||
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
|
||
|
||
authorData = []
|
||
articleData = []
|
||
|
||
batch_size = 100 # Number of articles to process before saving
|
||
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads
|
||
|
||
# Process each URL using multithreading
|
||
futures = [executor.submit(process_article, url) for url in url_list]
|
||
|
||
# Wait for all tasks to complete
|
||
for future in as_completed(futures):
|
||
try:
|
||
future.result()
|
||
except Exception as e:
|
||
print("An error occurred:", str(e))
|
||
|
||
# Save remaining data
|
||
if len(articleData) > 0:
|
||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||
|
||
if len(authorData) > 0:
|
||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||
|
||
# Transfer to large file and delete the temporary storage files
|
||
ejde_save.Transf()
|
||
ejde_save.delete()
|