186 lines
6.4 KiB
Python
186 lines
6.4 KiB
Python
import os
|
|
import uuid
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
import json
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from retrying import retry
|
|
|
|
|
|
def save_data(dataset, filetype, filename):
|
|
if dataset:
|
|
directory = "./ejde_buffer/" + filetype + "/"
|
|
os.makedirs(directory, exist_ok=True)
|
|
filepath = os.path.join(directory, filename)
|
|
with open(filepath, "w", encoding='utf-8') as json_file:
|
|
json.dump(dataset, json_file, indent=4)
|
|
print(filetype + " data have been added to", filepath)
|
|
|
|
|
|
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
|
def process_article(url):
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
|
|
baseWeb = url[:url.rfind('/')] + "/"
|
|
html = response.text
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
articles = soup.find_all("li")
|
|
|
|
for article in articles:
|
|
authors = article.find("strong").text.strip().split(", ")
|
|
title = article.find("em").text.strip()
|
|
article_url = baseWeb + article.find("a")["href"]
|
|
|
|
# Access article detail page
|
|
response = requests.get(article_url)
|
|
html = response.text
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
article_text = soup.get_text()
|
|
|
|
# Extract volume
|
|
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
|
volume = volume_match.group(1) if volume_match else None
|
|
# year = volume_match.group(2) if volume_match else None
|
|
|
|
# Extract pp
|
|
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
|
pp = pp_match.group(1) if pp_match else None
|
|
|
|
# Extract issue
|
|
issue_match = re.search(r'No\. (\d+)', article_text)
|
|
issue = issue_match.group(1) if issue_match else None
|
|
|
|
# Extract submission date
|
|
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
|
submitted_date = match.group(1) if match else None
|
|
|
|
# Extract publication date
|
|
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
|
publish_date = match.group(1) if match else None
|
|
|
|
# Extract MSC
|
|
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
|
if not msc_match:
|
|
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
|
|
if msc_match:
|
|
msc = msc_match.group(1).strip().strip('.')
|
|
msc = re.split(r', |;', msc)
|
|
else:
|
|
msc = None
|
|
|
|
# Extract KeyWords
|
|
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
|
|
if not keywords_match:
|
|
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
|
|
if keywords_match:
|
|
keywords = keywords_match.group(1).strip().replace('\n', '')
|
|
keywords = re.split(r', |;', keywords)
|
|
keywords = [keyword.strip().strip('.') for keyword in keywords]
|
|
else:
|
|
keywords = None
|
|
|
|
# Extract DOI
|
|
doi_match = re.search(r'DOI: (.+)(?=<)', html)
|
|
if not doi_match:
|
|
doi_match = re.search(r'DOI: (.+)', html)
|
|
doi = doi_match.group(1) if doi_match else None
|
|
|
|
# Article_id
|
|
article_id = str(uuid.uuid4())
|
|
|
|
article_data = {
|
|
"article_id": article_id,
|
|
"title": title,
|
|
"authors": authors,
|
|
"corresponding_authors": None,
|
|
"submit_datetime": submitted_date,
|
|
"publish_datetime": publish_date,
|
|
"keywords": keywords,
|
|
"MSC": msc,
|
|
"URL": article_url,
|
|
"DOI": doi,
|
|
"publisher": "Texas State University",
|
|
"journal": "Electronic Journal of Differential Equations",
|
|
"volume": volume,
|
|
"issue": issue,
|
|
"page": pp,
|
|
}
|
|
articleData.append(article_data)
|
|
|
|
# Author info
|
|
table = soup.find('table')
|
|
for row in table.find_all('tr'):
|
|
cells = [cell.text.strip() for cell in row.find_all('td')]
|
|
for cell in cells:
|
|
cell = cell.split("\n")
|
|
cell = [element.replace('email: ', '') for element in cell]
|
|
cell = [c.strip() for c in cell]
|
|
|
|
# Data processing
|
|
name = cell[0].split(" ")
|
|
affiliation = ', '.join(cell[1:-1])
|
|
email = cell[-1]
|
|
|
|
author_data = {
|
|
"author_id": str(uuid.uuid4()),
|
|
"from_article": article_id,
|
|
"first_name": name[0],
|
|
"last_name": name[-1],
|
|
"middle_name": name[1:len(name) - 1] if len(name) > 2 else None,
|
|
"affiliation": [{
|
|
"year": volume,
|
|
"affiliation": affiliation,
|
|
"email": email,
|
|
}]
|
|
}
|
|
authorData.append(author_data)
|
|
|
|
# Save the data periodically based on batch size
|
|
if len(articleData) % batch_size == 0:
|
|
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
|
|
articleData.clear()
|
|
|
|
if len(authorData) % batch_size == 0:
|
|
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
|
|
authorData.clear()
|
|
|
|
|
|
index = "https://ejde.math.txstate.edu/indexleft.html"
|
|
response = requests.get(index)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Find all the URL links under the first (Volumes) section
|
|
volume_links = soup.select('font > a[href]')
|
|
|
|
# Extract and store the URLs in a list using list comprehension
|
|
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
|
|
|
|
authorData = []
|
|
articleData = []
|
|
|
|
batch_size = 500 # Number of articles to process before saving
|
|
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
|
|
|
# Process each URL using multithreading
|
|
futures = [executor.submit(process_article, url) for url in url_list]
|
|
|
|
# Wait for all tasks to complete
|
|
for future in as_completed(futures):
|
|
try:
|
|
future.result()
|
|
except Exception as e:
|
|
print("An error occurred:", str(e))
|
|
|
|
# Save remaining data
|
|
if articleData:
|
|
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
|
|
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
|
|
|
|
if authorData:
|
|
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
|
|
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")
|