ScholarDataMining/EJDE_spider/ejde_scrawler.py
2023-07-14 18:50:36 +08:00

186 lines
6.4 KiB
Python

import os
import uuid
import requests
from bs4 import BeautifulSoup
import re
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from retrying import retry
def save_data(dataset, filetype, filename):
if dataset:
directory = "./ejde_buffer/" + filetype + "/"
os.makedirs(directory, exist_ok=True)
filepath = os.path.join(directory, filename)
with open(filepath, "w", encoding='utf-8') as json_file:
json.dump(dataset, json_file, indent=4)
print(filetype + " data have been added to", filepath)
@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(url):
response = requests.get(url)
response.raise_for_status()
baseWeb = url[:url.rfind('/')] + "/"
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all("li")
for article in articles:
authors = article.find("strong").text.strip().split(", ")
title = article.find("em").text.strip()
article_url = baseWeb + article.find("a")["href"]
# Access article detail page
response = requests.get(article_url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
article_text = soup.get_text()
# Extract volume
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
volume = volume_match.group(1) if volume_match else None
# year = volume_match.group(2) if volume_match else None
# Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
pp = pp_match.group(1) if pp_match else None
# Extract issue
issue_match = re.search(r'No\. (\d+)', article_text)
issue = issue_match.group(1) if issue_match else None
# Extract submission date
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
submitted_date = match.group(1) if match else None
# Extract publication date
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
publish_date = match.group(1) if match else None
# Extract MSC
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
if not msc_match:
msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
if msc_match:
msc = msc_match.group(1).strip().strip('.')
msc = re.split(r', |;', msc)
else:
msc = None
# Extract KeyWords
keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
if not keywords_match:
keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
if keywords_match:
keywords = keywords_match.group(1).strip().replace('\n', '')
keywords = re.split(r', |;', keywords)
keywords = [keyword.strip().strip('.') for keyword in keywords]
else:
keywords = None
# Extract DOI
doi_match = re.search(r'DOI: (.+)(?=<)', html)
if not doi_match:
doi_match = re.search(r'DOI: (.+)', html)
doi = doi_match.group(1) if doi_match else None
# Article_id
article_id = str(uuid.uuid4())
article_data = {
"article_id": article_id,
"title": title,
"authors": authors,
"corresponding_authors": None,
"submit_datetime": submitted_date,
"publish_datetime": publish_date,
"keywords": keywords,
"MSC": msc,
"URL": article_url,
"DOI": doi,
"publisher": "Texas State University",
"journal": "Electronic Journal of Differential Equations",
"volume": volume,
"issue": issue,
"page": pp,
}
articleData.append(article_data)
# Author info
table = soup.find('table')
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells:
cell = cell.split("\n")
cell = [element.replace('email: ', '') for element in cell]
cell = [c.strip() for c in cell]
# Data processing
name = cell[0].split(" ")
affiliation = ', '.join(cell[1:-1])
email = cell[-1]
author_data = {
"author_id": str(uuid.uuid4()),
"from_article": article_id,
"first_name": name[0],
"last_name": name[-1],
"middle_name": name[1:len(name) - 1] if len(name) > 2 else None,
"affiliation": [{
"year": volume,
"affiliation": affiliation,
"email": email,
}]
}
authorData.append(author_data)
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
articleData.clear()
if len(authorData) % batch_size == 0:
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
authorData.clear()
index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the URL links under the first (Volumes) section
volume_links = soup.select('font > a[href]')
# Extract and store the URLs in a list using list comprehension
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]
authorData = []
articleData = []
batch_size = 500 # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
# Process each URL using multithreading
futures = [executor.submit(process_article, url) for url in url_list]
# Wait for all tasks to complete
for future in as_completed(futures):
try:
future.result()
except Exception as e:
print("An error occurred:", str(e))
# Save remaining data
if articleData:
save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")
if authorData:
save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")