ScholarDataMining/EJDE_spider/ejde_scrawler.py

import os
import uuid
import requests
from bs4 import BeautifulSoup
import re
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from retrying import retry


def save_data(dataset, filetype, filename):
    if dataset:
        directory = "./ejde_buffer/" + filetype + "/"
        os.makedirs(directory, exist_ok=True)
        filepath = os.path.join(directory, filename)
        with open(filepath, "w", encoding='utf-8') as json_file:
            json.dump(dataset, json_file, indent=4)
        print(filetype + " data have been added to", filepath)


@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(url):
    response = requests.get(url)
    response.raise_for_status()

    baseWeb = url[:url.rfind('/')] + "/"
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    articles = soup.find_all("li")

    for article in articles:
        authors = article.find("strong").text.strip().split(", ")
        title = article.find("em").text.strip()
        article_url = baseWeb + article.find("a")["href"]

        # Access article detail page
        response = requests.get(article_url)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')

        article_text = soup.get_text()

        # Extract volume
        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
        volume = volume_match.group(1) if volume_match else None
        # year = volume_match.group(2) if volume_match else None

        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
        pp = pp_match.group(1) if pp_match else None

        # Extract issue
        issue_match = re.search(r'No\. (\d+)', article_text)
        issue = issue_match.group(1) if issue_match else None

        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        submitted_date = match.group(1) if match else None

        # Extract publication date
        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        publish_date = match.group(1) if match else None

        # Extract MSC
        msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
        if not msc_match:
            msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
        if msc_match:
            msc = msc_match.group(1).strip().strip('.')
            msc = re.split(r', |;', msc)
        else:
            msc = None

        # Extract KeyWords
        keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
        if not keywords_match:
            keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
        if keywords_match:
            keywords = keywords_match.group(1).strip().replace('\n', '')
            keywords = re.split(r', |;', keywords)
            keywords = [keyword.strip().strip('.') for keyword in keywords]
        else:
            keywords = None

        # Extract DOI
        doi_match = re.search(r'DOI: (.+)(?=<)', html)
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
        doi = doi_match.group(1) if doi_match else None

        # Article_id
        article_id = str(uuid.uuid4())

        article_data = {
            "article_id": article_id,
            "title": title,
            "authors": authors,
            "corresponding_authors": None,
            "submit_datetime": submitted_date,
            "publish_datetime": publish_date,
            "keywords": keywords,
            "MSC": msc,
            "URL": article_url,
            "DOI": doi,
            "publisher": "Texas State University",
            "journal": "Electronic Journal of Differential Equations",
            "volume": volume,
            "issue": issue,
            "page": pp,
        }
        articleData.append(article_data)

        # Author info
        table = soup.find('table')
        for row in table.find_all('tr'):
            cells = [cell.text.strip() for cell in row.find_all('td')]
            for cell in cells:
                cell = cell.split("\n")
                cell = [element.replace('email: ', '') for element in cell]
                cell = [c.strip() for c in cell]

                # Data processing
                name = cell[0].split(" ")
                affiliation = ', '.join(cell[1:-1])
                email = cell[-1]

                author_data = {
                    "author_id": str(uuid.uuid4()),
                    "from_article": article_id,
                    "first_name": name[0],
                    "last_name": name[-1],
                    "middle_name": name[1:len(name) - 1] if len(name) > 2 else None,
                    "affiliation": [{
                        "year": volume,
                        "affiliation": affiliation,
                        "email": email,
                    }]
                }
                authorData.append(author_data)

        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
            save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
            articleData.clear()

        if len(authorData) % batch_size == 0:
            save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
            authorData.clear()


index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the URL links under the first (Volumes) section
volume_links = soup.select('font > a[href]')

# Extract and store the URLs in a list using list comprehension
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]

authorData = []
articleData = []

batch_size = 500  # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=20)  # Set the number of worker threads

# Process each URL using multithreading
futures = [executor.submit(process_article, url) for url in url_list]

# Wait for all tasks to complete
for future in as_completed(futures):
    try:
        future.result()
    except Exception as e:
        print("An error occurred:", str(e))

# Save remaining data
if articleData:
    save_data(articleData, "Article", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article/")

if authorData:
    save_data(authorData, "Author", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author/")