ScholarDataMining/01_EJDE_spider/ejde_main.py

import uuid
import requests
import re
import ejde_save

from datetime import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

'''
    爬取网站：'ejde.math.txstate.edu'

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
    2、ejde_save                    从本地浏览暂存的小文件筛选后存入不同年份的大文件
    *3、ejde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
'''


def datetime_transform(date):
    input_date = datetime.strptime(date, "%B %d, %Y")
    return input_date.strftime("%Y-%m-%d")


# Article and author detail
def process_article(url):
    response = requests.get(url)
    response.raise_for_status()

    baseWeb = url[:url.rfind('/')] + "/"
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    articles = soup.find_all("li")

    for article in articles:
        authors = article.find("strong").text.strip().split(", ")
        title = article.find("em").text.strip()
        article_url = baseWeb + article.find("a")["href"]

        # Access article detail page
        response = requests.get(article_url)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')

        article_text = soup.get_text()

        # Extract volume
        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
        volume = str(volume_match.group(1)) if volume_match else None

        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
        pp = pp_match.group(1) if pp_match else None

        # Extract issue
        issue_match = re.search(r'No\. (\d+)', article_text)
        issue = issue_match.group(1) if issue_match else None

        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        submitted_date = match.group(1)
        if match:
            submitted_date = datetime_transform(submitted_date)
        else:
            submitted_date = None

        # Extract publication date
        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        publish_date = match.group(1)
        if match:
            publish_date = datetime_transform(publish_date)
        else:
            publish_date = None

        # Extract MSC
        msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
        if not msc_match:
            msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
        if msc_match:
            msc = msc_match.group(1).strip().strip('.')
            msc = re.split(r', |;', msc)
        else:
            msc = None

        # Extract KeyWords
        keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
        if not keywords_match:
            keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
        if keywords_match:
            keywords = keywords_match.group(1).strip().replace('\n', '')
            keywords = re.split(r', |;', keywords)
            keywords = [keyword.strip().strip('.') for keyword in keywords]
        else:
            keywords = None

        # Extract DOI
        doi_match = re.search(r'DOI: (.+)(?=<)', html)
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
        doi = doi_match.group(1) if doi_match else None

        # Article_id
        article_id = str(uuid.uuid4())

        article_data = {
            "article_id": article_id,
            "title": title,
            "authors": authors,
            "corresponding_authors": None,
            "submit_datetime": submitted_date,
            "publish_datetime": publish_date,
            "keywords": keywords,
            "MSC": msc,
            "URL": article_url,
            "DOI": doi,
            "publisher": "Texas State University",
            "journal": "Electronic Journal of Differential Equations",
            "volume": volume,
            "issue": issue,
            "page": pp,
        }
        articleData.append(article_data)

        # Author info
        table = soup.find('table')
        for row in table.find_all('tr'):
            cells = [cell.text.strip() for cell in row.find_all('td')]
            for cell in cells:
                cell = cell.split("\n")
                cell = [element.replace('email: ', '') for element in cell]
                cell = [c.strip() for c in cell]

                # Data processing
                name = cell[0].split(" ")
                affiliation = ', '.join(cell[1:-1])
                email = cell[-1]

                author_data = {
                    "author_id": str(uuid.uuid4()),
                    "from_article": article_id,
                    "firstname": name[0],
                    "lastname": name[-1],
                    "middlename": name[1:len(name) - 1] if len(name) > 2 else None,
                    "affiliation": [{
                        "year": volume,
                        "affiliation": affiliation,
                        "email": email,
                    }]
                }
                authorData.append(author_data)

        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
            ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
            articleData.clear()

        if len(authorData) % batch_size == 0:
            ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
            authorData.clear()


index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the URL links under the first (Volumes) section
volume_links = soup.select('font > a[href]')

# Extract and store the URLs in a list using list comprehension
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]

authorData = []
articleData = []

batch_size = 100  # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=25)  # Set the number of worker threads

# Process each URL using multithreading
futures = [executor.submit(process_article, url) for url in url_list]

# Wait for all tasks to complete
for future in as_completed(futures):
    try:
        future.result()
    except Exception as e:
        print("An error occurred:", str(e))

# Save remaining data
if len(articleData) > 0:
    ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")

if len(authorData) > 0:
    ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")

# Transfer to large file and delete the temporary storage files
ejde_save.Transf()
ejde_save.delete()