ScholarDataMining/01_EJDE_spider/ejde_main.py

import time
import uuid
import requests
import re
import ejde_save

from retrying import retry
from datetime import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed, wait

'''
    爬取网站：'ejde.math.txstate.edu'

    Total number of papers: 2023/08/08 - 4300
    Total Time via VPN w/119ms-delay: 441.80s

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
    2、ejde_save                    从本地浏览暂存的小文件筛选后存入不同年份的大文件
    *3、ejde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
'''


def datetime_transform(date):
    month_typo = {
        "Janaury": "January",
        "Febrary": "February",
        "Februay": "February",
        "Mar": "March",
        "Mach": "March",
        "Match": "March",
        "Maay": "May",
        "Jun": "June",
        "Juy": "July",
        "Aapril": "April",
        "Spetember": "September",
        "Septembere": "September",
        "Ocotber": "October",
    }
    try:
        input_date = datetime.strptime(date, "%B %d, %Y")
        return input_date.strftime("%Y-%m-%d")
    except ValueError:
        for typo, correction in month_typo.items():
            date = date.replace(typo, correction)

        try:
            input_date = datetime.strptime(date, "%B %d, %Y")
            return input_date.strftime("%Y-%m-%d")
        except ValueError as val_err:
            print("TYPO:", str(val_err))
            return date


# Article and author detail
def process_volume(url):
    articles = []
    baseWeb = None

    retries = 5
    for attempt in range(retries):
        try:
            volume_response = requests.get(url)
            if volume_response.status_code == 200:
                volume_response.raise_for_status()

                baseWeb = url[:url.rfind('/')] + "/"
                html = volume_response.text
                volume_soup = BeautifulSoup(html, "html.parser")
                ol_elements = volume_soup.find_all('ol')

                for ol in ol_elements:
                    em_elements = ol.find_all('em')
                    if em_elements:
                        articles.extend(em for em in em_elements)
                    # Another html style
                    else:
                        i_elements = ol.find_all('i')
                        if i_elements:
                            articles.extend(i for i in i_elements)
                        else:
                            print("HTML FORMAT FAILURE:", url)
                            fail = {
                                "website": url
                            }
                            failedFormatData.append(fail)
                            return
                break
        except Exception as fetch_err:
            if attempt < retries - 1:
                print("RETRYING TO FETCH HTML:", str(fetch_err))
                time.sleep(1)
                continue
            else:
                print("HTML FETCHING FAILURE:", url)
                fail = {
                    "website": url
                }
                failedVolData.append(fail)
                return

    # Process each article using multithreading (>20 threads would cause more error)
    volume_executor = ThreadPoolExecutor(max_workers=15)
    volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]

    # Wait for all tasks to complete
    for volume_futures in as_completed(volume_futures):
        try:
            volume_futures.result()
        except Exception as html_err:
            print("HTML PROCESSING ERROR:", str(html_err))


def process_html_article(baseweb, article):
    # Get article title & url
    try:
        title = article.text.strip()
        title = re.sub(r'\s+', ' ', title).strip()
        article_url = baseweb + article.find_next("a")["href"]
    except Exception as html_format_err:
        print("HTML FORMAT FAILURE:", str(html_format_err))
        fail = {
            "article": str(article)
        }
        failedFormatData.append(fail)
        return

    # Crawl article data
    try:
        process_article(title, article_url)
    except Exception as article_err:
        print("ARTICLE PROCESSING FAILURE:", str(article_err))
        fail = {
            "title": title,
            "URL": article_url
        }
        failedData.append(fail)
        return


@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(title, article_url):
    global articleNum, authorNum
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
    article_response = requests.get(article_url, headers=headers)
    if article_response.status_code == 200:
        article_response.raise_for_status()

        html = article_response.text
        article_soup = BeautifulSoup(html, 'html.parser')
        article_text = article_soup.get_text()

        # Extract volume
        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
        volume = str(volume_match.group(1)) if volume_match else None

        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
        pp = pp_match.group(1) if pp_match else None

        # Extract issue
        issue_match = re.search(r'No\. (\d+)', article_text)
        issue = issue_match.group(1) if issue_match else None

        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        submitted_date = match.group(1) if match else None
        if submitted_date:
            submitted_date = datetime_transform(submitted_date)

        # Extract publication date
        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        publish_date = match.group(1) if match else None
        if publish_date:
            publish_date = datetime_transform(publish_date)

        # Extract MSC
        msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
        if not msc_match:
            msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
        if msc_match:
            msc = msc_match.group(1).strip().strip('.').strip()
            msc = re.split(r', |;', msc)
        else:
            msc = []

        # Extract KeyWords
        keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
        if not keywords_match:
            keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
        if keywords_match:
            keywords = keywords_match.group(1).strip().replace('\n', '')
            keywords = re.split(r', |;', keywords)
            keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
        else:
            keywords = []

        # Extract DOI
        doi_match = re.search(r'DOI: (.+)(?=<)', html)
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
        doi = doi_match.group(1) if doi_match else None

        # Article_id
        article_id = str(uuid.uuid4())

        # Author info
        authors = []
        table = article_soup.find('table')
        if table:
            for row in table.find_all('tr'):
                cells = [cell.text.strip() for cell in row.find_all('td')]
                for cell in cells:
                    cell = cell.split("\n")
                    cell = [element.replace('email: ', '') for element in cell]
                    cell = [c.strip() for c in cell]

                    # Data processing
                    authors.append(cell[0])
                    name = cell[0].split(" ")
                    middle_name = ''.join(name[1:-1]) if name[1:-1] else None
                    affiliation = ', '.join(cell[1:-1])
                    affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                    email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
                    email = email_match.group() if email_match else None

                    author_data = {
                        "author_id": str(uuid.uuid4()),
                        "from_article": [article_id],
                        "first_name": name[0],
                        "last_name": name[-1],
                        "middle_name": middle_name,
                        "affiliation": [{
                            "year": volume,
                            "affiliation": affiliation,
                            "email": email
                        }]
                    }
                    authorData.append(author_data)
        # If no author table
        else:
            pattern = r'<hr>(.*?)<hr>'
            matches = str(re.findall(pattern, html, re.DOTALL))
            if matches:
                matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
                matches = matches.split("<p>")

                for match in matches:
                    match = re.sub(r'<[^>]+>', '', match)
                    match = match.lstrip("\\n ").rstrip("\\n ").strip()
                    match = match.split("\\n")
                    match = [element.replace('email: ', '') for element in match]
                    match = [m.strip() for m in match]

                    # Data processing
                    authors.append(match[0])
                    name = match[0].split(" ")
                    middle_name = ''.join(name[1:-1]) if name[1:-1] else None
                    affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
                    affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                    email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
                    email = email_match.group() if email_match else None

                    author_data = {
                        "author_id": str(uuid.uuid4()),
                        "from_article": [article_id],
                        "first_name": name[0],
                        "last_name": name[-1],
                        "middle_name": middle_name,
                        "affiliation": [{
                            "year": volume,
                            "affiliation": affiliation,
                            "email": email
                        }]
                    }
                    authorData.append(author_data)
            else:
                print("AUTHOR SEARCHING ERROR:", article_url)
                return

        # Article info
        article_data = {
            "article_id": article_id,
            "title": title,
            "authors": authors,
            "corresponding_authors": None,
            "submit_datetime": submitted_date,
            "publish_datetime": publish_date,
            "keywords": keywords,
            "MSC": msc,
            "URL": article_url,
            "DOI": doi,
            "publisher": "Texas State University",
            "journal": "Electronic Journal of Differential Equations",
            "volume": volume,
            "issue": issue,
            "page": pp
        }
        articleData.append(article_data)

        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
            ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
            articleNum += len(articleData)
            articleData.clear()

        if len(authorData) % batch_size == 0:
            ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
            authorNum += len(authorData)
            authorData.clear()


start_time = time.time()
index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the URL links under the first Volume section
volume_links = soup.select('font > a[href]')
# Extract and store the URLs in a list using list comprehension
url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1]

# Initialize lists
authorData = []
articleData = []
failedData = []
totallyFailedData = []
failedVolData = []
failedFormatData = []

# Initialize variables for counting
authorNum = 0
articleNum = 0

batch_size = 100  # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=25)  # Set the number of worker threads

# Process each URL using multithreading
futures = [executor.submit(process_volume, url) for url in url_list]

# Wait for all tasks to complete
for future in as_completed(futures):
    try:
        future.result()
    except Exception as vol_err:
        print("VOLUME PROCESSING ERROR:", str(vol_err))

wait(futures)

# Retry failed processing paper
print("START RETRYING:", len(failedData))
while failedData:
    data = failedData.pop(0)
    articleTitle = data["title"]
    articleUrl = data["URL"]
    try:
        process_article(articleTitle, articleUrl)
    except Exception as retry_err:
        print("ARTICLE RETRYING FAILURE:", str(retry_err))
        totally_fail = {
            "title": articleTitle,
            "URL": articleUrl
        }
        totallyFailedData.append(totally_fail)

# Save remaining data
if len(articleData) > 0:
    ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
    print("Total fetched paper:", len(articleData) + articleNum)

if len(authorData) > 0:
    ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
    print("Total fetched author:", len(authorData) + authorNum)

# Save error record
if len(totallyFailedData) > 0:
    ejde_save.save_data(failedData, "", "Failed_article_record.json")
    print("Total failed processing paper:", len(totallyFailedData))

if len(failedVolData) > 0:
    ejde_save.save_data(failedVolData, "", "Failed_volume_record.json")
    print("Total failed fetching volume:", len(failedVolData))

if len(failedFormatData) > 0:
    ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
    print("Total failed searching article:", len(failedFormatData))

# Total running time
print("time elapsed: {:.2f}s".format(time.time() - start_time))

# Transfer to large file and delete the temporary storage files
ejde_save.Transf()
ejde_save.delete()