ScholarDataMining/Parsers/01_EJDE_spider/ejde_main.py

import re
import time
import uuid
import requests
import threading
import ejde_save

from retrying import retry
from datetime import datetime
from bs4 import BeautifulSoup
from unidecode import unidecode
from concurrent.futures import ThreadPoolExecutor, as_completed

'''
    爬取网站：'ejde.math.txstate.edu'

    Total number of papers: 2023/08/08 - 4785
    Total Time via VPN w/100ms-delay: 96.30s

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
    2、ejde_save                    从本地浏览暂存的小文件筛选后存入不同年份的大文件
    *3、ejde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
'''


def append_data_thread_safe(from_list, to_list, data_lock):
    with data_lock:
        to_list.append(from_list)


def save_data_thread_safe(data, data_lock, data_type):
    global articleNum, authorNum
    with data_lock:
        ejde_save.save_data(data, f"{data_type}_TS", str(uuid.uuid4()) + ".json")
        if data_type == "Article":
            articleNum += len(data)
        else:
            authorNum += len(data)
        data.clear()


def datetime_transform(date):
    month_typo = {
        "Janaury": "January",
        "Febrary": "February",
        "Februay": "February",
        "Mar": "March",
        "Mach": "March",
        "Match": "March",
        "Maay": "May",
        "Jun": "June",
        "Juy": "July",
        "Aapril": "April",
        "Spetember": "September",
        "Septembere": "September",
        "Ocotber": "October",
        "Nobember": "November",
    }
    try:
        input_date = datetime.strptime(date, "%B %d, %Y")
        return input_date.strftime("%Y-%m-%d")
    except ValueError:
        for typo, correction in month_typo.items():
            date = date.replace(typo, correction)

        try:
            input_date = datetime.strptime(date, "%B %d, %Y")
            return input_date.strftime("%Y-%m-%d")
        except ValueError as val_err:
            print("TYPO:", str(val_err))
            return date


# Article and author detail
def process_volume(url):
    articles = []
    baseWeb = None

    retries = 5
    for attempt in range(retries):
        try:
            volume_response = requests.get(url)
            if volume_response.status_code == 200:
                volume_response.raise_for_status()

                baseWeb = url[:url.rfind('/')] + "/"
                html = volume_response.text
                volume_soup = BeautifulSoup(html, "html.parser")
                li_elements = volume_soup.find_all('ol')
                if not li_elements:
                    li_elements = volume_soup.find_all('ul')

                for li in li_elements:
                    em_elements = li.find_all('em')
                    if em_elements:
                        articles.extend(em for em in em_elements)
                    # Another html style
                    else:
                        i_elements = li.find_all('i')
                        if i_elements:
                            articles.extend(i for i in i_elements)
                        else:
                            print("HTML FORMAT FAILURE:", url)
                            fail = {
                                "website": url
                            }
                            failedFormatData.append(fail)
                            return
                break
        except Exception as fetch_err:
            if attempt < retries - 1:
                print("RETRYING TO FETCH HTML:", str(fetch_err))
                time.sleep(1)
                continue
            else:
                print("HTML FETCHING FAILURE:", url)
                fail = {
                    "website": url
                }
                failedVolData.append(fail)
                return

    # Process each article using multithreading (>20 threads would cause more error)
    volume_executor = ThreadPoolExecutor(max_workers=15)
    volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]

    # Wait for all tasks to complete
    for volume_futures in as_completed(volume_futures):
        try:
            volume_futures.result()
        except Exception as html_err:
            print("HTML PROCESSING ERROR:", str(html_err))


def process_html_article(baseweb, article):
    # Get article title & url
    try:
        title = article.text.strip()
        title = str(re.sub(r'\s+', ' ', title).strip())
        article_url = baseweb + article.find_next("a")["href"]
        if "../../index.html" in article_url:
            print("Redundant URL:", article_url)
            return
    except Exception as html_format_err:
        print("HTML FORMAT FAILURE:", str(html_format_err))
        fail = {
            "article": str(article)
        }
        failedFormatData.append(fail)
        return

    # Crawl article data
    try:
        process_article(title, article_url)
    except Exception as article_err:
        print("ARTICLE PROCESSING FAILURE:", str(article_err))
        fail = {
            "title": title,
            "URL": article_url
        }
        failedData.append(fail)
        return


@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(title, article_url):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
    article_response = requests.get(article_url, headers=headers)
    if article_response.status_code == 200:
        article_response.raise_for_status()

        html = article_response.text
        article_soup = BeautifulSoup(html, 'html.parser')
        article_text = article_soup.get_text()

        # Extract title if title == None
        if not title:
            title_match = re.search(r"<h3>(.*?)<p>", article_text)
            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""

        # Extract issue
        issue_match = re.search(r'No\. (\d+)', article_text)
        issue = issue_match.group(1) if issue_match else ""

        # Extract volume
        volume_match = re.search(r'Vol\. (\d+)', article_text)
        volume = str(volume_match.group(1)) if volume_match else None
        if not volume:
            volume_match = re.search(r'Special Issue (\d+) \((\d+)\)', article_text)
            if volume_match:
                issue_number, volume = volume_match.groups()
                volume = str(volume)
                issue = "Special Issue " + str(issue_number)
            else:
                volume_match = re.search(r'Conf\. (\d{2}), (\d{4})', article_text)
                if volume_match:
                    issue = "Conference " + str(volume_match.group(1))
                    volume = str(volume_match.group(2))
                else:
                    volume_match = re.search(r'Conference (\d+) \((\d+)\)', article_text)
                    if volume_match:
                        issue_number, volume = volume_match.groups()
                        volume = str(volume)
                        issue = "Conference " + str(issue_number)
                    else:
                        volume = ""

        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
        pp = pp_match.group(1) if pp_match else ""

        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        if not match:
            match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
        submitted_date = match.group(1) if match else ""
        if submitted_date:
            submitted_date = datetime_transform(submitted_date)

        # Extract publication date
        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        publish_date = match.group(1) if match else ""
        if publish_date:
            publish_date = datetime_transform(publish_date)

        # Extract MSC
        msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
        if not msc_match:
            msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
        if msc_match:
            msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
            msc = msc.strip('.').strip()
            msc = re.split(r', |;', msc)
        else:
            msc = []

        # Extract KeyWords
        keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
        if not keywords_match:
            keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
        if keywords_match:
            keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
            keywords = re.split(r', |;', keywords)
            keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in
                        keywords if len(keyword.strip())]
        else:
            keywords = []

        # Extract DOI
        doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
        doi = doi_match.group(1) if doi_match else ""

        # Article_id
        article_id = str(uuid.uuid4())

        # Author info
        authors = []
        author_names = []
        table = article_soup.find('table')
        if table:
            for row in table.find_all('tr'):
                cells = [cell.text.strip() for cell in row.find_all('td')]
                for cell in cells:
                    if "email" in cell:
                        cell = cell.split("email")
                        email_list = str(cell[1]).split(',')
                        cell = cell[0]
                    elif "e-mail" in cell:
                        cell = cell.split("e-mail")
                        email_list = str(cell[1]).split(',')
                        cell = cell[0]
                    else:
                        email_list = None

                    cell = re.split(r'[\r\n]+', cell)
                    cell = [c.replace('\\newline', '') for c in cell]
                    cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]

                    # Data processing
                    if cell[0]:
                        author_id = str(uuid.uuid4())
                        authors.append(author_id)
                        author_names.append(unidecode(cell[0]))
                        name = re.split(r'\s+', cell[0])
                        name = [item for item in name if item != '']
                        affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
                        emails = []
                        if email_list:
                            for email in email_list:
                                email_match = re.search(r'[\w.-]+@[\w.-]+', email)
                                emails.append(unidecode(email_match.group())) if email_match else None

                        author_data = {
                            "author_id": author_id,
                            "from_article": article_id,
                            "first_name": unidecode(name[0]),
                            "last_name": unidecode(name[-1]),
                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
                            "raw_name": unidecode(cell[0]),
                            "affiliation": [
                                {
                                    "year": volume,
                                    "affiliation": unidecode(affiliation),
                                    "email": ", ".join(emails)
                                }
                            ]
                        }
                        append_data_thread_safe(author_data, authorData, authorDataLock)
        # If no author table
        else:
            match_type = 0
            hr_count = len(article_soup.find_all('hr'))
            if hr_count < 3:
                pattern = r'<hr>(.*?)<hr>'
            else:
                pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
            matches = str(re.findall(pattern, html, re.DOTALL))
            if len(matches) < 5:
                match_type = 1
                last_p_tag = str(article_soup.find_all('p')[-1])
                pattern = r'<p>(.*?)<hr/>'
                matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip()

            if matches:
                matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
                matches = matches.split("<p>")
                for match in matches:
                    if "email" in match:
                        match = match.split("email")
                        email_list = str(match[1]).split(',')
                        match = match[0]
                    elif "e-mail" in match:
                        match = match.split("e-mail")
                        email_list = str(match[1]).split(',')
                        match = match[0]
                    else:
                        email_list = None

                    match = re.sub(r'<[^>]+>', '', match)
                    match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
                    if match_type == 0:
                        match = match.split("\\n")
                    else:
                        match = match.split("\n")
                    match = [m.replace('\\newline', '') for m in match]
                    match = [re.sub(r'\s+', ' ', m).strip() for m in match]

                    # Data processing
                    if match[0]:
                        author_id = str(uuid.uuid4())
                        authors.append(author_id)
                        authors.append(unidecode(match[0]))
                        name = re.split(r'\s+', match[0])
                        name = [item for item in name if item != '']
                        affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
                        emails = []
                        if email_list:
                            for email in email_list:
                                email_match = re.search(r'[\w.-]+@[\w.-]+', email)
                                emails.append(unidecode(email_match.group())) if email_match else None

                        author_data = {
                            "author_id": author_id,
                            "from_article": article_id,
                            "first_name": unidecode(name[0]),
                            "last_name": unidecode(name[-1]),
                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
                            "raw_name": unidecode(match[0]),
                            "affiliation": [
                                {
                                    "year": volume,
                                    "affiliation": unidecode(affiliation),
                                    "email": ", ".join(emails)
                                }
                            ]
                        }
                        append_data_thread_safe(author_data, authorData, authorDataLock)
            else:
                print("AUTHOR SEARCHING ERROR:", article_url)
                fail = {
                    "title": title,
                    "URL": article_url
                }
                failedFormatData.append(fail)

        # Article info
        article_data = {
            "article_id": article_id,
            "title": unidecode(title),
            "authors": authors,
            "author_names": author_names,
            "submit_datetime": submitted_date,
            "publish_datetime": publish_date,
            "keywords": keywords,
            "MSC": msc,
            "URL": article_url,
            "DOI": doi,
            "publisher": "Texas State University",
            "journal": "Electronic Journal of Differential Equations",
            "volume": volume,
            "issue": issue,
            "page": pp
        }
        append_data_thread_safe(article_data, articleData, articleDataLock)

        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
            save_data_thread_safe(articleData, articleDataLock, "Article")

        if len(authorData) % batch_size == 0:
            save_data_thread_safe(authorData, authorDataLock, "Author")


start_time = time.time()
url_list = []

# Get all general volumes url
index = "https://ejde.math.txstate.edu/indexleft.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
volume_links = soup.select('font > a[href]')
url_list.extend(["https://ejde.math.txstate.edu/" + link['href'] for link in volume_links[1:]][::-1])

# Get all special issues url
index = "https://ejde.math.txstate.edu/special-toc.html"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
special_links = soup.find_all("a", href=True)
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])

# Get all conference special issues url
index = "https://ejde.math.txstate.edu/conf-toc.html#latest"
response = requests.get(index)
soup = BeautifulSoup(response.content, 'html.parser')
special_links = soup.find_all("a", href=True)
url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-2]])

authorData = []
articleData = []
failedData = []
totallyFailedData = []
failedVolData = []
failedFormatData = []

authorNum = 0
articleNum = 0

batch_size = 100  # Number of articles to process before saving
authorDataLock = threading.Lock()
articleDataLock = threading.Lock()
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2))  # Set the number of worker threads

# Process each URL using multithreading
futures = [executor.submit(process_volume, url) for url in url_list]

# Wait for all tasks to complete
for future in as_completed(futures):
    try:
        future.result()
    except Exception as vol_err:
        print("VOLUME PROCESSING ERROR:", str(vol_err))

# Retry failed processing paper
if len(failedData):
    print("START RETRYING:", len(failedData))
    while failedData:
        fail_data = failedData.pop(0)
        articleTitle = fail_data["title"]
        articleUrl = fail_data["URL"]
        try:
            process_article(articleTitle, articleUrl)
        except Exception as retry_err:
            print("ARTICLE RETRYING FAILURE:", str(retry_err))
            totally_fail = {
                "title": articleTitle,
                "URL": articleUrl
            }
            totallyFailedData.append(totally_fail)

# Save remaining data
if len(articleData) > 0:
    save_data_thread_safe(articleData, articleDataLock, "Article")
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")

if len(authorData) > 0:
    save_data_thread_safe(authorData, authorDataLock, "Author")
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")

# Save error record
if len(totallyFailedData) > 0:
    ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json")
    print("Total failed processing paper:", len(totallyFailedData))

if len(failedVolData) > 0:
    ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json")
    print("Total failed fetching volume:", len(failedVolData))

if len(failedFormatData) > 0:
    ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
    print("Total failed searching article:", len(failedFormatData))

# Statistics
print("Total fetched paper:", articleNum)
print("Total fetched author:", authorNum)
print("time elapsed: {:.2f}s".format(time.time() - start_time))

# Transfer to large file and delete the temporary storage files
ejde_save.transform_data()
ejde_save.delete_data()