更新 EJDE_spider/ejde_main.py

adjust output datetime format
2023-08-02 11:21:37 +08:00 · 2023-08-02 11:21:37 +08:00 · 2d1f2c504d
commit 2d1f2c504d
parent 2fc3b85bab
1 changed files with 10 additions and 1 deletions
--- a/EJDE_spider/ejde_main.py
+++ b/EJDE_spider/ejde_main.py
@ -3,6 +3,7 @@ import requests
 import re
 import ejde_save
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from retrying import retry
 from bs4 import BeautifulSoup
@ -16,6 +17,12 @@ from bs4 import BeautifulSoup
    *3、ejde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
 '''
 def datetime_transform(date):
    input_date = datetime.strptime(date, "%B %d, %Y")
    return input_date.strftime("%Y-%m-%d")
 # Article and author detail
@retry(wait_fixed=5000, stop_max_attempt_number=5)
 def process_article(url):
@ -56,10 +63,12 @@ def process_article(url):
        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        submitted_date = match.group(1) if match else None
        submitted_date = datetime_transform(submitted_date)
        # Extract publication date
        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
        publish_date = match.group(1) if match else None
        publish_date = datetime_transform(publish_date)
        # Extract MSC
        msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
@ -161,7 +170,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l
 authorData = []
 articleData = []
-batch_size = 5  # Number of articles to process before saving
+batch_size = 100  # Number of articles to process before saving
 executor = ThreadPoolExecutor(max_workers=20)  # Set the number of worker threads
 # Process each URL using multithreading