diff --git a/EJDE_spider/ejde_main.py b/EJDE_spider/ejde_main.py index ec679e8..d66bbdf 100644 --- a/EJDE_spider/ejde_main.py +++ b/EJDE_spider/ejde_main.py @@ -3,6 +3,7 @@ import requests import re import ejde_save +from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from retrying import retry from bs4 import BeautifulSoup @@ -16,6 +17,12 @@ from bs4 import BeautifulSoup *3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) ''' + +def datetime_transform(date): + input_date = datetime.strptime(date, "%B %d, %Y") + return input_date.strftime("%Y-%m-%d") + + # Article and author detail @retry(wait_fixed=5000, stop_max_attempt_number=5) def process_article(url): @@ -56,10 +63,12 @@ def process_article(url): # Extract submission date match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) submitted_date = match.group(1) if match else None + submitted_date = datetime_transform(submitted_date) # Extract publication date match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html) publish_date = match.group(1) if match else None + publish_date = datetime_transform(publish_date) # Extract MSC msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html) @@ -161,7 +170,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l authorData = [] articleData = [] -batch_size = 5 # Number of articles to process before saving +batch_size = 100 # Number of articles to process before saving executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads # Process each URL using multithreading