更新 EJDE_spider/ejde_main.py

adjust output datetime format
This commit is contained in:
ldy 2023-08-02 11:21:37 +08:00
parent 2fc3b85bab
commit 2d1f2c504d

View File

@ -3,6 +3,7 @@ import requests
import re import re
import ejde_save import ejde_save
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from retrying import retry from retrying import retry
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -16,6 +17,12 @@ from bs4 import BeautifulSoup
*3ejde_save.delete()(可选) 删除暂存区内部所有文件注意备份 *3ejde_save.delete()(可选) 删除暂存区内部所有文件注意备份
''' '''
def datetime_transform(date):
input_date = datetime.strptime(date, "%B %d, %Y")
return input_date.strftime("%Y-%m-%d")
# Article and author detail # Article and author detail
@retry(wait_fixed=5000, stop_max_attempt_number=5) @retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(url): def process_article(url):
@ -56,10 +63,12 @@ def process_article(url):
# Extract submission date # Extract submission date
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
submitted_date = match.group(1) if match else None submitted_date = match.group(1) if match else None
submitted_date = datetime_transform(submitted_date)
# Extract publication date # Extract publication date
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html) match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
publish_date = match.group(1) if match else None publish_date = match.group(1) if match else None
publish_date = datetime_transform(publish_date)
# Extract MSC # Extract MSC
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html) msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
@ -161,7 +170,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l
authorData = [] authorData = []
articleData = [] articleData = []
batch_size = 5 # Number of articles to process before saving batch_size = 100 # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
# Process each URL using multithreading # Process each URL using multithreading