更新 EJDE_spider/ejde_main.py
adjust output datetime format
This commit is contained in:
parent
2fc3b85bab
commit
2d1f2c504d
@ -3,6 +3,7 @@ import requests
|
||||
import re
|
||||
import ejde_save
|
||||
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from retrying import retry
|
||||
from bs4 import BeautifulSoup
|
||||
@ -16,6 +17,12 @@ from bs4 import BeautifulSoup
|
||||
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||||
'''
|
||||
|
||||
|
||||
def datetime_transform(date):
|
||||
input_date = datetime.strptime(date, "%B %d, %Y")
|
||||
return input_date.strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
# Article and author detail
|
||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||
def process_article(url):
|
||||
@ -56,10 +63,12 @@ def process_article(url):
|
||||
# Extract submission date
|
||||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||
submitted_date = match.group(1) if match else None
|
||||
submitted_date = datetime_transform(submitted_date)
|
||||
|
||||
# Extract publication date
|
||||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||
publish_date = match.group(1) if match else None
|
||||
publish_date = datetime_transform(publish_date)
|
||||
|
||||
# Extract MSC
|
||||
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
||||
@ -161,7 +170,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l
|
||||
authorData = []
|
||||
articleData = []
|
||||
|
||||
batch_size = 5 # Number of articles to process before saving
|
||||
batch_size = 100 # Number of articles to process before saving
|
||||
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
||||
|
||||
# Process each URL using multithreading
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user