更新 EJDE_spider/ejde_main.py
adjust output datetime format
This commit is contained in:
parent
2fc3b85bab
commit
2d1f2c504d
@ -3,6 +3,7 @@ import requests
|
|||||||
import re
|
import re
|
||||||
import ejde_save
|
import ejde_save
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@ -16,6 +17,12 @@ from bs4 import BeautifulSoup
|
|||||||
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
*3、ejde_save.delete()(可选) 删除暂存区内部所有文件(注意备份)
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def datetime_transform(date):
|
||||||
|
input_date = datetime.strptime(date, "%B %d, %Y")
|
||||||
|
return input_date.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
|
||||||
# Article and author detail
|
# Article and author detail
|
||||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||||
def process_article(url):
|
def process_article(url):
|
||||||
@ -56,10 +63,12 @@ def process_article(url):
|
|||||||
# Extract submission date
|
# Extract submission date
|
||||||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
submitted_date = match.group(1) if match else None
|
submitted_date = match.group(1) if match else None
|
||||||
|
submitted_date = datetime_transform(submitted_date)
|
||||||
|
|
||||||
# Extract publication date
|
# Extract publication date
|
||||||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
publish_date = match.group(1) if match else None
|
publish_date = match.group(1) if match else None
|
||||||
|
publish_date = datetime_transform(publish_date)
|
||||||
|
|
||||||
# Extract MSC
|
# Extract MSC
|
||||||
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html)
|
||||||
@ -161,7 +170,7 @@ url_list = ["https://ejde.math.txstate.edu/" + link['href'] for link in volume_l
|
|||||||
authorData = []
|
authorData = []
|
||||||
articleData = []
|
articleData = []
|
||||||
|
|
||||||
batch_size = 5 # Number of articles to process before saving
|
batch_size = 100 # Number of articles to process before saving
|
||||||
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
||||||
|
|
||||||
# Process each URL using multithreading
|
# Process each URL using multithreading
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user