diff --git a/EJDE_spider/ejde_main.py b/EJDE_spider/ejde_main.py index a6dcc28..d7d6450 100644 --- a/EJDE_spider/ejde_main.py +++ b/EJDE_spider/ejde_main.py @@ -4,9 +4,8 @@ import re import ejde_save from datetime import datetime -from concurrent.futures import ThreadPoolExecutor, as_completed, wait -from retrying import retry from bs4 import BeautifulSoup +from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' @@ -24,7 +23,6 @@ def datetime_transform(date): # Article and author detail -@retry(wait_fixed=5000, stop_max_attempt_number=5) def process_article(url): response = requests.get(url) response.raise_for_status() @@ -61,13 +59,19 @@ def process_article(url): # Extract submission date match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html) - submitted_date = match.group(1) if match else None - submitted_date = datetime_transform(submitted_date) + submitted_date = match.group(1) + if match: + submitted_date = datetime_transform(submitted_date) + else: + submitted_date = None # Extract publication date match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html) - publish_date = match.group(1) if match else None - publish_date = datetime_transform(publish_date) + publish_date = match.group(1) + if match: + publish_date = datetime_transform(publish_date) + else: + publish_date = None # Extract MSC msc_match = re.search(r'Math Subject Classifications: (.*?)\n', html) @@ -182,8 +186,6 @@ for future in as_completed(futures): except Exception as e: print("An error occurred:", str(e)) -wait(futures) - # Save remaining data if len(articleData) > 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") diff --git a/EJDE_spider/ejde_save.py b/EJDE_spider/ejde_save.py index d00a89c..0091c1e 100644 --- a/EJDE_spider/ejde_save.py +++ b/EJDE_spider/ejde_save.py @@ -93,5 +93,6 @@ def delete(): file_path = os.path.join(folder_path, file_name) if os.path.isfile(file_path): os.remove(file_path) + os.rmdir(folder_path) print('\nAttention: The temporary storage files have been deleted!')