diff --git a/EJDE_spider/ejde_main.py b/EJDE_spider/ejde_main.py index d66bbdf..a6dcc28 100644 --- a/EJDE_spider/ejde_main.py +++ b/EJDE_spider/ejde_main.py @@ -4,7 +4,7 @@ import re import ejde_save from datetime import datetime -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor, as_completed, wait from retrying import retry from bs4 import BeautifulSoup @@ -50,7 +50,6 @@ def process_article(url): # Extract volume volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text) volume = str(volume_match.group(1)) if volume_match else None - # year = volume_match.group(2) if volume_match else None # Extract pp pp_match = re.search(r'pp\. (\d+-\d+)', article_text) @@ -171,7 +170,7 @@ authorData = [] articleData = [] batch_size = 100 # Number of articles to process before saving -executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads +executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads # Process each URL using multithreading futures = [executor.submit(process_article, url) for url in url_list] @@ -183,6 +182,8 @@ for future in as_completed(futures): except Exception as e: print("An error occurred:", str(e)) +wait(futures) + # Save remaining data if len(articleData) > 0: ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")