Fix the saving problem
This commit is contained in:
parent
2d1f2c504d
commit
e49e829682
@ -4,7 +4,7 @@ import re
|
||||
import ejde_save
|
||||
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||||
from retrying import retry
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@ -50,7 +50,6 @@ def process_article(url):
|
||||
# Extract volume
|
||||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||||
volume = str(volume_match.group(1)) if volume_match else None
|
||||
# year = volume_match.group(2) if volume_match else None
|
||||
|
||||
# Extract pp
|
||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||
@ -171,7 +170,7 @@ authorData = []
|
||||
articleData = []
|
||||
|
||||
batch_size = 100 # Number of articles to process before saving
|
||||
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
||||
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads
|
||||
|
||||
# Process each URL using multithreading
|
||||
futures = [executor.submit(process_article, url) for url in url_list]
|
||||
@ -183,6 +182,8 @@ for future in as_completed(futures):
|
||||
except Exception as e:
|
||||
print("An error occurred:", str(e))
|
||||
|
||||
wait(futures)
|
||||
|
||||
# Save remaining data
|
||||
if len(articleData) > 0:
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user