Fix the saving problem

This commit is contained in:
XCX 2023-08-03 12:01:51 +08:00
parent 2d1f2c504d
commit e49e829682

View File

@ -4,7 +4,7 @@ import re
import ejde_save
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from retrying import retry
from bs4 import BeautifulSoup
@ -50,7 +50,6 @@ def process_article(url):
# Extract volume
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
volume = str(volume_match.group(1)) if volume_match else None
# year = volume_match.group(2) if volume_match else None
# Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
@ -171,7 +170,7 @@ authorData = []
articleData = []
batch_size = 100 # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads
# Process each URL using multithreading
futures = [executor.submit(process_article, url) for url in url_list]
@ -183,6 +182,8 @@ for future in as_completed(futures):
except Exception as e:
print("An error occurred:", str(e))
wait(futures)
# Save remaining data
if len(articleData) > 0:
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")