Fix the saving problem
This commit is contained in:
parent
2d1f2c504d
commit
e49e829682
@ -4,7 +4,7 @@ import re
|
|||||||
import ejde_save
|
import ejde_save
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
@ -50,7 +50,6 @@ def process_article(url):
|
|||||||
# Extract volume
|
# Extract volume
|
||||||
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
|
||||||
volume = str(volume_match.group(1)) if volume_match else None
|
volume = str(volume_match.group(1)) if volume_match else None
|
||||||
# year = volume_match.group(2) if volume_match else None
|
|
||||||
|
|
||||||
# Extract pp
|
# Extract pp
|
||||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||||
@ -171,7 +170,7 @@ authorData = []
|
|||||||
articleData = []
|
articleData = []
|
||||||
|
|
||||||
batch_size = 100 # Number of articles to process before saving
|
batch_size = 100 # Number of articles to process before saving
|
||||||
executor = ThreadPoolExecutor(max_workers=20) # Set the number of worker threads
|
executor = ThreadPoolExecutor(max_workers=25) # Set the number of worker threads
|
||||||
|
|
||||||
# Process each URL using multithreading
|
# Process each URL using multithreading
|
||||||
futures = [executor.submit(process_article, url) for url in url_list]
|
futures = [executor.submit(process_article, url) for url in url_list]
|
||||||
@ -183,6 +182,8 @@ for future in as_completed(futures):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("An error occurred:", str(e))
|
print("An error occurred:", str(e))
|
||||||
|
|
||||||
|
wait(futures)
|
||||||
|
|
||||||
# Save remaining data
|
# Save remaining data
|
||||||
if len(articleData) > 0:
|
if len(articleData) > 0:
|
||||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user