Fixed: miss append data while multithreading
Pushed: new parsed zip
This commit is contained in:
parent
61ef0081d8
commit
ad427c24dc
Binary file not shown.
Binary file not shown.
@ -15,7 +15,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
爬取网站:'ejde.math.txstate.edu'
|
||||
|
||||
Total number of papers: 2023/08/08 - 4785
|
||||
Total Time via VPN w/100ms-delay: 48.04s
|
||||
Total Time via VPN w/100ms-delay: 96.30s
|
||||
|
||||
==========运行顺序==========
|
||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||
@ -24,9 +24,19 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
'''
|
||||
|
||||
|
||||
def save_data_thread_safe(data, data_lock, data_type):
|
||||
def append_data_thread_safe(from_list, to_list, data_lock):
|
||||
with data_lock:
|
||||
ejde_save.save_data(data, f"{data_type}", str(uuid.uuid4()) + ".json")
|
||||
to_list.append(from_list)
|
||||
|
||||
|
||||
def save_data_thread_safe(data, data_lock, data_type):
|
||||
global articleNum, authorNum
|
||||
with data_lock:
|
||||
ejde_save.save_data(data, f"{data_type}_TS", str(uuid.uuid4()) + ".json")
|
||||
if data_type == "Article":
|
||||
articleNum += len(data)
|
||||
else:
|
||||
authorNum += len(data)
|
||||
data.clear()
|
||||
|
||||
|
||||
@ -155,7 +165,6 @@ def process_html_article(baseweb, article):
|
||||
|
||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||
def process_article(title, article_url):
|
||||
global articleNum, authorNum
|
||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
||||
article_response = requests.get(article_url, headers=headers)
|
||||
@ -302,8 +311,7 @@ def process_article(title, article_url):
|
||||
}
|
||||
]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
authorNum += 1
|
||||
append_data_thread_safe(author_data, authorData, authorDataLock)
|
||||
# If no author table
|
||||
else:
|
||||
match_type = 0
|
||||
@ -374,8 +382,7 @@ def process_article(title, article_url):
|
||||
}
|
||||
]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
authorNum += 1
|
||||
append_data_thread_safe(author_data, authorData, authorDataLock)
|
||||
else:
|
||||
print("AUTHOR SEARCHING ERROR:", article_url)
|
||||
fail = {
|
||||
@ -402,15 +409,14 @@ def process_article(title, article_url):
|
||||
"issue": issue,
|
||||
"page": pp
|
||||
}
|
||||
articleData.append(article_data)
|
||||
articleNum += 1
|
||||
append_data_thread_safe(article_data, articleData, articleDataLock)
|
||||
|
||||
# Save the data periodically based on batch size
|
||||
if len(articleData) % batch_size == 0:
|
||||
save_data_thread_safe(articleData, articleDataLock, "Article_TS")
|
||||
save_data_thread_safe(articleData, articleDataLock, "Article")
|
||||
|
||||
if len(authorData) % batch_size == 0:
|
||||
save_data_thread_safe(authorData, authorDataLock, "Author_TS")
|
||||
save_data_thread_safe(authorData, authorDataLock, "Author")
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
@ -463,6 +469,7 @@ for future in as_completed(futures):
|
||||
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
||||
|
||||
# Retry failed processing paper
|
||||
if len(failedData):
|
||||
print("START RETRYING:", len(failedData))
|
||||
while failedData:
|
||||
fail_data = failedData.pop(0)
|
||||
@ -480,11 +487,11 @@ while failedData:
|
||||
|
||||
# Save remaining data
|
||||
if len(articleData) > 0:
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
save_data_thread_safe(articleData, articleDataLock, "Article")
|
||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||
|
||||
if len(authorData) > 0:
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
save_data_thread_safe(authorData, authorDataLock, "Author")
|
||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||
|
||||
# Save error record
|
||||
@ -507,4 +514,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||
|
||||
# Transfer to large file and delete the temporary storage files
|
||||
ejde_save.transform_data()
|
||||
# ejde_save.delete_data()
|
||||
ejde_save.delete_data()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user