diff --git a/Data/Origin/ejde_buffer.zip b/Data/Origin/ejde_buffer.zip index 3b1b079..bddb513 100644 Binary files a/Data/Origin/ejde_buffer.zip and b/Data/Origin/ejde_buffer.zip differ diff --git a/Data/Transform/EJDE_buffer_transform.zip b/Data/Transform/EJDE_buffer_transform.zip index 203cf74..2e2cdf8 100644 Binary files a/Data/Transform/EJDE_buffer_transform.zip and b/Data/Transform/EJDE_buffer_transform.zip differ diff --git a/Parsers/01_EJDE_spider/ejde_main.py b/Parsers/01_EJDE_spider/ejde_main.py index 0245370..7143e5a 100644 --- a/Parsers/01_EJDE_spider/ejde_main.py +++ b/Parsers/01_EJDE_spider/ejde_main.py @@ -15,7 +15,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed 爬取网站:'ejde.math.txstate.edu' Total number of papers: 2023/08/08 - 4785 - Total Time via VPN w/100ms-delay: 48.04s + Total Time via VPN w/100ms-delay: 96.30s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 @@ -24,9 +24,19 @@ from concurrent.futures import ThreadPoolExecutor, as_completed ''' -def save_data_thread_safe(data, data_lock, data_type): +def append_data_thread_safe(from_list, to_list, data_lock): with data_lock: - ejde_save.save_data(data, f"{data_type}", str(uuid.uuid4()) + ".json") + to_list.append(from_list) + + +def save_data_thread_safe(data, data_lock, data_type): + global articleNum, authorNum + with data_lock: + ejde_save.save_data(data, f"{data_type}_TS", str(uuid.uuid4()) + ".json") + if data_type == "Article": + articleNum += len(data) + else: + authorNum += len(data) data.clear() @@ -155,7 +165,6 @@ def process_html_article(baseweb, article): @retry(wait_fixed=5000, stop_max_attempt_number=5) def process_article(title, article_url): - global articleNum, authorNum headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} article_response = requests.get(article_url, headers=headers) @@ -302,8 +311,7 @@ def process_article(title, article_url): } ] } - authorData.append(author_data) - authorNum += 1 + append_data_thread_safe(author_data, authorData, authorDataLock) # If no author table else: match_type = 0 @@ -374,8 +382,7 @@ def process_article(title, article_url): } ] } - authorData.append(author_data) - authorNum += 1 + append_data_thread_safe(author_data, authorData, authorDataLock) else: print("AUTHOR SEARCHING ERROR:", article_url) fail = { @@ -402,15 +409,14 @@ def process_article(title, article_url): "issue": issue, "page": pp } - articleData.append(article_data) - articleNum += 1 + append_data_thread_safe(article_data, articleData, articleDataLock) # Save the data periodically based on batch size if len(articleData) % batch_size == 0: - save_data_thread_safe(articleData, articleDataLock, "Article_TS") + save_data_thread_safe(articleData, articleDataLock, "Article") if len(authorData) % batch_size == 0: - save_data_thread_safe(authorData, authorDataLock, "Author_TS") + save_data_thread_safe(authorData, authorDataLock, "Author") start_time = time.time() @@ -463,28 +469,29 @@ for future in as_completed(futures): print("VOLUME PROCESSING ERROR:", str(vol_err)) # Retry failed processing paper -print("START RETRYING:", len(failedData)) -while failedData: - fail_data = failedData.pop(0) - articleTitle = fail_data["title"] - articleUrl = fail_data["URL"] - try: - process_article(articleTitle, articleUrl) - except Exception as retry_err: - print("ARTICLE RETRYING FAILURE:", str(retry_err)) - totally_fail = { - "title": articleTitle, - "URL": articleUrl - } - totallyFailedData.append(totally_fail) +if len(failedData): + print("START RETRYING:", len(failedData)) + while failedData: + fail_data = failedData.pop(0) + articleTitle = fail_data["title"] + articleUrl = fail_data["URL"] + try: + process_article(articleTitle, articleUrl) + except Exception as retry_err: + print("ARTICLE RETRYING FAILURE:", str(retry_err)) + totally_fail = { + "title": articleTitle, + "URL": articleUrl + } + totallyFailedData.append(totally_fail) # Save remaining data if len(articleData) > 0: - ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") + save_data_thread_safe(articleData, articleDataLock, "Article") print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/") if len(authorData) > 0: - ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") + save_data_thread_safe(authorData, authorDataLock, "Author") print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/") # Save error record @@ -507,4 +514,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time)) # Transfer to large file and delete the temporary storage files ejde_save.transform_data() -# ejde_save.delete_data() +ejde_save.delete_data()