diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 6f91237..9fd79ce 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed ''' 爬取网站:'ejde.math.txstate.edu' - Total number of papers: - 2023/08/08 - 4300 + Total number of papers: 2023/08/08 - 4300 + Total Time via VPN w/119ms-delay: 441.80s ==========运行顺序========== 1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存 @@ -81,6 +81,10 @@ def process_volume(url): articles.extend(i for i in i_elements) else: print("HTML FORMAT FAILURE:", url) + fail = { + "website": url + } + failedFormatData.append(fail) return break except Exception as fetch_err: @@ -91,12 +95,12 @@ def process_volume(url): else: print("HTML FETCHING FAILURE:", url) fail = { - "website": url, + "website": url } failedVolData.append(fail) return - # Process each article using multithreading + # Process each article using multithreading (>20 threads would cause more error) volume_executor = ThreadPoolExecutor(max_workers=15) volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles] @@ -109,7 +113,6 @@ def process_volume(url): def process_html_article(baseweb, article): - global articleNum, authorNum # Get article title & url try: title = article.text.strip() @@ -118,7 +121,7 @@ def process_html_article(baseweb, article): except Exception as html_format_err: print("HTML FORMAT FAILURE:", str(html_format_err)) fail = { - "article": str(article), + "article": str(article) } failedFormatData.append(fail) return @@ -130,25 +133,15 @@ def process_html_article(baseweb, article): print("ARTICLE PROCESSING FAILURE:", str(article_err)) fail = { "title": title, - "URL": article_url, + "URL": article_url } failedData.append(fail) return - # Save the data periodically based on batch size - if len(articleData) % batch_size == 0: - ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") - articleNum += len(articleData) - articleData.clear() - - if len(authorData) % batch_size == 0: - ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") - authorNum += len(authorData) - authorData.clear() - @retry(wait_fixed=5000, stop_max_attempt_number=5) def process_article(title, article_url): + global articleNum, authorNum headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} article_response = requests.get(article_url, headers=headers) @@ -241,7 +234,7 @@ def process_article(title, article_url): "affiliation": [{ "year": volume, "affiliation": affiliation, - "email": email, + "email": email }] } authorData.append(author_data) @@ -277,7 +270,7 @@ def process_article(title, article_url): "affiliation": [{ "year": volume, "affiliation": affiliation, - "email": email, + "email": email }] } authorData.append(author_data) @@ -301,11 +294,23 @@ def process_article(title, article_url): "journal": "Electronic Journal of Differential Equations", "volume": volume, "issue": issue, - "page": pp, + "page": pp } articleData.append(article_data) + # Save the data periodically based on batch size + if len(articleData) % batch_size == 0: + ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json") + articleNum += len(articleData) + articleData.clear() + if len(authorData) % batch_size == 0: + ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json") + authorNum += len(authorData) + authorData.clear() + + +start_time = time.time() index = "https://ejde.math.txstate.edu/indexleft.html" response = requests.get(index) soup = BeautifulSoup(response.content, 'html.parser') @@ -341,7 +346,9 @@ for future in as_completed(futures): print("VOLUME PROCESSING ERROR:", str(vol_err)) # Retry failed processing paper -for data in failedData: +print("START RETRYING:", len(failedData)) +while failedData: + data = failedData.pop(0) articleTitle = data["title"] articleUrl = data["URL"] try: @@ -350,7 +357,7 @@ for data in failedData: print("ARTICLE RETRYING FAILURE:", str(retry_err)) totally_fail = { "title": articleTitle, - "URL": articleUrl, + "URL": articleUrl } totallyFailedData.append(totally_fail) @@ -378,6 +385,9 @@ if len(failedFormatData) > 0: ejde_save.save_data(failedFormatData, "", "Failed_format_record.json") print("Total failed searching article:", len(failedFormatData)) +# Total running time +print("time elapsed: {:.2f}s".format(time.time() - start_time)) + # Transfer to large file and delete the temporary storage files ejde_save.Transf() # ejde_save.delete()