Optimization:

less memory usage data collection for volume HTML format error added time elapse monitor
2023-08-10 12:57:28 +08:00 · 2023-08-10 12:57:28 +08:00 · 71e613d994
commit 71e613d994
parent 2c25682f81
1 changed files with 33 additions and 23 deletions
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 '''
    爬取网站：'ejde.math.txstate.edu'

-    Total number of papers:
-    2023/08/08 - 4300
+    Total number of papers: 2023/08/08 - 4300
+    Total Time via VPN w/119ms-delay: 441.80s

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
@ -81,6 +81,10 @@ def process_volume(url):
                            articles.extend(i for i in i_elements)
                        else:
                            print("HTML FORMAT FAILURE:", url)
+                            fail = {
+                                "website": url
+                            }
+                            failedFormatData.append(fail)
                            return
                break
        except Exception as fetch_err:
@ -91,12 +95,12 @@ def process_volume(url):
            else:
                print("HTML FETCHING FAILURE:", url)
                fail = {
-                    "website": url,
+                    "website": url
                }
                failedVolData.append(fail)
                return

-    # Process each article using multithreading
+    # Process each article using multithreading (>20 threads would cause more error)
    volume_executor = ThreadPoolExecutor(max_workers=15)
    volume_futures = [volume_executor.submit(process_html_article, baseWeb, article) for article in articles]

@ -109,7 +113,6 @@ def process_volume(url):


 def process_html_article(baseweb, article):
-    global articleNum, authorNum
    # Get article title & url
    try:
        title = article.text.strip()
@ -118,7 +121,7 @@ def process_html_article(baseweb, article):
    except Exception as html_format_err:
        print("HTML FORMAT FAILURE:", str(html_format_err))
        fail = {
-            "article": str(article),
+            "article": str(article)
        }
        failedFormatData.append(fail)
        return
@ -130,25 +133,15 @@ def process_html_article(baseweb, article):
        print("ARTICLE PROCESSING FAILURE:", str(article_err))
        fail = {
            "title": title,
-            "URL": article_url,
+            "URL": article_url
        }
        failedData.append(fail)
        return

-    # Save the data periodically based on batch size
-    if len(articleData) % batch_size == 0:
-        ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
-        articleNum += len(articleData)
-        articleData.clear()
-
-    if len(authorData) % batch_size == 0:
-        ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
-        authorNum += len(authorData)
-        authorData.clear()
-

@retry(wait_fixed=5000, stop_max_attempt_number=5)
 def process_article(title, article_url):
+    global articleNum, authorNum
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
    article_response = requests.get(article_url, headers=headers)
@ -241,7 +234,7 @@ def process_article(title, article_url):
                        "affiliation": [{
                            "year": volume,
                            "affiliation": affiliation,
-                            "email": email,
+                            "email": email
                        }]
                    }
                    authorData.append(author_data)
@ -277,7 +270,7 @@ def process_article(title, article_url):
                        "affiliation": [{
                            "year": volume,
                            "affiliation": affiliation,
-                            "email": email,
+                            "email": email
                        }]
                    }
                    authorData.append(author_data)
@ -301,11 +294,23 @@ def process_article(title, article_url):
            "journal": "Electronic Journal of Differential Equations",
            "volume": volume,
            "issue": issue,
-            "page": pp,
+            "page": pp
        }
        articleData.append(article_data)

+        # Save the data periodically based on batch size
+        if len(articleData) % batch_size == 0:
+            ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
+            articleNum += len(articleData)
+            articleData.clear()

+        if len(authorData) % batch_size == 0:
+            ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
+            authorNum += len(authorData)
+            authorData.clear()
+
+
+start_time = time.time()
 index = "https://ejde.math.txstate.edu/indexleft.html"
 response = requests.get(index)
 soup = BeautifulSoup(response.content, 'html.parser')
@ -341,7 +346,9 @@ for future in as_completed(futures):
        print("VOLUME PROCESSING ERROR:", str(vol_err))

 # Retry failed processing paper
-for data in failedData:
+print("START RETRYING:", len(failedData))
+while failedData:
+    data = failedData.pop(0)
    articleTitle = data["title"]
    articleUrl = data["URL"]
    try:
@ -350,7 +357,7 @@ for data in failedData:
        print("ARTICLE RETRYING FAILURE:", str(retry_err))
        totally_fail = {
            "title": articleTitle,
-            "URL": articleUrl,
+            "URL": articleUrl
        }
        totallyFailedData.append(totally_fail)

@ -378,6 +385,9 @@ if len(failedFormatData) > 0:
    ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
    print("Total failed searching article:", len(failedFormatData))

+# Total running time
+print("time elapsed: {:.2f}s".format(time.time() - start_time))
+
 # Transfer to large file and delete the temporary storage files
 ejde_save.Transf()
 # ejde_save.delete()