Optimization:

1. add conference special issue papers 2. optimized counting process 3. enhanced saving robustness
2023-08-20 16:48:43 +08:00 · 2023-08-20 16:48:43 +08:00 · b5ce290ea5
commit b5ce290ea5
parent e217342ce2
1 changed files with 33 additions and 12 deletions
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -12,8 +12,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 '''
    爬取网站：'ejde.math.txstate.edu'

-    Total number of papers: 2023/08/08 - 4339
-    Total Time via VPN w/52ms-delay: 430.38s
+    Total number of papers: 2023/08/08 - 4761
+    Total Time via VPN w/100ms-delay: 306.73s

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
@ -37,6 +37,7 @@ def datetime_transform(date):
        "Spetember": "September",
        "Septembere": "September",
        "Ocotber": "October",
+        "Nobember": "November",
    }
    try:
        input_date = datetime.strptime(date, "%B %d, %Y")
@ -120,6 +121,9 @@ def process_html_article(baseweb, article):
        title = article.text.strip()
        title = re.sub(r'\s+', ' ', title).strip()
        article_url = baseweb + article.find_next("a")["href"]
+        if "../../index.html" in article_url:
+            print("Redundant URL:", article_url)
+            return
    except Exception as html_format_err:
        print("HTML FORMAT FAILURE:", str(html_format_err))
        fail = {
@ -172,6 +176,17 @@ def process_article(title, article_url):
                issue_number, volume = volume_match.groups()
                volume = str(volume)
                issue = "Special Issue " + str(issue_number)
+            else:
+                volume_match = re.search(r'Conf\. (\d{2}), (\d{4})', article_text)
+                if volume_match:
+                    issue = "Conference " + str(volume_match.group(1))
+                    volume = str(volume_match.group(2))
+                else:
+                    volume_match = re.search(r'Conference (\d+) \((\d+)\)', article_text)
+                    if volume_match:
+                        issue_number, volume = volume_match.groups()
+                        volume = str(volume)
+                        issue = "Conference " + str(issue_number)
                    else:
                        volume = None

@ -209,7 +224,7 @@ def process_article(title, article_url):
        if keywords_match:
            keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
            keywords = re.split(r', |;', keywords)
-            keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
+            keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())]
        else:
            keywords = []

@ -270,6 +285,7 @@ def process_article(title, article_url):
                            }]
                        }
                        authorData.append(author_data)
+                        authorNum += 1
        # If no author table
        else:
            match_type = 0
@ -335,6 +351,7 @@ def process_article(title, article_url):
                            }]
                        }
                        authorData.append(author_data)
+                        authorNum += 1
            else:
                print("AUTHOR SEARCHING ERROR:", article_url)
                fail = {
@ -362,16 +379,15 @@ def process_article(title, article_url):
            "page": pp
        }
        articleData.append(article_data)
+        articleNum += 1

        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
            ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
-            articleNum += len(articleData)
            articleData.clear()

        if len(authorData) % batch_size == 0:
            ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
-            authorNum += len(authorData)
            authorData.clear()


@ -392,7 +408,13 @@ soup = BeautifulSoup(response.content, 'html.parser')
 special_links = soup.find_all("a", href=True)
 url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-1]])

-# Initialize lists
+# Get all conference special issues url
+index = "https://ejde.math.txstate.edu/conf-toc.html#latest"
+response = requests.get(index)
+soup = BeautifulSoup(response.content, 'html.parser')
+special_links = soup.find_all("a", href=True)
+url_list.extend(["https://ejde.math.txstate.edu/" + tag["href"] for tag in special_links[:-2]])
+
 authorData = []
 articleData = []
 failedData = []
@ -400,12 +422,11 @@ totallyFailedData = []
 failedVolData = []
 failedFormatData = []

-# Initialize variables for counting
 authorNum = 0
 articleNum = 0

 batch_size = 100  # Number of articles to process before saving
-executor = ThreadPoolExecutor(max_workers=len(url_list))  # Set the number of worker threads
+executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2))  # Set the number of worker threads

 # Process each URL using multithreading
 futures = [executor.submit(process_volume, url) for url in url_list]
@ -437,12 +458,10 @@ while failedData:
 if len(articleData) > 0:
    ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
-print("Total fetched paper:", len(articleData) + articleNum)

 if len(authorData) > 0:
    ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
-print("Total fetched author:", len(authorData) + authorNum)

 # Save error record
 if len(totallyFailedData) > 0:
@ -457,7 +476,9 @@ if len(failedFormatData) > 0:
    ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
    print("Total failed searching article:", len(failedFormatData))

-# Total running time
+# Statistics
+print("Total fetched paper:", articleNum)
+print("Total fetched author:", authorNum)
 print("time elapsed: {:.2f}s".format(time.time() - start_time))

 # Transfer to large file and delete the temporary storage files