Optimization:

1. added new regular expression format for volume 2. added new strip method for msc 3. deleted blank-space author 4. optimized middle name strip method 5. added new matching pattern for no table author list 6. added exception storing for AUTHOR SEARCHING ERROR Bug fix: 1. error record saving
2023-08-11 14:26:59 +08:00 · 2023-08-11 14:26:59 +08:00 · 3e78e9f48e
commit 3e78e9f48e
parent 69b10a9f72
1 changed files with 66 additions and 47 deletions
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -13,7 +13,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
    爬取网站：'ejde.math.txstate.edu'

    Total number of papers: 2023/08/08 - 4300
-    Total Time via VPN w/119ms-delay: 441.80s
+    Total Time via VPN w/100ms-delay: 254.04s

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
@ -156,7 +156,8 @@ def process_article(title, article_url):
        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
        volume = str(volume_match.group(1)) if volume_match else None
        if not volume:
-            volume = str(volume_match.group(2)) if volume_match else None
+            volume_match = re.search(r'Vol\. (\d+)', article_text)
+            volume = str(volume_match.group(1)) if volume_match else None

        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
@ -183,7 +184,8 @@ def process_article(title, article_url):
        if not msc_match:
            msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
        if msc_match:
-            msc = msc_match.group(1).strip().strip('.').strip()
+            msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
+            msc = msc.strip('.').strip()
            msc = re.split(r', |;', msc)
        else:
            msc = None
@ -217,9 +219,10 @@ def process_article(title, article_url):
                for cell in cells:
                    cell = cell.split("\n")
                    cell = [element.replace('email: ', '') for element in cell]
-                    cell = [c.strip() for c in cell]
+                    cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]

                    # Data processing
+                    if cell[0]:
                        authors.append(cell[0])
                        name = cell[0].split(" ")
                        affiliation = ', '.join(cell[1:-1])
@ -232,7 +235,8 @@ def process_article(title, article_url):
                            "from_article": article_id,
                            "firstname": name[0],
                            "lastname": name[-1],
-                        "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
+                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
+                                name) > 2 else None,
                            "affiliation": [{
                                "year": volume,
                                "affiliation": affiliation,
@ -242,20 +246,30 @@ def process_article(title, article_url):
                        authorData.append(author_data)
        # If no author table
        else:
+            match_type = 0
            pattern = r'<hr>(.*?)<hr>'
            matches = str(re.findall(pattern, html, re.DOTALL))
+            if len(matches) < 5:
+                match_type = 1
+                last_p_tag = str(article_soup.find_all('p')[-1])
+                pattern = r'<p>(.*?)<hr/>'
+                matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip()
+
            if matches:
                matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
                matches = matches.split("<p>")
-
                for match in matches:
                    match = re.sub(r'<[^>]+>', '', match)
                    match = match.lstrip("\\n ").rstrip("\\n ").strip()
+                    if match_type == 0:
                        match = match.split("\\n")
+                    else:
+                        match = match.split("\n")
                    match = [element.replace('email: ', '') for element in match]
-                    match = [m.strip() for m in match]
+                    match = [re.sub(r'\s+', ' ', m).strip() for m in match]

                    # Data processing
+                    if match[0]:
                        authors.append(match[0])
                        name = match[0].split(" ")
                        affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
@ -268,7 +282,8 @@ def process_article(title, article_url):
                            "from_article": article_id,
                            "firstname": name[0],
                            "lastname": name[-1],
-                        "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
+                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
+                                name) > 2 else None,
                            "affiliation": [{
                                "year": volume,
                                "affiliation": affiliation,
@ -278,7 +293,11 @@ def process_article(title, article_url):
                        authorData.append(author_data)
            else:
                print("AUTHOR SEARCHING ERROR:", article_url)
-                return
+                fail = {
+                    "title": title,
+                    "URL": article_url
+                }
+                failedFormatData.append(fail)

        # Article info
        article_data = {
@ -376,7 +395,7 @@ if len(authorData) > 0:

 # Save error record
 if len(totallyFailedData) > 0:
-    ejde_save.save_data(failedData, "Error", "Failed_article_record.json")
+    ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json")
    print("Total failed processing paper:", len(totallyFailedData))

 if len(failedVolData) > 0: