Change the data structure

2023-08-13 21:36:22 +08:00 · 2023-08-13 21:36:22 +08:00 · 1602d03e9d
commit 1602d03e9d
parent 27707a058c 083e6c87eb
1 changed files with 86 additions and 62 deletions
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -7,13 +7,13 @@ import ejde_save
 from retrying import retry
 from datetime import datetime
 from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, as_completed, wait
+from concurrent.futures import ThreadPoolExecutor, as_completed

 '''
    爬取网站：'ejde.math.txstate.edu'

    Total number of papers: 2023/08/08 - 4300
-    Total Time via VPN w/119ms-delay: 441.80s
+    Total Time via VPN w/100ms-delay: 254.04s

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
@ -152,9 +152,17 @@ def process_article(title, article_url):
        article_soup = BeautifulSoup(html, 'html.parser')
        article_text = article_soup.get_text()

+        # Extract title if title == None
+        if not title:
+            title_match = re.search(r"<h3>(.*?)<p>", article_text)
+            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
+
        # Extract volume
        volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
        volume = str(volume_match.group(1)) if volume_match else None
+        if not volume:
+            volume_match = re.search(r'Vol\. (\d+)', article_text)
+            volume = str(volume_match.group(1)) if volume_match else None

        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
@ -181,17 +189,18 @@ def process_article(title, article_url):
        if not msc_match:
            msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
        if msc_match:
-            msc = msc_match.group(1).strip().strip('.').strip()
+            msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
+            msc = msc.strip('.').strip()
            msc = re.split(r', |;', msc)
        else:
            msc = []

        # Extract KeyWords
-        keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
+        keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
        if not keywords_match:
-            keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
+            keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
        if keywords_match:
-            keywords = keywords_match.group(1).strip().replace('\n', '')
+            keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
            keywords = re.split(r', |;', keywords)
            keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
        else:
@ -213,72 +222,89 @@ def process_article(title, article_url):
            for row in table.find_all('tr'):
                cells = [cell.text.strip() for cell in row.find_all('td')]
                for cell in cells:
-                    cell = cell.split("\n")
-                    cell = [element.replace('email: ', '') for element in cell]
-                    cell = [c.strip() for c in cell]
+                    cell = re.split(r'[\r\n]+', cell)
+                    cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
+                    cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]

                    # Data processing
-                    authors.append(cell[0])
-                    name = cell[0].split(" ")
-                    middle_name = ''.join(name[1:-1]) if name[1:-1] else None
-                    affiliation = ', '.join(cell[1:-1])
-                    affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
-                    email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
-                    email = email_match.group() if email_match else None
+                    if cell[0]:
+                        authors.append(cell[0])
+                        name = re.split(r'[ .]', cell[0])
+                        affiliation = ', '.join(cell[1:-1])
+                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
+                        email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
+                        email = email_match.group() if email_match else None

-                    author_data = {
-                        "author_id": str(uuid.uuid4()),
-                        "from_article": [article_id],
-                        "first_name": name[0],
-                        "last_name": name[-1],
-                        "middle_name": middle_name,
-                        "affiliation": [{
-                            "year": volume,
-                            "affiliation": affiliation,
-                            "email": email
-                        }]
-                    }
-                    authorData.append(author_data)
+                        author_data = {
+                            "author_id": str(uuid.uuid4()),
+                            "from_article": article_id,
+                            "first_name": name[0],
+                            "last_name": name[-1],
+                            "middle_name": ''.join(name[1:-1]) if name[1:-1] else None,
+                            "affiliation": [{
+                                "year": volume,
+                                "affiliation": affiliation,
+                                "email": email
+                            }]
+                        }
+                        authorData.append(author_data)
        # If no author table
        else:
-            pattern = r'<hr>(.*?)<hr>'
+            match_type = 0
+            hr_count = len(article_soup.find_all('hr'))
+            if hr_count < 3:
+                pattern = r'<hr>(.*?)<hr>'
+            else:
+                pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
            matches = str(re.findall(pattern, html, re.DOTALL))
+            if len(matches) < 5:
+                match_type = 1
+                last_p_tag = str(article_soup.find_all('p')[-1])
+                pattern = r'<p>(.*?)<hr/>'
+                matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip()
+
            if matches:
                matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
                matches = matches.split("<p>")
-
                for match in matches:
                    match = re.sub(r'<[^>]+>', '', match)
-                    match = match.lstrip("\\n ").rstrip("\\n ").strip()
-                    match = match.split("\\n")
-                    match = [element.replace('email: ', '') for element in match]
-                    match = [m.strip() for m in match]
+                    match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
+                    if match_type == 0:
+                        match = match.split("\\n")
+                    else:
+                        match = match.split("\n")
+                    match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
+                    match = [re.sub(r'\s+', ' ', m).strip() for m in match]

                    # Data processing
-                    authors.append(match[0])
-                    name = match[0].split(" ")
-                    middle_name = ''.join(name[1:-1]) if name[1:-1] else None
-                    affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
-                    affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
-                    email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
-                    email = email_match.group() if email_match else None
+                    if match[0]:
+                        authors.append(match[0])
+                        name = re.split(r'[ .]', match[0])
+                        affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
+                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
+                        email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
+                        email = email_match.group() if email_match else None

-                    author_data = {
-                        "author_id": str(uuid.uuid4()),
-                        "from_article": [article_id],
-                        "first_name": name[0],
-                        "last_name": name[-1],
-                        "middle_name": middle_name,
-                        "affiliation": [{
-                            "year": volume,
-                            "affiliation": affiliation,
-                            "email": email
-                        }]
-                    }
-                    authorData.append(author_data)
+                        author_data = {
+                            "author_id": str(uuid.uuid4()),
+                            "from_article": article_id,
+                            "first_name": name[0],
+                            "last_name": name[-1],
+                            "middle_name": ''.join(name[1:-1]) if name[1:-1] else None,
+                            "affiliation": [{
+                                "year": volume,
+                                "affiliation": affiliation,
+                                "email": email
+                            }]
+                        }
+                        authorData.append(author_data)
            else:
                print("AUTHOR SEARCHING ERROR:", article_url)
-                return
+                fail = {
+                    "title": title,
+                    "URL": article_url
+                }
+                failedFormatData.append(fail)

        # Article info
        article_data = {
@ -347,8 +373,6 @@ for future in as_completed(futures):
    except Exception as vol_err:
        print("VOLUME PROCESSING ERROR:", str(vol_err))

-wait(futures)
-
 # Retry failed processing paper
 print("START RETRYING:", len(failedData))
 while failedData:
@ -378,15 +402,15 @@ if len(authorData) > 0:

 # Save error record
 if len(totallyFailedData) > 0:
-    ejde_save.save_data(failedData, "", "Failed_article_record.json")
+    ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json")
    print("Total failed processing paper:", len(totallyFailedData))

 if len(failedVolData) > 0:
-    ejde_save.save_data(failedVolData, "", "Failed_volume_record.json")
+    ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json")
    print("Total failed fetching volume:", len(failedVolData))

 if len(failedFormatData) > 0:
-    ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
+    ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
    print("Total failed searching article:", len(failedFormatData))

 # Total running time