Bug Fix:

1. added split author data when hits "\n" 2. added split name by "." 3. added method extracting author info when have 3 hr tag
2023-08-11 19:13:33 +08:00 · 2023-08-11 19:13:33 +08:00 · 68a755a633
commit 68a755a633
parent f97195c94d
1 changed files with 8 additions and 4 deletions
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -222,14 +222,14 @@ def process_article(title, article_url):
            for row in table.find_all('tr'):
                cells = [cell.text.strip() for cell in row.find_all('td')]
                for cell in cells:
-                    cell = cell.split("\n")
+                    cell = re.split(r'[\r\n]+', cell)
                    cell = [element.replace('email: ', '') for element in cell]
                    cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]

                    # Data processing
                    if cell[0]:
                        authors.append(cell[0])
-                        name = cell[0].split(" ")
+                        name = re.split(r'[ .]', cell[0])
                        affiliation = ', '.join(cell[1:-1])
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                        email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
@ -252,7 +252,11 @@ def process_article(title, article_url):
        # If no author table
        else:
            match_type = 0
-            pattern = r'<hr>(.*?)<hr>'
+            hr_count = len(soup.find_all('hr'))
+            if hr_count < 3:
+                pattern = r'<hr>(.*?)<hr>'
+            else:
+                pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
            matches = str(re.findall(pattern, html, re.DOTALL))
            if len(matches) < 5:
                match_type = 1
@ -276,7 +280,7 @@ def process_article(title, article_url):
                    # Data processing
                    if match[0]:
                        authors.append(match[0])
-                        name = match[0].split(" ")
+                        name = re.split(r'[ .]', match[0])
                        affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                        email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])