From 68a755a633b155aa10cb8c048cc7358b12b24304 Mon Sep 17 00:00:00 2001
From: ldy <1913292237@qq.com>
Date: Fri, 11 Aug 2023 19:13:33 +0800
Subject: [PATCH] Bug Fix: 1. added split author data when hits "\n" 2. added
 split name by "." 3. added method extracting author info when have 3 hr tag

---
 01_EJDE_spider/ejde_main.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index 53228d4..2ea14c9 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -222,14 +222,14 @@ def process_article(title, article_url):
             for row in table.find_all('tr'):
                 cells = [cell.text.strip() for cell in row.find_all('td')]
                 for cell in cells:
-                    cell = cell.split("\n")
+                    cell = re.split(r'[\r\n]+', cell)
                     cell = [element.replace('email: ', '') for element in cell]
                     cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
 
                     # Data processing
                     if cell[0]:
                         authors.append(cell[0])
-                        name = cell[0].split(" ")
+                        name = re.split(r'[ .]', cell[0])
                         affiliation = ', '.join(cell[1:-1])
                         affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                         email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
@@ -252,7 +252,11 @@ def process_article(title, article_url):
         # If no author table
         else:
             match_type = 0
-            pattern = r'<hr>(.*?)<hr>'
+            hr_count = len(soup.find_all('hr'))
+            if hr_count < 3:
+                pattern = r'<hr>(.*?)<hr>'
+            else:
+                pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
             matches = str(re.findall(pattern, html, re.DOTALL))
             if len(matches) < 5:
                 match_type = 1
@@ -276,7 +280,7 @@ def process_article(title, article_url):
                     # Data processing
                     if match[0]:
                         authors.append(match[0])
-                        name = match[0].split(" ")
+                        name = re.split(r'[ .]', match[0])
                         affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
                         affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                         email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])