From 35f5f2ac5e39a53061369a86993cf62682da1356 Mon Sep 17 00:00:00 2001
From: ldy <1913292237@qq.com>
Date: Fri, 11 Aug 2023 11:42:02 +0800
Subject: [PATCH 1/7] Optimization: clustered error files into a folder

---
 01_EJDE_spider/ejde_main.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index 1876b22..86b8b68 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -155,6 +155,8 @@ def process_article(title, article_url):
         # Extract volume
         volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
         volume = str(volume_match.group(1)) if volume_match else None
+        if not volume:
+            volume = str(volume_match.group(2)) if volume_match else None
 
         # Extract pp
         pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
@@ -374,15 +376,15 @@ if len(authorData) > 0:
 
 # Save error record
 if len(totallyFailedData) > 0:
-    ejde_save.save_data(failedData, "", "Failed_article_record.json")
+    ejde_save.save_data(failedData, "Error", "Failed_article_record.json")
     print("Total failed processing paper:", len(totallyFailedData))
 
 if len(failedVolData) > 0:
-    ejde_save.save_data(failedVolData, "", "Failed_volume_record.json")
+    ejde_save.save_data(failedVolData, "Error", "Failed_volume_record.json")
     print("Total failed fetching volume:", len(failedVolData))
 
 if len(failedFormatData) > 0:
-    ejde_save.save_data(failedFormatData, "", "Failed_format_record.json")
+    ejde_save.save_data(failedFormatData, "Error", "Failed_format_record.json")
     print("Total failed searching article:", len(failedFormatData))
 
 # Total running time

From 3e78e9f48e6c1be92bb861504bfc4484cec3012e Mon Sep 17 00:00:00 2001
From: ldy <1913292237@qq.com>
Date: Fri, 11 Aug 2023 14:26:59 +0800
Subject: [PATCH 2/7] Optimization: 1. added new regular expression format for
 volume 2. added new strip method for msc 3. deleted blank-space author 4.
 optimized middle name strip method 5. added new matching pattern for no table
 author list 6. added exception storing for AUTHOR SEARCHING ERROR Bug fix: 1.
 error record saving

---
 01_EJDE_spider/ejde_main.py | 113 +++++++++++++++++++++---------------
 1 file changed, 66 insertions(+), 47 deletions(-)

diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index 86b8b68..3c9c71a 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -13,7 +13,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
     爬取网站：'ejde.math.txstate.edu'
 
     Total number of papers: 2023/08/08 - 4300
-    Total Time via VPN w/119ms-delay: 441.80s
+    Total Time via VPN w/100ms-delay: 254.04s
 
     ==========运行顺序==========
     1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
@@ -156,7 +156,8 @@ def process_article(title, article_url):
         volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
         volume = str(volume_match.group(1)) if volume_match else None
         if not volume:
-            volume = str(volume_match.group(2)) if volume_match else None
+            volume_match = re.search(r'Vol\. (\d+)', article_text)
+            volume = str(volume_match.group(1)) if volume_match else None
 
         # Extract pp
         pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
@@ -183,7 +184,8 @@ def process_article(title, article_url):
         if not msc_match:
             msc_match = re.search(r'Math Subject Classifications:(.*?)\.', html)
         if msc_match:
-            msc = msc_match.group(1).strip().strip('.').strip()
+            msc = re.sub(r'<[^>]+>', '', msc_match.group(1).strip())
+            msc = msc.strip('.').strip()
             msc = re.split(r', |;', msc)
         else:
             msc = None
@@ -217,68 +219,85 @@ def process_article(title, article_url):
                 for cell in cells:
                     cell = cell.split("\n")
                     cell = [element.replace('email: ', '') for element in cell]
-                    cell = [c.strip() for c in cell]
+                    cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
 
                     # Data processing
-                    authors.append(cell[0])
-                    name = cell[0].split(" ")
-                    affiliation = ', '.join(cell[1:-1])
-                    affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
-                    email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
-                    email = email_match.group() if email_match else None
+                    if cell[0]:
+                        authors.append(cell[0])
+                        name = cell[0].split(" ")
+                        affiliation = ', '.join(cell[1:-1])
+                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
+                        email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
+                        email = email_match.group() if email_match else None
 
-                    author_data = {
-                        "author_id": str(uuid.uuid4()),
-                        "from_article": article_id,
-                        "firstname": name[0],
-                        "lastname": name[-1],
-                        "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
-                        "affiliation": [{
-                            "year": volume,
-                            "affiliation": affiliation,
-                            "email": email
-                        }]
-                    }
-                    authorData.append(author_data)
+                        author_data = {
+                            "author_id": str(uuid.uuid4()),
+                            "from_article": article_id,
+                            "firstname": name[0],
+                            "lastname": name[-1],
+                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
+                                name) > 2 else None,
+                            "affiliation": [{
+                                "year": volume,
+                                "affiliation": affiliation,
+                                "email": email
+                            }]
+                        }
+                        authorData.append(author_data)
         # If no author table
         else:
+            match_type = 0
             pattern = r'<hr>(.*?)<hr>'
             matches = str(re.findall(pattern, html, re.DOTALL))
+            if len(matches) < 5:
+                match_type = 1
+                last_p_tag = str(article_soup.find_all('p')[-1])
+                pattern = r'<p>(.*?)<hr/>'
+                matches = re.search(pattern, str(last_p_tag), re.DOTALL).group(1).strip()
+
             if matches:
                 matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
                 matches = matches.split("<p>")
-
                 for match in matches:
                     match = re.sub(r'<[^>]+>', '', match)
                     match = match.lstrip("\\n ").rstrip("\\n ").strip()
-                    match = match.split("\\n")
+                    if match_type == 0:
+                        match = match.split("\\n")
+                    else:
+                        match = match.split("\n")
                     match = [element.replace('email: ', '') for element in match]
-                    match = [m.strip() for m in match]
+                    match = [re.sub(r'\s+', ' ', m).strip() for m in match]
 
                     # Data processing
-                    authors.append(match[0])
-                    name = match[0].split(" ")
-                    affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
-                    affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
-                    email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
-                    email = email_match.group() if email_match else None
+                    if match[0]:
+                        authors.append(match[0])
+                        name = match[0].split(" ")
+                        affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
+                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
+                        email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
+                        email = email_match.group() if email_match else None
 
-                    author_data = {
-                        "author_id": str(uuid.uuid4()),
-                        "from_article": article_id,
-                        "firstname": name[0],
-                        "lastname": name[-1],
-                        "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
-                        "affiliation": [{
-                            "year": volume,
-                            "affiliation": affiliation,
-                            "email": email
-                        }]
-                    }
-                    authorData.append(author_data)
+                        author_data = {
+                            "author_id": str(uuid.uuid4()),
+                            "from_article": article_id,
+                            "firstname": name[0],
+                            "lastname": name[-1],
+                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
+                                name) > 2 else None,
+                            "affiliation": [{
+                                "year": volume,
+                                "affiliation": affiliation,
+                                "email": email
+                            }]
+                        }
+                        authorData.append(author_data)
             else:
                 print("AUTHOR SEARCHING ERROR:", article_url)
-                return
+                fail = {
+                    "title": title,
+                    "URL": article_url
+                }
+                failedFormatData.append(fail)
 
         # Article info
         article_data = {
@@ -376,7 +395,7 @@ if len(authorData) > 0:
 
 # Save error record
 if len(totallyFailedData) > 0:
-    ejde_save.save_data(failedData, "Error", "Failed_article_record.json")
+    ejde_save.save_data(totallyFailedData, "Error", "Failed_article_record.json")
     print("Total failed processing paper:", len(totallyFailedData))
 
 if len(failedVolData) > 0:

From f97195c94dab9ace48e72f7c7f2d30ef9bea6ecc Mon Sep 17 00:00:00 2001
From: ldy <1913292237@qq.com>
Date: Fri, 11 Aug 2023 18:05:15 +0800
Subject: [PATCH 3/7] Bug Fix: handled exception when the volume website has no
 title

---
 01_EJDE_spider/ejde_main.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index 3c9c71a..53228d4 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -152,6 +152,11 @@ def process_article(title, article_url):
         article_soup = BeautifulSoup(html, 'html.parser')
         article_text = article_soup.get_text()
 
+        # Extract title if title == None
+        if not title:
+            title_match = re.search(r"<h3>(.*?)<p>", article_text)
+            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
+
         # Extract volume
         volume_match = re.search(r'Vol\. (\d+) \((\d+)\)', article_text)
         volume = str(volume_match.group(1)) if volume_match else None

From 68a755a633b155aa10cb8c048cc7358b12b24304 Mon Sep 17 00:00:00 2001
From: ldy <1913292237@qq.com>
Date: Fri, 11 Aug 2023 19:13:33 +0800
Subject: [PATCH 4/7] Bug Fix: 1. added split author data when hits "\n" 2.
 added split name by "." 3. added method extracting author info when have 3 hr
 tag

---
 01_EJDE_spider/ejde_main.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index 53228d4..2ea14c9 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -222,14 +222,14 @@ def process_article(title, article_url):
             for row in table.find_all('tr'):
                 cells = [cell.text.strip() for cell in row.find_all('td')]
                 for cell in cells:
-                    cell = cell.split("\n")
+                    cell = re.split(r'[\r\n]+', cell)
                     cell = [element.replace('email: ', '') for element in cell]
                     cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
 
                     # Data processing
                     if cell[0]:
                         authors.append(cell[0])
-                        name = cell[0].split(" ")
+                        name = re.split(r'[ .]', cell[0])
                         affiliation = ', '.join(cell[1:-1])
                         affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                         email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
@@ -252,7 +252,11 @@ def process_article(title, article_url):
         # If no author table
         else:
             match_type = 0
-            pattern = r'<hr>(.*?)<hr>'
+            hr_count = len(soup.find_all('hr'))
+            if hr_count < 3:
+                pattern = r'<hr>(.*?)<hr>'
+            else:
+                pattern = r'<hr>(?:.*<hr>)(.*)(?=<hr>)'
             matches = str(re.findall(pattern, html, re.DOTALL))
             if len(matches) < 5:
                 match_type = 1
@@ -276,7 +280,7 @@ def process_article(title, article_url):
                     # Data processing
                     if match[0]:
                         authors.append(match[0])
-                        name = match[0].split(" ")
+                        name = re.split(r'[ .]', match[0])
                         affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
                         affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                         email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])

From b1eba69085d8b8c8ecdd31eaf1fb68bdf47b9322 Mon Sep 17 00:00:00 2001
From: ldy <1913292237@qq.com>
Date: Fri, 11 Aug 2023 19:16:03 +0800
Subject: [PATCH 5/7] Bug Fix: 1. hr_count soup should be article_soup

---
 01_EJDE_spider/ejde_main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index 2ea14c9..d13393f 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -252,7 +252,7 @@ def process_article(title, article_url):
         # If no author table
         else:
             match_type = 0
-            hr_count = len(soup.find_all('hr'))
+            hr_count = len(article_soup.find_all('hr'))
             if hr_count < 3:
                 pattern = r'<hr>(.*?)<hr>'
             else:

From ed469ee362cef9708f37a54526815a2075276875 Mon Sep 17 00:00:00 2001
From: ldy <1913292237@qq.com>
Date: Fri, 11 Aug 2023 19:52:40 +0800
Subject: [PATCH 6/7] Bug Fix: 1. reformat regular expressions for keyword
 matching

---
 01_EJDE_spider/ejde_main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index d13393f..bd8429c 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -196,11 +196,11 @@ def process_article(title, article_url):
             msc = None
 
         # Extract KeyWords
-        keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
+        keywords_match = re.search(r'Key Words: (.*?)(?:<br>|<p>|$)', html, re.DOTALL)
         if not keywords_match:
-            keywords_match = re.search(r'Key Words: (.*?)<br>', html, re.DOTALL)
+            keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
         if keywords_match:
-            keywords = keywords_match.group(1).strip().replace('\n', '')
+            keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
             keywords = re.split(r', |;', keywords)
             keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
         else:

From 083e6c87eb4da86d996adc791f3b618ddebe076e Mon Sep 17 00:00:00 2001
From: ldy <1913292237@qq.com>
Date: Fri, 11 Aug 2023 20:45:04 +0800
Subject: [PATCH 7/7] Optimization: strip "\newline" in author name

---
 01_EJDE_spider/ejde_main.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index bd8429c..17577cf 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -223,7 +223,7 @@ def process_article(title, article_url):
                 cells = [cell.text.strip() for cell in row.find_all('td')]
                 for cell in cells:
                     cell = re.split(r'[\r\n]+', cell)
-                    cell = [element.replace('email: ', '') for element in cell]
+                    cell = [c.replace('email: ', '').replace('\\newline', '') for c in cell]
                     cell = [re.sub(r'\s+', ' ', c).strip() for c in cell]
 
                     # Data processing
@@ -240,8 +240,8 @@ def process_article(title, article_url):
                             "from_article": article_id,
                             "firstname": name[0],
                             "lastname": name[-1],
-                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
-                                name) > 2 else None,
+                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
+                                           ] if len(name) > 2 else None,
                             "affiliation": [{
                                 "year": volume,
                                 "affiliation": affiliation,
@@ -269,12 +269,12 @@ def process_article(title, article_url):
                 matches = matches.split("<p>")
                 for match in matches:
                     match = re.sub(r'<[^>]+>', '', match)
-                    match = match.lstrip("\\n ").rstrip("\\n ").strip()
+                    match = match.lstrip("\\n ").lstrip("\n ").rstrip("\\n ").rstrip("\n ").strip()
                     if match_type == 0:
                         match = match.split("\\n")
                     else:
                         match = match.split("\n")
-                    match = [element.replace('email: ', '') for element in match]
+                    match = [m.replace('email: ', '').replace('\\newline', '') for m in match]
                     match = [re.sub(r'\s+', ' ', m).strip() for m in match]
 
                     # Data processing
@@ -291,8 +291,8 @@ def process_article(title, article_url):
                             "from_article": article_id,
                             "firstname": name[0],
                             "lastname": name[-1],
-                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(
-                                name) > 2 else None,
+                            "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0
+                                           ] if len(name) > 2 else None,
                             "affiliation": [{
                                 "year": volume,
                                 "affiliation": affiliation,
@@ -420,4 +420,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
 
 # Transfer to large file and delete the temporary storage files
 ejde_save.Transf()
-# ejde_save.delete()
+ejde_save.delete()