From 27707a058ccf63a8bbe094fda6bb5275fac833c1 Mon Sep 17 00:00:00 2001
From: XCX <1361986662@qq.com>
Date: Sun, 13 Aug 2023 21:29:10 +0800
Subject: [PATCH] Change the data structure

---
 01_EJDE_spider/ejde_main.py         | 28 ++++++++++++++++------------
 02_EJQTDE_spider/ejqtde_scrawler.py | 22 ++++++++--------------
 04_SpringerOpen_spider/SD_detail.py | 15 +++++++--------
 3 files changed, 31 insertions(+), 34 deletions(-)

diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index 1876b22..4ae686f 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -7,7 +7,7 @@ import ejde_save
 from retrying import retry
 from datetime import datetime
 from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 
 '''
     爬取网站：'ejde.math.txstate.edu'
@@ -184,7 +184,7 @@ def process_article(title, article_url):
             msc = msc_match.group(1).strip().strip('.').strip()
             msc = re.split(r', |;', msc)
         else:
-            msc = None
+            msc = []
 
         # Extract KeyWords
         keywords_match = re.search(r'Key Words: (.*?)(?=<)', html, re.DOTALL)
@@ -195,7 +195,7 @@ def process_article(title, article_url):
             keywords = re.split(r', |;', keywords)
             keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords]
         else:
-            keywords = None
+            keywords = []
 
         # Extract DOI
         doi_match = re.search(r'DOI: (.+)(?=<)', html)
@@ -220,6 +220,7 @@ def process_article(title, article_url):
                     # Data processing
                     authors.append(cell[0])
                     name = cell[0].split(" ")
+                    middle_name = ''.join(name[1:-1]) if name[1:-1] else None
                     affiliation = ', '.join(cell[1:-1])
                     affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                     email_match = re.search(r'[\w.-]+@[\w.-]+', cell[-1])
@@ -227,10 +228,10 @@ def process_article(title, article_url):
 
                     author_data = {
                         "author_id": str(uuid.uuid4()),
-                        "from_article": article_id,
-                        "firstname": name[0],
-                        "lastname": name[-1],
-                        "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
+                        "from_article": [article_id],
+                        "first_name": name[0],
+                        "last_name": name[-1],
+                        "middle_name": middle_name,
                         "affiliation": [{
                             "year": volume,
                             "affiliation": affiliation,
@@ -256,6 +257,7 @@ def process_article(title, article_url):
                     # Data processing
                     authors.append(match[0])
                     name = match[0].split(" ")
+                    middle_name = ''.join(name[1:-1]) if name[1:-1] else None
                     affiliation = ''.join(match[1:-1]).lstrip(",").rstrip(",").strip()
                     affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))).strip()
                     email_match = re.search(r'[\w.-]+@[\w.-]+', match[-1])
@@ -263,10 +265,10 @@ def process_article(title, article_url):
 
                     author_data = {
                         "author_id": str(uuid.uuid4()),
-                        "from_article": article_id,
-                        "firstname": name[0],
-                        "lastname": name[-1],
-                        "middlename": [elem.strip() for elem in name[1:len(name) - 1] if len(elem.strip()) > 0] if len(name) > 2 else None,
+                        "from_article": [article_id],
+                        "first_name": name[0],
+                        "last_name": name[-1],
+                        "middle_name": middle_name,
                         "affiliation": [{
                             "year": volume,
                             "affiliation": affiliation,
@@ -345,6 +347,8 @@ for future in as_completed(futures):
     except Exception as vol_err:
         print("VOLUME PROCESSING ERROR:", str(vol_err))
 
+wait(futures)
+
 # Retry failed processing paper
 print("START RETRYING:", len(failedData))
 while failedData:
@@ -390,4 +394,4 @@ print("time elapsed: {:.2f}s".format(time.time() - start_time))
 
 # Transfer to large file and delete the temporary storage files
 ejde_save.Transf()
-# ejde_save.delete()
+ejde_save.delete()
diff --git a/02_EJQTDE_spider/ejqtde_scrawler.py b/02_EJQTDE_spider/ejqtde_scrawler.py
index 9601f6f..9c1bd5d 100644
--- a/02_EJQTDE_spider/ejqtde_scrawler.py
+++ b/02_EJQTDE_spider/ejqtde_scrawler.py
@@ -63,10 +63,10 @@ def author_detail(Data, Year, article_id, Author_list):
 
     author_data = {
         "author_id": str(uuid.uuid4()),
-        "from_article": article_id,
-        "firstname": Firstname,
-        "lastname": Lastname,
-        "middlename": Middlename,
+        "from_article": [article_id],
+        "first_name": Firstname,
+        "last_name": Lastname,
+        "middle_name": Middlename,
         "affiliation": [
             {
                 "year": Year,
@@ -94,24 +94,18 @@ def article_detail(Data, URL, article_id, Aricle_list):
     del Author[-1]
 
     # Submit_datetime and publish_datetime
-    def timeSet(time):
-        time = time.split('-')
-        time[1] = time[1].strip('0')
-        time = time[0] + '-' + time[1] + '-' + time[2]
-        return time
-
     time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
     time = re.findall(r'\d+-\d+-\d+', str(time))
-    Submit_date = timeSet(time[0]) if time[0] else None
-    Publish_date = timeSet(time[1]) if time[1] else None
+    Submit_date = time[0] if time[0] else None
+    Publish_date = time[1] if time[1] else None
 
     # Keyword
     Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
-    Keyword = Keyword.split(', ') if Keyword is not None else None
+    Keyword = Keyword.split(', ') if Keyword is not None else []
 
     # MSC
     MSC = Data.find('subjectcodes').get_text() if Data.find('subjectcodes') is not None else None
-    MSC = MSC.split(', ') if MSC is not None else None
+    MSC = MSC.split(', ') if MSC is not None else []
 
     # DOI
     if len(re.findall(r'<a href="(https://doi\.org/.*?)"', str(Data))) > 0:
diff --git a/04_SpringerOpen_spider/SD_detail.py b/04_SpringerOpen_spider/SD_detail.py
index 2aae1f9..98c78b4 100644
--- a/04_SpringerOpen_spider/SD_detail.py
+++ b/04_SpringerOpen_spider/SD_detail.py
@@ -1,5 +1,6 @@
 import uuid
 import calendar
+from datetime import datetime
 
 
 # ==========获取细节==========
@@ -36,9 +37,9 @@ def Author_dict(soup, article_id, Author_list):
         author_data = {
             "author_id": str(uuid.uuid4()),
             "from_article": article_id,
-            "firstname": Firstname,
-            "lastname": Lastname,
-            "middlename": Middlename,
+            "first _name": Firstname,
+            "last_name": Lastname,
+            "middle_name": Middlename,
             "affiliation": [
                 {
                     "year": Year,
@@ -87,10 +88,8 @@ def Article_dict(soup, url, article_id):
     Time = []
 
     def timeSet(time):
-        time = time.split(' ')
-        time[1] = str(list(calendar.month_name).index(time[1]))
-        time = time[2] + '-' + time[1] + '-' + time[0]
-        return time
+        input_date = datetime.strptime(time, "%d %B %Y")
+        return input_date.strftime("%Y-%m-%d")
 
     time_list = info.find('ul', class_='c-bibliographic-information__list')
     times = time_list.find_all('time')
@@ -112,7 +111,7 @@ def Article_dict(soup, url, article_id):
             Keyword.append(keyword)
 
     # MSC
-    MSC = None      # SpringerOpen.com does not have MSC
+    MSC = []      # SpringerOpen.com does not have MSC
 
     # DOI
     DOI = info.find('li', class_='c-bibliographic-information__list-item c-bibliographic-information__list-item--doi')