From 2fc3b85babaadbccac6908d1a07601afa9e60a75 Mon Sep 17 00:00:00 2001
From: XCX <1361986662@qq.com>
Date: Tue, 1 Aug 2023 19:11:24 +0800
Subject: [PATCH] Corrected the loops, the program will now not add the same
 data repeatedly

---
 EJDE_spider/ejde_save.py          | 36 +++++++++++++++-------------
 EJQTDE_spider/ejqtde_main.py      | 11 +++++----
 EJQTDE_spider/ejqtde_save.py      | 34 ++++++++++++++------------
 EJQTDE_spider/ejqtde_scrawler.py  | 10 ++++++--
 SpringerOpen_spider/SD_main.py    |  6 +----
 SpringerOpen_spider/SD_save.py    | 40 +++++++++++++++++--------------
 SpringerOpen_spider/SD_scrawl.py  |  8 ++++++-
 SpringerOpen_spider/SD_threads.py | 15 ++++--------
 8 files changed, 88 insertions(+), 72 deletions(-)

diff --git a/EJDE_spider/ejde_save.py b/EJDE_spider/ejde_save.py
index 5b67447..d00a89c 100644
--- a/EJDE_spider/ejde_save.py
+++ b/EJDE_spider/ejde_save.py
@@ -31,27 +31,31 @@ def Transf():
                 with open(file_path, 'r', encoding='utf-8') as file:
                     data = json.load(file)
 
-                for Dict in data:
-                    if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
-                        # Select data
-                        data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+                    for Dict in data:
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                            # Select data
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
+                                data_oldest.append(Dict)
 
-                        data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+                            if (isinstance(Dict, dict) and 2010 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
+                                data_2010_2014.append(Dict)
 
-                        data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+                            if (isinstance(Dict, dict) and 2015 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
+                                data_2015_2020.append(Dict)
 
-                        data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
+                                data_newest.append(Dict)
 
-                        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+        # Transfer
+        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
 
-                        # Transfer
-                        for index in range(0, 4):
-                            with open(output_files[index], 'w', encoding='utf-8') as file:
-                                json.dump(Data[index], file, indent=4)
+        for index in range(0, 4):
+            with open(output_files[index], 'w', encoding='utf-8') as file:
+                json.dump(Data[index], file, indent=4)
 
     # The path of reading
     author_folder_path = './ejde_buffer/Author_TS'
diff --git a/EJQTDE_spider/ejqtde_main.py b/EJQTDE_spider/ejqtde_main.py
index 4917e6c..8d05b6f 100644
--- a/EJQTDE_spider/ejqtde_main.py
+++ b/EJQTDE_spider/ejqtde_main.py
@@ -15,10 +15,10 @@ from urllib.parse import urljoin
     爬取网站：'https://www.math.u-szeged.hu/ejqtde'
 
     ==========运行顺序==========
-    1、ejqtde_main                      获取各年份的期刊链接
-    2、ejqtde_scrawler                  抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件（json）暂存
-    3、ejqtde_save                      从本地浏览暂存的小文件筛选后存入不同年份的大文件
-    *4、ejqtde_save.delete()(可选)      删除暂存区内部所有文件（注意备份）
+    1、ejqtde_main       获取各年份的期刊链接
+    2、ejqtde_scrawler                抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件（json）暂存
+    3、ejqtde_save                    从本地浏览暂存的小文件筛选后存入不同年份的大文件
+    *4、ejqtde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
 '''
 
 
@@ -49,9 +49,10 @@ Article_list = []
 hrefs = []
 
 # Base web urls
-baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
 current_year = datetime.datetime.now().year
 years = range(2009, 2011)            # years = range(2010, current_year + 1)
+
+baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
 url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1&paramtipus_ertek=publications&param_ertek='
             + f'{year}' for year in years][::-1]
 
diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py
index 5ae8e54..62c990e 100644
--- a/EJQTDE_spider/ejqtde_save.py
+++ b/EJQTDE_spider/ejqtde_save.py
@@ -34,26 +34,30 @@ def Transf():
                     data = json.load(file)
 
                     for Dict in data:
-                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
-                            # Select data
-                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                            # 筛选文章
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
+                                data_oldest.append(Dict)
 
-                            data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+                            if (isinstance(Dict, dict) and 2010 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
+                                data_2010_2014.append(Dict)
 
-                            data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+                            if (isinstance(Dict, dict) and 2015 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
+                                data_2015_2020.append(Dict)
 
-                            data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
+                                data_newest.append(Dict)
 
-                            Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+        # Transfer
+        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
 
-                            # Transfer
-                            for index in range(0, 4):
-                                with open(output_files[index], 'w', encoding='utf-8') as file:
-                                    json.dump(Data[index], file, indent=4)
+        for index in range(0, 4):
+            with open(output_files[index], 'w', encoding='utf-8') as file:
+                json.dump(Data[index], file, indent=4)
 
     # The path of reading
     author_folder_path = './EJQTDE_buffer/Author_TS'
diff --git a/EJQTDE_spider/ejqtde_scrawler.py b/EJQTDE_spider/ejqtde_scrawler.py
index 1f3975d..9601f6f 100644
--- a/EJQTDE_spider/ejqtde_scrawler.py
+++ b/EJQTDE_spider/ejqtde_scrawler.py
@@ -94,10 +94,16 @@ def article_detail(Data, URL, article_id, Aricle_list):
     del Author[-1]
 
     # Submit_datetime and publish_datetime
+    def timeSet(time):
+        time = time.split('-')
+        time[1] = time[1].strip('0')
+        time = time[0] + '-' + time[1] + '-' + time[2]
+        return time
+
     time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
     time = re.findall(r'\d+-\d+-\d+', str(time))
-    Submit_date = time[0] if time[0] else None
-    Publish_date = time[1] if time[1] else None
+    Submit_date = timeSet(time[0]) if time[0] else None
+    Publish_date = timeSet(time[1]) if time[1] else None
 
     # Keyword
     Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
diff --git a/SpringerOpen_spider/SD_main.py b/SpringerOpen_spider/SD_main.py
index c56f993..a4b5c5d 100644
--- a/SpringerOpen_spider/SD_main.py
+++ b/SpringerOpen_spider/SD_main.py
@@ -22,10 +22,6 @@ from urllib.parse import urljoin
 Webs = []           # A list for web url
 Links = []          # A list for links
 
-# 存放爬取数据的空列表
-Article_data = []
-Author_data = []
-
 # ==========访问论文列表页==========
 # 获取数学类期刊网站链接
 headers = SD_link.header()
@@ -60,7 +56,7 @@ for web in Webs:
 print('\nThe links have been stored!\n')
 
 # 进入多线程池开始爬取
-SD_threads.Threads(Links, Article_data, Author_data)
+SD_threads.Threads(Links)
 
 # json文件汇总
 SD_save.Transf()
diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py
index 0d7119d..dfa8df5 100644
--- a/SpringerOpen_spider/SD_save.py
+++ b/SpringerOpen_spider/SD_save.py
@@ -32,31 +32,34 @@ def Transf():
                     data = json.load(file)
 
                     for Dict in data:
-                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                             # 筛选文章
-                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
+                                data_oldest.append(Dict)
 
-                            data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+                            if (isinstance(Dict, dict) and 2010 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
+                                data_2010_2014.append(Dict)
 
-                            data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+                            if (isinstance(Dict, dict) and 2015 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
+                                data_2015_2020.append(Dict)
 
-                            data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
+                                data_newest.append(Dict)
 
-                            Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+        # 转存
+        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
 
-                            # 转存
-                            for index in range(0, 4):
-                                with open(output_files[index], 'w', encoding='utf-8') as file:
-                                    json.dump(Data[index], file, indent=4)
+        for index in range(0, 4):
+            with open(output_files[index], 'w', encoding='utf-8') as file:
+                json.dump(Data[index], file, indent=4)
 
-        # 读取路径
-
-    author_folder_path = './SpringerOpen_buffer/Author'
-    article_folder_path = './SpringerOpen_buffer/Article'
+    # 读取路径
+    author_folder_path = './SpringerOpen_buffer/Author_TS'
+    article_folder_path = './SpringerOpen_buffer/Article_TS'
 
     # 存储路径
     author_output_file = [
@@ -80,6 +83,7 @@ def Transf():
     # End
     print("\nData has been written into files.")
 
+
 # 删除暂存区文件
 def delete(folder_path):
     file_names = os.listdir(folder_path)
diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py
index 3f9099c..31dbee4 100644
--- a/SpringerOpen_spider/SD_scrawl.py
+++ b/SpringerOpen_spider/SD_scrawl.py
@@ -8,7 +8,11 @@ import SD_detail
 import SD_save
 
 # ==========获取论文详情页链接==========
-def Scrawl(Link, Article_data, Author_data):
+def Scrawl(Link):
+    # 存放爬取数据的空列表
+    Article_data = []
+    Author_data = []
+
     # 访问论文列表页
     headers = SD_link.header()
     soup = SD_link.Link(Link, headers)
@@ -35,6 +39,8 @@ def Scrawl(Link, Article_data, Author_data):
             Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
             Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
 
+    print('Complete: ', Link)
+
     # 放入json文件暂存（小文件）
     if Article_data:
         index = str(uuid.uuid4())
diff --git a/SpringerOpen_spider/SD_threads.py b/SpringerOpen_spider/SD_threads.py
index d5d7527..74e649b 100644
--- a/SpringerOpen_spider/SD_threads.py
+++ b/SpringerOpen_spider/SD_threads.py
@@ -1,24 +1,19 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 import SD_scrawl
 
+
 # ==========多线程处理==========
-def Threads(Links, Article_data, Author_data):
-    executor = ThreadPoolExecutor(max_workers=20)       # 进程池
+def Threads(Links):
+    executor = ThreadPoolExecutor(max_workers=25)       # 进程池
 
     # 进行多线程处理
-    futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links]
-
-    # max_iterations = 5      # 最多同时进行数
-    # iteration_count = 0     # 计数器
+    futures = [executor.submit(SD_scrawl.Scrawl, Link) for Link in Links]
 
     # 等待所有进程完成
     for future in as_completed(futures):
         try:
             future.result()
-        #     # 限制最大同时爬取数
-        #     iteration_count += 1  # Increment the counter
-        #     if iteration_count >= max_iterations:
-        #         break
+
         except Exception as e:
             print("An error occurred:", str(e))