Corrected the loops, the program will now not add the same data repeatedly

2023-08-01 19:11:24 +08:00 · 2023-08-01 19:11:24 +08:00 · 2fc3b85bab
commit 2fc3b85bab
parent 01c1a7d978
8 changed files with 88 additions and 72 deletions
--- a/EJDE_spider/ejde_save.py
+++ b/EJDE_spider/ejde_save.py
@ -31,27 +31,31 @@ def Transf():
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

-                for Dict in data:
-                    if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
-                        # Select data
-                        data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+                    for Dict in data:
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                            # Select data
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
+                                data_oldest.append(Dict)

-                        data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+                            if (isinstance(Dict, dict) and 2010 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
+                                data_2010_2014.append(Dict)

-                        data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+                            if (isinstance(Dict, dict) and 2015 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
+                                data_2015_2020.append(Dict)

-                        data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
+                                data_newest.append(Dict)

-                        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+        # Transfer
+        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]

-                        # Transfer
-                        for index in range(0, 4):
-                            with open(output_files[index], 'w', encoding='utf-8') as file:
-                                json.dump(Data[index], file, indent=4)
+        for index in range(0, 4):
+            with open(output_files[index], 'w', encoding='utf-8') as file:
+                json.dump(Data[index], file, indent=4)

    # The path of reading
    author_folder_path = './ejde_buffer/Author_TS'
--- a/EJQTDE_spider/ejqtde_main.py
+++ b/EJQTDE_spider/ejqtde_main.py
@ -15,10 +15,10 @@ from urllib.parse import urljoin
    爬取网站：'https://www.math.u-szeged.hu/ejqtde'

    ==========运行顺序==========
-    1、ejqtde_main                      获取各年份的期刊链接
-    2、ejqtde_scrawler                  抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件（json）暂存
-    3、ejqtde_save                      从本地浏览暂存的小文件筛选后存入不同年份的大文件
-    *4、ejqtde_save.delete()(可选)      删除暂存区内部所有文件（注意备份）
+    1、ejqtde_main       获取各年份的期刊链接
+    2、ejqtde_scrawler                抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件（json）暂存
+    3、ejqtde_save                    从本地浏览暂存的小文件筛选后存入不同年份的大文件
+    *4、ejqtde_save.delete()(可选)     删除暂存区内部所有文件（注意备份）
 '''


@ -49,9 +49,10 @@ Article_list = []
 hrefs = []

 # Base web urls
-baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
 current_year = datetime.datetime.now().year
 years = range(2009, 2011)            # years = range(2010, current_year + 1)
+
+baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
 url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1&paramtipus_ertek=publications&param_ertek='
            + f'{year}' for year in years][::-1]

--- a/EJQTDE_spider/ejqtde_save.py
+++ b/EJQTDE_spider/ejqtde_save.py
@ -34,26 +34,30 @@ def Transf():
                    data = json.load(file)

                    for Dict in data:
-                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
-                            # Select data
-                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                            # 筛选文章
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
+                                data_oldest.append(Dict)

-                            data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+                            if (isinstance(Dict, dict) and 2010 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
+                                data_2010_2014.append(Dict)

-                            data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+                            if (isinstance(Dict, dict) and 2015 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
+                                data_2015_2020.append(Dict)

-                            data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
+                                data_newest.append(Dict)

-                            Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+        # Transfer
+        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]

-                            # Transfer
-                            for index in range(0, 4):
-                                with open(output_files[index], 'w', encoding='utf-8') as file:
-                                    json.dump(Data[index], file, indent=4)
+        for index in range(0, 4):
+            with open(output_files[index], 'w', encoding='utf-8') as file:
+                json.dump(Data[index], file, indent=4)

    # The path of reading
    author_folder_path = './EJQTDE_buffer/Author_TS'
--- a/EJQTDE_spider/ejqtde_scrawler.py
+++ b/EJQTDE_spider/ejqtde_scrawler.py
@ -94,10 +94,16 @@ def article_detail(Data, URL, article_id, Aricle_list):
    del Author[-1]

    # Submit_datetime and publish_datetime
+    def timeSet(time):
+        time = time.split('-')
+        time[1] = time[1].strip('0')
+        time = time[0] + '-' + time[1] + '-' + time[2]
+        return time
+
    time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
    time = re.findall(r'\d+-\d+-\d+', str(time))
-    Submit_date = time[0] if time[0] else None
-    Publish_date = time[1] if time[1] else None
+    Submit_date = timeSet(time[0]) if time[0] else None
+    Publish_date = timeSet(time[1]) if time[1] else None

    # Keyword
    Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
--- a/SpringerOpen_spider/SD_main.py
+++ b/SpringerOpen_spider/SD_main.py
@ -22,10 +22,6 @@ from urllib.parse import urljoin
 Webs = []           # A list for web url
 Links = []          # A list for links

-# 存放爬取数据的空列表
-Article_data = []
-Author_data = []
-
 # ==========访问论文列表页==========
 # 获取数学类期刊网站链接
 headers = SD_link.header()
@ -60,7 +56,7 @@ for web in Webs:
 print('\nThe links have been stored!\n')

 # 进入多线程池开始爬取
-SD_threads.Threads(Links, Article_data, Author_data)
+SD_threads.Threads(Links)

 # json文件汇总
 SD_save.Transf()
--- a/SpringerOpen_spider/SD_save.py
+++ b/SpringerOpen_spider/SD_save.py
@ -32,31 +32,34 @@ def Transf():
                    data = json.load(file)

                    for Dict in data:
-                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                            # 筛选文章
-                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
+                                data_oldest.append(Dict)

-                            data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+                            if (isinstance(Dict, dict) and 2010 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
+                                data_2010_2014.append(Dict)

-                            data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+                            if (isinstance(Dict, dict) and 2015 <= int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
+                                data_2015_2020.append(Dict)

-                            data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
-                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+                            if (isinstance(Dict, dict) and int(
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
+                                data_newest.append(Dict)

-                            Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+        # 转存
+        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]

-                            # 转存
-                            for index in range(0, 4):
-                                with open(output_files[index], 'w', encoding='utf-8') as file:
-                                    json.dump(Data[index], file, indent=4)
+        for index in range(0, 4):
+            with open(output_files[index], 'w', encoding='utf-8') as file:
+                json.dump(Data[index], file, indent=4)

-        # 读取路径
-
-    author_folder_path = './SpringerOpen_buffer/Author'
-    article_folder_path = './SpringerOpen_buffer/Article'
+    # 读取路径
+    author_folder_path = './SpringerOpen_buffer/Author_TS'
+    article_folder_path = './SpringerOpen_buffer/Article_TS'

    # 存储路径
    author_output_file = [
@ -80,6 +83,7 @@ def Transf():
    # End
    print("\nData has been written into files.")

+
 # 删除暂存区文件
 def delete(folder_path):
    file_names = os.listdir(folder_path)
--- a/SpringerOpen_spider/SD_scrawl.py
+++ b/SpringerOpen_spider/SD_scrawl.py
@ -8,7 +8,11 @@ import SD_detail
 import SD_save

 # ==========获取论文详情页链接==========
-def Scrawl(Link, Article_data, Author_data):
+def Scrawl(Link):
+    # 存放爬取数据的空列表
+    Article_data = []
+    Author_data = []
+
    # 访问论文列表页
    headers = SD_link.header()
    soup = SD_link.Link(Link, headers)
@ -35,6 +39,8 @@ def Scrawl(Link, Article_data, Author_data):
            Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
            Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)

+    print('Complete: ', Link)
+
    # 放入json文件暂存（小文件）
    if Article_data:
        index = str(uuid.uuid4())
--- a/SpringerOpen_spider/SD_threads.py
+++ b/SpringerOpen_spider/SD_threads.py
@ -1,24 +1,19 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 import SD_scrawl

+
 # ==========多线程处理==========
-def Threads(Links, Article_data, Author_data):
-    executor = ThreadPoolExecutor(max_workers=20)       # 进程池
+def Threads(Links):
+    executor = ThreadPoolExecutor(max_workers=25)       # 进程池

    # 进行多线程处理
-    futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links]
-
-    # max_iterations = 5      # 最多同时进行数
-    # iteration_count = 0     # 计数器
+    futures = [executor.submit(SD_scrawl.Scrawl, Link) for Link in Links]

    # 等待所有进程完成
    for future in as_completed(futures):
        try:
            future.result()
-        #     # 限制最大同时爬取数
-        #     iteration_count += 1  # Increment the counter
-        #     if iteration_count >= max_iterations:
-        #         break
+
        except Exception as e:
            print("An error occurred:", str(e))