Corrected the loops, the program will now not add the same data repeatedly

2023-08-01 19:11:24 +08:00 · 2023-08-01 19:11:24 +08:00 · 2fc3b85bab
commit 2fc3b85bab
parent 01c1a7d978
8 changed files with 88 additions and 72 deletions
--- a/EJDE_spider/ejde_save.py
+++ b/EJDE_spider/ejde_save.py
@ -32,23 +32,27 @@ def Transf():
                    data = json.load(file)
                    for Dict in data:
-                    if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                            # Select data
-                        data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            if (isinstance(Dict, dict) and int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
                                data_oldest.append(Dict)
-                        data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+                            if (isinstance(Dict, dict) and 2010 <= int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
                                data_2010_2014.append(Dict)
-                        data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+                            if (isinstance(Dict, dict) and 2015 <= int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
                                data_2015_2020.append(Dict)
-                        data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            if (isinstance(Dict, dict) and int(
-                            Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
-
+                                data_newest.append(Dict)
                        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
        # Transfer
        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
        for index in range(0, 4):
            with open(output_files[index], 'w', encoding='utf-8') as file:
                json.dump(Data[index], file, indent=4)
--- a/EJQTDE_spider/ejqtde_main.py
+++ b/EJQTDE_spider/ejqtde_main.py
@ -49,9 +49,10 @@ Article_list = []
 hrefs = []
 # Base web urls
 baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
 current_year = datetime.datetime.now().year
 years = range(2009, 2011)            # years = range(2010, current_year + 1)
 baseWeb = 'https://www.math.u-szeged.hu/ejqtde/'
 url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1&paramtipus_ertek=publications&param_ertek='
            + f'{year}' for year in years][::-1]
--- a/EJQTDE_spider/ejqtde_save.py
+++ b/EJQTDE_spider/ejqtde_save.py
@ -34,23 +34,27 @@ def Transf():
                    data = json.load(file)
                    for Dict in data:
-                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
-                            # Select data
+                            # 筛选文章
-                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            if (isinstance(Dict, dict) and int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
                                data_oldest.append(Dict)
-                            data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+                            if (isinstance(Dict, dict) and 2010 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
                                data_2010_2014.append(Dict)
-                            data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+                            if (isinstance(Dict, dict) and 2015 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
                                data_2015_2020.append(Dict)
-                            data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            if (isinstance(Dict, dict) and int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
-
+                                data_newest.append(Dict)
                            Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
        # Transfer
        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
        for index in range(0, 4):
            with open(output_files[index], 'w', encoding='utf-8') as file:
                json.dump(Data[index], file, indent=4)
--- a/EJQTDE_spider/ejqtde_scrawler.py
+++ b/EJQTDE_spider/ejqtde_scrawler.py
@ -94,10 +94,16 @@ def article_detail(Data, URL, article_id, Aricle_list):
    del Author[-1]
    # Submit_datetime and publish_datetime
    def timeSet(time):
        time = time.split('-')
        time[1] = time[1].strip('0')
        time = time[0] + '-' + time[1] + '-' + time[2]
        return time
    time = Data.find('td', attrs={'align': 'right', 'width': '50%'})
    time = re.findall(r'\d+-\d+-\d+', str(time))
-    Submit_date = time[0] if time[0] else None
+    Submit_date = timeSet(time[0]) if time[0] else None
-    Publish_date = time[1] if time[1] else None
+    Publish_date = timeSet(time[1]) if time[1] else None
    # Keyword
    Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None
--- a/SpringerOpen_spider/SD_main.py
+++ b/SpringerOpen_spider/SD_main.py
@ -22,10 +22,6 @@ from urllib.parse import urljoin
 Webs = []           # A list for web url
 Links = []          # A list for links
 # 存放爬取数据的空列表
 Article_data = []
 Author_data = []
 # ==========访问论文列表页==========
 # 获取数学类期刊网站链接
 headers = SD_link.header()
@ -60,7 +56,7 @@ for web in Webs:
 print('\nThe links have been stored!\n')
 # 进入多线程池开始爬取
-SD_threads.Threads(Links, Article_data, Author_data)
+SD_threads.Threads(Links)
 # json文件汇总
 SD_save.Transf()
--- a/SpringerOpen_spider/SD_save.py
+++ b/SpringerOpen_spider/SD_save.py
@ -32,31 +32,34 @@ def Transf():
                    data = json.load(file)
                    for Dict in data:
-                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                            # 筛选文章
-                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            if (isinstance(Dict, dict) and int(
-                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
                                data_oldest.append(Dict)
-                            data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
+                            if (isinstance(Dict, dict) and 2010 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014):
                                data_2010_2014.append(Dict)
-                            data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
+                            if (isinstance(Dict, dict) and 2015 <= int(
-                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020):
                                data_2015_2020.append(Dict)
-                            data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
+                            if (isinstance(Dict, dict) and int(
-                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]
+                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021):
-
+                                data_newest.append(Dict)
                            Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
        # 转存
        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
        for index in range(0, 4):
            with open(output_files[index], 'w', encoding='utf-8') as file:
                json.dump(Data[index], file, indent=4)
    # 读取路径
-
+    author_folder_path = './SpringerOpen_buffer/Author_TS'
-    author_folder_path = './SpringerOpen_buffer/Author'
+    article_folder_path = './SpringerOpen_buffer/Article_TS'
    article_folder_path = './SpringerOpen_buffer/Article'
    # 存储路径
    author_output_file = [
@ -80,6 +83,7 @@ def Transf():
    # End
    print("\nData has been written into files.")
 # 删除暂存区文件
 def delete(folder_path):
    file_names = os.listdir(folder_path)
--- a/SpringerOpen_spider/SD_scrawl.py
+++ b/SpringerOpen_spider/SD_scrawl.py
@ -8,7 +8,11 @@ import SD_detail
 import SD_save
 # ==========获取论文详情页链接==========
-def Scrawl(Link, Article_data, Author_data):
+def Scrawl(Link):
    # 存放爬取数据的空列表
    Article_data = []
    Author_data = []
    # 访问论文列表页
    headers = SD_link.header()
    soup = SD_link.Link(Link, headers)
@ -35,6 +39,8 @@ def Scrawl(Link, Article_data, Author_data):
            Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id))
            Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data)
    print('Complete: ', Link)
    # 放入json文件暂存（小文件）
    if Article_data:
        index = str(uuid.uuid4())
--- a/SpringerOpen_spider/SD_threads.py
+++ b/SpringerOpen_spider/SD_threads.py
@ -1,24 +1,19 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 import SD_scrawl
 # ==========多线程处理==========
-def Threads(Links, Article_data, Author_data):
+def Threads(Links):
-    executor = ThreadPoolExecutor(max_workers=20)       # 进程池
+    executor = ThreadPoolExecutor(max_workers=25)       # 进程池
    # 进行多线程处理
-    futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links]
+    futures = [executor.submit(SD_scrawl.Scrawl, Link) for Link in Links]
    # max_iterations = 5      # 最多同时进行数
    # iteration_count = 0     # 计数器
    # 等待所有进程完成
    for future in as_completed(futures):
        try:
            future.result()
-        #     # 限制最大同时爬取数
+
        #     iteration_count += 1  # Increment the counter
        #     if iteration_count >= max_iterations:
        #         break
        except Exception as e:
            print("An error occurred:", str(e))