From 2fc3b85babaadbccac6908d1a07601afa9e60a75 Mon Sep 17 00:00:00 2001 From: XCX <1361986662@qq.com> Date: Tue, 1 Aug 2023 19:11:24 +0800 Subject: [PATCH] Corrected the loops, the program will now not add the same data repeatedly --- EJDE_spider/ejde_save.py | 36 +++++++++++++++------------- EJQTDE_spider/ejqtde_main.py | 11 +++++---- EJQTDE_spider/ejqtde_save.py | 34 ++++++++++++++------------ EJQTDE_spider/ejqtde_scrawler.py | 10 ++++++-- SpringerOpen_spider/SD_main.py | 6 +---- SpringerOpen_spider/SD_save.py | 40 +++++++++++++++++-------------- SpringerOpen_spider/SD_scrawl.py | 8 ++++++- SpringerOpen_spider/SD_threads.py | 15 ++++-------- 8 files changed, 88 insertions(+), 72 deletions(-) diff --git a/EJDE_spider/ejde_save.py b/EJDE_spider/ejde_save.py index 5b67447..d00a89c 100644 --- a/EJDE_spider/ejde_save.py +++ b/EJDE_spider/ejde_save.py @@ -31,27 +31,31 @@ def Transf(): with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) - for Dict in data: - if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: - # Select data - data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + for Dict in data: + if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # Select data + if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009): + data_oldest.append(Dict) - data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014): + data_2010_2014.append(Dict) - data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020): + data_2015_2020.append(Dict) - data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021): + data_newest.append(Dict) - Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + # Transfer + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] - # Transfer - for index in range(0, 4): - with open(output_files[index], 'w', encoding='utf-8') as file: - json.dump(Data[index], file, indent=4) + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) # The path of reading author_folder_path = './ejde_buffer/Author_TS' diff --git a/EJQTDE_spider/ejqtde_main.py b/EJQTDE_spider/ejqtde_main.py index 4917e6c..8d05b6f 100644 --- a/EJQTDE_spider/ejqtde_main.py +++ b/EJQTDE_spider/ejqtde_main.py @@ -15,10 +15,10 @@ from urllib.parse import urljoin 爬取网站:'https://www.math.u-szeged.hu/ejqtde' ==========运行顺序========== - 1、ejqtde_main 获取各年份的期刊链接 - 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 - 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 - *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) + 1、ejqtde_main 获取各年份的期刊链接 + 2、ejqtde_scrawler 抓取各篇论文的信息和作者信息 -> 调用ejqtde_save -> 存入小文件(json)暂存 + 3、ejqtde_save 从本地浏览暂存的小文件筛选后存入不同年份的大文件 + *4、ejqtde_save.delete()(可选) 删除暂存区内部所有文件(注意备份) ''' @@ -49,9 +49,10 @@ Article_list = [] hrefs = [] # Base web urls -baseWeb = 'https://www.math.u-szeged.hu/ejqtde/' current_year = datetime.datetime.now().year years = range(2009, 2011) # years = range(2010, current_year + 1) + +baseWeb = 'https://www.math.u-szeged.hu/ejqtde/' url_list = ['https://www.math.u-szeged.hu/ejqtde/periodica.html?periodica=1¶mtipus_ertek=publications¶m_ertek=' + f'{year}' for year in years][::-1] diff --git a/EJQTDE_spider/ejqtde_save.py b/EJQTDE_spider/ejqtde_save.py index 5ae8e54..62c990e 100644 --- a/EJQTDE_spider/ejqtde_save.py +++ b/EJQTDE_spider/ejqtde_save.py @@ -34,26 +34,30 @@ def Transf(): data = json.load(file) for Dict in data: - if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: - # Select data - data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + # 筛选文章 + if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009): + data_oldest.append(Dict) - data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014): + data_2010_2014.append(Dict) - data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020): + data_2015_2020.append(Dict) - data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021): + data_newest.append(Dict) - Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + # Transfer + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] - # Transfer - for index in range(0, 4): - with open(output_files[index], 'w', encoding='utf-8') as file: - json.dump(Data[index], file, indent=4) + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) # The path of reading author_folder_path = './EJQTDE_buffer/Author_TS' diff --git a/EJQTDE_spider/ejqtde_scrawler.py b/EJQTDE_spider/ejqtde_scrawler.py index 1f3975d..9601f6f 100644 --- a/EJQTDE_spider/ejqtde_scrawler.py +++ b/EJQTDE_spider/ejqtde_scrawler.py @@ -94,10 +94,16 @@ def article_detail(Data, URL, article_id, Aricle_list): del Author[-1] # Submit_datetime and publish_datetime + def timeSet(time): + time = time.split('-') + time[1] = time[1].strip('0') + time = time[0] + '-' + time[1] + '-' + time[2] + return time + time = Data.find('td', attrs={'align': 'right', 'width': '50%'}) time = re.findall(r'\d+-\d+-\d+', str(time)) - Submit_date = time[0] if time[0] else None - Publish_date = time[1] if time[1] else None + Submit_date = timeSet(time[0]) if time[0] else None + Publish_date = timeSet(time[1]) if time[1] else None # Keyword Keyword = Data.find('keywords').get_text() if Data.find('keywords') is not None else None diff --git a/SpringerOpen_spider/SD_main.py b/SpringerOpen_spider/SD_main.py index c56f993..a4b5c5d 100644 --- a/SpringerOpen_spider/SD_main.py +++ b/SpringerOpen_spider/SD_main.py @@ -22,10 +22,6 @@ from urllib.parse import urljoin Webs = [] # A list for web url Links = [] # A list for links -# 存放爬取数据的空列表 -Article_data = [] -Author_data = [] - # ==========访问论文列表页========== # 获取数学类期刊网站链接 headers = SD_link.header() @@ -60,7 +56,7 @@ for web in Webs: print('\nThe links have been stored!\n') # 进入多线程池开始爬取 -SD_threads.Threads(Links, Article_data, Author_data) +SD_threads.Threads(Links) # json文件汇总 SD_save.Transf() diff --git a/SpringerOpen_spider/SD_save.py b/SpringerOpen_spider/SD_save.py index 0d7119d..dfa8df5 100644 --- a/SpringerOpen_spider/SD_save.py +++ b/SpringerOpen_spider/SD_save.py @@ -32,31 +32,34 @@ def Transf(): data = json.load(file) for Dict in data: - if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None: + if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None: # 筛选文章 - data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)] + if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009): + data_oldest.append(Dict) - data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)] + if (isinstance(Dict, dict) and 2010 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014): + data_2010_2014.append(Dict) - data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)] + if (isinstance(Dict, dict) and 2015 <= int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020): + data_2015_2020.append(Dict) - data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int( - Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)] + if (isinstance(Dict, dict) and int( + Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021): + data_newest.append(Dict) - Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] + # 转存 + Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest] - # 转存 - for index in range(0, 4): - with open(output_files[index], 'w', encoding='utf-8') as file: - json.dump(Data[index], file, indent=4) + for index in range(0, 4): + with open(output_files[index], 'w', encoding='utf-8') as file: + json.dump(Data[index], file, indent=4) - # 读取路径 - - author_folder_path = './SpringerOpen_buffer/Author' - article_folder_path = './SpringerOpen_buffer/Article' + # 读取路径 + author_folder_path = './SpringerOpen_buffer/Author_TS' + article_folder_path = './SpringerOpen_buffer/Article_TS' # 存储路径 author_output_file = [ @@ -80,6 +83,7 @@ def Transf(): # End print("\nData has been written into files.") + # 删除暂存区文件 def delete(folder_path): file_names = os.listdir(folder_path) diff --git a/SpringerOpen_spider/SD_scrawl.py b/SpringerOpen_spider/SD_scrawl.py index 3f9099c..31dbee4 100644 --- a/SpringerOpen_spider/SD_scrawl.py +++ b/SpringerOpen_spider/SD_scrawl.py @@ -8,7 +8,11 @@ import SD_detail import SD_save # ==========获取论文详情页链接========== -def Scrawl(Link, Article_data, Author_data): +def Scrawl(Link): + # 存放爬取数据的空列表 + Article_data = [] + Author_data = [] + # 访问论文列表页 headers = SD_link.header() soup = SD_link.Link(Link, headers) @@ -35,6 +39,8 @@ def Scrawl(Link, Article_data, Author_data): Article_data.append(SD_detail.Article_dict(sub_soup, sub_Link, article_id)) Author_data = SD_detail.Author_dict(sub_soup, article_id, Author_data) + print('Complete: ', Link) + # 放入json文件暂存(小文件) if Article_data: index = str(uuid.uuid4()) diff --git a/SpringerOpen_spider/SD_threads.py b/SpringerOpen_spider/SD_threads.py index d5d7527..74e649b 100644 --- a/SpringerOpen_spider/SD_threads.py +++ b/SpringerOpen_spider/SD_threads.py @@ -1,24 +1,19 @@ from concurrent.futures import ThreadPoolExecutor, as_completed, wait import SD_scrawl + # ==========多线程处理========== -def Threads(Links, Article_data, Author_data): - executor = ThreadPoolExecutor(max_workers=20) # 进程池 +def Threads(Links): + executor = ThreadPoolExecutor(max_workers=25) # 进程池 # 进行多线程处理 - futures = [executor.submit(SD_scrawl.Scrawl, Link, Article_data, Author_data) for Link in Links] - - # max_iterations = 5 # 最多同时进行数 - # iteration_count = 0 # 计数器 + futures = [executor.submit(SD_scrawl.Scrawl, Link) for Link in Links] # 等待所有进程完成 for future in as_completed(futures): try: future.result() - # # 限制最大同时爬取数 - # iteration_count += 1 # Increment the counter - # if iteration_count >= max_iterations: - # break + except Exception as e: print("An error occurred:", str(e))