From e504e7340970c1fd3358a1c3f77c0aef3277f72a Mon Sep 17 00:00:00 2001 From: XCX <1361986662@qq.com> Date: Fri, 11 Aug 2023 12:22:48 +0800 Subject: [PATCH] Changed the file path of saving data --- 01_EJDE_spider/ejde_merge.py | 97 ------------------------------- 01_EJDE_spider/ejde_save.py | 16 ++--- 02_EJQTDE_spider/ejqtde_save.py | 19 +++--- 04_SpringerOpen_spider/SD_save.py | 32 +++++----- 4 files changed, 35 insertions(+), 129 deletions(-) delete mode 100644 01_EJDE_spider/ejde_merge.py diff --git a/01_EJDE_spider/ejde_merge.py b/01_EJDE_spider/ejde_merge.py deleted file mode 100644 index 9903bbc..0000000 --- a/01_EJDE_spider/ejde_merge.py +++ /dev/null @@ -1,97 +0,0 @@ -import json -import os -import unicodedata - -''' - ========== SameWeb_merge(folder_path) 运行顺序 ========== - 1. 创建新列表,依次读取文件夹内的json文件并录入,对列表按年份进行排序 - 2. 使用 - for a in Data: - Merge(a, count, Data): - for aa in Data: - 结构进行遍历,每次扫 - 描到符合条件的aa元素后,添加至a元素中,然后删除aa元素再次调用Merge - 函数重新遍历,直至data中不再包含符合条件的aa元素为止 - 3. 最后Data列表中元素全部完成相同作者合并 - - *Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法,对于数 - 据格式较为规范的网站效果良好,对于数据格式不做规范的网站效果较差。 - 具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性,或者同 - 义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值 - 是否相同。 - - 建议: - 1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决) - 2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(比较 - 复杂,暂时没有好思路) -''' - - -def SameWeb_merge(folder_path): - # Function - def Merge(a, count, Data): - for aa in Data: - if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get( - "lastname") == aa.get("lastname") and a != aa: - - # Add different key-elements of "from_article" into the first element - if isinstance(a["from_article"], list): - if isinstance(aa["from_article"], list): - a["from_article"] += aa["from_article"] - else: - a["from_article"].append(aa["from_article"]) - elif isinstance(a["from_article"], str): - if isinstance(aa["from_article"], list): - a["from_article"] = [a["from_article"]] + aa["from_article"] - else: - a["from_article"] = [a["from_article"], aa["from_article"]] - - # Add different key-elements of "affiliation" into the first element - a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year']) # Sort by year - - # Uniform characters in English - fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore') - faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore') - - if fa != faa: - a['affiliation'].extend(aa['affiliation']) - elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']: - a['affiliation'].extend(aa['affiliation']) - - # Delete extra elements - Data.remove(aa) - - # Update the counter - count[0] += 1 - return Merge(a, count, Data) - - # ========== Main code ========== - Data = [] # Empty list - count = [0] # counter - - # Add data into list - for filename in os.listdir(folder_path): - if filename.endswith('.json'): - file_path = os.path.join(folder_path, filename) - with open(file_path, 'r', encoding='utf-8') as file: - data = json.load(file) - if len(data) > 0: - Data.extend(data) - - Database = len(Data) - Data = sorted(Data, key=lambda x: x['affiliation'][0]['year']) - - # Same website data merge - for a in Data: - Merge(a, count, Data) - - # Information - print(str(count[0]) + ' copies of data have been merged.') - print(str(Database) + ' copies of data in total, before') - print(str(len(Data)) + ' copies of data in total, now.') - -# =========== input the file path here ========== -SameWeb_merge('./EJDE_buffer/Author_output') - - - diff --git a/01_EJDE_spider/ejde_save.py b/01_EJDE_spider/ejde_save.py index 0091c1e..eee016b 100644 --- a/01_EJDE_spider/ejde_save.py +++ b/01_EJDE_spider/ejde_save.py @@ -63,17 +63,17 @@ def Transf(): # The path of storage author_output_file = [ - './ejde_buffer/Author_output/Author_output_file(oldest).json', - './ejde_buffer/Author_output/Author_output_file(2010-2014).json', - './ejde_buffer/Author_output/Author_output_file(2015-2020).json', - './ejde_buffer/Author_output/Author_output_file(newest).json' + './ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json', + './ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json', + './ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json', + './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json' ] article_output_file = [ - './ejde_buffer/Article_output/Article_output_file(oldest).json', - './ejde_buffer/Article_output/Article_output_file(2010-2014).json', - './ejde_buffer/Article_output/Article_output_file(2015-2020).json', - './ejde_buffer/Article_output/Article_output_file(newest).json' + './ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json', + './ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json', + './ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json', + './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json' ] # Read and write into files diff --git a/02_EJQTDE_spider/ejqtde_save.py b/02_EJQTDE_spider/ejqtde_save.py index 62c990e..6a7b8b0 100644 --- a/02_EJQTDE_spider/ejqtde_save.py +++ b/02_EJQTDE_spider/ejqtde_save.py @@ -35,7 +35,7 @@ def Transf(): for Dict in data: if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None: - # 筛选文章 + # Select data if (isinstance(Dict, dict) and int( Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009): data_oldest.append(Dict) @@ -65,17 +65,17 @@ def Transf(): # The path of storage author_output_file = [ - './EJQTDE_buffer/Author_output/Author_output_file(oldest).json', - './EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json', - './EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json', - './EJQTDE_buffer/Author_output/Author_output_file(newest).json' + './EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(oldest).json', + './EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(2010-2014).json', + './EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(2015-2020).json', + './EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(newest).json' ] article_output_file = [ - './EJQTDE_buffer/Article_output/Article_output_file(oldest).json', - './EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json', - './EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json', - './EJQTDE_buffer/Article_output/Article_output_file(newest).json' + './EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(oldest).json', + './EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(2010-2014).json', + './EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(2015-2020).json', + './EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(newest).json' ] # Read and write into files @@ -95,6 +95,7 @@ def delete(): file_path = os.path.join(folder_path, file_name) if os.path.isfile(file_path): os.remove(file_path) + os.rmdir(folder_path) print('\nAttention: The temporary storage files have been deleted!') diff --git a/04_SpringerOpen_spider/SD_save.py b/04_SpringerOpen_spider/SD_save.py index dfa8df5..a70ef86 100644 --- a/04_SpringerOpen_spider/SD_save.py +++ b/04_SpringerOpen_spider/SD_save.py @@ -63,17 +63,17 @@ def Transf(): # 存储路径 author_output_file = [ - './SpringerOpen_buffer/Author_output/Author_output_file(oldest).json', - './SpringerOpen_buffer/Author_output/Author_output_file(2010-2014).json', - './SpringerOpen_buffer/Author_output/Author_output_file(2015-2020).json', - './SpringerOpen_buffer/Author_output/Author_output_file(newest).json' + './SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(oldest).json', + './SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(2010-2014).json', + './SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(2015-2020).json', + './SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(newest).json' ] article_output_file = [ - './SpringerOpen_buffer/Article_output/Article_output_file(oldest).json', - './SpringerOpen_buffer/Article_output/Article_output_file(2010-2014).json', - './SpringerOpen_buffer/Article_output/Article_output_file(2015-2020).json', - './SpringerOpen_buffer/Article_output/Article_output_file(newest).json' + './SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(oldest).json', + './SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(2010-2014).json', + './SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(2015-2020).json', + './SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(newest).json' ] # 读取并写入文件 @@ -85,12 +85,14 @@ def Transf(): # 删除暂存区文件 -def delete(folder_path): - file_names = os.listdir(folder_path) - - for file_name in file_names: - file_path = os.path.join(folder_path, file_name) - if os.path.isfile(file_path): - os.remove(file_path) +def delete(): + folder_paths = ['./SpringerOpen_buffer/Author_TS', './SpringerOpen_buffer/Article_TS'] + for folder_path in folder_paths: + file_names = os.listdir(folder_path) + for file_name in file_names: + file_path = os.path.join(folder_path, file_name) + if os.path.isfile(file_path): + os.remove(file_path) + os.rmdir(folder_path) print('\nAttention: The temporary storage files have been deleted!')