diff --git a/00_File_merge/Merge.py b/00_File_merge/Merge.py index 082bdd2..52a0fdf 100644 --- a/00_File_merge/Merge.py +++ b/00_File_merge/Merge.py @@ -1,108 +1,98 @@ import json import os -from pprint import pprint import unicodedata +''' + ========== SameWeb_merge(folder_path) 运行顺序 ========== + 1. 创建新列表,依次读取文件夹内的json文件并录入,对列表按年份进行排序 + 2. 使用 + for a in Data: + Merge(a, count, Data): + for aa in Data: + 结构进行遍历,每次扫 + 描到符合条件的aa元素后,添加至a元素中,然后删除aa元素再次调用Merge + 函数重新遍历,直至data中不再包含符合条件的aa元素为止 + 3. 最后Data列表中元素全部完成相同作者合并 + + *Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法,对于数 + 据格式较为规范的网站效果良好,对于数据格式不做规范的网站效果较差。 + 具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性,或者同 + 义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值 + 是否相同。 + + 建议: + 1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决) + 2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(比较 + 复杂,暂时没有好思路) +''' -def Merge(folder_path): - Data = [] # Empty list +def SameWeb_merge(folder_path): + # Function + def Merge(a, count, Data): + for aa in Data: + if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get( + "lastname") == aa.get("lastname") and a != aa: + + # Add different key-elements of "from_article" into the first element + if isinstance(a["from_article"], list): + if isinstance(aa["from_article"], list): + a["from_article"] += aa["from_article"] + else: + a["from_article"].append(aa["from_article"]) + elif isinstance(a["from_article"], str): + if isinstance(aa["from_article"], list): + a["from_article"] = [a["from_article"]] + aa["from_article"] + else: + a["from_article"] = [a["from_article"], aa["from_article"]] + + # Add different key-elements of "affiliation" into the first element + a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year']) # Sort by year + + # Uniform characters in English + fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore') + faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore') + + if fa != faa: + a['affiliation'].extend(aa['affiliation']) + elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']: + a['affiliation'].extend(aa['affiliation']) + + # Delete extra elements + Data.remove(aa) + + # Update the counter + count[0] += 1 + return Merge(a, count, Data) + + # ========== Main code ========== + Data = [] # Empty list + count = [0] # counter + + # Add data into list for filename in os.listdir(folder_path): if filename.endswith('.json'): file_path = os.path.join(folder_path, filename) with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) - Data.append(data) + if len(data) > 0: + Data.extend(data) - count = 0 # counter + Database = len(Data) + Data = sorted(Data, key=lambda x: x['affiliation'][0]['year']) - # Same file merge - for data in Data: - if len(data) > 0: - data = sorted(data, key=lambda x: x['affiliation'][0]['year']) - for a in data: - for aa in data: - if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \ - a.get("lastname") == aa.get("lastname"): + # Same website data merge + for a in Data: + Merge(a, count, Data) - # Add different key-elements of "affiliation" into the first element - if a.get('affiliation') != aa.get('affiliation'): - # Uniform text formatting - ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore') - ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore') - if ch_1 != ch_2: - hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values()))) - hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values()))) - if hash_1 != hash_2: - a['affiliation'] += aa['affiliation'] - - # Add different key-elements of "from_article" into the first element - a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \ - isinstance(a.get("from_article"), str) else a.get("from_article") + ( - [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else - aa.get("from_article")) - - pprint(a) - print('//////////////////////////////////////\n') - - # Delete extra elements - data.remove(aa) - - # Update the counter - count += 1 - - # Different files merge - A = Data[2] # newest - B = Data[1] # (2015-2020) - C = Data[0] # (2010-2014) - D = Data[3] # oldest - - Data.clear() - Data = [B, C, D] - - for data in Data: - if len(data) > 0: - for a in A: - for aa in data: - if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \ - a.get("lastname") == aa.get("lastname"): - - # Add different key-elements of "affiliation" into the first element - if a.get('affiliation') != aa.get('affiliation'): - # Uniform text formatting - ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore') - ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore') - if ch_1 != ch_2: - hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values()))) - hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values()))) - if hash_1 != hash_2: - a['affiliation'] += aa['affiliation'] - - # Add different key-elements of "from_article" into the first element - a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \ - isinstance(a.get("from_article"), str) else a.get("from_article") + ( - [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else - aa.get("from_article")) - - pprint(a) - print('================================\n') - - # Delete extra elements - data.remove(aa) - - # Update the counter - count += 1 - - # Combined in one list - A += data - - # Tips - print(str(count) + ' file(s) have been merged.') - print('There are ' + str(len(A)) + ' file(s) in total, now.') + # Information + print(str(count[0]) + ' copies of data have been merged.') + print(str(Database) + ' copies of data in total, before') + print(str(len(Data)) + ' copies of data in total, now.') -# # input the file path here !!! -# Merge('./EJQTDE_buffer/Author_output') +# =========== input the file path here ========== +# SameWeb_merge('./EJQTDE_buffer/Author_output')