import json import os from pprint import pprint import unicodedata def Merge(folder_path): Data = [] # Empty list for filename in os.listdir(folder_path): if filename.endswith('.json'): file_path = os.path.join(folder_path, filename) with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) Data.append(data) count = 0 # counter # Same file merge for data in Data: if len(data) > 0: data = sorted(data, key=lambda x: x['affiliation'][0]['year']) for a in data: for aa in data: if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \ a.get("lastname") == aa.get("lastname"): # Add different key-elements of "affiliation" into the first element if a.get('affiliation') != aa.get('affiliation'): # Uniform text formatting ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore') ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore') if ch_1 != ch_2: hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values()))) hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values()))) if hash_1 != hash_2: a['affiliation'] += aa['affiliation'] # Add different key-elements of "from_article" into the first element a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \ isinstance(a.get("from_article"), str) else a.get("from_article") + ( [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else aa.get("from_article")) pprint(a) print('//////////////////////////////////////\n') # Delete extra elements data.remove(aa) # Update the counter count += 1 # Different files merge A = Data[2] # newest B = Data[1] # (2015-2020) C = Data[0] # (2010-2014) D = Data[3] # oldest Data.clear() Data = [B, C, D] for data in Data: if len(data) > 0: for a in A: for aa in data: if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \ a.get("lastname") == aa.get("lastname"): # Add different key-elements of "affiliation" into the first element if a.get('affiliation') != aa.get('affiliation'): # Uniform text formatting ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore') ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore') if ch_1 != ch_2: hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values()))) hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values()))) if hash_1 != hash_2: a['affiliation'] += aa['affiliation'] # Add different key-elements of "from_article" into the first element a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \ isinstance(a.get("from_article"), str) else a.get("from_article") + ( [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else aa.get("from_article")) pprint(a) print('================================\n') # Delete extra elements data.remove(aa) # Update the counter count += 1 # Combined in one list A += data # Tips print(str(count) + ' file(s) have been merged.') print('There are ' + str(len(A)) + ' file(s) in total, now.') # input the file path Merge('./ejde_buffer/Author_output')