import json import os def SameWeb_merge(folder_path): # Function def SameName_merge(i, count1, count2, Data, ml): # Same name merge for j in range(i + 1, len(Data)): if j < len(Data): a = Data[i] aa = Data[j] if isinstance(a, dict) and isinstance(aa, dict): if len(a['from_article']) == 1 and len(aa['from_article']) == 1: if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get( "middle_name") and a.get("last_name") == aa.get("last_name"): if a['from_article'] == aa['from_article']: # Remove same data Data.remove(Data[j]) count2[0] += 1 return SameName_merge(i, count1, count2, Data, ml) else: ml.append(aa) # Update the counter count1[0] += 1 if count1[0] % 100 == 0 and count1[0] != 0: print(str(count1[0]) + ' copies of data have been merged by same name.') # Delete extra elements Data.remove(Data[j]) return SameName_merge(i, count1, count2, Data, ml) if len(ml) > 0: ml.append(Data[i]) # Add first element Data.remove(Data[i]) ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year')) # Sorted by year # Add into Data list if len(ml) == 1: Data.insert(-1, ml[0]) else: Data.insert(-1, ml) # ========== Main code ========== Data = [] # List of all data count1 = [0] # Same name merged data counter count2 = [0] # Duplicate data counter num1 = 0 # Unique data counter num2 = 0 # Complete merged data counter num3 = 0 # Incomplete merged data counter # Add data into list for num_folder in os.listdir(folder_path): num_folder_path = os.path.join(folder_path, num_folder) for filename in os.listdir(num_folder_path): if filename.endswith('.json'): file_path = os.path.join(num_folder_path, filename) with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) if len(data) > 0: Data.extend(data) Database = len(Data) # The length of the original data Data = sorted(Data, key=lambda x: x['affiliation'][0]['year']) # ========== Merge ========== # ----- Same name data merge ----- ml = [] if len(Data) > 1: for i in range(len(Data)): ml.clear() SameName_merge(i, count1, count2, Data, ml) print('\n----- Same name data merge complete -----\n') # ----- Similarity algorithm merge ----- # Change the index of incomplete data before other data temp_list = [] # Temp list for incomplete merged data if len(Data) > 1: for i in range(len(Data) - 1, -1, -1): if isinstance(Data[i], list): temp = Data[i] Data.remove(Data[i]) temp_list.append(temp) print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n') print('\n----- Similarity algorithm merge complete -----\n') # ========== Statistic data ========== # Data counter update for a in Data: if isinstance(a, dict) and len(a['from_article']) == 1: num1 += 1 elif isinstance(a, dict) and len(a['from_article']) > 1: num2 += 1 else: num3 += 1 # Information print('\n========== Complete ==========\n') print(str(Database) + ' copies of data in total, before') print(str(count1[0]) + ' copies of data have been merged by same name.') print(str(count2[0]) + ' copies of duplicate data have been deleted') print(str(len(Data)) + ' copies of data in total, now.\n') print(str(num1) + ' copies of data are unique.') print(str(num3) + ' copies of data are incomplete merged') # Save into file path = os.path.dirname(folder_path) # parent path path = os.path.join(path, "Author_data(merged)") os.makedirs(path, exist_ok=True) path = os.path.join(path, "Author_data(merged).json") with open(path, 'w', encoding='utf-8') as file: json.dump(temp_list, file, indent=4) print('\nData has been added to ' + path) # =========== input the file path here ========== SameWeb_merge('.\\nameDivision')