ScholarDataMining/FileMerger/Merge_byNameAndEmail.py
2023-10-09 23:24:58 +08:00

133 lines
4.7 KiB
Python

import json
import os
def SameWeb_merge(folder_path):
# Function
def SameName_merge(i, count1, count2, Data, ml):
# Same name merge
for j in range(i + 1, len(Data)):
if j < len(Data):
a = Data[i]
aa = Data[j]
if isinstance(a, dict) and isinstance(aa, dict):
if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
"middle_name") and a.get("last_name") == aa.get("last_name"):
if a['from_article'] == aa['from_article']: # Remove same data
Data.remove(Data[j])
count2[0] += 1
return SameName_merge(i, count1, count2, Data, ml)
else:
ml.append(aa)
# Update the counter
count1[0] += 1
if count1[0] % 100 == 0 and count1[0] != 0:
print(str(count1[0]) + ' copies of data have been merged by same name.')
# Delete extra elements
Data.remove(Data[j])
return SameName_merge(i, count1, count2, Data, ml)
if len(ml) > 0:
ml.append(Data[i]) # Add first element
Data.remove(Data[i])
ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year')) # Sorted by year
# Add into Data list
if len(ml) == 1:
Data.insert(-1, ml[0])
else:
Data.insert(-1, ml)
# ========== Main code ==========
Data = [] # List of all data
count1 = [0] # Same name merged data counter
count2 = [0] # Duplicate data counter
num1 = 0 # Unique data counter
num2 = 0 # Complete merged data counter
num3 = 0 # Incomplete merged data counter
# Add data into list
for num_folder in os.listdir(folder_path):
num_folder_path = os.path.join(folder_path, num_folder)
for filename in os.listdir(num_folder_path):
if filename.endswith('.json'):
file_path = os.path.join(num_folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
if len(data) > 0:
Data.extend(data)
Database = len(Data) # The length of the original data
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
# ========== Merge ==========
# ----- Same name data merge -----
ml = []
if len(Data) > 1:
for i in range(len(Data)):
ml.clear()
SameName_merge(i, count1, count2, Data, ml)
print('\n----- Same name data merge complete -----\n')
# ----- Similarity algorithm merge -----
# Change the index of incomplete data before other data
temp_list = [] # Temp list for incomplete merged data
if len(Data) > 1:
for i in range(len(Data) - 1, -1, -1):
if isinstance(Data[i], list):
temp = Data[i]
Data.remove(Data[i])
temp_list.append(temp)
print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
print('\n----- Similarity algorithm merge complete -----\n')
# ========== Statistic data ==========
# Data counter update
for a in Data:
if isinstance(a, dict) and len(a['from_article']) == 1:
num1 += 1
elif isinstance(a, dict) and len(a['from_article']) > 1:
num2 += 1
else:
num3 += 1
# Information
print('\n========== Complete ==========\n')
print(str(Database) + ' copies of data in total, before')
print(str(count1[0]) + ' copies of data have been merged by same name.')
print(str(count2[0]) + ' copies of duplicate data have been deleted')
print(str(len(Data)) + ' copies of data in total, now.\n')
print(str(num1) + ' copies of data are unique.')
print(str(num3) + ' copies of data are incomplete merged')
# Save into file
path = os.path.dirname(folder_path) # parent path
path = os.path.join(path, "Author_data(merged)")
os.makedirs(path, exist_ok=True)
path = os.path.join(path, "Author_data(merged).json")
with open(path, 'w', encoding='utf-8') as file:
json.dump(temp_list, file, indent=4)
print('\nData has been added to ' + path)
# =========== input the file path here ==========
SameWeb_merge('.\\nameDivision')