133 lines
4.7 KiB
Python
133 lines
4.7 KiB
Python
import json
|
|
import os
|
|
|
|
|
|
def SameWeb_merge(folder_path):
|
|
# Function
|
|
def SameName_merge(i, count1, count2, Data, ml):
|
|
# Same name merge
|
|
for j in range(i + 1, len(Data)):
|
|
if j < len(Data):
|
|
a = Data[i]
|
|
aa = Data[j]
|
|
|
|
if isinstance(a, dict) and isinstance(aa, dict):
|
|
if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
|
|
if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
|
|
"middle_name") and a.get("last_name") == aa.get("last_name"):
|
|
|
|
if a['from_article'] == aa['from_article']: # Remove same data
|
|
Data.remove(Data[j])
|
|
count2[0] += 1
|
|
return SameName_merge(i, count1, count2, Data, ml)
|
|
|
|
else:
|
|
ml.append(aa)
|
|
|
|
# Update the counter
|
|
count1[0] += 1
|
|
if count1[0] % 100 == 0 and count1[0] != 0:
|
|
print(str(count1[0]) + ' copies of data have been merged by same name.')
|
|
|
|
# Delete extra elements
|
|
Data.remove(Data[j])
|
|
|
|
return SameName_merge(i, count1, count2, Data, ml)
|
|
|
|
if len(ml) > 0:
|
|
ml.append(Data[i]) # Add first element
|
|
Data.remove(Data[i])
|
|
ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year')) # Sorted by year
|
|
|
|
# Add into Data list
|
|
if len(ml) == 1:
|
|
Data.insert(-1, ml[0])
|
|
else:
|
|
Data.insert(-1, ml)
|
|
|
|
# ========== Main code ==========
|
|
Data = [] # List of all data
|
|
|
|
count1 = [0] # Same name merged data counter
|
|
count2 = [0] # Duplicate data counter
|
|
|
|
num1 = 0 # Unique data counter
|
|
num2 = 0 # Complete merged data counter
|
|
num3 = 0 # Incomplete merged data counter
|
|
|
|
# Add data into list
|
|
for num_folder in os.listdir(folder_path):
|
|
num_folder_path = os.path.join(folder_path, num_folder)
|
|
for filename in os.listdir(num_folder_path):
|
|
if filename.endswith('.json'):
|
|
file_path = os.path.join(num_folder_path, filename)
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
data = json.load(file)
|
|
if len(data) > 0:
|
|
Data.extend(data)
|
|
|
|
Database = len(Data) # The length of the original data
|
|
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
|
|
|
|
# ========== Merge ==========
|
|
# ----- Same name data merge -----
|
|
ml = []
|
|
if len(Data) > 1:
|
|
for i in range(len(Data)):
|
|
ml.clear()
|
|
SameName_merge(i, count1, count2, Data, ml)
|
|
|
|
print('\n----- Same name data merge complete -----\n')
|
|
|
|
# ----- Similarity algorithm merge -----
|
|
# Change the index of incomplete data before other data
|
|
temp_list = [] # Temp list for incomplete merged data
|
|
|
|
if len(Data) > 1:
|
|
for i in range(len(Data) - 1, -1, -1):
|
|
if isinstance(Data[i], list):
|
|
temp = Data[i]
|
|
Data.remove(Data[i])
|
|
temp_list.append(temp)
|
|
|
|
print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
|
|
|
|
print('\n----- Similarity algorithm merge complete -----\n')
|
|
|
|
# ========== Statistic data ==========
|
|
# Data counter update
|
|
for a in Data:
|
|
if isinstance(a, dict) and len(a['from_article']) == 1:
|
|
num1 += 1
|
|
elif isinstance(a, dict) and len(a['from_article']) > 1:
|
|
num2 += 1
|
|
else:
|
|
num3 += 1
|
|
|
|
# Information
|
|
print('\n========== Complete ==========\n')
|
|
print(str(Database) + ' copies of data in total, before')
|
|
print(str(count1[0]) + ' copies of data have been merged by same name.')
|
|
print(str(count2[0]) + ' copies of duplicate data have been deleted')
|
|
print(str(len(Data)) + ' copies of data in total, now.\n')
|
|
|
|
print(str(num1) + ' copies of data are unique.')
|
|
print(str(num3) + ' copies of data are incomplete merged')
|
|
|
|
# Save into file
|
|
path = os.path.dirname(folder_path) # parent path
|
|
path = os.path.join(path, "Author_data(merged)")
|
|
os.makedirs(path, exist_ok=True)
|
|
path = os.path.join(path, "Author_data(merged).json")
|
|
|
|
with open(path, 'w', encoding='utf-8') as file:
|
|
json.dump(temp_list, file, indent=4)
|
|
|
|
print('\nData has been added to ' + path)
|
|
|
|
|
|
# =========== input the file path here ==========
|
|
SameWeb_merge('.\\nameDivision')
|
|
|
|
|