109 lines
4.8 KiB
Python

import json
import os
from pprint import pprint
import unicodedata
def Merge(folder_path):
Data = [] # Empty list
for filename in os.listdir(folder_path):
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
Data.append(data)
count = 0 # counter
# Same file merge
for data in Data:
if len(data) > 0:
data = sorted(data, key=lambda x: x['affiliation'][0]['year'])
for a in data:
for aa in data:
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
a.get("lastname") == aa.get("lastname"):
# Add different key-elements of "affiliation" into the first element
if a.get('affiliation') != aa.get('affiliation'):
# Uniform text formatting
ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
if ch_1 != ch_2:
hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
if hash_1 != hash_2:
a['affiliation'] += aa['affiliation']
# Add different key-elements of "from_article" into the first element
a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
isinstance(a.get("from_article"), str) else a.get("from_article") + (
[aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
aa.get("from_article"))
pprint(a)
print('//////////////////////////////////////\n')
# Delete extra elements
data.remove(aa)
# Update the counter
count += 1
# Different files merge
A = Data[2] # newest
B = Data[1] # (2015-2020)
C = Data[0] # (2010-2014)
D = Data[3] # oldest
Data.clear()
Data = [B, C, D]
for data in Data:
if len(data) > 0:
for a in A:
for aa in data:
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
a.get("lastname") == aa.get("lastname"):
# Add different key-elements of "affiliation" into the first element
if a.get('affiliation') != aa.get('affiliation'):
# Uniform text formatting
ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
if ch_1 != ch_2:
hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
if hash_1 != hash_2:
a['affiliation'] += aa['affiliation']
# Add different key-elements of "from_article" into the first element
a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
isinstance(a.get("from_article"), str) else a.get("from_article") + (
[aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
aa.get("from_article"))
pprint(a)
print('================================\n')
# Delete extra elements
data.remove(aa)
# Update the counter
count += 1
# Combined in one list
A += data
# Tips
print(str(count) + ' file(s) have been merged.')
print('There are ' + str(len(A)) + ' file(s) in total, now.')
# # input the file path here !!!
# Merge('./EJQTDE_buffer/Author_output')