Fix the bug of adding duplicate data

This commit is contained in:
XCX 2023-08-11 12:19:55 +08:00
parent 7726650eaa
commit 8ea31d08f4

View File

@ -1,8 +1,7 @@
import json import json
import os import os
from pprint import pprint
import unicodedata import unicodedata
from pprint import pprint
''' '''
========== SameWeb_merge(folder_path) 运行顺序 ========== ========== SameWeb_merge(folder_path) 运行顺序 ==========
@ -53,13 +52,16 @@ def SameWeb_merge(folder_path):
# Uniform characters in English # Uniform characters in English
fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore') fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore') faa = unicodedata.normalize('NFKD', aa['affiliation'][0]["affiliation"]).encode('ascii', 'ignore')
if fa != faa: if fa != faa:
a['affiliation'].extend(aa['affiliation']) a['affiliation'].extend(aa['affiliation'])
elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']: elif fa == faa and a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
a['affiliation'].extend(aa['affiliation']) a['affiliation'].extend(aa['affiliation'])
if len(a['affiliation']) > 1 and a['affiliation'][0] == a['affiliation'][1]:
a['affiliation'].remove(a['affiliation'][0])
# Delete extra elements # Delete extra elements
Data.remove(aa) Data.remove(aa)
@ -82,7 +84,7 @@ def SameWeb_merge(folder_path):
if len(data) > 0: if len(data) > 0:
Data.extend(data) Data.extend(data)
Database = len(Data) Database = len(Data) # The length of the original data
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year']) Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
# Same website data merge # Same website data merge
@ -96,7 +98,7 @@ def SameWeb_merge(folder_path):
print(str(len(Data)) + ' copies of data in total, now.') print(str(len(Data)) + ' copies of data in total, now.')
# Save into file # Save into file
path = os.path.dirname(folder_path) path = os.path.dirname(folder_path) # parent path
path = os.path.join(path, "Author_data(merged)") path = os.path.join(path, "Author_data(merged)")
os.makedirs(path, exist_ok=True) os.makedirs(path, exist_ok=True)
path = os.path.join(path, "Author_data(merged).json") path = os.path.join(path, "Author_data(merged).json")
@ -106,10 +108,10 @@ def SameWeb_merge(folder_path):
print('\nData has been added to ' + path + '\Author_data(merged).json') print('\nData has been added to ' + path + '\Author_data(merged).json')
# =========== input the file path here ========== # =========== input the file path here ==========
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output') # SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output') # SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
# SameWeb_merge('.\ejde\ejde_buffer\Author_output')