Replace the code for merging data

This commit is contained in:
XCX 2023-08-08 22:57:29 +08:00
parent 73cf15980f
commit 9ee9bc4462

View File

@ -1,105 +1,97 @@
import json import json
import os import os
from pprint import pprint
import unicodedata import unicodedata
'''
========== SameWeb_merge(folder_path) 运行顺序 ==========
1. 创建新列表依次读取文件夹内的json文件并录入对列表按年份进行排序
2. 使用
for a in Data:
Merge(a, count, Data):
for aa in Data:
结构进行遍历每次扫
描到符合条件的aa元素后添加至a元素中然后删除aa元素再次调用Merge
函数重新遍历直至data中不再包含符合条件的aa元素为止
3. 最后Data列表中元素全部完成相同作者合并
def Merge(folder_path): *Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法对于数
Data = [] # Empty list 据格式较为规范的网站效果良好对于数据格式不做规范的网站效果较差
具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性或者同
义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值
是否相同
建议
1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比比较
复杂暂时没有好思路
'''
def SameWeb_merge(folder_path):
# Function
def Merge(a, count, Data):
for aa in Data:
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get(
"lastname") == aa.get("lastname") and a != aa:
# Add different key-elements of "from_article" into the first element
if isinstance(a["from_article"], list):
if isinstance(aa["from_article"], list):
a["from_article"] += aa["from_article"]
else:
a["from_article"].append(aa["from_article"])
elif isinstance(a["from_article"], str):
if isinstance(aa["from_article"], list):
a["from_article"] = [a["from_article"]] + aa["from_article"]
else:
a["from_article"] = [a["from_article"], aa["from_article"]]
# Add different key-elements of "affiliation" into the first element
a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year']) # Sort by year
# Uniform characters in English
fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
if fa != faa:
a['affiliation'].extend(aa['affiliation'])
elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
a['affiliation'].extend(aa['affiliation'])
# Delete extra elements
Data.remove(aa)
# Update the counter
count[0] += 1
return Merge(a, count, Data)
# ========== Main code ==========
Data = [] # Empty list
count = [0] # counter
# Add data into list
for filename in os.listdir(folder_path): for filename in os.listdir(folder_path):
if filename.endswith('.json'): if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename) file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file: with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file) data = json.load(file)
Data.append(data) if len(data) > 0:
Data.extend(data)
count = 0 # counter Database = len(Data)
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
# Same file merge # Same website data merge
for data in Data: for a in Data:
if len(data) > 0: Merge(a, count, Data)
data = sorted(data, key=lambda x: x['affiliation'][0]['year'])
for a in data:
for aa in data:
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
a.get("lastname") == aa.get("lastname"):
# Add different key-elements of "affiliation" into the first element # Information
if a.get('affiliation') != aa.get('affiliation'): print(str(count[0]) + ' copies of data have been merged.')
# Uniform text formatting print(str(Database) + ' copies of data in total, before')
ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore') print(str(len(Data)) + ' copies of data in total, now.')
ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
if ch_1 != ch_2: # =========== input the file path here ==========
hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values()))) SameWeb_merge('./EJDE_buffer/Author_output')
hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
if hash_1 != hash_2:
a['affiliation'] += aa['affiliation']
# Add different key-elements of "from_article" into the first element
a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
isinstance(a.get("from_article"), str) else a.get("from_article") + (
[aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
aa.get("from_article"))
pprint(a)
print('//////////////////////////////////////\n')
# Delete extra elements
data.remove(aa)
# Update the counter
count += 1
# Different files merge
A = Data[2] # newest
B = Data[1] # (2015-2020)
C = Data[0] # (2010-2014)
D = Data[3] # oldest
Data.clear()
Data = [B, C, D]
for data in Data:
if len(data) > 0:
for a in A:
for aa in data:
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
a.get("lastname") == aa.get("lastname"):
# Add different key-elements of "affiliation" into the first element
if a.get('affiliation') != aa.get('affiliation'):
# Uniform text formatting
ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
if ch_1 != ch_2:
hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
if hash_1 != hash_2:
a['affiliation'] += aa['affiliation']
# Add different key-elements of "from_article" into the first element
a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
isinstance(a.get("from_article"), str) else a.get("from_article") + (
[aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
aa.get("from_article"))
pprint(a)
print('================================\n')
# Delete extra elements
data.remove(aa)
# Update the counter
count += 1
# Combined in one list
A += data
# Tips
print(str(count) + ' file(s) have been merged.')
print('There are ' + str(len(A)) + ' file(s) in total, now.')
# input the file path
Merge('./ejde_buffer/Author_output')