2023-08-11 12:19:55 +08:00

118 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import unicodedata
from pprint import pprint
'''
========== SameWeb_merge(folder_path) 运行顺序 ==========
1. 创建新列表依次读取文件夹内的json文件并录入对列表按年份进行排序
2. 使用
for a in Data:
Merge(a, count, Data):
for aa in Data:
结构进行遍历,每次扫
描到符合条件的aa元素后添加至a元素中然后删除aa元素再次调用Merge
函数重新遍历直至data中不再包含符合条件的aa元素为止
3. 最后Data列表中元素全部完成相同作者合并
*Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法,对于数
据格式较为规范的网站效果良好,对于数据格式不做规范的网站效果较差。
具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性,或者同
义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值
是否相同。
建议:
1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(比较
复杂,暂时没有好思路)
'''
def SameWeb_merge(folder_path):
# Function
def Merge(a, count, Data):
for aa in Data:
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get(
"lastname") == aa.get("lastname") and a != aa:
# Add different key-elements of "from_article" into the first element
if isinstance(a["from_article"], list):
if isinstance(aa["from_article"], list):
a["from_article"] += aa["from_article"]
else:
a["from_article"].append(aa["from_article"])
elif isinstance(a["from_article"], str):
if isinstance(aa["from_article"], list):
a["from_article"] = [a["from_article"]] + aa["from_article"]
else:
a["from_article"] = [a["from_article"], aa["from_article"]]
# Add different key-elements of "affiliation" into the first element
a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year']) # Sort by year
# Uniform characters in English
fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
faa = unicodedata.normalize('NFKD', aa['affiliation'][0]["affiliation"]).encode('ascii', 'ignore')
if fa != faa:
a['affiliation'].extend(aa['affiliation'])
elif fa == faa and a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
a['affiliation'].extend(aa['affiliation'])
if len(a['affiliation']) > 1 and a['affiliation'][0] == a['affiliation'][1]:
a['affiliation'].remove(a['affiliation'][0])
# Delete extra elements
Data.remove(aa)
# Update the counter
count[0] += 1
if count[0] % 100 == 0 and count[0] != 0:
print(str(count[0]) + ' copies of data have been merged.')
return Merge(a, count, Data)
# ========== Main code ==========
Data = [] # Empty list
count = [0] # counter
# Add data into list
for filename in os.listdir(folder_path):
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
if len(data) > 0:
Data.extend(data)
Database = len(Data) # The length of the original data
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
# Same website data merge
for a in Data:
Merge(a, count, Data)
# Information
print('\n========== Complete ==========\n')
print(str(Database) + ' copies of data in total, before')
print(str(count[0]) + ' copies of data have been merged.')
print(str(len(Data)) + ' copies of data in total, now.')
# Save into file
path = os.path.dirname(folder_path) # parent path
path = os.path.join(path, "Author_data(merged)")
os.makedirs(path, exist_ok=True)
path = os.path.join(path, "Author_data(merged).json")
with open(path, 'w', encoding='utf-8') as file:
json.dump(Data, file, indent=4)
print('\nData has been added to ' + path + '\Author_data(merged).json')
# =========== input the file path here ==========
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
# SameWeb_merge('.\ejde\ejde_buffer\Author_output')