310 lines
13 KiB
Python
310 lines
13 KiB
Python
import json
|
||
import os
|
||
import re
|
||
import time
|
||
|
||
import unicodedata
|
||
from pprint import pprint
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||
|
||
from transformers import BertTokenizer, BertModel
|
||
import torch
|
||
from sklearn.metrics.pairwise import cosine_similarity
|
||
|
||
'''
|
||
========== SameWeb_merge(folder_path) 运行顺序 ==========
|
||
1. 创建新列表,依次读取文件夹内的json文件并录入,对列表按年份进行排序
|
||
|
||
2. 采用梯度合并的方式,将需要合并的元素根据合并时有把握的程度放入堆叠的
|
||
列表中,形如[[[a,b],c],d,e],每次根据不同合并条件将外围的元素逐渐
|
||
合并进内部的元素。对于最后没有把握完全合并的元素一起作为单个列表保留
|
||
|
||
3. 合并顺序:作者同名称优先合并(发现了不同作者公用相同邮箱的情况,所以
|
||
针对同一个网站优先考虑相同名字合并),之后针对affiliation和邮箱是
|
||
否完全相同经行二次合并,最后使用文本嵌入法经行相似度比较,对于相似度
|
||
大于0.8的数据经行第三次合并
|
||
|
||
4. 相似度比较时使用了BERT的预置模型,需要提前安装。目前看来运行时间会
|
||
因此延长许多,在10线程下相似度比较速度约50份/分钟,但从最后表现结果
|
||
来看准确率较为不错,针对大型数据(>1500份)处理速度可能偏慢,后续可
|
||
以考虑优化代码结构或者其他方式提高效率
|
||
|
||
5. 大部分函数使用
|
||
for i in range(len(Data)):
|
||
Merge(i, Data):
|
||
for j in range(i+1, len(Data)):
|
||
结构进
|
||
行遍历,每次扫描到符合条件的 Data[j] 元素后,添加至 Data[i]元素
|
||
中,然后删除 Data[j] 元素再次调用Merge函数重新遍历,直至Data中
|
||
不再包含符合条件的 Data[j] 元素为止
|
||
|
||
*Tips: 部分网站出现了不同作者共用同一邮箱账号的情况,因此针对同网站合并时,
|
||
优先进行同名称合并,再进行同邮箱合并以及后续操作
|
||
|
||
建议:
|
||
1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
|
||
2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(比较
|
||
复杂,暂时没有好思路)
|
||
'''
|
||
|
||
|
||
def SameWeb_merge(folder_path):
|
||
# Function
|
||
def SameName_merge(i, count1, count2, Data, ml):
|
||
# Same name merge
|
||
for j in range(i+1, len(Data)):
|
||
if j < len(Data):
|
||
a = Data[i]
|
||
aa = Data[j]
|
||
|
||
if isinstance(a, dict) and isinstance(aa, dict):
|
||
if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
|
||
if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
|
||
"middle_name") and a.get("last_name") == aa.get("last_name"):
|
||
if a['from_article'] == aa['from_article']:
|
||
Data.remove(Data[j])
|
||
count2[0] += 1
|
||
return SameName_merge(i, count1, count2, Data, ml)
|
||
|
||
else:
|
||
ml.append(aa)
|
||
|
||
# Update the counter
|
||
count1[0] += 1
|
||
if count1[0] % 100 == 0 and count1[0] != 0:
|
||
print(str(count1[0]) + ' copies of data have been merged by same name.')
|
||
|
||
# Delete extra elements
|
||
Data.remove(Data[j])
|
||
|
||
return SameName_merge(i, count1, count2, Data, ml)
|
||
|
||
# Detail merge
|
||
if len(ml) > 0:
|
||
ml.append(Data[i]) # Add first element
|
||
Data.remove(Data[i])
|
||
ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year')) # Sorted by year
|
||
|
||
# Merge same affiliation data
|
||
for i in range(len(ml)):
|
||
for j in range(i+1, len(ml)):
|
||
if j < len(ml):
|
||
m = ml[i]
|
||
n = ml[j]
|
||
|
||
if m.get('affiliation')[-1].get('affiliation') == n.get('affiliation')[0].get('affiliation'):
|
||
if m.get('affiliation')[-1].get('year') != n.get('affiliation')[0].get('year'):
|
||
m['from_article'] += n['from_article']
|
||
m['affiliation'] += n['affiliation']
|
||
ml.remove(ml[j])
|
||
|
||
elif m.get('affiliation')[-1].get('year') == n.get('affiliation')[0].get('year'):
|
||
m['from_article'] += n['from_article']
|
||
ml.remove(ml[j])
|
||
|
||
# Merge same email data
|
||
def SameEmail_merge(i, ml):
|
||
for j in range(i + 1, len(ml)):
|
||
if j < len(ml):
|
||
m = ml[i]
|
||
n = ml[j]
|
||
A = m.get('affiliation')
|
||
AA = n.get('affiliation')
|
||
num = 0 # Merge counter
|
||
|
||
for a in A:
|
||
if num == 0:
|
||
for aa in AA:
|
||
if a.get('email') == aa.get('email'):
|
||
m['from_article'] += n['from_article']
|
||
m['affiliation'] += n['affiliation']
|
||
|
||
ml.remove(n) # Delete merged element
|
||
num += 1 # Update counter
|
||
|
||
# Sorted by year
|
||
m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
|
||
break
|
||
else:
|
||
return SameEmail_merge(i, ml)
|
||
|
||
# Loop of merging data by same email
|
||
for i in range(len(ml)):
|
||
SameEmail_merge(i, ml)
|
||
|
||
# Add into Data list
|
||
if len(ml) == 1:
|
||
Data.insert(-1, ml[0])
|
||
else:
|
||
Data.insert(-1, ml)
|
||
|
||
def Similarity_merge(M):
|
||
for i in range(len(M)):
|
||
for j in range(i+1, len(M)):
|
||
if j < len(M):
|
||
|
||
m = M[i]
|
||
n = M[j]
|
||
A = m.get('affiliation')
|
||
AA = n.get('affiliation')
|
||
num = 0 # Merge counter
|
||
|
||
for a in A:
|
||
if num == 0:
|
||
for aa in AA:
|
||
# ========== Comparing document embeddings for similarity ==========
|
||
# Get the data of affiliation
|
||
text1 = a['affiliation']
|
||
text2 = aa['affiliation']
|
||
|
||
# Uniform characters in English
|
||
text1 = unicodedata.normalize('NFKD', text1).encode('ascii', 'ignore').decode('utf-8')
|
||
text2 = unicodedata.normalize('NFKD', text2).encode('ascii', 'ignore').decode('utf-8')
|
||
|
||
# Delete punctuation and lower the character
|
||
text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
|
||
text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()
|
||
|
||
# Delete space
|
||
text1 = re.sub(r'\s+', ' ', text1).strip()
|
||
text2 = re.sub(r'\s+', ' ', text2).strip()
|
||
|
||
# Load the pre-trained BERT simulator and tokenizer
|
||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||
model = BertModel.from_pretrained('bert-base-uncased')
|
||
|
||
# Tokenize and encode the text
|
||
inputs1 = tokenizer.encode_plus(text1, add_special_tokens=True, return_tensors='pt')
|
||
inputs2 = tokenizer.encode_plus(text2, add_special_tokens=True, return_tensors='pt')
|
||
|
||
# Get the embedding vectors of the text
|
||
with torch.no_grad():
|
||
outputs1 = model(**inputs1)
|
||
outputs2 = model(**inputs2)
|
||
embeddings1 = torch.mean(outputs1.last_hidden_state, dim=1).squeeze()
|
||
embeddings2 = torch.mean(outputs2.last_hidden_state, dim=1).squeeze()
|
||
|
||
# Calculate text similarity (cosine similarity)
|
||
similarity = cosine_similarity(embeddings1.unsqueeze(0), embeddings2.unsqueeze(0))[0][0]
|
||
print('Similarity algorithm complete: the similarity score is', similarity)
|
||
|
||
if similarity >= 0.8:
|
||
m['from_article'] += n['from_article']
|
||
m['affiliation'] += n['affiliation']
|
||
|
||
M.remove(n) # Delete merged element
|
||
num += 1 # Update counter
|
||
|
||
# Sorted by year
|
||
m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
|
||
break
|
||
|
||
else:
|
||
return Similarity_merge(M)
|
||
|
||
# ========== Main code ==========
|
||
Data = [] # List of all data
|
||
|
||
count1 = [0] # Same name merged data counter
|
||
count2 = [0] # Duplicate data counter
|
||
|
||
num1 = 0 # Unique data counter
|
||
num2 = 0 # Complete merged data counter
|
||
num3 = 0 # Incomplete merged data counter
|
||
|
||
# Add data into list
|
||
for filename in os.listdir(folder_path):
|
||
if filename.endswith('.json'):
|
||
file_path = os.path.join(folder_path, filename)
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
data = json.load(file)
|
||
if len(data) > 0:
|
||
Data.extend(data)
|
||
|
||
Database = len(Data) # The length of the original data
|
||
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
|
||
|
||
# ========== Merge ==========
|
||
# ----- Same name data merge -----
|
||
ml = []
|
||
if len(Data) > 1:
|
||
for i in range(len(Data)):
|
||
ml.clear()
|
||
SameName_merge(i, count1, count2, Data, ml)
|
||
|
||
print('\n----- Same name data merge complete -----\n')
|
||
|
||
# ----- Similarity algorithm merge -----
|
||
# Change the index of incomplete data before other data
|
||
temp_list = [] # Temp list for incomplete merged data
|
||
|
||
if len(Data) > 1:
|
||
for i in range(len(Data)-1, -1, -1):
|
||
if isinstance(Data[i], list):
|
||
temp = Data[i]
|
||
Data.remove(Data[i])
|
||
temp_list.append(temp)
|
||
|
||
print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
|
||
|
||
st = time.time() # Start time
|
||
|
||
if len(temp_list) > 1:
|
||
executor = ThreadPoolExecutor(max_workers=10) # Thread pool
|
||
futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
|
||
for future in as_completed(futures):
|
||
pass
|
||
wait(futures)
|
||
|
||
et = time.time() # End time
|
||
print('\nThread pool has been run for ' + str(et-st) + 's')
|
||
|
||
# Combine Data with temp_list
|
||
for i in temp_list:
|
||
if len(i) == 1:
|
||
Data.append(i[0])
|
||
else:
|
||
Data.append(i)
|
||
|
||
print('\n----- Similarity algorithm merge complete -----\n')
|
||
|
||
# ========== Statistic data ==========
|
||
# Data counter update
|
||
for a in Data:
|
||
if isinstance(a, dict) and len(a['from_article']) == 1:
|
||
num1 += 1
|
||
elif isinstance(a, dict) and len(a['from_article']) > 1:
|
||
num2 += 1
|
||
else:
|
||
num3 += 1
|
||
|
||
# Information
|
||
print('\n========== Complete ==========\n')
|
||
print(str(Database) + ' copies of data in total, before')
|
||
print(str(count1[0]) + ' copies of data have been merged by same name.')
|
||
print(str(count2[0]) + ' copies of duplicate data have been deleted')
|
||
print(str(len(Data)) + ' copies of data in total, now.\n')
|
||
|
||
print(str(num1) + ' copies of data are unique.')
|
||
print(str(num2) + ' copies of data are complete merged')
|
||
print(str(num3) + ' copies of data are incomplete merged')
|
||
|
||
# Save into file
|
||
path = os.path.dirname(folder_path) # parent path
|
||
path = os.path.join(path, "Author_data(merged)")
|
||
os.makedirs(path, exist_ok=True)
|
||
path = os.path.join(path, "Author_data(merged).json")
|
||
|
||
with open(path, 'w', encoding='utf-8') as file:
|
||
json.dump(Data, file, indent=4)
|
||
|
||
print('\nData has been added to ' + path)
|
||
|
||
# =========== input the file path here ==========
|
||
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
|
||
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
|
||
SameWeb_merge('.\ejde\ejde_buffer\Author_output')
|
||
|
||
|
||
|