2023-09-26 23:37:43 +08:00

315 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import re
import time
import unicodedata
import torch
from pprint import pprint
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
'''
========== SameWeb_merge(folder_path) 运行顺序 ==========
1. 创建新列表依次读取文件夹内的json文件并录入对列表按年份进行排序
2. 采用梯度合并的方式,将需要合并的元素根据合并时有把握的程度放入堆叠的
列表中,形如[[[a,b],c],d,e],每次根据不同合并条件将外围的元素逐渐
合并进内部的元素。对于最后没有把握完全合并的元素一起作为单个列表保留
3. 合并顺序:作者同名称优先合并(发现了不同作者公用相同邮箱的情况,所以
针对同一个网站优先考虑相同名字合并之后针对affiliation和邮箱是
否完全相同经行二次合并,最后使用文本嵌入法经行相似度比较,对于相似度
大于0.8的数据经行第三次合并
4. 相似度比较时使用了BERT的预置模型需要提前安装。目前看来运行时间会
因此延长许多在10线程下相似度比较速度约50份/分钟,但从最后表现结果
来看准确率较为不错,针对大型数据(>1500份处理速度可能偏慢后续可
以考虑优化代码结构或者其他方式提高效率
5. 大部分函数使用
for i in range(len(Data)):
Merge(i, Data):
for j in range(i+1, len(Data)):
结构进
行遍历,每次扫描到符合条件的 Data[j] 元素后,添加至 Data[i]元素
中,然后删除 Data[j] 元素再次调用Merge函数重新遍历直至Data中
不再包含符合条件的 Data[j] 元素为止
*Tips: 部分网站出现了不同作者共用同一邮箱账号的情况,因此针对同网站合并时,
优先进行同名称合并,再进行同邮箱合并以及后续操作
建议:
1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(已解
决,通过删除多余空格,全部标点符号,再进行相似度比较)
3. 相似度比较需要一个更加快捷的方式通过预置的模型对比耗时过长cpu
占用率也较高
'''
def SameWeb_merge(folder_path):
# Function
def SameName_merge(i, count1, count2, Data, ml):
# Same name merge
for j in range(i+1, len(Data)):
if j < len(Data):
a = Data[i]
aa = Data[j]
if isinstance(a, dict) and isinstance(aa, dict):
if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
"middle_name") and a.get("last_name") == aa.get("last_name"):
if a['from_article'] == aa['from_article']: # Remove same data
Data.remove(Data[j])
count2[0] += 1
return SameName_merge(i, count1, count2, Data, ml)
else:
ml.append(aa)
# Update the counter
count1[0] += 1
if count1[0] % 100 == 0 and count1[0] != 0:
print(str(count1[0]) + ' copies of data have been merged by same name.')
# Delete extra elements
Data.remove(Data[j])
return SameName_merge(i, count1, count2, Data, ml)
# Detail merge
if len(ml) > 0:
ml.append(Data[i]) # Add first element
Data.remove(Data[i])
ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year')) # Sorted by year
# Merge same affiliation data
for i in range(len(ml)):
for j in range(i+1, len(ml)):
if j < len(ml):
m = ml[i]
n = ml[j]
if m.get('affiliation')[-1].get('affiliation') == n.get('affiliation')[0].get('affiliation'):
if m.get('affiliation')[-1].get('year') != n.get('affiliation')[0].get('year'):
m['from_article'] += n['from_article']
m['affiliation'] += n['affiliation']
ml.remove(ml[j])
elif m.get('affiliation')[-1].get('year') == n.get('affiliation')[0].get('year'):
m['from_article'] += n['from_article']
ml.remove(ml[j])
# Merge same email data
def SameEmail_merge(i, ml):
for j in range(i + 1, len(ml)):
if j < len(ml):
m = ml[i]
n = ml[j]
A = m.get('affiliation')
AA = n.get('affiliation')
num = 0 # Merge counter
for a in A:
if num == 0:
for aa in AA:
if a.get('email') == aa.get('email'):
m['from_article'] += n['from_article']
m['affiliation'] += n['affiliation']
ml.remove(n) # Delete merged element
num += 1 # Update counter
# Sorted by year
m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
break
else:
return SameEmail_merge(i, ml)
# Loop of merging data by same email
for i in range(len(ml)):
SameEmail_merge(i, ml)
# Add into Data list
if len(ml) == 1:
Data.insert(-1, ml[0])
else:
Data.insert(-1, ml)
def Similarity_merge(M):
for i in range(len(M)):
for j in range(i+1, len(M)):
if j < len(M):
m = M[i]
n = M[j]
A = m.get('affiliation')
AA = n.get('affiliation')
num = 0 # Merge counter
for a in A:
if num == 0:
for aa in AA:
# ========== Comparing document embeddings for similarity ==========
# Get the data of affiliation
text1 = a['affiliation']
text2 = aa['affiliation']
# Uniform characters in English
text1 = unicodedata.normalize('NFKD', text1).encode('ascii', 'ignore').decode('utf-8')
text2 = unicodedata.normalize('NFKD', text2).encode('ascii', 'ignore').decode('utf-8')
# Delete punctuation and lower the character
text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()
# Delete extra spaces
text1 = re.sub(r'\s+', ' ', text1).strip()
text2 = re.sub(r'\s+', ' ', text2).strip()
# Load the pre-trained BERT simulator and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# Tokenize and encode the text
inputs1 = tokenizer.encode_plus(text1, add_special_tokens=True, return_tensors='pt')
inputs2 = tokenizer.encode_plus(text2, add_special_tokens=True, return_tensors='pt')
# Get the embedding vectors of the text
with torch.no_grad():
outputs1 = model(**inputs1)
outputs2 = model(**inputs2)
embeddings1 = torch.mean(outputs1.last_hidden_state, dim=1).squeeze()
embeddings2 = torch.mean(outputs2.last_hidden_state, dim=1).squeeze()
# Calculate text similarity (cosine similarity)
similarity = cosine_similarity(embeddings1.unsqueeze(0), embeddings2.unsqueeze(0))[0][0]
print('Similarity algorithm complete: the similarity score is', similarity)
if similarity >= 0.8:
m['from_article'] += n['from_article']
m['affiliation'] += n['affiliation']
M.remove(n) # Delete merged element
num += 1 # Update counter
# Sorted by year
m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
break
else:
return Similarity_merge(M)
# ========== Main code ==========
Data = [] # List of all data
count1 = [0] # Same name merged data counter
count2 = [0] # Duplicate data counter
num1 = 0 # Unique data counter
num2 = 0 # Complete merged data counter
num3 = 0 # Incomplete merged data counter
num4 = 0 # Similarity algorithm merged data counter
# Add data into list
for filename in os.listdir(folder_path):
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
if len(data) > 0:
Data.extend(data)
Database = len(Data) # The length of the original data
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
# ========== Merge ==========
# ----- Same name data merge -----
ml = []
if len(Data) > 1:
for i in range(len(Data)):
ml.clear()
SameName_merge(i, count1, count2, Data, ml)
print('\n----- Same name data merge complete -----\n')
# ----- Similarity algorithm merge -----
# Change the index of incomplete data before other data
temp_list = [] # Temp list for incomplete merged data
if len(Data) > 1:
for i in range(len(Data)-1, -1, -1):
if isinstance(Data[i], list):
temp = Data[i]
Data.remove(Data[i])
temp_list.append(temp)
print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
st = time.time() # Start time
if len(temp_list) > 1:
executor = ThreadPoolExecutor(max_workers=10) # Thread pool
futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
for future in as_completed(futures):
pass
wait(futures)
et = time.time() # End time
print('\nThread pool has been run for ' + str(et-st) + 's')
# Combine Data with temp_list
for i in temp_list:
if len(i) == 1:
num4 += 1
Data.insert(0, i[0])
else:
Data.insert(-1, i)
print('\n----- Similarity algorithm merge complete -----\n')
# ========== Statistic data ==========
# Data counter update
for a in Data:
if isinstance(a, dict) and len(a['from_article']) == 1:
num1 += 1
elif isinstance(a, dict) and len(a['from_article']) > 1:
num2 += 1
else:
num3 += 1
# Information
print('\n========== Complete ==========\n')
print(str(Database) + ' copies of data in total, before')
print(str(count1[0]) + ' copies of data have been merged by same name.')
print(str(count2[0]) + ' copies of duplicate data have been deleted')
print(str(len(Data)) + ' copies of data in total, now.\n')
print(str(num1) + ' copies of data are unique.')
print(str(num2) + ' copies of data are complete merged')
print(str(num4) + ' copies of data are complete merged by similarity algorithm')
print(str(num3) + ' copies of data are incomplete merged')
# Save into file
path = os.path.dirname(folder_path) # parent path
path = os.path.join(path, "Author_data(merged)")
os.makedirs(path, exist_ok=True)
path = os.path.join(path, "Author_data(merged).json")
with open(path, 'w', encoding='utf-8') as file:
json.dump(Data, file, indent=4)
print('\nData has been added to ' + path)
# =========== input the file path here ==========
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
SameWeb_merge('.\ejde\ejde_buffer\Author_output')