Full same web merge code

This commit is contained in:
XCX 2023-08-20 00:07:47 +08:00
parent ba3671b5fd
commit e217342ce2

View File

@ -1,25 +1,45 @@
import json
import os
import re
import time
import unicodedata
from pprint import pprint
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
'''
========== SameWeb_merge(folder_path) 运行顺序 ==========
1. 创建新列表依次读取文件夹内的json文件并录入对列表按年份进行排序
2. 使用
for a in Data:
Merge(a, count, Data):
for aa in Data:
结构进行遍历每次扫
描到符合条件的aa元素后添加至a元素中然后删除aa元素再次调用Merge
函数重新遍历直至data中不再包含符合条件的aa元素为止
3. 最后Data列表中元素全部完成相同作者合并
*Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法对于数
据格式较为规范的网站效果良好对于数据格式不做规范的网站效果较差
具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性或者同
义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值
是否相同
2. 采用梯度合并的方式将需要合并的元素根据合并时有把握的程度放入堆叠的
列表中形如[[[a,b],c],d,e]每次根据不同合并条件将外围的元素逐渐
合并进内部的元素对于最后没有把握完全合并的元素一起作为单个列表保留
3. 合并顺序作者同名称优先合并发现了不同作者公用相同邮箱的情况所以
针对同一个网站优先考虑相同名字合并之后针对affiliation和邮箱是
否完全相同经行二次合并最后使用文本嵌入法经行相似度比较对于相似度
大于0.8的数据经行第三次合并
4. 相似度比较时使用了BERT的预置模型需要提前安装目前看来运行时间会
因此延长许多在10线程下相似度比较速度约50份/分钟但从最后表现结果
来看准确率较为不错针对大型数据>1500处理速度可能偏慢后续可
以考虑优化代码结构或者其他方式提高效率
5. 大部分函数使用
for i in range(len(Data)):
Merge(i, Data):
for j in range(i+1, len(Data)):
结构进
行遍历每次扫描到符合条件的 Data[j] 元素后添加至 Data[i]元素
然后删除 Data[j] 元素再次调用Merge函数重新遍历直至Data中
不再包含符合条件的 Data[j] 元素为止
*Tips: 部分网站出现了不同作者共用同一邮箱账号的情况因此针对同网站合并时
优先进行同名称合并再进行同邮箱合并以及后续操作
建议
1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
@ -30,50 +50,167 @@ from pprint import pprint
def SameWeb_merge(folder_path):
# Function
def Merge(a, count, Data):
for aa in Data:
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get(
"lastname") == aa.get("lastname") and a != aa:
def SameName_merge(i, count1, count2, Data, ml):
# Same name merge
for j in range(i+1, len(Data)):
if j < len(Data):
a = Data[i]
aa = Data[j]
# Add different key-elements of "from_article" into the first element
if isinstance(a["from_article"], list):
if isinstance(aa["from_article"], list):
a["from_article"] += aa["from_article"]
else:
a["from_article"].append(aa["from_article"])
elif isinstance(a["from_article"], str):
if isinstance(aa["from_article"], list):
a["from_article"] = [a["from_article"]] + aa["from_article"]
else:
a["from_article"] = [a["from_article"], aa["from_article"]]
if isinstance(a, dict) and isinstance(aa, dict):
if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
"middle_name") and a.get("last_name") == aa.get("last_name"):
if a['from_article'] == aa['from_article']:
Data.remove(Data[j])
count2[0] += 1
return SameName_merge(i, count1, count2, Data, ml)
# Add different key-elements of "affiliation" into the first element
a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year']) # Sort by year
else:
ml.append(aa)
# Uniform characters in English
fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
faa = unicodedata.normalize('NFKD', aa['affiliation'][0]["affiliation"]).encode('ascii', 'ignore')
# Update the counter
count1[0] += 1
if count1[0] % 100 == 0 and count1[0] != 0:
print(str(count1[0]) + ' copies of data have been merged by same name.')
if fa != faa:
a['affiliation'].extend(aa['affiliation'])
elif fa == faa and a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
a['affiliation'].extend(aa['affiliation'])
# Delete extra elements
Data.remove(Data[j])
if len(a['affiliation']) > 1 and a['affiliation'][0] == a['affiliation'][1]:
a['affiliation'].remove(a['affiliation'][0])
return SameName_merge(i, count1, count2, Data, ml)
# Delete extra elements
Data.remove(aa)
# Detail merge
if len(ml) > 0:
ml.append(Data[i]) # Add first element
Data.remove(Data[i])
ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year')) # Sorted by year
# Update the counter
count[0] += 1
if count[0] % 100 == 0 and count[0] != 0:
print(str(count[0]) + ' copies of data have been merged.')
return Merge(a, count, Data)
# Merge same affiliation data
for i in range(len(ml)):
for j in range(i+1, len(ml)):
if j < len(ml):
m = ml[i]
n = ml[j]
if m.get('affiliation')[-1].get('affiliation') == n.get('affiliation')[0].get('affiliation'):
if m.get('affiliation')[-1].get('year') != n.get('affiliation')[0].get('year'):
m['from_article'] += n['from_article']
m['affiliation'] += n['affiliation']
ml.remove(ml[j])
elif m.get('affiliation')[-1].get('year') == n.get('affiliation')[0].get('year'):
m['from_article'] += n['from_article']
ml.remove(ml[j])
# Merge same email data
def SameEmail_merge(i, ml):
for j in range(i + 1, len(ml)):
if j < len(ml):
m = ml[i]
n = ml[j]
A = m.get('affiliation')
AA = n.get('affiliation')
num = 0 # Merge counter
for a in A:
if num == 0:
for aa in AA:
if a.get('email') == aa.get('email'):
m['from_article'] += n['from_article']
m['affiliation'] += n['affiliation']
ml.remove(n) # Delete merged element
num += 1 # Update counter
# Sorted by year
m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
break
else:
return SameEmail_merge(i, ml)
# Loop of merging data by same email
for i in range(len(ml)):
SameEmail_merge(i, ml)
# Add into Data list
if len(ml) == 1:
Data.insert(-1, ml[0])
else:
Data.insert(-1, ml)
def Similarity_merge(M):
for i in range(len(M)):
for j in range(i+1, len(M)):
if j < len(M):
m = M[i]
n = M[j]
A = m.get('affiliation')
AA = n.get('affiliation')
num = 0 # Merge counter
for a in A:
if num == 0:
for aa in AA:
# ========== Comparing document embeddings for similarity ==========
# Get the data of affiliation
text1 = a['affiliation']
text2 = aa['affiliation']
# Uniform characters in English
text1 = unicodedata.normalize('NFKD', text1).encode('ascii', 'ignore').decode('utf-8')
text2 = unicodedata.normalize('NFKD', text2).encode('ascii', 'ignore').decode('utf-8')
# Delete punctuation and lower the character
text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()
# Delete space
text1 = re.sub(r'\s+', ' ', text1).strip()
text2 = re.sub(r'\s+', ' ', text2).strip()
# Load the pre-trained BERT simulator and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# Tokenize and encode the text
inputs1 = tokenizer.encode_plus(text1, add_special_tokens=True, return_tensors='pt')
inputs2 = tokenizer.encode_plus(text2, add_special_tokens=True, return_tensors='pt')
# Get the embedding vectors of the text
with torch.no_grad():
outputs1 = model(**inputs1)
outputs2 = model(**inputs2)
embeddings1 = torch.mean(outputs1.last_hidden_state, dim=1).squeeze()
embeddings2 = torch.mean(outputs2.last_hidden_state, dim=1).squeeze()
# Calculate text similarity (cosine similarity)
similarity = cosine_similarity(embeddings1.unsqueeze(0), embeddings2.unsqueeze(0))[0][0]
print('Similarity algorithm complete: the similarity score is', similarity)
if similarity >= 0.8:
m['from_article'] += n['from_article']
m['affiliation'] += n['affiliation']
M.remove(n) # Delete merged element
num += 1 # Update counter
# Sorted by year
m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
break
else:
return Similarity_merge(M)
# ========== Main code ==========
Data = [] # Empty list
count = [0] # counter
Data = [] # List of all data
count1 = [0] # Same name merged data counter
count2 = [0] # Duplicate data counter
num1 = 0 # Unique data counter
num2 = 0 # Complete merged data counter
num3 = 0 # Incomplete merged data counter
# Add data into list
for filename in os.listdir(folder_path):
@ -87,15 +224,70 @@ def SameWeb_merge(folder_path):
Database = len(Data) # The length of the original data
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
# Same website data merge
# ========== Merge ==========
# ----- Same name data merge -----
ml = []
if len(Data) > 1:
for i in range(len(Data)):
ml.clear()
SameName_merge(i, count1, count2, Data, ml)
print('\n----- Same name data merge complete -----\n')
# ----- Similarity algorithm merge -----
# Change the index of incomplete data before other data
temp_list = [] # Temp list for incomplete merged data
if len(Data) > 1:
for i in range(len(Data)-1, -1, -1):
if isinstance(Data[i], list):
temp = Data[i]
Data.remove(Data[i])
temp_list.append(temp)
print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
st = time.time() # Start time
if len(temp_list) > 1:
executor = ThreadPoolExecutor(max_workers=10) # Thread pool
futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
for future in as_completed(futures):
pass
wait(futures)
et = time.time() # End time
print('\nThread pool has been run for ' + str(et-st) + 's')
# Combine Data with temp_list
for i in temp_list:
if len(i) == 1:
Data.append(i[0])
else:
Data.append(i)
print('\n----- Similarity algorithm merge complete -----\n')
# ========== Statistic data ==========
# Data counter update
for a in Data:
Merge(a, count, Data)
if isinstance(a, dict) and len(a['from_article']) == 1:
num1 += 1
elif isinstance(a, dict) and len(a['from_article']) > 1:
num2 += 1
else:
num3 += 1
# Information
print('\n========== Complete ==========\n')
print(str(Database) + ' copies of data in total, before')
print(str(count[0]) + ' copies of data have been merged.')
print(str(len(Data)) + ' copies of data in total, now.')
print(str(count1[0]) + ' copies of data have been merged by same name.')
print(str(count2[0]) + ' copies of duplicate data have been deleted')
print(str(len(Data)) + ' copies of data in total, now.\n')
print(str(num1) + ' copies of data are unique.')
print(str(num2) + ' copies of data are complete merged')
print(str(num3) + ' copies of data are incomplete merged')
# Save into file
path = os.path.dirname(folder_path) # parent path
@ -106,12 +298,12 @@ def SameWeb_merge(folder_path):
with open(path, 'w', encoding='utf-8') as file:
json.dump(Data, file, indent=4)
print('\nData has been added to ' + path + '\Author_data(merged).json')
print('\nData has been added to ' + path)
# =========== input the file path here ==========
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
# SameWeb_merge('.\ejde\ejde_buffer\Author_output')
SameWeb_merge('.\ejde\ejde_buffer\Author_output')