import json import os import re import time import unicodedata import torch from pprint import pprint from concurrent.futures import ThreadPoolExecutor, as_completed, wait from transformers import BertTokenizer, BertModel from sklearn.metrics.pairwise import cosine_similarity ''' ========== SameWeb_merge(folder_path) 运行顺序 ========== 1. 创建新列表,依次读取文件夹内的json文件并录入,对列表按年份进行排序 2. 采用梯度合并的方式,将需要合并的元素根据合并时有把握的程度放入堆叠的 列表中,形如[[[a,b],c],d,e],每次根据不同合并条件将外围的元素逐渐 合并进内部的元素。对于最后没有把握完全合并的元素一起作为单个列表保留 3. 合并顺序:作者同名称优先合并(发现了不同作者公用相同邮箱的情况,所以 针对同一个网站优先考虑相同名字合并),之后针对affiliation和邮箱是 否完全相同经行二次合并,最后使用文本嵌入法经行相似度比较,对于相似度 大于0.8的数据经行第三次合并 4. 相似度比较时使用了BERT的预置模型,需要提前安装。目前看来运行时间会 因此延长许多,在10线程下相似度比较速度约50份/分钟,但从最后表现结果 来看准确率较为不错,针对大型数据(>1500份)处理速度可能偏慢,后续可 以考虑优化代码结构或者其他方式提高效率 5. 大部分函数使用 for i in range(len(Data)): Merge(i, Data): for j in range(i+1, len(Data)): 结构进 行遍历,每次扫描到符合条件的 Data[j] 元素后,添加至 Data[i]元素 中,然后删除 Data[j] 元素再次调用Merge函数重新遍历,直至Data中 不再包含符合条件的 Data[j] 元素为止 *Tips: 部分网站出现了不同作者共用同一邮箱账号的情况,因此针对同网站合并时, 优先进行同名称合并,再进行同邮箱合并以及后续操作 建议: 1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决) 2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(已解 决,通过删除多余空格,全部标点符号,再进行相似度比较) 3. 相似度比较需要一个更加快捷的方式,通过预置的模型对比耗时过长,cpu 占用率也较高 ''' def SameWeb_merge(folder_path): # Function def SameName_merge(i, count1, count2, Data, ml): # Same name merge for j in range(i+1, len(Data)): if j < len(Data): a = Data[i] aa = Data[j] if isinstance(a, dict) and isinstance(aa, dict): if len(a['from_article']) == 1 and len(aa['from_article']) == 1: if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get( "middle_name") and a.get("last_name") == aa.get("last_name"): if a['from_article'] == aa['from_article']: # Remove same data Data.remove(Data[j]) count2[0] += 1 return SameName_merge(i, count1, count2, Data, ml) else: ml.append(aa) # Update the counter count1[0] += 1 if count1[0] % 100 == 0 and count1[0] != 0: print(str(count1[0]) + ' copies of data have been merged by same name.') # Delete extra elements Data.remove(Data[j]) return SameName_merge(i, count1, count2, Data, ml) # Detail merge if len(ml) > 0: ml.append(Data[i]) # Add first element Data.remove(Data[i]) ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year')) # Sorted by year # Merge same affiliation data for i in range(len(ml)): for j in range(i+1, len(ml)): if j < len(ml): m = ml[i] n = ml[j] if m.get('affiliation')[-1].get('affiliation') == n.get('affiliation')[0].get('affiliation'): if m.get('affiliation')[-1].get('year') != n.get('affiliation')[0].get('year'): m['from_article'] += n['from_article'] m['affiliation'] += n['affiliation'] ml.remove(ml[j]) elif m.get('affiliation')[-1].get('year') == n.get('affiliation')[0].get('year'): m['from_article'] += n['from_article'] ml.remove(ml[j]) # Merge same email data def SameEmail_merge(i, ml): for j in range(i + 1, len(ml)): if j < len(ml): m = ml[i] n = ml[j] A = m.get('affiliation') AA = n.get('affiliation') num = 0 # Merge counter for a in A: if num == 0: for aa in AA: if a.get('email') == aa.get('email'): m['from_article'] += n['from_article'] m['affiliation'] += n['affiliation'] ml.remove(n) # Delete merged element num += 1 # Update counter # Sorted by year m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year']) break else: return SameEmail_merge(i, ml) # Loop of merging data by same email for i in range(len(ml)): SameEmail_merge(i, ml) # Add into Data list if len(ml) == 1: Data.insert(-1, ml[0]) else: Data.insert(-1, ml) def Similarity_merge(M): for i in range(len(M)): for j in range(i+1, len(M)): if j < len(M): m = M[i] n = M[j] A = m.get('affiliation') AA = n.get('affiliation') num = 0 # Merge counter for a in A: if num == 0: for aa in AA: # ========== Comparing document embeddings for similarity ========== # Get the data of affiliation text1 = a['affiliation'] text2 = aa['affiliation'] # Uniform characters in English text1 = unicodedata.normalize('NFKD', text1).encode('ascii', 'ignore').decode('utf-8') text2 = unicodedata.normalize('NFKD', text2).encode('ascii', 'ignore').decode('utf-8') # Delete punctuation and lower the character text1 = re.sub(r'[^\w\s]', '', str(text1)).lower() text2 = re.sub(r'[^\w\s]', '', str(text2)).lower() # Delete extra spaces text1 = re.sub(r'\s+', ' ', text1).strip() text2 = re.sub(r'\s+', ' ', text2).strip() # Load the pre-trained BERT simulator and tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') # Tokenize and encode the text inputs1 = tokenizer.encode_plus(text1, add_special_tokens=True, return_tensors='pt') inputs2 = tokenizer.encode_plus(text2, add_special_tokens=True, return_tensors='pt') # Get the embedding vectors of the text with torch.no_grad(): outputs1 = model(**inputs1) outputs2 = model(**inputs2) embeddings1 = torch.mean(outputs1.last_hidden_state, dim=1).squeeze() embeddings2 = torch.mean(outputs2.last_hidden_state, dim=1).squeeze() # Calculate text similarity (cosine similarity) similarity = cosine_similarity(embeddings1.unsqueeze(0), embeddings2.unsqueeze(0))[0][0] print('Similarity algorithm complete: the similarity score is', similarity) if similarity >= 0.8: m['from_article'] += n['from_article'] m['affiliation'] += n['affiliation'] M.remove(n) # Delete merged element num += 1 # Update counter # Sorted by year m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year']) break else: return Similarity_merge(M) # ========== Main code ========== Data = [] # List of all data count1 = [0] # Same name merged data counter count2 = [0] # Duplicate data counter num1 = 0 # Unique data counter num2 = 0 # Complete merged data counter num3 = 0 # Incomplete merged data counter num4 = 0 # Similarity algorithm merged data counter # Add data into list for filename in os.listdir(folder_path): if filename.endswith('.json'): file_path = os.path.join(folder_path, filename) with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) if len(data) > 0: Data.extend(data) Database = len(Data) # The length of the original data Data = sorted(Data, key=lambda x: x['affiliation'][0]['year']) # ========== Merge ========== # ----- Same name data merge ----- ml = [] if len(Data) > 1: for i in range(len(Data)): ml.clear() SameName_merge(i, count1, count2, Data, ml) print('\n----- Same name data merge complete -----\n') # ----- Similarity algorithm merge ----- # Change the index of incomplete data before other data temp_list = [] # Temp list for incomplete merged data if len(Data) > 1: for i in range(len(Data)-1, -1, -1): if isinstance(Data[i], list): temp = Data[i] Data.remove(Data[i]) temp_list.append(temp) print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n') st = time.time() # Start time if len(temp_list) > 1: executor = ThreadPoolExecutor(max_workers=10) # Thread pool futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))] for future in as_completed(futures): pass wait(futures) et = time.time() # End time print('\nThread pool has been run for ' + str(et-st) + 's') # Combine Data with temp_list for i in temp_list: if len(i) == 1: num4 += 1 Data.insert(0, i[0]) else: Data.insert(-1, i) print('\n----- Similarity algorithm merge complete -----\n') # ========== Statistic data ========== # Data counter update for a in Data: if isinstance(a, dict) and len(a['from_article']) == 1: num1 += 1 elif isinstance(a, dict) and len(a['from_article']) > 1: num2 += 1 else: num3 += 1 # Information print('\n========== Complete ==========\n') print(str(Database) + ' copies of data in total, before') print(str(count1[0]) + ' copies of data have been merged by same name.') print(str(count2[0]) + ' copies of duplicate data have been deleted') print(str(len(Data)) + ' copies of data in total, now.\n') print(str(num1) + ' copies of data are unique.') print(str(num2) + ' copies of data are complete merged') print(str(num4) + ' copies of data are complete merged by similarity algorithm') print(str(num3) + ' copies of data are incomplete merged') # Save into file path = os.path.dirname(folder_path) # parent path path = os.path.join(path, "Author_data(merged)") os.makedirs(path, exist_ok=True) path = os.path.join(path, "Author_data(merged).json") with open(path, 'w', encoding='utf-8') as file: json.dump(Data, file, indent=4) print('\nData has been added to ' + path) # =========== input the file path here ========== # SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output') # SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output') SameWeb_merge('.\ejde\ejde_buffer\Author_output')