Full same web merge code

2023-08-20 00:07:47 +08:00 · 2023-08-20 00:07:47 +08:00 · e217342ce2
commit e217342ce2
parent ba3671b5fd
1 changed files with 246 additions and 54 deletions
--- a/00_File_merge/Merge.py
+++ b/00_File_merge/Merge.py
@ -1,25 +1,45 @@
 import json
 import os
+import re
+import time
+
 import unicodedata
 from pprint import pprint
+from concurrent.futures import ThreadPoolExecutor, as_completed, wait
+
+from transformers import BertTokenizer, BertModel
+import torch
+from sklearn.metrics.pairwise import cosine_similarity

 '''
    ========== SameWeb_merge(folder_path) 运行顺序 ==========
    1. 创建新列表，依次读取文件夹内的json文件并录入，对列表按年份进行排序
-    2. 使用       
-                for a in Data:
-                    Merge(a, count, Data):
-                        for aa in Data:
-                                            结构进行遍历，每次扫
-       描到符合条件的aa元素后，添加至a元素中，然后删除aa元素再次调用Merge
-       函数重新遍历，直至data中不再包含符合条件的aa元素为止
-    3. 最后Data列表中元素全部完成相同作者合并
    
-    *Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法，对于数
-           据格式较为规范的网站效果良好，对于数据格式不做规范的网站效果较差。
-           具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性，或者同
-           义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值
-           是否相同。
+    2. 采用梯度合并的方式，将需要合并的元素根据合并时有把握的程度放入堆叠的
+       列表中，形如[[[a,b],c],d,e]，每次根据不同合并条件将外围的元素逐渐
+       合并进内部的元素。对于最后没有把握完全合并的元素一起作为单个列表保留
+    
+    3. 合并顺序：作者同名称优先合并（发现了不同作者公用相同邮箱的情况，所以
+       针对同一个网站优先考虑相同名字合并），之后针对affiliation和邮箱是
+       否完全相同经行二次合并，最后使用文本嵌入法经行相似度比较，对于相似度
+       大于0.8的数据经行第三次合并
+    
+    4. 相似度比较时使用了BERT的预置模型，需要提前安装。目前看来运行时间会
+       因此延长许多，在10线程下相似度比较速度约50份/分钟，但从最后表现结果
+       来看准确率较为不错，针对大型数据（>1500份）处理速度可能偏慢，后续可
+       以考虑优化代码结构或者其他方式提高效率
+    
+    5. 大部分函数使用       
+                for i in range(len(Data)):
+                    Merge(i, Data):
+                        for j in range(i+1, len(Data)):
+                                                        结构进
+       行遍历，每次扫描到符合条件的 Data[j] 元素后，添加至 Data[i]元素
+       中，然后删除 Data[j] 元素再次调用Merge函数重新遍历，直至Data中
+       不再包含符合条件的 Data[j] 元素为止
+    
+    *Tips: 部分网站出现了不同作者共用同一邮箱账号的情况，因此针对同网站合并时，
+           优先进行同名称合并，再进行同邮箱合并以及后续操作
           
           建议：
           1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
@ -30,50 +50,167 @@ from pprint import pprint

 def SameWeb_merge(folder_path):
    # Function
-    def Merge(a, count, Data):
-        for aa in Data:
-            if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get(
-                    "lastname") == aa.get("lastname") and a != aa:
+    def SameName_merge(i, count1, count2, Data, ml):
+        # Same name merge
+        for j in range(i+1, len(Data)):
+            if j < len(Data):
+                a = Data[i]
+                aa = Data[j]

-                # Add different key-elements of "from_article" into the first element
-                if isinstance(a["from_article"], list):
-                    if isinstance(aa["from_article"], list):
-                        a["from_article"] += aa["from_article"]
-                    else:
-                        a["from_article"].append(aa["from_article"])
-                elif isinstance(a["from_article"], str):
-                    if isinstance(aa["from_article"], list):
-                        a["from_article"] = [a["from_article"]] + aa["from_article"]
-                    else:
-                        a["from_article"] = [a["from_article"], aa["from_article"]]
+                if isinstance(a, dict) and isinstance(aa, dict):
+                    if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
+                        if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
+                                "middle_name") and a.get("last_name") == aa.get("last_name"):
+                            if a['from_article'] == aa['from_article']:
+                                Data.remove(Data[j])
+                                count2[0] += 1
+                                return SameName_merge(i, count1, count2, Data, ml)

-                # Add different key-elements of "affiliation" into the first element
-                a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year'])    # Sort by year
+                            else:
+                                ml.append(aa)

-                # Uniform characters in English
-                fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
-                faa = unicodedata.normalize('NFKD', aa['affiliation'][0]["affiliation"]).encode('ascii', 'ignore')
+                                # Update the counter
+                                count1[0] += 1
+                                if count1[0] % 100 == 0 and count1[0] != 0:
+                                    print(str(count1[0]) + ' copies of data have been merged by same name.')

-                if fa != faa:
-                    a['affiliation'].extend(aa['affiliation'])
-                elif fa == faa and a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
-                    a['affiliation'].extend(aa['affiliation'])
+                                # Delete extra elements
+                                Data.remove(Data[j])

-                if len(a['affiliation']) > 1 and a['affiliation'][0] == a['affiliation'][1]:
-                    a['affiliation'].remove(a['affiliation'][0])
+                                return SameName_merge(i, count1, count2, Data, ml)

-                # Delete extra elements
-                Data.remove(aa)
+        # Detail merge
+        if len(ml) > 0:
+            ml.append(Data[i])              # Add first element
+            Data.remove(Data[i])
+            ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year'))      # Sorted by year

-                # Update the counter
-                count[0] += 1
-                if count[0] % 100 == 0 and count[0] != 0:
-                    print(str(count[0]) + ' copies of data have been merged.')
-                return Merge(a, count, Data)
+            # Merge same affiliation data
+            for i in range(len(ml)):
+                for j in range(i+1, len(ml)):
+                    if j < len(ml):
+                        m = ml[i]
+                        n = ml[j]
+
+                        if m.get('affiliation')[-1].get('affiliation') == n.get('affiliation')[0].get('affiliation'):
+                            if m.get('affiliation')[-1].get('year') != n.get('affiliation')[0].get('year'):
+                                m['from_article'] += n['from_article']
+                                m['affiliation'] += n['affiliation']
+                                ml.remove(ml[j])
+
+                            elif m.get('affiliation')[-1].get('year') == n.get('affiliation')[0].get('year'):
+                                m['from_article'] += n['from_article']
+                                ml.remove(ml[j])
+
+            # Merge same email data
+            def SameEmail_merge(i, ml):
+                for j in range(i + 1, len(ml)):
+                    if j < len(ml):
+                        m = ml[i]
+                        n = ml[j]
+                        A = m.get('affiliation')
+                        AA = n.get('affiliation')
+                        num = 0         # Merge counter
+
+                        for a in A:
+                            if num == 0:
+                                for aa in AA:
+                                    if a.get('email') == aa.get('email'):
+                                        m['from_article'] += n['from_article']
+                                        m['affiliation'] += n['affiliation']
+
+                                        ml.remove(n)    # Delete merged element
+                                        num += 1        # Update counter
+
+                                        # Sorted by year
+                                        m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
+                                        break
+                            else:
+                                return SameEmail_merge(i, ml)
+
+            # Loop of merging data by same email
+            for i in range(len(ml)):
+                SameEmail_merge(i, ml)
+
+            # Add into Data list
+            if len(ml) == 1:
+                Data.insert(-1, ml[0])
+            else:
+                Data.insert(-1, ml)
+
+    def Similarity_merge(M):
+        for i in range(len(M)):
+            for j in range(i+1, len(M)):
+                if j < len(M):
+
+                    m = M[i]
+                    n = M[j]
+                    A = m.get('affiliation')
+                    AA = n.get('affiliation')
+                    num = 0  # Merge counter
+
+                    for a in A:
+                        if num == 0:
+                            for aa in AA:
+                                # ========== Comparing document embeddings for similarity ==========
+                                # Get the data of affiliation
+                                text1 = a['affiliation']
+                                text2 = aa['affiliation']
+
+                                # Uniform characters in English
+                                text1 = unicodedata.normalize('NFKD', text1).encode('ascii', 'ignore').decode('utf-8')
+                                text2 = unicodedata.normalize('NFKD', text2).encode('ascii', 'ignore').decode('utf-8')
+
+                                # Delete punctuation and lower the character
+                                text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
+                                text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()
+
+                                # Delete space
+                                text1 = re.sub(r'\s+', ' ', text1).strip()
+                                text2 = re.sub(r'\s+', ' ', text2).strip()
+
+                                # Load the pre-trained BERT simulator and tokenizer
+                                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                                model = BertModel.from_pretrained('bert-base-uncased')
+
+                                # Tokenize and encode the text
+                                inputs1 = tokenizer.encode_plus(text1, add_special_tokens=True, return_tensors='pt')
+                                inputs2 = tokenizer.encode_plus(text2, add_special_tokens=True, return_tensors='pt')
+
+                                # Get the embedding vectors of the text
+                                with torch.no_grad():
+                                    outputs1 = model(**inputs1)
+                                    outputs2 = model(**inputs2)
+                                embeddings1 = torch.mean(outputs1.last_hidden_state, dim=1).squeeze()
+                                embeddings2 = torch.mean(outputs2.last_hidden_state, dim=1).squeeze()
+
+                                # Calculate text similarity (cosine similarity)
+                                similarity = cosine_similarity(embeddings1.unsqueeze(0), embeddings2.unsqueeze(0))[0][0]
+                                print('Similarity algorithm complete: the similarity score is', similarity)
+
+                                if similarity >= 0.8:
+                                    m['from_article'] += n['from_article']
+                                    m['affiliation'] += n['affiliation']
+
+                                    M.remove(n)  # Delete merged element
+                                    num += 1  # Update counter
+
+                                    # Sorted by year
+                                    m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
+                                    break
+
+                        else:
+                            return Similarity_merge(M)

    # ========== Main code ==========
-    Data = []           # Empty list
-    count = [0]         # counter
+    Data = []           # List of all data
+
+    count1 = [0]        # Same name merged data counter
+    count2 = [0]        # Duplicate data counter
+
+    num1 = 0            # Unique data counter
+    num2 = 0            # Complete merged data counter
+    num3 = 0            # Incomplete merged data counter

    # Add data into list
    for filename in os.listdir(folder_path):
@ -87,15 +224,70 @@ def SameWeb_merge(folder_path):
    Database = len(Data)        # The length of the original data
    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])

-    # Same website data merge
+    # ========== Merge ==========
+    # ----- Same name data merge -----
+    ml = []
+    if len(Data) > 1:
+        for i in range(len(Data)):
+            ml.clear()
+            SameName_merge(i, count1, count2, Data, ml)
+
+    print('\n----- Same name data merge complete -----\n')
+
+    # ----- Similarity algorithm merge -----
+    # Change the index of incomplete data before other data
+    temp_list = []      # Temp list for incomplete merged data
+
+    if len(Data) > 1:
+        for i in range(len(Data)-1, -1, -1):
+            if isinstance(Data[i], list):
+                temp = Data[i]
+                Data.remove(Data[i])
+                temp_list.append(temp)
+
+    print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
+
+    st = time.time()    # Start time
+
+    if len(temp_list) > 1:
+        executor = ThreadPoolExecutor(max_workers=10)                           # Thread pool
+        futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
+        for future in as_completed(futures):
+            pass
+        wait(futures)
+
+    et = time.time()    # End time
+    print('\nThread pool has been run for ' + str(et-st) + 's')
+
+    # Combine Data with temp_list
+    for i in temp_list:
+        if len(i) == 1:
+            Data.append(i[0])
+        else:
+            Data.append(i)
+
+    print('\n----- Similarity algorithm merge complete -----\n')
+
+    # ========== Statistic data ==========
+    # Data counter update
    for a in Data:
-        Merge(a, count, Data)
+        if isinstance(a, dict) and len(a['from_article']) == 1:
+            num1 += 1
+        elif isinstance(a, dict) and len(a['from_article']) > 1:
+            num2 += 1
+        else:
+            num3 += 1

    # Information
    print('\n========== Complete ==========\n')
    print(str(Database) + ' copies of data in total, before')
-    print(str(count[0]) + ' copies of data have been merged.')
-    print(str(len(Data)) + ' copies of data in total, now.')
+    print(str(count1[0]) + ' copies of data have been merged by same name.')
+    print(str(count2[0]) + ' copies of duplicate data have been deleted')
+    print(str(len(Data)) + ' copies of data in total, now.\n')
+
+    print(str(num1) + ' copies of data are unique.')
+    print(str(num2) + ' copies of data are complete merged')
+    print(str(num3) + ' copies of data are incomplete merged')

    # Save into file
    path = os.path.dirname(folder_path)     # parent path
@ -106,12 +298,12 @@ def SameWeb_merge(folder_path):
    with open(path, 'w', encoding='utf-8') as file:
        json.dump(Data, file, indent=4)

-    print('\nData has been added to ' + path + '\Author_data(merged).json')
+    print('\nData has been added to ' + path)

 # =========== input the file path here ==========
 # SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
 # SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
-# SameWeb_merge('.\ejde\ejde_buffer\Author_output')
+SameWeb_merge('.\ejde\ejde_buffer\Author_output')