Full same web merge code

2023-08-20 00:07:47 +08:00 · 2023-08-20 00:07:47 +08:00 · e217342ce2
commit e217342ce2
parent ba3671b5fd
1 changed files with 246 additions and 54 deletions
--- a/00_File_merge/Merge.py
+++ b/00_File_merge/Merge.py
@ -1,25 +1,45 @@
 import json
 import os
 import re
 import time
 import unicodedata
 from pprint import pprint
 from concurrent.futures import ThreadPoolExecutor, as_completed, wait
 from transformers import BertTokenizer, BertModel
 import torch
 from sklearn.metrics.pairwise import cosine_similarity
 '''
    ========== SameWeb_merge(folder_path) 运行顺序 ==========
    1. 创建新列表，依次读取文件夹内的json文件并录入，对列表按年份进行排序
    2. 使用       
                for a in Data:
                    Merge(a, count, Data):
                        for aa in Data:
                                            结构进行遍历，每次扫
       描到符合条件的aa元素后，添加至a元素中，然后删除aa元素再次调用Merge
       函数重新遍历，直至data中不再包含符合条件的aa元素为止
    3. 最后Data列表中元素全部完成相同作者合并
-    *Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法，对于数
+    2. 采用梯度合并的方式，将需要合并的元素根据合并时有把握的程度放入堆叠的
-           据格式较为规范的网站效果良好，对于数据格式不做规范的网站效果较差。
+       列表中，形如[[[a,b],c],d,e]，每次根据不同合并条件将外围的元素逐渐
-           具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性，或者同
+       合并进内部的元素。对于最后没有把握完全合并的元素一起作为单个列表保留
-           义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值
+    
-           是否相同。
+    3. 合并顺序：作者同名称优先合并（发现了不同作者公用相同邮箱的情况，所以
       针对同一个网站优先考虑相同名字合并），之后针对affiliation和邮箱是
       否完全相同经行二次合并，最后使用文本嵌入法经行相似度比较，对于相似度
       大于0.8的数据经行第三次合并
    4. 相似度比较时使用了BERT的预置模型，需要提前安装。目前看来运行时间会
       因此延长许多，在10线程下相似度比较速度约50份/分钟，但从最后表现结果
       来看准确率较为不错，针对大型数据（>1500份）处理速度可能偏慢，后续可
       以考虑优化代码结构或者其他方式提高效率
    5. 大部分函数使用       
                for i in range(len(Data)):
                    Merge(i, Data):
                        for j in range(i+1, len(Data)):
                                                        结构进
       行遍历，每次扫描到符合条件的 Data[j] 元素后，添加至 Data[i]元素
       中，然后删除 Data[j] 元素再次调用Merge函数重新遍历，直至Data中
       不再包含符合条件的 Data[j] 元素为止
    *Tips: 部分网站出现了不同作者共用同一邮箱账号的情况，因此针对同网站合并时，
           优先进行同名称合并，再进行同邮箱合并以及后续操作
           建议：
           1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
@ -30,50 +50,167 @@ from pprint import pprint
 def SameWeb_merge(folder_path):
    # Function
-    def Merge(a, count, Data):
+    def SameName_merge(i, count1, count2, Data, ml):
-        for aa in Data:
+        # Same name merge
-            if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get(
+        for j in range(i+1, len(Data)):
-                    "lastname") == aa.get("lastname") and a != aa:
+            if j < len(Data):
                a = Data[i]
                aa = Data[j]
-                # Add different key-elements of "from_article" into the first element
+                if isinstance(a, dict) and isinstance(aa, dict):
-                if isinstance(a["from_article"], list):
+                    if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
-                    if isinstance(aa["from_article"], list):
+                        if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
-                        a["from_article"] += aa["from_article"]
+                                "middle_name") and a.get("last_name") == aa.get("last_name"):
-                    else:
+                            if a['from_article'] == aa['from_article']:
-                        a["from_article"].append(aa["from_article"])
+                                Data.remove(Data[j])
-                elif isinstance(a["from_article"], str):
+                                count2[0] += 1
-                    if isinstance(aa["from_article"], list):
+                                return SameName_merge(i, count1, count2, Data, ml)
                        a["from_article"] = [a["from_article"]] + aa["from_article"]
                    else:
                        a["from_article"] = [a["from_article"], aa["from_article"]]
-                # Add different key-elements of "affiliation" into the first element
+                            else:
-                a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year'])    # Sort by year
+                                ml.append(aa)
-                # Uniform characters in English
+                                # Update the counter
-                fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
+                                count1[0] += 1
-                faa = unicodedata.normalize('NFKD', aa['affiliation'][0]["affiliation"]).encode('ascii', 'ignore')
+                                if count1[0] % 100 == 0 and count1[0] != 0:
                                    print(str(count1[0]) + ' copies of data have been merged by same name.')
-                if fa != faa:
+                                # Delete extra elements
-                    a['affiliation'].extend(aa['affiliation'])
+                                Data.remove(Data[j])
                elif fa == faa and a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
                    a['affiliation'].extend(aa['affiliation'])
-                if len(a['affiliation']) > 1 and a['affiliation'][0] == a['affiliation'][1]:
+                                return SameName_merge(i, count1, count2, Data, ml)
                    a['affiliation'].remove(a['affiliation'][0])
-                # Delete extra elements
+        # Detail merge
-                Data.remove(aa)
+        if len(ml) > 0:
            ml.append(Data[i])              # Add first element
            Data.remove(Data[i])
            ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year'))      # Sorted by year
-                # Update the counter
+            # Merge same affiliation data
-                count[0] += 1
+            for i in range(len(ml)):
-                if count[0] % 100 == 0 and count[0] != 0:
+                for j in range(i+1, len(ml)):
-                    print(str(count[0]) + ' copies of data have been merged.')
+                    if j < len(ml):
-                return Merge(a, count, Data)
+                        m = ml[i]
                        n = ml[j]
                        if m.get('affiliation')[-1].get('affiliation') == n.get('affiliation')[0].get('affiliation'):
                            if m.get('affiliation')[-1].get('year') != n.get('affiliation')[0].get('year'):
                                m['from_article'] += n['from_article']
                                m['affiliation'] += n['affiliation']
                                ml.remove(ml[j])
                            elif m.get('affiliation')[-1].get('year') == n.get('affiliation')[0].get('year'):
                                m['from_article'] += n['from_article']
                                ml.remove(ml[j])
            # Merge same email data
            def SameEmail_merge(i, ml):
                for j in range(i + 1, len(ml)):
                    if j < len(ml):
                        m = ml[i]
                        n = ml[j]
                        A = m.get('affiliation')
                        AA = n.get('affiliation')
                        num = 0         # Merge counter
                        for a in A:
                            if num == 0:
                                for aa in AA:
                                    if a.get('email') == aa.get('email'):
                                        m['from_article'] += n['from_article']
                                        m['affiliation'] += n['affiliation']
                                        ml.remove(n)    # Delete merged element
                                        num += 1        # Update counter
                                        # Sorted by year
                                        m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
                                        break
                            else:
                                return SameEmail_merge(i, ml)
            # Loop of merging data by same email
            for i in range(len(ml)):
                SameEmail_merge(i, ml)
            # Add into Data list
            if len(ml) == 1:
                Data.insert(-1, ml[0])
            else:
                Data.insert(-1, ml)
    def Similarity_merge(M):
        for i in range(len(M)):
            for j in range(i+1, len(M)):
                if j < len(M):
                    m = M[i]
                    n = M[j]
                    A = m.get('affiliation')
                    AA = n.get('affiliation')
                    num = 0  # Merge counter
                    for a in A:
                        if num == 0:
                            for aa in AA:
                                # ========== Comparing document embeddings for similarity ==========
                                # Get the data of affiliation
                                text1 = a['affiliation']
                                text2 = aa['affiliation']
                                # Uniform characters in English
                                text1 = unicodedata.normalize('NFKD', text1).encode('ascii', 'ignore').decode('utf-8')
                                text2 = unicodedata.normalize('NFKD', text2).encode('ascii', 'ignore').decode('utf-8')
                                # Delete punctuation and lower the character
                                text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
                                text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()
                                # Delete space
                                text1 = re.sub(r'\s+', ' ', text1).strip()
                                text2 = re.sub(r'\s+', ' ', text2).strip()
                                # Load the pre-trained BERT simulator and tokenizer
                                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                                model = BertModel.from_pretrained('bert-base-uncased')
                                # Tokenize and encode the text
                                inputs1 = tokenizer.encode_plus(text1, add_special_tokens=True, return_tensors='pt')
                                inputs2 = tokenizer.encode_plus(text2, add_special_tokens=True, return_tensors='pt')
                                # Get the embedding vectors of the text
                                with torch.no_grad():
                                    outputs1 = model(**inputs1)
                                    outputs2 = model(**inputs2)
                                embeddings1 = torch.mean(outputs1.last_hidden_state, dim=1).squeeze()
                                embeddings2 = torch.mean(outputs2.last_hidden_state, dim=1).squeeze()
                                # Calculate text similarity (cosine similarity)
                                similarity = cosine_similarity(embeddings1.unsqueeze(0), embeddings2.unsqueeze(0))[0][0]
                                print('Similarity algorithm complete: the similarity score is', similarity)
                                if similarity >= 0.8:
                                    m['from_article'] += n['from_article']
                                    m['affiliation'] += n['affiliation']
                                    M.remove(n)  # Delete merged element
                                    num += 1  # Update counter
                                    # Sorted by year
                                    m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
                                    break
                        else:
                            return Similarity_merge(M)
    # ========== Main code ==========
-    Data = []           # Empty list
+    Data = []           # List of all data
-    count = [0]         # counter
+
    count1 = [0]        # Same name merged data counter
    count2 = [0]        # Duplicate data counter
    num1 = 0            # Unique data counter
    num2 = 0            # Complete merged data counter
    num3 = 0            # Incomplete merged data counter
    # Add data into list
    for filename in os.listdir(folder_path):
@ -87,15 +224,70 @@ def SameWeb_merge(folder_path):
    Database = len(Data)        # The length of the original data
    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
-    # Same website data merge
+    # ========== Merge ==========
    # ----- Same name data merge -----
    ml = []
    if len(Data) > 1:
        for i in range(len(Data)):
            ml.clear()
            SameName_merge(i, count1, count2, Data, ml)
    print('\n----- Same name data merge complete -----\n')
    # ----- Similarity algorithm merge -----
    # Change the index of incomplete data before other data
    temp_list = []      # Temp list for incomplete merged data
    if len(Data) > 1:
        for i in range(len(Data)-1, -1, -1):
            if isinstance(Data[i], list):
                temp = Data[i]
                Data.remove(Data[i])
                temp_list.append(temp)
    print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
    st = time.time()    # Start time
    if len(temp_list) > 1:
        executor = ThreadPoolExecutor(max_workers=10)                           # Thread pool
        futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
        for future in as_completed(futures):
            pass
        wait(futures)
    et = time.time()    # End time
    print('\nThread pool has been run for ' + str(et-st) + 's')
    # Combine Data with temp_list
    for i in temp_list:
        if len(i) == 1:
            Data.append(i[0])
        else:
            Data.append(i)
    print('\n----- Similarity algorithm merge complete -----\n')
    # ========== Statistic data ==========
    # Data counter update
    for a in Data:
-        Merge(a, count, Data)
+        if isinstance(a, dict) and len(a['from_article']) == 1:
            num1 += 1
        elif isinstance(a, dict) and len(a['from_article']) > 1:
            num2 += 1
        else:
            num3 += 1
    # Information
    print('\n========== Complete ==========\n')
    print(str(Database) + ' copies of data in total, before')
-    print(str(count[0]) + ' copies of data have been merged.')
+    print(str(count1[0]) + ' copies of data have been merged by same name.')
-    print(str(len(Data)) + ' copies of data in total, now.')
+    print(str(count2[0]) + ' copies of duplicate data have been deleted')
    print(str(len(Data)) + ' copies of data in total, now.\n')
    print(str(num1) + ' copies of data are unique.')
    print(str(num2) + ' copies of data are complete merged')
    print(str(num3) + ' copies of data are incomplete merged')
    # Save into file
    path = os.path.dirname(folder_path)     # parent path
@ -106,12 +298,12 @@ def SameWeb_merge(folder_path):
    with open(path, 'w', encoding='utf-8') as file:
        json.dump(Data, file, indent=4)
-    print('\nData has been added to ' + path + '\Author_data(merged).json')
+    print('\nData has been added to ' + path)
 # =========== input the file path here ==========
 # SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
 # SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
-# SameWeb_merge('.\ejde\ejde_buffer\Author_output')
+SameWeb_merge('.\ejde\ejde_buffer\Author_output')