ScholarDataMining/FileMerger/Merge.py

import json
import os
import re
import time
import unicodedata
import torch

from pprint import pprint
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

'''
    ========== SameWeb_merge(folder_path) 运行顺序 ==========
    1. 创建新列表，依次读取文件夹内的json文件并录入，对列表按年份进行排序

    2. 采用梯度合并的方式，将需要合并的元素根据合并时有把握的程度放入堆叠的
       列表中，形如[[[a,b],c],d,e]，每次根据不同合并条件将外围的元素逐渐
       合并进内部的元素。对于最后没有把握完全合并的元素一起作为单个列表保留

    3. 合并顺序：作者同名称优先合并（发现了不同作者公用相同邮箱的情况，所以
       针对同一个网站优先考虑相同名字合并），之后针对affiliation和邮箱是
       否完全相同经行二次合并，最后使用文本嵌入法经行相似度比较，对于相似度
       大于0.8的数据经行第三次合并

    4. 相似度比较时使用了BERT的预置模型，需要提前安装。目前看来运行时间会
       因此延长许多，在10线程下相似度比较速度约50份/分钟，但从最后表现结果
       来看准确率较为不错，针对大型数据（>1500份）处理速度可能偏慢，后续可
       以考虑优化代码结构或者其他方式提高效率

    5. 大部分函数使用
                for i in range(len(Data)):
                    Merge(i, Data):
                        for j in range(i+1, len(Data)):
                                                        结构进
       行遍历，每次扫描到符合条件的 Data[j] 元素后，添加至 Data[i]元素
       中，然后删除 Data[j] 元素再次调用Merge函数重新遍历，直至Data中
       不再包含符合条件的 Data[j] 元素为止

    *Tips: 部分网站出现了不同作者共用同一邮箱账号的情况，因此针对同网站合并时，
           优先进行同名称合并，再进行同邮箱合并以及后续操作

           建议：
           1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
           2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比（已解
              决，通过删除多余空格，全部标点符号，再进行相似度比较）
           3. 相似度比较需要一个更加快捷的方式，通过预置的模型对比耗时过长，cpu
              占用率也较高

'''


def SameWeb_merge(folder_path):
    # Function
    def SameName_merge(i, count1, count2, Data, ml):
        # Same name merge
        for j in range(i+1, len(Data)):
            if j < len(Data):
                a = Data[i]
                aa = Data[j]

                if isinstance(a, dict) and isinstance(aa, dict):
                    if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
                        if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
                                "middle_name") and a.get("last_name") == aa.get("last_name"):
                            if a['from_article'] == aa['from_article']:     # Remove same data
                                Data.remove(Data[j])
                                count2[0] += 1
                                return SameName_merge(i, count1, count2, Data, ml)

                            else:
                                ml.append(aa)

                                # Update the counter
                                count1[0] += 1
                                if count1[0] % 100 == 0 and count1[0] != 0:
                                    print(str(count1[0]) + ' copies of data have been merged by same name.')

                                # Delete extra elements
                                Data.remove(Data[j])

                                return SameName_merge(i, count1, count2, Data, ml)

        # Detail merge
        if len(ml) > 0:
            ml.append(Data[i])              # Add first element
            Data.remove(Data[i])
            ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year'))      # Sorted by year

            # Merge same affiliation data
            for i in range(len(ml)):
                for j in range(i+1, len(ml)):
                    if j < len(ml):
                        m = ml[i]
                        n = ml[j]

                        if m.get('affiliation')[-1].get('affiliation') == n.get('affiliation')[0].get('affiliation'):
                            if m.get('affiliation')[-1].get('year') != n.get('affiliation')[0].get('year'):
                                m['from_article'] += n['from_article']
                                m['affiliation'] += n['affiliation']
                                ml.remove(ml[j])

                            elif m.get('affiliation')[-1].get('year') == n.get('affiliation')[0].get('year'):
                                m['from_article'] += n['from_article']
                                ml.remove(ml[j])

            # Merge same email data
            def SameEmail_merge(i, ml):
                for j in range(i + 1, len(ml)):
                    if j < len(ml):
                        m = ml[i]
                        n = ml[j]
                        A = m.get('affiliation')
                        AA = n.get('affiliation')
                        num = 0         # Merge counter

                        for a in A:
                            if num == 0:
                                for aa in AA:
                                    if a.get('email') == aa.get('email'):
                                        m['from_article'] += n['from_article']
                                        m['affiliation'] += n['affiliation']

                                        ml.remove(n)    # Delete merged element
                                        num += 1        # Update counter

                                        # Sorted by year
                                        m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
                                        break
                            else:
                                return SameEmail_merge(i, ml)

            # Loop of merging data by same email
            for i in range(len(ml)):
                SameEmail_merge(i, ml)

            # Add into Data list
            if len(ml) == 1:
                Data.insert(-1, ml[0])
            else:
                Data.insert(-1, ml)

    def Similarity_merge(M):
        for i in range(len(M)):
            for j in range(i+1, len(M)):
                if j < len(M):

                    m = M[i]
                    n = M[j]
                    A = m.get('affiliation')
                    AA = n.get('affiliation')
                    num = 0  # Merge counter

                    for a in A:
                        if num == 0:
                            for aa in AA:
                                # ========== Comparing document embeddings for similarity ==========
                                # Get the data of affiliation
                                text1 = a['affiliation']
                                text2 = aa['affiliation']

                                # Uniform characters in English
                                text1 = unicodedata.normalize('NFKD', text1).encode('ascii', 'ignore').decode('utf-8')
                                text2 = unicodedata.normalize('NFKD', text2).encode('ascii', 'ignore').decode('utf-8')

                                # Delete punctuation and lower the character
                                text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
                                text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()

                                # Delete extra spaces
                                text1 = re.sub(r'\s+', ' ', text1).strip()
                                text2 = re.sub(r'\s+', ' ', text2).strip()

                                # Load the pre-trained BERT simulator and tokenizer
                                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                                model = BertModel.from_pretrained('bert-base-uncased')

                                # Tokenize and encode the text
                                inputs1 = tokenizer.encode_plus(text1, add_special_tokens=True, return_tensors='pt')
                                inputs2 = tokenizer.encode_plus(text2, add_special_tokens=True, return_tensors='pt')

                                # Get the embedding vectors of the text
                                with torch.no_grad():
                                    outputs1 = model(**inputs1)
                                    outputs2 = model(**inputs2)
                                embeddings1 = torch.mean(outputs1.last_hidden_state, dim=1).squeeze()
                                embeddings2 = torch.mean(outputs2.last_hidden_state, dim=1).squeeze()

                                # Calculate text similarity (cosine similarity)
                                similarity = cosine_similarity(embeddings1.unsqueeze(0), embeddings2.unsqueeze(0))[0][0]
                                print('Similarity algorithm complete: the similarity score is', similarity)

                                if similarity >= 0.8:
                                    m['from_article'] += n['from_article']
                                    m['affiliation'] += n['affiliation']

                                    M.remove(n)  # Delete merged element
                                    num += 1  # Update counter

                                    # Sorted by year
                                    m['affiliation'] = sorted(m['affiliation'], key=lambda x: x['year'])
                                    break

                        else:
                            return Similarity_merge(M)

    # ========== Main code ==========
    Data = []           # List of all data

    count1 = [0]        # Same name merged data counter
    count2 = [0]        # Duplicate data counter

    num1 = 0            # Unique data counter
    num2 = 0            # Complete merged data counter
    num3 = 0            # Incomplete merged data counter
    num4 = 0            # Similarity algorithm merged data counter

    # Add data into list
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                if len(data) > 0:
                    Data.extend(data)

    Database = len(Data)        # The length of the original data
    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])

    # ========== Merge ==========
    # ----- Same name data merge -----
    ml = []
    if len(Data) > 1:
        for i in range(len(Data)):
            ml.clear()
            SameName_merge(i, count1, count2, Data, ml)

    print('\n----- Same name data merge complete -----\n')

    # ----- Similarity algorithm merge -----
    # Change the index of incomplete data before other data
    temp_list = []      # Temp list for incomplete merged data

    if len(Data) > 1:
        for i in range(len(Data)-1, -1, -1):
            if isinstance(Data[i], list):
                temp = Data[i]
                Data.remove(Data[i])
                temp_list.append(temp)

    print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')

    st = time.time()    # Start time

    if len(temp_list) > 1:
        executor = ThreadPoolExecutor(max_workers=10)                           # Thread pool
        futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
        for future in as_completed(futures):
            pass
        wait(futures)

    et = time.time()    # End time
    print('\nThread pool has been run for ' + str(et-st) + 's')

    # Combine Data with temp_list
    for i in temp_list:
        if len(i) == 1:
            num4 += 1
            Data.insert(0, i[0])
        else:
            Data.insert(-1, i)

    print('\n----- Similarity algorithm merge complete -----\n')

    # ========== Statistic data ==========
    # Data counter update
    for a in Data:
        if isinstance(a, dict) and len(a['from_article']) == 1:
            num1 += 1
        elif isinstance(a, dict) and len(a['from_article']) > 1:
            num2 += 1
        else:
            num3 += 1

    # Information
    print('\n========== Complete ==========\n')
    print(str(Database) + ' copies of data in total, before')
    print(str(count1[0]) + ' copies of data have been merged by same name.')
    print(str(count2[0]) + ' copies of duplicate data have been deleted')
    print(str(len(Data)) + ' copies of data in total, now.\n')

    print(str(num1) + ' copies of data are unique.')
    print(str(num2) + ' copies of data are complete merged')
    print(str(num4) + ' copies of data are complete merged by similarity algorithm')
    print(str(num3) + ' copies of data are incomplete merged')

    # Save into file
    path = os.path.dirname(folder_path)     # parent path
    path = os.path.join(path, "Author_data(merged)")
    os.makedirs(path, exist_ok=True)
    path = os.path.join(path, "Author_data(merged).json")

    with open(path, 'w', encoding='utf-8') as file:
        json.dump(Data, file, indent=4)

    print('\nData has been added to ' + path)

# =========== input the file path here ==========
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
SameWeb_merge('.\ejde\ejde_buffer\Author_output')