ScholarDataMining/FileMerger/Merge_byNameAndEmail.py

import json
import os


def SameWeb_merge(folder_path):
    # Function
    def SameName_merge(i, count1, count2, Data, ml):
        # Same name merge
        for j in range(i + 1, len(Data)):
            if j < len(Data):
                a = Data[i]
                aa = Data[j]

                if isinstance(a, dict) and isinstance(aa, dict):
                    if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
                        if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
                                "middle_name") and a.get("last_name") == aa.get("last_name"):

                            if a['from_article'] == aa['from_article']:                 # Remove same data
                                Data.remove(Data[j])
                                count2[0] += 1
                                return SameName_merge(i, count1, count2, Data, ml)

                            else:
                                ml.append(aa)

                                # Update the counter
                                count1[0] += 1
                                if count1[0] % 100 == 0 and count1[0] != 0:
                                    print(str(count1[0]) + ' copies of data have been merged by same name.')

                                # Delete extra elements
                                Data.remove(Data[j])

                                return SameName_merge(i, count1, count2, Data, ml)

        if len(ml) > 0:
            ml.append(Data[i])  # Add first element
            Data.remove(Data[i])
            ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year'))  # Sorted by year

            # Add into Data list
            if len(ml) == 1:
                Data.insert(-1, ml[0])
            else:
                Data.insert(-1, ml)

    # ========== Main code ==========
    Data = []  # List of all data

    count1 = [0]  # Same name merged data counter
    count2 = [0]  # Duplicate data counter

    num1 = 0  # Unique data counter
    num2 = 0  # Complete merged data counter
    num3 = 0  # Incomplete merged data counter

    # Add data into list
    for num_folder in os.listdir(folder_path):
        num_folder_path = os.path.join(folder_path, num_folder)
        for filename in os.listdir(num_folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(num_folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    if len(data) > 0:
                        Data.extend(data)

    Database = len(Data)  # The length of the original data
    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])

    # ========== Merge ==========
    # ----- Same name data merge -----
    ml = []
    if len(Data) > 1:
        for i in range(len(Data)):
            ml.clear()
            SameName_merge(i, count1, count2, Data, ml)

    print('\n----- Same name data merge complete -----\n')

    # ----- Similarity algorithm merge -----
    # Change the index of incomplete data before other data
    temp_list = []  # Temp list for incomplete merged data

    if len(Data) > 1:
        for i in range(len(Data) - 1, -1, -1):
            if isinstance(Data[i], list):
                temp = Data[i]
                Data.remove(Data[i])
                temp_list.append(temp)

    print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')

    print('\n----- Similarity algorithm merge complete -----\n')

    # ========== Statistic data ==========
    # Data counter update
    for a in Data:
        if isinstance(a, dict) and len(a['from_article']) == 1:
            num1 += 1
        elif isinstance(a, dict) and len(a['from_article']) > 1:
            num2 += 1
        else:
            num3 += 1

    # Information
    print('\n========== Complete ==========\n')
    print(str(Database) + ' copies of data in total, before')
    print(str(count1[0]) + ' copies of data have been merged by same name.')
    print(str(count2[0]) + ' copies of duplicate data have been deleted')
    print(str(len(Data)) + ' copies of data in total, now.\n')

    print(str(num1) + ' copies of data are unique.')
    print(str(num3) + ' copies of data are incomplete merged')

    # Save into file
    path = os.path.dirname(folder_path)  # parent path
    path = os.path.join(path, "Author_data(merged)")
    os.makedirs(path, exist_ok=True)
    path = os.path.join(path, "Author_data(merged).json")

    with open(path, 'w', encoding='utf-8') as file:
        json.dump(temp_list, file, indent=4)

    print('\nData has been added to ' + path)


# =========== input the file path here ==========
SameWeb_merge('.\\nameDivision')