ScholarDataMining/00_File_merge/Merge.py

import json
import os
from pprint import pprint

import unicodedata

'''
    ========== SameWeb_merge(folder_path) 运行顺序 ==========
    1. 创建新列表，依次读取文件夹内的json文件并录入，对列表按年份进行排序
    2. 使用
                for a in Data:
                    Merge(a, count, Data):
                        for aa in Data:
                                            结构进行遍历，每次扫
       描到符合条件的aa元素后，添加至a元素中，然后删除aa元素再次调用Merge
       函数重新遍历，直至data中不再包含符合条件的aa元素为止
    3. 最后Data列表中元素全部完成相同作者合并

    *Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法，对于数
           据格式较为规范的网站效果良好，对于数据格式不做规范的网站效果较差。
           具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性，或者同
           义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值
           是否相同。

           建议：
           1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
           2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比（比较
           复杂，暂时没有好思路）
'''


def SameWeb_merge(folder_path):
    # Function
    def Merge(a, count, Data):
        for aa in Data:
            if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get(
                    "lastname") == aa.get("lastname") and a != aa:

                # Add different key-elements of "from_article" into the first element
                if isinstance(a["from_article"], list):
                    if isinstance(aa["from_article"], list):
                        a["from_article"] += aa["from_article"]
                    else:
                        a["from_article"].append(aa["from_article"])
                elif isinstance(a["from_article"], str):
                    if isinstance(aa["from_article"], list):
                        a["from_article"] = [a["from_article"]] + aa["from_article"]
                    else:
                        a["from_article"] = [a["from_article"], aa["from_article"]]

                # Add different key-elements of "affiliation" into the first element
                a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year'])    # Sort by year

                # Uniform characters in English
                fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
                faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')

                if fa != faa:
                    a['affiliation'].extend(aa['affiliation'])
                elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
                    a['affiliation'].extend(aa['affiliation'])

                # Delete extra elements
                Data.remove(aa)

                # Update the counter
                count[0] += 1
                if count[0] % 100 == 0 and count[0] != 0:
                    print(str(count[0]) + ' copies of data have been merged.')
                return Merge(a, count, Data)

    # ========== Main code ==========
    Data = []           # Empty list
    count = [0]         # counter

    # Add data into list
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                if len(data) > 0:
                    Data.extend(data)

    Database = len(Data)
    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])

    # Same website data merge
    for a in Data:
        Merge(a, count, Data)

    # Information
    print('\n========== Complete ==========\n')
    print(str(Database) + ' copies of data in total, before')
    print(str(count[0]) + ' copies of data have been merged.')
    print(str(len(Data)) + ' copies of data in total, now.')

    # Save into file
    path = os.path.dirname(folder_path)
    path = os.path.join(path, "Author_data(merged)")
    os.makedirs(path, exist_ok=True)
    path = os.path.join(path, "Author_data(merged).json")

    with open(path, 'w', encoding='utf-8') as file:
        json.dump(Data, file, indent=4)

    print('\nData has been added to ' + path + '\Author_data(merged).json')


# =========== input the file path here ==========
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')