ScholarDataMining/01_EJDE_spider/ejde_merge.py

import json
import os
from pprint import pprint
import unicodedata


def Merge(folder_path):
    Data = []       # Empty list

    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                Data.append(data)

    count = 0       # counter

    # Same file merge
    for data in Data:
        if len(data) > 0:
            data = sorted(data, key=lambda x: x['affiliation'][0]['year'])
            for a in data:
                for aa in data:
                    if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
                            a.get("lastname") == aa.get("lastname"):

                        # Add different key-elements of "affiliation" into the first element
                        if a.get('affiliation') != aa.get('affiliation'):
                            # Uniform text formatting
                            ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
                            ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
                            if ch_1 != ch_2:
                                hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
                                hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
                                if hash_1 != hash_2:
                                    a['affiliation'] += aa['affiliation']

                                    # Add different key-elements of "from_article" into the first element
                                    a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
                                        isinstance(a.get("from_article"), str) else a.get("from_article") + (
                                        [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
                                        aa.get("from_article"))

                                    pprint(a)
                                    print('//////////////////////////////////////\n')

                        # Delete extra elements
                        data.remove(aa)

                        # Update the counter
                        count += 1

    # Different files merge
    A = Data[2]     # newest
    B = Data[1]     # (2015-2020)
    C = Data[0]     # (2010-2014)
    D = Data[3]     # oldest

    Data.clear()
    Data = [B, C, D]

    for data in Data:
        if len(data) > 0:
            for a in A:
                for aa in data:
                    if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
                            a.get("lastname") == aa.get("lastname"):

                        # Add different key-elements of "affiliation" into the first element
                        if a.get('affiliation') != aa.get('affiliation'):
                            # Uniform text formatting
                            ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
                            ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
                            if ch_1 != ch_2:
                                hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
                                hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
                                if hash_1 != hash_2:
                                    a['affiliation'] += aa['affiliation']

                                    # Add different key-elements of "from_article" into the first element
                                    a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
                                        isinstance(a.get("from_article"), str) else a.get("from_article") + (
                                        [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
                                        aa.get("from_article"))

                                pprint(a)
                                print('================================\n')

                        # Delete extra elements
                        data.remove(aa)

                        # Update the counter
                        count += 1

            # Combined in one list
            A += data

    # Tips
    print(str(count) + ' file(s) have been merged.')
    print('There are ' + str(len(A)) + ' file(s) in total, now.')


# input the file path
Merge('./ejde_buffer/Author_output')