ScholarDataMining/FileMerger/fileReader.py

import os
import json
from pprint import pprint

'''
    ========== fileReader =========
    1. 本程序用于读取作者信息后获取来源文章的标题，并且将其存储到新的字典当中
    2. 通过检索作者信息获取 author_id 和 from_article，并且返回作者信息中检索来源文章的 title，
       将作者信息和标题一并存储到新字典
    3. 将新字典存储到 json 文件中
'''


# Function to find the author data which does not have "email" information
def Read(author_path, article_path):
    # Read data list
    def au_read(path, list):
        with open(path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            for Dict in range(len(data)-1, -1, -1):
                list.append(data[Dict])

        return list

    def ar_read(path, file_name, ar_id, au_data, num):
        file_path = os.path.join(path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            for Dict in data:
                if Dict.get('article_id') == ar_id[0]:
                    # A new dictionary to stored information
                    au_data['from_article_title'] = Dict.get('title')
                    num[0] += 1

    # ========== Main code ==========
    au_list = []    # List for author data
    num = [0]         # Data number counter

    # Read the file
    ar_names = os.listdir(article_path)

    # Stored the author data which has no "email" information
    au_list = au_read(author_path, au_list)

    # Search the articles where the authors from
    for au_MergeList in au_list:
        for au_data in au_MergeList:
            if num[0] % 100 == 0 and num[0] != 0:
                print(str(num[0]) + " copies of data have been done.")

            if int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
                ar_read(article_path, ar_names[3], au_data.get('from_article'), au_data, num)

            elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
                ar_read(article_path, ar_names[0], au_data.get('from_article'), au_data, num)

            elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
                ar_read(article_path, ar_names[1], au_data.get('from_article'), au_data, num)

            else:
                ar_read(article_path, ar_names[2], au_data.get('from_article'), au_data, num)

    with open('./Author_data(merged)/Author_data(info_supplementary).json', 'w', encoding='utf-8') as file:
        json.dump(au_list, file, indent=4)

    print('All data have been stored into ./Author_data(merged)/Author_data(info_supplementary).json')

# ========== Test code ==========
Read('./Author_data(merged)/Author_data(merged).json', './test_buffer/Article_output')