ScholarDataMining/FileMerger/fileReader.py

import os
import json

'''
    ========== fileReader =========
    1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。
    2. 通过检索作者信息获取 author_id 和 from_article，并且返回作者信息中检索来源文章的 title，
       将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。
    3. 将 ar_list 作为结果返回。
'''


# Function to find the author data which does not have "email" information
def Read(author_path, article_path):
    # Read data list
    def au_read(path, file_names, list):
        for file_name in file_names:
            file_path = os.path.join(path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                for Dict in range(len(data)-1, -1, -1):
                    if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None:
                        list.append(data[Dict])
                        # del data[Dict]
        return list

    def ar_read(path, file_name, list, ar_id, au_id):
        file_path = os.path.join(path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            for Dict in data:
                if Dict.get('article_id') == ar_id:
                    # A new dictionary to stored key information
                    temp_data = {
                        'title': Dict.get('title'),
                        'author_id': au_id
                    }

                    list.append(temp_data)  # Add into list

        return list

    # ========== Main code ==========
    au_list = []    # List for author data
    ar_list = []    # List for article data
    ar_temp = []    # List for temp stored
    num = 0         # Data number counter

    # Read the file
    au_names = os.listdir(author_path)
    ar_names = os.listdir(article_path)

    # Stored the author data which has no "email" information
    au_list = au_read(author_path, au_names, au_list)

    # Search the articles where the authors from
    for au_data in au_list:
        if len(ar_temp) == 100:
            num += 100
            ar_list.append(ar_temp)
            ar_temp.clear()

            print(str(num) + " copies of data has been stored.")

        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
            ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id'))

        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
            ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id'))

        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
            ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id'))

        else:
            ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id'))

    if len(ar_temp) > 0:            # Stored remaining data
        ar_list.append(ar_temp)

    print(len(ar_list))
    return ar_list


# ========== Test code ==========
# Read('./test_buffer/Author_output', './test_buffer/Article_output')