import os import json ''' ========== fileReader ========= 1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。 2. 通过检索作者信息获取 author_id 和 from_article,并且返回作者信息中检索来源文章的 title, 将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。 3. 将 ar_list 作为结果返回。 ''' # Function to find the author data which does not have "email" information def Read(author_path, article_path): # Read data list def au_read(path, file_names, list): for file_name in file_names: file_path = os.path.join(path, file_name) with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) for Dict in range(len(data)-1, -1, -1): if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None: list.append(data[Dict]) # del data[Dict] return list def ar_read(path, file_name, list, ar_id, au_id): file_path = os.path.join(path, file_name) with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) for Dict in data: if Dict.get('article_id') == ar_id: # A new dictionary to stored key information temp_data = { 'title': Dict.get('title'), 'author_id': au_id } list.append(temp_data) # Add into list return list # ========== Main code ========== au_list = [] # List for author data ar_list = [] # List for article data ar_temp = [] # List for temp stored num = 0 # Data number counter # Read the file au_names = os.listdir(author_path) ar_names = os.listdir(article_path) # Stored the author data which has no "email" information au_list = au_read(author_path, au_names, au_list) # Search the articles where the authors from for au_data in au_list: if len(ar_temp) == 100: num += 100 ar_list.append(ar_temp) ar_temp.clear() print(str(num) + " copies of data has been stored.") elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009: ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id')) elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010: ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id')) elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020: ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id')) else: ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id')) if len(ar_temp) > 0: # Stored remaining data ar_list.append(ar_temp) print(len(ar_list)) return ar_list # ========== Test code ========== # Read('./test_buffer/Author_output', './test_buffer/Article_output')