2023-09-26 23:37:43 +08:00

88 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
'''
========== fileReader =========
1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。
2. 通过检索作者信息获取 author_id 和 from_article并且返回作者信息中检索来源文章的 title
将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。
3. 将 ar_list 作为结果返回。
'''
# Function to find the author data which does not have "email" information
def Read(author_path, article_path):
# Read data list
def au_read(path, file_names, list):
for file_name in file_names:
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
for Dict in range(len(data)-1, -1, -1):
if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None:
list.append(data[Dict])
# del data[Dict]
return list
def ar_read(path, file_name, list, ar_id, au_id):
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
for Dict in data:
if Dict.get('article_id') == ar_id:
# A new dictionary to stored key information
temp_data = {
'title': Dict.get('title'),
'author_id': au_id
}
list.append(temp_data) # Add into list
return list
# ========== Main code ==========
au_list = [] # List for author data
ar_list = [] # List for article data
ar_temp = [] # List for temp stored
num = 0 # Data number counter
# Read the file
au_names = os.listdir(author_path)
ar_names = os.listdir(article_path)
# Stored the author data which has no "email" information
au_list = au_read(author_path, au_names, au_list)
# Search the articles where the authors from
for au_data in au_list:
if len(ar_temp) == 100:
num += 100
ar_list.append(ar_temp)
ar_temp.clear()
print(str(num) + " copies of data has been stored.")
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
else:
ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
if len(ar_temp) > 0: # Stored remaining data
ar_list.append(ar_temp)
print(len(ar_list))
return ar_list
# ========== Test code ==========
# Read('./test_buffer/Author_output', './test_buffer/Article_output')