88 lines
3.2 KiB
Python
88 lines
3.2 KiB
Python
import os
|
||
import json
|
||
|
||
'''
|
||
========== fileReader =========
|
||
1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。
|
||
2. 通过检索作者信息获取 author_id 和 from_article,并且返回作者信息中检索来源文章的 title,
|
||
将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。
|
||
3. 将 ar_list 作为结果返回。
|
||
'''
|
||
|
||
|
||
# Function to find the author data which does not have "email" information
|
||
def Read(author_path, article_path):
|
||
# Read data list
|
||
def au_read(path, file_names, list):
|
||
for file_name in file_names:
|
||
file_path = os.path.join(path, file_name)
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
data = json.load(file)
|
||
for Dict in range(len(data)-1, -1, -1):
|
||
if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None:
|
||
list.append(data[Dict])
|
||
# del data[Dict]
|
||
return list
|
||
|
||
def ar_read(path, file_name, list, ar_id, au_id):
|
||
file_path = os.path.join(path, file_name)
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
data = json.load(file)
|
||
for Dict in data:
|
||
if Dict.get('article_id') == ar_id:
|
||
# A new dictionary to stored key information
|
||
temp_data = {
|
||
'title': Dict.get('title'),
|
||
'author_id': au_id
|
||
}
|
||
|
||
list.append(temp_data) # Add into list
|
||
|
||
return list
|
||
|
||
# ========== Main code ==========
|
||
au_list = [] # List for author data
|
||
ar_list = [] # List for article data
|
||
ar_temp = [] # List for temp stored
|
||
num = 0 # Data number counter
|
||
|
||
# Read the file
|
||
au_names = os.listdir(author_path)
|
||
ar_names = os.listdir(article_path)
|
||
|
||
# Stored the author data which has no "email" information
|
||
au_list = au_read(author_path, au_names, au_list)
|
||
|
||
# Search the articles where the authors from
|
||
for au_data in au_list:
|
||
if len(ar_temp) == 100:
|
||
num += 100
|
||
ar_list.append(ar_temp)
|
||
ar_temp.clear()
|
||
|
||
print(str(num) + " copies of data has been stored.")
|
||
|
||
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
||
ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
||
|
||
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
|
||
ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
||
|
||
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
||
ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
||
|
||
else:
|
||
ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
||
|
||
if len(ar_temp) > 0: # Stored remaining data
|
||
ar_list.append(ar_temp)
|
||
|
||
print(len(ar_list))
|
||
return ar_list
|
||
|
||
|
||
# ========== Test code ==========
|
||
# Read('./test_buffer/Author_output', './test_buffer/Article_output')
|
||
|
||
|