ScholarDataMining/DataTransformer/FileStructureTansfer(EJQTDE).py
2023-11-01 13:13:28 +08:00

206 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import unicodedata
from collections import OrderedDict
from pprint import pprint
'''
========== FileStructureTransfer ==========
1. 本程序用于将获取的数据进行结构调整
2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中
1 newest 发表于 2020 年之后
2 oldest 发表于 2010 年之前
3 2010-2014 发表于 2010 年至 2014 年
4 2015-2020 发表于 2015 年至 2020 年
3. 考虑到部分网站的总数据量过大,所以分成多份
4. 本程序运行顺序为:
1 fileReader() 读取本地已爬取数据,存入待处理列表
2 arDataTransform() 转换论文数据格式
3 auDataTransform() 转换作者数据格式
4 存入转换后数据的存储文件夹
'''
# Read the data
def fileReader(folder, dataset):
files = os.listdir(folder)
for file in files:
file_path = os.path.join(folder, file)
with open(file_path, 'r', encoding='utf-8') as json_file:
Data = json.load(json_file)
dataset.append(Data)
return dataset
# Article data structure transfer
def arDataTransform(au_folder, ar_dataset, num):
def auInfoFind(path, file_name, ar_data, num):
authors = ar_data.get('authors')
authors.append(ar_data.get('corresponding_authors'))
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
Data = json.load(file)
au_ID = [] # A new list to store author_id
# Find the author_id
for author in authors:
author = author.replace(" ", "")
for Dict in Data:
Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
unicodedata.category(char) != 'Mn')
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
au_ID.append(Dict.get('author_id'))
author_names_new = []
author_names = ar_data['authors']
for author_name in author_names:
author_name_new = ''
author_name = author_name.split(", ")
for i in range(len(author_name)-1, 0, -1):
# print(author_name[i])
author_name_new += author_name[i]
if i != 0:
author_name_new += ', '
print(author_name_new)
author_names_new.append(author_name_new)
# Change the structure
ar_data_transform = {
"article_id": ar_data['article_id'],
"title": ar_data['title'],
"authors": au_ID,
"authors_name": author_names_new,
"submit_datetime": ar_data['submit_datetime'],
"publish_datetime": ar_data['publish_datetime'],
"keywords": ar_data['keywords'],
"MSC": ar_data['MSC'],
"URL": ar_data['URL'],
"DOI": ar_data['DOI'],
"publisher": ar_data['publisher'],
"journal": ar_data['journal'],
"volume": ar_data['volume'],
"issue": ar_data['issue'],
"page": ar_data['page']
}
num[0] += 1 # Update the counter
return ar_data_transform
# ====== Main code for function =====
ar_names = os.listdir(au_folder) # Read the folder
for ar_list in ar_dataset:
for Dict in ar_list:
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
print(str(num[0]) + " copies of article data structure have been transformed.")
if int(Dict.get('volume')) <= 2009:
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
ar_dataset_new[3].append(Dict)
elif 2010 <= int(Dict.get('volume')) <= 2014:
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
ar_dataset_new[0].append(Dict)
elif 2015 <= int(Dict.get('volume')) <= 2020:
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
ar_dataset_new[1].append(Dict)
else:
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
ar_dataset_new[2].append(Dict)
# Store into the new file
filepaths = [
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2010-2014).json",
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2015-2020).json",
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(newest).json",
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
]
for i in range(4):
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
json.dump(ar_dataset_new[i], json_file, indent=4)
print("\nComplete: All of the article data structure have been transformed.")
# Author data structure transfer
def auDataTransform(au_dataset, num):
def transform(list, num):
new_list = [] # New list to store transformed data
for au_data in list:
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
print(str(num[0]) + " copies of author data structure have been transformed.\n")
if au_data['middle_name'] is not None:
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
else:
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
au_data_transform = {
"author_id": au_data['author_id'],
"from_article": au_data['from_article'][0],
"first_name": au_data['last_name'],
"last_name": au_data['first_name'],
"middle_name": au_data['middle_name'],
"raw_name": raw_name,
"affiliation": au_data['affiliation']
}
new_list.append(au_data_transform)
num[0] += 1 # Update the counter
return new_list
for i in range(4):
au_list = transform(au_dataset[i], num)
au_dataset_new[i].append(au_list)
# Store into the new file
filepaths = [
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2010-2014).json",
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2015-2020).json",
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(newest).json",
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(oldest).json",
]
for i in range(4):
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
json.dump(au_dataset_new[i], json_file, indent=4)
print("\nComplete: All of the author data structure have been transformed.")
# ========== Main code ========== #
# New list for storing data
ar_dataset = []
au_dataset = []
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
au_dataset_new = [[] for _ in range(4)] # New list to store transformed data
num1 = [0] # Counter for complete ar_date
num2 = [0] # Counter for complete au_data
os.makedirs('./EJQTDE_buffer_transform/Article_output/', exist_ok=True)
os.makedirs('./EJQTDE_buffer_transform/Author_output/', exist_ok=True)
# Read the data
ar_dataset = fileReader('./EJQTDE_buffer/Article_output', ar_dataset)
au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset)
# Change the structure
arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
auDataTransform(au_dataset, num2)