ScholarDataMining/DataTransformer/FileStructureTansfer.py
2023-10-29 15:21:01 +08:00

182 lines
6.5 KiB
Python

import json
import os
import unicodedata
from collections import OrderedDict
from pprint import pprint
# Read the data
def fileReader(folder, dataset):
files = os.listdir(folder)
for file in files:
file_path = os.path.join(folder, file)
with open(file_path, 'r', encoding='utf-8') as json_file:
Data = json.load(json_file)
dataset.append(Data)
return dataset
# Article data structure transfer
def arDataTransform(au_folder, ar_dataset, num):
def auInfoFind(path, file_name, ar_data, num):
authors = ar_data.get('authors')
authors.append(ar_data.get('corresponding_authors'))
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
Data = json.load(file)
au_ID = [] # A new list to store author_id
# Find the author_id
for author in authors:
author = author.replace(" ", "")
for Dict in Data:
Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
unicodedata.category(char) != 'Mn')
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
au_ID.append(Dict.get('author_id'))
# Change the structure
ar_data_transform = {
"article_id": ar_data['article_id'],
"title": ar_data['title'],
"authors": au_ID,
"authors_name": ar_data['authors'],
"submit_datetime": ar_data['submit_datetime'],
"publish_datetime": ar_data['publish_datetime'],
"keywords": ar_data['keywords'],
"MSC": ar_data['MSC'],
"URL": ar_data['URL'],
"DOI": ar_data['DOI'],
"publisher": ar_data['publisher'],
"journal": ar_data['journal'],
"volume": ar_data['volume'],
"issue": ar_data['issue'],
"page": ar_data['page']
}
num[0] += 1 # Update the counter
return ar_data_transform
# ====== Main code for function =====
ar_names = os.listdir(au_folder) # Read the folder
for ar_list in ar_dataset:
for Dict in ar_list:
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
print(str(num[0]) + " copies of article data structure have been transformed.")
if int(Dict.get('volume')) <= 2009:
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
ar_dataset_new[3].append(Dict)
elif 2010 <= int(Dict.get('volume')) <= 2014:
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
ar_dataset_new[0].append(Dict)
elif 2015 <= int(Dict.get('volume')) <= 2020:
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
ar_dataset_new[1].append(Dict)
else:
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
ar_dataset_new[2].append(Dict)
# Store into the new file
filepaths = [
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2010-2014).json",
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2015-2020).json",
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(newest).json",
]
for filepath in filepaths:
for list in ar_dataset_new:
with open(filepath, "w", encoding='utf-8') as json_file:
json.dump(list, json_file, indent=4)
break
print("\nComplete: All of the article data structure have been transformed.")
# Author data structure transfer
def auDataTransform(au_dataset, num):
def transform(list, num):
new_list = [] # New list to store transformed data
for au_data in list:
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
print(str(num[0]) + " copies of author data structure have been transformed.\n")
if au_data['middle_name'] is not None:
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
else:
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
au_data_transform = {
"author_id": au_data['author_id'],
"from_article": au_data['from_article'][0],
"first_name": au_data['first_name'],
"last_name": au_data['last_name'],
"middle_name": au_data['middle_name'],
"raw_name": raw_name,
"affiliation": au_data['affiliation']
}
new_list.append(au_data_transform)
num[0] += 1 # Update the counter
return new_list
# Transform the author data structure
au_dataset_new = [] # New list to store transformed data
for au_list in au_dataset:
au_list_new = transform(au_list, num)
au_dataset_new.append(au_list_new)
# Store into the new file
filepaths = [
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(oldest).json",
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2010-2014).json",
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2015-2020).json",
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(newest).json",
]
for filepath in filepaths:
for list in au_dataset_new:
with open(filepath, "w", encoding='utf-8') as json_file:
json.dump(list, json_file, indent=4)
break
print("\nComplete: All of the author data structure have been transformed.")
# ========== Main code ========== #
# New list for storing data
ar_dataset = []
au_dataset = []
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
num1 = [0] # Counter for complete ar_date
num2 = [0] # Counter for complete au_data
os.makedirs('./EJQTDE_buffer_transform/Article_output/', exist_ok=True)
os.makedirs('./EJQTDE_buffer_transform/Author_output/', exist_ok=True)
# Read the data
ar_dataset = fileReader('./EJQTDE_buffer/Article_output', ar_dataset)
au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset)
# Change the structure
arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
auDataTransform(au_dataset, num2)