From e07617bebc7a7a4994fff9314481ad8b89b567ff Mon Sep 17 00:00:00 2001 From: Chenxiao Xia Date: Sun, 29 Oct 2023 15:21:01 +0800 Subject: [PATCH] A new code for transforming data structure --- DataTransformer/FileStructureTansfer.py | 182 ++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 DataTransformer/FileStructureTansfer.py diff --git a/DataTransformer/FileStructureTansfer.py b/DataTransformer/FileStructureTansfer.py new file mode 100644 index 0000000..86e7c68 --- /dev/null +++ b/DataTransformer/FileStructureTansfer.py @@ -0,0 +1,182 @@ +import json +import os +import unicodedata + +from collections import OrderedDict +from pprint import pprint + + +# Read the data +def fileReader(folder, dataset): + files = os.listdir(folder) + for file in files: + file_path = os.path.join(folder, file) + with open(file_path, 'r', encoding='utf-8') as json_file: + Data = json.load(json_file) + dataset.append(Data) + + return dataset + + +# Article data structure transfer +def arDataTransform(au_folder, ar_dataset, num): + def auInfoFind(path, file_name, ar_data, num): + authors = ar_data.get('authors') + authors.append(ar_data.get('corresponding_authors')) + + file_path = os.path.join(path, file_name) + with open(file_path, 'r', encoding='utf-8') as file: + Data = json.load(file) + + au_ID = [] # A new list to store author_id + + # Find the author_id + for author in authors: + author = author.replace(" ", "") + + for Dict in Data: + Dict_name = Dict.get('first_name') + "," + Dict.get('last_name') + Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if + unicodedata.category(char) != 'Mn') + + if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author: + au_ID.append(Dict.get('author_id')) + + # Change the structure + ar_data_transform = { + "article_id": ar_data['article_id'], + "title": ar_data['title'], + "authors": au_ID, + "authors_name": ar_data['authors'], + "submit_datetime": ar_data['submit_datetime'], + "publish_datetime": ar_data['publish_datetime'], + "keywords": ar_data['keywords'], + "MSC": ar_data['MSC'], + "URL": ar_data['URL'], + "DOI": ar_data['DOI'], + "publisher": ar_data['publisher'], + "journal": ar_data['journal'], + "volume": ar_data['volume'], + "issue": ar_data['issue'], + "page": ar_data['page'] + } + + num[0] += 1 # Update the counter + return ar_data_transform + + # ====== Main code for function ===== + ar_names = os.listdir(au_folder) # Read the folder + + for ar_list in ar_dataset: + for Dict in ar_list: + if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data + print(str(num[0]) + " copies of article data structure have been transformed.") + + if int(Dict.get('volume')) <= 2009: + Dict = auInfoFind(au_folder, ar_names[3], Dict, num) + ar_dataset_new[3].append(Dict) + + elif 2010 <= int(Dict.get('volume')) <= 2014: + Dict = auInfoFind(au_folder, ar_names[0], Dict, num) + ar_dataset_new[0].append(Dict) + + elif 2015 <= int(Dict.get('volume')) <= 2020: + Dict = auInfoFind(au_folder, ar_names[1], Dict, num) + ar_dataset_new[1].append(Dict) + + else: + Dict = auInfoFind(au_folder, ar_names[2], Dict, num) + ar_dataset_new[2].append(Dict) + + # Store into the new file + filepaths = [ + "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json", + "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2010-2014).json", + "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2015-2020).json", + "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(newest).json", + ] + + for filepath in filepaths: + for list in ar_dataset_new: + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(list, json_file, indent=4) + + break + + print("\nComplete: All of the article data structure have been transformed.") + + +# Author data structure transfer +def auDataTransform(au_dataset, num): + def transform(list, num): + new_list = [] # New list to store transformed data + + for au_data in list: + if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data + print(str(num[0]) + " copies of author data structure have been transformed.\n") + + if au_data['middle_name'] is not None: + raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name'] + else: + raw_name = au_data['first_name'] + ' ' + au_data['last_name'] + + au_data_transform = { + "author_id": au_data['author_id'], + "from_article": au_data['from_article'][0], + "first_name": au_data['first_name'], + "last_name": au_data['last_name'], + "middle_name": au_data['middle_name'], + "raw_name": raw_name, + "affiliation": au_data['affiliation'] + } + + new_list.append(au_data_transform) + num[0] += 1 # Update the counter + + return new_list + + # Transform the author data structure + au_dataset_new = [] # New list to store transformed data + + for au_list in au_dataset: + au_list_new = transform(au_list, num) + au_dataset_new.append(au_list_new) + + # Store into the new file + filepaths = [ + "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(oldest).json", + "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2010-2014).json", + "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2015-2020).json", + "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(newest).json", + ] + + for filepath in filepaths: + for list in au_dataset_new: + with open(filepath, "w", encoding='utf-8') as json_file: + json.dump(list, json_file, indent=4) + + break + + print("\nComplete: All of the author data structure have been transformed.") + + +# ========== Main code ========== # +# New list for storing data +ar_dataset = [] +au_dataset = [] + +ar_dataset_new = [[] for _ in range(4)] # New list for transformed data + +num1 = [0] # Counter for complete ar_date +num2 = [0] # Counter for complete au_data + +os.makedirs('./EJQTDE_buffer_transform/Article_output/', exist_ok=True) +os.makedirs('./EJQTDE_buffer_transform/Author_output/', exist_ok=True) + +# Read the data +ar_dataset = fileReader('./EJQTDE_buffer/Article_output', ar_dataset) +au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset) + +# Change the structure +arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1) +auDataTransform(au_dataset, num2) \ No newline at end of file