import json import os import unicodedata from collections import OrderedDict from pprint import pprint # Read the data def fileReader(folder, dataset): files = os.listdir(folder) for file in files: file_path = os.path.join(folder, file) with open(file_path, 'r', encoding='utf-8') as json_file: Data = json.load(json_file) dataset.append(Data) return dataset # Article data structure transfer def arDataTransform(au_folder, ar_dataset, num): def auInfoFind(path, file_name, ar_data, num): authors = ar_data.get('authors') authors.append(ar_data.get('corresponding_authors')) file_path = os.path.join(path, file_name) with open(file_path, 'r', encoding='utf-8') as file: Data = json.load(file) au_ID = [] # A new list to store author_id # Find the author_id for author in authors: author = author.replace(" ", "") for Dict in Data: Dict_name = Dict.get('first_name') + "," + Dict.get('last_name') Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if unicodedata.category(char) != 'Mn') if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author: au_ID.append(Dict.get('author_id')) # Change the structure ar_data_transform = { "article_id": ar_data['article_id'], "title": ar_data['title'], "authors": au_ID, "authors_name": ar_data['authors'], "submit_datetime": ar_data['submit_datetime'], "publish_datetime": ar_data['publish_datetime'], "keywords": ar_data['keywords'], "MSC": ar_data['MSC'], "URL": ar_data['URL'], "DOI": ar_data['DOI'], "publisher": ar_data['publisher'], "journal": ar_data['journal'], "volume": ar_data['volume'], "issue": ar_data['issue'], "page": ar_data['page'] } num[0] += 1 # Update the counter return ar_data_transform # ====== Main code for function ===== ar_names = os.listdir(au_folder) # Read the folder for ar_list in ar_dataset: for Dict in ar_list: if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data print(str(num[0]) + " copies of article data structure have been transformed.") if int(Dict.get('volume')) <= 2009: Dict = auInfoFind(au_folder, ar_names[3], Dict, num) ar_dataset_new[3].append(Dict) elif 2010 <= int(Dict.get('volume')) <= 2014: Dict = auInfoFind(au_folder, ar_names[0], Dict, num) ar_dataset_new[0].append(Dict) elif 2015 <= int(Dict.get('volume')) <= 2020: Dict = auInfoFind(au_folder, ar_names[1], Dict, num) ar_dataset_new[1].append(Dict) else: Dict = auInfoFind(au_folder, ar_names[2], Dict, num) ar_dataset_new[2].append(Dict) # Store into the new file filepaths = [ "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2010-2014).json", "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2015-2020).json", "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(newest).json", "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json", ] # for filepath in filepaths: # for list in ar_dataset_new: # with open(filepath, "w", encoding='utf-8') as json_file: # json.dump(list, json_file, indent=4) # # break for i in range(4): with open(filepaths[i], 'w', encoding='utf-8') as json_file: json.dump(ar_dataset_new[i], json_file, indent=4) print("\nComplete: All of the article data structure have been transformed.") # Author data structure transfer def auDataTransform(au_dataset, num): def transform(list, num): new_list = [] # New list to store transformed data for au_data in list: if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data print(str(num[0]) + " copies of author data structure have been transformed.\n") if au_data['middle_name'] is not None: raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name'] else: raw_name = au_data['first_name'] + ' ' + au_data['last_name'] au_data_transform = { "author_id": au_data['author_id'], "from_article": au_data['from_article'][0], "first_name": au_data['first_name'], "last_name": au_data['last_name'], "middle_name": au_data['middle_name'], "raw_name": raw_name, "affiliation": au_data['affiliation'] } new_list.append(au_data_transform) num[0] += 1 # Update the counter return new_list # # Transform the author data structure # au_dataset_new = [] # New list to store transformed data # for au_list in au_dataset: # au_list_new = transform(au_list, num) # au_dataset_new.append(au_list_new) for i in range(4): au_list = transform(au_dataset[i], num) au_dataset_new[i].append(au_list) # Store into the new file filepaths = [ "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2010-2014).json", "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2015-2020).json", "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(newest).json", "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(oldest).json", ] for i in range(4): with open(filepaths[i], 'w', encoding='utf-8') as json_file: json.dump(au_dataset_new[i], json_file, indent=4) print("\nComplete: All of the author data structure have been transformed.") # ========== Main code ========== # # New list for storing data ar_dataset = [] au_dataset = [] ar_dataset_new = [[] for _ in range(4)] # New list for transformed data au_dataset_new = [[] for _ in range(4)] # New list to store transformed data num1 = [0] # Counter for complete ar_date num2 = [0] # Counter for complete au_data os.makedirs('./EJQTDE_buffer_transform/Article_output/', exist_ok=True) os.makedirs('./EJQTDE_buffer_transform/Author_output/', exist_ok=True) # Read the data ar_dataset = fileReader('./EJQTDE_buffer/Article_output', ar_dataset) au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset) # Change the structure arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1) # auDataTransform(au_dataset, num2)