From e07617bebc7a7a4994fff9314481ad8b89b567ff Mon Sep 17 00:00:00 2001
From: Chenxiao Xia <s230026169@mail.uic.edu.cn>
Date: Sun, 29 Oct 2023 15:21:01 +0800
Subject: [PATCH] A new code for transforming data structure

---
 DataTransformer/FileStructureTansfer.py | 182 ++++++++++++++++++++++++
 1 file changed, 182 insertions(+)
 create mode 100644 DataTransformer/FileStructureTansfer.py

diff --git a/DataTransformer/FileStructureTansfer.py b/DataTransformer/FileStructureTansfer.py
new file mode 100644
index 0000000..86e7c68
--- /dev/null
+++ b/DataTransformer/FileStructureTansfer.py
@@ -0,0 +1,182 @@
+import json
+import os
+import unicodedata
+
+from collections import OrderedDict
+from pprint import pprint
+
+
+# Read the data
+def fileReader(folder, dataset):
+    files = os.listdir(folder)
+    for file in files:
+        file_path = os.path.join(folder, file)
+        with open(file_path, 'r', encoding='utf-8') as json_file:
+            Data = json.load(json_file)
+            dataset.append(Data)
+
+    return dataset
+
+
+# Article data structure transfer
+def arDataTransform(au_folder, ar_dataset, num):
+    def auInfoFind(path, file_name, ar_data, num):
+        authors = ar_data.get('authors')
+        authors.append(ar_data.get('corresponding_authors'))
+
+        file_path = os.path.join(path, file_name)
+        with open(file_path, 'r', encoding='utf-8') as file:
+            Data = json.load(file)
+
+        au_ID = []      # A new list to store author_id
+
+        # Find the author_id
+        for author in authors:
+            author = author.replace(" ", "")
+
+            for Dict in Data:
+                Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
+                Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
+                                    unicodedata.category(char) != 'Mn')
+
+                if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
+                    au_ID.append(Dict.get('author_id'))
+
+        # Change the structure
+        ar_data_transform = {
+            "article_id": ar_data['article_id'],
+            "title": ar_data['title'],
+            "authors": au_ID,
+            "authors_name": ar_data['authors'],
+            "submit_datetime": ar_data['submit_datetime'],
+            "publish_datetime": ar_data['publish_datetime'],
+            "keywords": ar_data['keywords'],
+            "MSC": ar_data['MSC'],
+            "URL": ar_data['URL'],
+            "DOI": ar_data['DOI'],
+            "publisher": ar_data['publisher'],
+            "journal": ar_data['journal'],
+            "volume": ar_data['volume'],
+            "issue": ar_data['issue'],
+            "page": ar_data['page']
+        }
+
+        num[0] += 1     # Update the counter
+        return ar_data_transform
+
+    # ====== Main code for function =====
+    ar_names = os.listdir(au_folder)    # Read the folder
+
+    for ar_list in ar_dataset:
+        for Dict in ar_list:
+            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
+                print(str(num[0]) + " copies of article data structure have been transformed.")
+
+            if int(Dict.get('volume')) <= 2009:
+                Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
+                ar_dataset_new[3].append(Dict)
+
+            elif 2010 <= int(Dict.get('volume')) <= 2014:
+                Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
+                ar_dataset_new[0].append(Dict)
+
+            elif 2015 <= int(Dict.get('volume')) <= 2020:
+                Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
+                ar_dataset_new[1].append(Dict)
+
+            else:
+                Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
+                ar_dataset_new[2].append(Dict)
+
+    # Store into the new file
+    filepaths = [
+        "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
+        "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2010-2014).json",
+        "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2015-2020).json",
+        "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(newest).json",
+    ]
+
+    for filepath in filepaths:
+        for list in ar_dataset_new:
+            with open(filepath, "w", encoding='utf-8') as json_file:
+                json.dump(list, json_file, indent=4)
+
+            break
+
+    print("\nComplete: All of the article data structure have been transformed.")
+
+
+# Author data structure transfer
+def auDataTransform(au_dataset, num):
+    def transform(list, num):
+        new_list = []   # New list to store transformed data
+
+        for au_data in list:
+            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
+                print(str(num[0]) + " copies of author data structure have been transformed.\n")
+
+            if au_data['middle_name'] is not None:
+                raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
+            else:
+                raw_name = au_data['first_name'] + ' ' + au_data['last_name']
+
+            au_data_transform = {
+                "author_id": au_data['author_id'],
+                "from_article": au_data['from_article'][0],
+                "first_name": au_data['first_name'],
+                "last_name": au_data['last_name'],
+                "middle_name": au_data['middle_name'],
+                "raw_name": raw_name,
+                "affiliation": au_data['affiliation']
+            }
+
+            new_list.append(au_data_transform)
+            num[0] += 1         # Update the counter
+
+        return new_list
+
+    # Transform the author data structure
+    au_dataset_new = []     # New list to store transformed data
+
+    for au_list in au_dataset:
+        au_list_new = transform(au_list, num)
+        au_dataset_new.append(au_list_new)
+
+    # Store into the new file
+    filepaths = [
+        "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(oldest).json",
+        "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2010-2014).json",
+        "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2015-2020).json",
+        "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(newest).json",
+    ]
+
+    for filepath in filepaths:
+        for list in au_dataset_new:
+            with open(filepath, "w", encoding='utf-8') as json_file:
+                json.dump(list, json_file, indent=4)
+
+            break
+
+    print("\nComplete: All of the author data structure have been transformed.")
+
+
+# ========== Main code ========== #
+# New list for storing data
+ar_dataset = []
+au_dataset = []
+
+ar_dataset_new = [[] for _ in range(4)]    # New list for transformed data
+
+num1 = [0]      # Counter for complete ar_date
+num2 = [0]      # Counter for complete au_data
+
+os.makedirs('./EJQTDE_buffer_transform/Article_output/', exist_ok=True)
+os.makedirs('./EJQTDE_buffer_transform/Author_output/', exist_ok=True)
+
+# Read the data
+ar_dataset = fileReader('./EJQTDE_buffer/Article_output', ar_dataset)
+au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset)
+
+# Change the structure
+arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
+auDataTransform(au_dataset, num2)
\ No newline at end of file