New code for transformation. Transform the structure of SprigerOpen data

2023-11-01 20:11:20 +08:00 · 2023-11-01 20:11:20 +08:00 · 50e30e105b
commit 50e30e105b
parent 7f9ab94adc
2 changed files with 189 additions and 0 deletions
--- a/Data/Transform/SpringerOpen_buffer_transform.zip
+++ b/Data/Transform/SpringerOpen_buffer_transform.zip
--- a/DataTransformer/FileStructureTansfer(SprigerOpen).py
+++ b/DataTransformer/FileStructureTansfer(SprigerOpen).py
@ -0,0 +1,189 @@
 import json
 import os
 import unicodedata
 from collections import OrderedDict
 from pprint import pprint
 '''
    ========== FileStructureTransfer ==========
    1. 本程序用于将获取的数据进行结构调整
    2. 根据论文发表的时间年限，分别将最后的数据存储在四个 json 文件中
        （1） newest： 发表于 2020 年之后
        （2） oldest： 发表于 2010 年之前
        （3） 2010-2014： 发表于 2010 年至 2014 年
        （4） 2015-2020： 发表于 2015 年至 2020 年
    3. 考虑到部分网站的总数据量过大，所以分成多份
    4. 本程序运行顺序为：
        （1） fileReader() 读取本地已爬取数据，存入待处理列表
        （2） arDataTransform() 转换论文数据格式
        （3） auDataTransform() 转换作者数据格式
        （4） 存入转换后数据的存储文件夹
 '''
 # Read the data
 def fileReader(folder, dataset):
    files = os.listdir(folder)
    for file in files:
        file_path = os.path.join(folder, file)
        with open(file_path, 'r', encoding='utf-8') as json_file:
            Data = json.load(json_file)
            dataset.append(Data)
    return dataset
 # Article data structure transfer
 def arDataTransform(au_folder, ar_dataset, num):
    def auInfoFind(path, file_name, ar_data, num):
        authors = ar_data.get('authors')
        authors.extend(ar_data.get('corresponding_authors'))
        file_path = os.path.join(path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            Data = json.load(file)
        au_ID = []      # A new list to store author_id
        # Find the author_id
        for author in authors:
            for Dict in Data:
                Dict_name = Dict.get('first_name') + ' ' + Dict.get('last_name')
                Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
                                    unicodedata.category(char) != 'Mn')
                if Dict.get('from_article') == ar_data.get('article_id') and Dict_name == author:
                    au_ID.append(Dict.get('author_id'))
        # Change the structure
        ar_data_transform = {
            "article_id": ar_data['article_id'],
            "title": ar_data['title'],
            "authors": au_ID,
            "authors_name": authors,
            "submit_datetime": ar_data['submit_datetime'],
            "publish_datetime": ar_data['publish_datetime'],
            "keywords": ar_data['keywords'],
            "MSC": ar_data['MSC'],
            "URL": ar_data['URL'],
            "DOI": ar_data['DOI'],
            "publisher": ar_data['publisher'],
            "journal": ar_data['journal'],
            "volume": ar_data['volume'],
            "issue": ar_data['issue'],
            "page": ar_data['page']
        }
        num[0] += 1     # Update the counter
        return ar_data_transform
    # ====== Main code for function =====
    ar_names = os.listdir(au_folder)    # Read the folder
    for ar_list in ar_dataset:
        for Dict in ar_list:
            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
                print(str(num[0]) + " copies of article data structure have been transformed.")
            if int(Dict.get('volume')) <= 2009:
                Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
                ar_dataset_new[3].append(Dict)
            elif 2010 <= int(Dict.get('volume')) <= 2014:
                Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
                ar_dataset_new[0].append(Dict)
            elif 2015 <= int(Dict.get('volume')) <= 2020:
                Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
                ar_dataset_new[1].append(Dict)
            else:
                Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
                ar_dataset_new[2].append(Dict)
    # Store into the new file
    filepaths = [
        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2010-2014).json",
        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2015-2020).json",
        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(newest).json",
        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(oldest).json",
    ]
    for i in range(4):
        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
            json.dump(ar_dataset_new[i], json_file, indent=4)
    print("\nComplete: All of the article data structure have been transformed.")
 # Author data structure transfer
 def auDataTransform(au_dataset, num):
    def transform(list, num):
        new_list = []   # New list to store transformed data
        for au_data in list:
            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
                print(str(num[0]) + " copies of author data structure have been transformed.\n")
            if au_data['middle_name'] is not None:
                raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
            else:
                raw_name = au_data['first_name'] + ' ' + au_data['last_name']
            au_data_transform = {
                "author_id": au_data['author_id'],
                "from_article": au_data['from_article'],
                "first_name": au_data['first_name'],
                "last_name": au_data['last_name'],
                "middle_name": au_data['middle_name'],
                "raw_name": raw_name,
                "affiliation": au_data['affiliation']
            }
            new_list.append(au_data_transform)
            num[0] += 1         # Update the counter
        return new_list
    for i in range(4):
        au_list = transform(au_dataset[i], num)
        au_dataset_new[i].extend(au_list)
    # Store into the new file
    filepaths = [
        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2010-2014).json",
        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2015-2020).json",
        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(newest).json",
        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(oldest).json",
    ]
    for i in range(4):
        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
            json.dump(au_dataset_new[i], json_file, indent=4)
    print("\nComplete: All of the author data structure have been transformed.")
 # ========== Main code ========== #
 # New list for storing data
 ar_dataset = []
 au_dataset = []
 ar_dataset_new = [[] for _ in range(4)]    # New list for transformed data
 au_dataset_new = [[] for _ in range(4)]    # New list to store transformed data
 num1 = [0]      # Counter for complete ar_date
 num2 = [0]      # Counter for complete au_data
 os.makedirs('./SpringerOpen_buffer_transform/Article_output/', exist_ok=True)
 os.makedirs('./SpringerOpen_buffer_transform/Author_output/', exist_ok=True)
 # Read the data
 ar_dataset = fileReader('./SpringerOpen_buffer/Article_output', ar_dataset)
 au_dataset = fileReader('./SpringerOpen_buffer/Author_output', au_dataset)
 # Change the structure
 # arDataTransform('./SpringerOpen_buffer/Author_output', ar_dataset, num1)
 auDataTransform(au_dataset, num2)