ScholarDataMining/SpringerOpen_spider/SD_save.py

import os
import json


# 将数据暂存到json文件（小文件 50篇）
def save_data(dataset, filetype, filename):
    if dataset:
        directory = "./SpringerOpen_buffer/" + filetype + "/"
        os.makedirs(directory, exist_ok=True)
        filepath = os.path.join(directory, filename)
        with open(filepath, "w", encoding='utf-8') as json_file:
            json.dump(dataset, json_file, indent=4)
        print(filetype + " data have been added to", filepath)


# 文件最终筛选汇总
def Transf():
    def Read(folder_path, output_files):
        # 新建文件夹
        os.makedirs('./SpringerOpen_buffer/Article_output/', exist_ok=True)
        os.makedirs('./SpringerOpen_buffer/Author_output/', exist_ok=True)

        data_oldest = []
        data_2010_2014 = []
        data_2015_2020 = []
        data_newest = []

        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

                    for Dict in data:
                        if Dict.get('volume') and Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                            # 筛选文章
                            data_oldest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009)]

                            data_2010_2014 += [Dict for Dict in data if (isinstance(Dict, dict) and 2010 <= int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014)]

                            data_2015_2020 += [Dict for Dict in data if (isinstance(Dict, dict) and 2015 <= int(
                                Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020)]

                            data_newest += [Dict for Dict in data if (isinstance(Dict, dict) and int(
                                        Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) >= 2021)]

                            Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]

                            # 转存
                            for index in range(0, 4):
                                with open(output_files[index], 'w', encoding='utf-8') as file:
                                    json.dump(Data[index], file, indent=4)

        # 读取路径

    author_folder_path = './SpringerOpen_buffer/Author'
    article_folder_path = './SpringerOpen_buffer/Article'

    # 存储路径
    author_output_file = [
        './SpringerOpen_buffer/Author_output/Author_output_file(oldest).json',
        './SpringerOpen_buffer/Author_output/Author_output_file(2010-2014).json',
        './SpringerOpen_buffer/Author_output/Author_output_file(2015-2020).json',
        './SpringerOpen_buffer/Author_output/Author_output_file(newest).json'
    ]

    article_output_file = [
        './SpringerOpen_buffer/Article_output/Article_output_file(oldest).json',
        './SpringerOpen_buffer/Article_output/Article_output_file(2010-2014).json',
        './SpringerOpen_buffer/Article_output/Article_output_file(2015-2020).json',
        './SpringerOpen_buffer/Article_output/Article_output_file(newest).json'
    ]

    # 读取并写入文件
    Read(author_folder_path, author_output_file)
    Read(article_folder_path, article_output_file)

    # End
    print("\nData has been written into files.")

# 删除暂存区文件
def delete(folder_path):
    file_names = os.listdir(folder_path)

    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)

    print('\nAttention: The temporary storage files have been deleted!')