From 35ea1dd424c118da1ee27aefaa6a3f5e3c4dd855 Mon Sep 17 00:00:00 2001 From: Chenxiao Xia Date: Wed, 1 Nov 2023 13:12:38 +0800 Subject: [PATCH] Remove error code --- DataTransformer/FileStructureTansfer(EJDE).py | 199 ----------------- .../FileStructureTansfer(EJQTDE).py | 206 ------------------ 2 files changed, 405 deletions(-) delete mode 100644 DataTransformer/FileStructureTansfer(EJDE).py delete mode 100644 DataTransformer/FileStructureTansfer(EJQTDE).py diff --git a/DataTransformer/FileStructureTansfer(EJDE).py b/DataTransformer/FileStructureTansfer(EJDE).py deleted file mode 100644 index bc9197b..0000000 --- a/DataTransformer/FileStructureTansfer(EJDE).py +++ /dev/null @@ -1,199 +0,0 @@ -import json -import os -import unicodedata - -from collections import OrderedDict -from pprint import pprint - -''' - ========== FileStructureTransfer ========== - 1. 本程序用于将获取的数据进行结构调整 - 2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中 - (1) newest: 发表于 2020 年之后 - (2) oldest: 发表于 2010 年之前 - (3) 2010-2014: 发表于 2010 年至 2014 年 - (4) 2015-2020: 发表于 2015 年至 2020 年 - 3. 考虑到部分网站的总数据量过大,所以分成多份 - 4. 本程序运行顺序为: - (1) fileReader() 读取本地已爬取数据,存入待处理列表 - (2) arDataTransform() 转换论文数据格式 - (3) auDataTransform() 转换作者数据格式 - (4) 存入转换后数据的存储文件夹 -''' - - -# Read the data -def fileReader(folder, dataset): - files = os.listdir(folder) - for file in files: - file_path = os.path.join(folder, file) - with open(file_path, 'r', encoding='utf-8') as json_file: - Data = json.load(json_file) - dataset.append(Data) - - return dataset - - -# Article data structure transfer -def arDataTransform(au_folder, ar_dataset, num): - def auInfoFind(path, file_name, ar_data, num): - authors = ar_data.get('authors') - authors.append(ar_data.get('corresponding_authors')) - - file_path = os.path.join(path, file_name) - with open(file_path, 'r', encoding='utf-8') as file: - Data = json.load(file) - - au_ID = [] # A new list to store author_id - - # Find the author_id - for author in authors: - if author is None: - continue - - author = author.replace(" ", "") - - for Dict in Data: - Dict_name = Dict.get('first_name') + "," + Dict.get('last_name') - Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if - unicodedata.category(char) != 'Mn') - - if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author: - au_ID.append(Dict.get('author_id')) - - # Change the structure - ar_data_transform = { - "article_id": ar_data['article_id'], - "title": ar_data['title'], - "authors": au_ID, - "authors_name": ar_data['authors'], - "submit_datetime": ar_data['submit_datetime'], - "publish_datetime": ar_data['publish_datetime'], - "keywords": ar_data['keywords'], - "MSC": ar_data['MSC'], - "URL": ar_data['URL'], - "DOI": ar_data['DOI'], - "publisher": ar_data['publisher'], - "journal": ar_data['journal'], - "volume": ar_data['volume'], - "issue": ar_data['issue'], - "page": ar_data['page'] - } - - num[0] += 1 # Update the counter - return ar_data_transform - - # ====== Main code for function ===== - ar_names = os.listdir(au_folder) # Read the folder - - for ar_list in ar_dataset: - for Dict in ar_list: - year = Dict.get('publish_datetime') - if year is None: - continue - - year = year.split('-') - - if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data - print(str(num[0]) + " copies of article data structure have been transformed.") - - if int(year[0]) <= 2009: - Dict = auInfoFind(au_folder, ar_names[3], Dict, num) - ar_dataset_new[3].append(Dict) - - elif 2010 <= int(year[0]) <= 2014: - Dict = auInfoFind(au_folder, ar_names[0], Dict, num) - ar_dataset_new[0].append(Dict) - - elif 2015 <= int(year[0]) <= 2020: - Dict = auInfoFind(au_folder, ar_names[1], Dict, num) - ar_dataset_new[1].append(Dict) - - else: - Dict = auInfoFind(au_folder, ar_names[2], Dict, num) - ar_dataset_new[2].append(Dict) - - # Store into the new file - filepaths = [ - "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2010-2014).json", - "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2015-2020).json", - "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(newest).json", - "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(oldest).json", - ] - - for i in range(4): - with open(filepaths[i], 'w', encoding='utf-8') as json_file: - json.dump(ar_dataset_new[i], json_file, indent=4) - - print("\nComplete: All of the article data structure have been transformed.") - - -# Author data structure transfer -def auDataTransform(au_dataset, num): - def transform(list, num): - new_list = [] # New list to store transformed data - - for au_data in list: - if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data - print(str(num[0]) + " copies of author data structure have been transformed.\n") - - if au_data['middle_name'] is not None: - raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name'] - else: - raw_name = au_data['first_name'] + ' ' + au_data['last_name'] - - au_data_transform = { - "author_id": au_data['author_id'], - "from_article": au_data['from_article'][0], - "first_name": au_data['last_name'], - "last_name": au_data['first_name'], - "middle_name": au_data['middle_name'], - "raw_name": raw_name, - "affiliation": au_data['affiliation'] - } - - new_list.append(au_data_transform) - num[0] += 1 # Update the counter - - return new_list - - for i in range(4): - au_list = transform(au_dataset[i], num) - au_dataset_new[i].append(au_list) - - # Store into the new file - filepaths = [ - "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2010-2014).json", - "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2015-2020).json", - "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(newest).json", - "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(oldest).json", - ] - - for i in range(4): - with open(filepaths[i], 'w', encoding='utf-8') as json_file: - json.dump(au_dataset_new[i], json_file, indent=4) - - print("\nComplete: All of the author data structure have been transformed.") - - -# ========== Main code ========== # -# New list for storing data -ar_dataset = [] -au_dataset = [] - -ar_dataset_new = [[] for _ in range(4)] # New list for transformed data -au_dataset_new = [[] for _ in range(4)] # New list to store transformed data - -num1 = [0] # Counter for complete ar_date -num2 = [0] # Counter for complete au_data - -os.makedirs('./EJDE_buffer_transform/Article_output/', exist_ok=True) -os.makedirs('./EJDE_buffer_transform/Author_output/', exist_ok=True) - -# Read the data -ar_dataset = fileReader('./EJDE_buffer/Article_output', ar_dataset) -au_dataset = fileReader('./EJDE_buffer/Author_output', au_dataset) - -# Change the structure -arDataTransform('./EJDE_buffer/Author_output', ar_dataset, num1) -auDataTransform(au_dataset, num2) \ No newline at end of file diff --git a/DataTransformer/FileStructureTansfer(EJQTDE).py b/DataTransformer/FileStructureTansfer(EJQTDE).py deleted file mode 100644 index 69c2dd4..0000000 --- a/DataTransformer/FileStructureTansfer(EJQTDE).py +++ /dev/null @@ -1,206 +0,0 @@ -import json -import os -import unicodedata - -from collections import OrderedDict -from pprint import pprint - -''' - ========== FileStructureTransfer ========== - 1. 本程序用于将获取的数据进行结构调整 - 2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中 - (1) newest: 发表于 2020 年之后 - (2) oldest: 发表于 2010 年之前 - (3) 2010-2014: 发表于 2010 年至 2014 年 - (4) 2015-2020: 发表于 2015 年至 2020 年 - 3. 考虑到部分网站的总数据量过大,所以分成多份 - 4. 本程序运行顺序为: - (1) fileReader() 读取本地已爬取数据,存入待处理列表 - (2) arDataTransform() 转换论文数据格式 - (3) auDataTransform() 转换作者数据格式 - (4) 存入转换后数据的存储文件夹 -''' - - -# Read the data -def fileReader(folder, dataset): - files = os.listdir(folder) - for file in files: - file_path = os.path.join(folder, file) - with open(file_path, 'r', encoding='utf-8') as json_file: - Data = json.load(json_file) - dataset.append(Data) - - return dataset - - -# Article data structure transfer -def arDataTransform(au_folder, ar_dataset, num): - def auInfoFind(path, file_name, ar_data, num): - authors = ar_data.get('authors') - authors.append(ar_data.get('corresponding_authors')) - - file_path = os.path.join(path, file_name) - with open(file_path, 'r', encoding='utf-8') as file: - Data = json.load(file) - - au_ID = [] # A new list to store author_id - - # Find the author_id - for author in authors: - author = author.replace(" ", "") - - for Dict in Data: - Dict_name = Dict.get('first_name') + "," + Dict.get('last_name') - Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if - unicodedata.category(char) != 'Mn') - - if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author: - au_ID.append(Dict.get('author_id')) - - author_names_new = [] - author_names = ar_data['authors'] - - for author_name in author_names: - author_name_new = '' - author_name = author_name.split(", ") - - for i in range(len(author_name)-1, 0, -1): - # print(author_name[i]) - author_name_new += author_name[i] - if i != 0: - author_name_new += ', ' - - print(author_name_new) - author_names_new.append(author_name_new) - - # Change the structure - ar_data_transform = { - "article_id": ar_data['article_id'], - "title": ar_data['title'], - "authors": au_ID, - "authors_name": author_names_new, - "submit_datetime": ar_data['submit_datetime'], - "publish_datetime": ar_data['publish_datetime'], - "keywords": ar_data['keywords'], - "MSC": ar_data['MSC'], - "URL": ar_data['URL'], - "DOI": ar_data['DOI'], - "publisher": ar_data['publisher'], - "journal": ar_data['journal'], - "volume": ar_data['volume'], - "issue": ar_data['issue'], - "page": ar_data['page'] - } - - num[0] += 1 # Update the counter - return ar_data_transform - - # ====== Main code for function ===== - ar_names = os.listdir(au_folder) # Read the folder - - for ar_list in ar_dataset: - for Dict in ar_list: - if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data - print(str(num[0]) + " copies of article data structure have been transformed.") - - if int(Dict.get('volume')) <= 2009: - Dict = auInfoFind(au_folder, ar_names[3], Dict, num) - ar_dataset_new[3].append(Dict) - - elif 2010 <= int(Dict.get('volume')) <= 2014: - Dict = auInfoFind(au_folder, ar_names[0], Dict, num) - ar_dataset_new[0].append(Dict) - - elif 2015 <= int(Dict.get('volume')) <= 2020: - Dict = auInfoFind(au_folder, ar_names[1], Dict, num) - ar_dataset_new[1].append(Dict) - - else: - Dict = auInfoFind(au_folder, ar_names[2], Dict, num) - ar_dataset_new[2].append(Dict) - - # Store into the new file - filepaths = [ - "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2010-2014).json", - "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2015-2020).json", - "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(newest).json", - "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json", - ] - - for i in range(4): - with open(filepaths[i], 'w', encoding='utf-8') as json_file: - json.dump(ar_dataset_new[i], json_file, indent=4) - - print("\nComplete: All of the article data structure have been transformed.") - - -# Author data structure transfer -def auDataTransform(au_dataset, num): - def transform(list, num): - new_list = [] # New list to store transformed data - - for au_data in list: - if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data - print(str(num[0]) + " copies of author data structure have been transformed.\n") - - if au_data['middle_name'] is not None: - raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name'] - else: - raw_name = au_data['first_name'] + ' ' + au_data['last_name'] - - au_data_transform = { - "author_id": au_data['author_id'], - "from_article": au_data['from_article'][0], - "first_name": au_data['last_name'], - "last_name": au_data['first_name'], - "middle_name": au_data['middle_name'], - "raw_name": raw_name, - "affiliation": au_data['affiliation'] - } - - new_list.append(au_data_transform) - num[0] += 1 # Update the counter - - return new_list - - for i in range(4): - au_list = transform(au_dataset[i], num) - au_dataset_new[i].append(au_list) - - # Store into the new file - filepaths = [ - "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2010-2014).json", - "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2015-2020).json", - "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(newest).json", - "./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(oldest).json", - ] - - for i in range(4): - with open(filepaths[i], 'w', encoding='utf-8') as json_file: - json.dump(au_dataset_new[i], json_file, indent=4) - - print("\nComplete: All of the author data structure have been transformed.") - - -# ========== Main code ========== # -# New list for storing data -ar_dataset = [] -au_dataset = [] - -ar_dataset_new = [[] for _ in range(4)] # New list for transformed data -au_dataset_new = [[] for _ in range(4)] # New list to store transformed data - -num1 = [0] # Counter for complete ar_date -num2 = [0] # Counter for complete au_data - -os.makedirs('./EJQTDE_buffer_transform/Article_output/', exist_ok=True) -os.makedirs('./EJQTDE_buffer_transform/Author_output/', exist_ok=True) - -# Read the data -ar_dataset = fileReader('./EJQTDE_buffer/Article_output', ar_dataset) -au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset) - -# Change the structure -arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1) -auDataTransform(au_dataset, num2) \ No newline at end of file