Remove error code
This commit is contained in:
parent
ad6ba8832a
commit
35ea1dd424
@ -1,199 +0,0 @@
|
|||||||
import json
|
|
||||||
import os
|
|
||||||
import unicodedata
|
|
||||||
|
|
||||||
from collections import OrderedDict
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
'''
|
|
||||||
========== FileStructureTransfer ==========
|
|
||||||
1. 本程序用于将获取的数据进行结构调整
|
|
||||||
2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中
|
|
||||||
(1) newest: 发表于 2020 年之后
|
|
||||||
(2) oldest: 发表于 2010 年之前
|
|
||||||
(3) 2010-2014: 发表于 2010 年至 2014 年
|
|
||||||
(4) 2015-2020: 发表于 2015 年至 2020 年
|
|
||||||
3. 考虑到部分网站的总数据量过大,所以分成多份
|
|
||||||
4. 本程序运行顺序为:
|
|
||||||
(1) fileReader() 读取本地已爬取数据,存入待处理列表
|
|
||||||
(2) arDataTransform() 转换论文数据格式
|
|
||||||
(3) auDataTransform() 转换作者数据格式
|
|
||||||
(4) 存入转换后数据的存储文件夹
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
# Read the data
|
|
||||||
def fileReader(folder, dataset):
|
|
||||||
files = os.listdir(folder)
|
|
||||||
for file in files:
|
|
||||||
file_path = os.path.join(folder, file)
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as json_file:
|
|
||||||
Data = json.load(json_file)
|
|
||||||
dataset.append(Data)
|
|
||||||
|
|
||||||
return dataset
|
|
||||||
|
|
||||||
|
|
||||||
# Article data structure transfer
|
|
||||||
def arDataTransform(au_folder, ar_dataset, num):
|
|
||||||
def auInfoFind(path, file_name, ar_data, num):
|
|
||||||
authors = ar_data.get('authors')
|
|
||||||
authors.append(ar_data.get('corresponding_authors'))
|
|
||||||
|
|
||||||
file_path = os.path.join(path, file_name)
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
|
||||||
Data = json.load(file)
|
|
||||||
|
|
||||||
au_ID = [] # A new list to store author_id
|
|
||||||
|
|
||||||
# Find the author_id
|
|
||||||
for author in authors:
|
|
||||||
if author is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
author = author.replace(" ", "")
|
|
||||||
|
|
||||||
for Dict in Data:
|
|
||||||
Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
|
|
||||||
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
|
|
||||||
unicodedata.category(char) != 'Mn')
|
|
||||||
|
|
||||||
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
|
|
||||||
au_ID.append(Dict.get('author_id'))
|
|
||||||
|
|
||||||
# Change the structure
|
|
||||||
ar_data_transform = {
|
|
||||||
"article_id": ar_data['article_id'],
|
|
||||||
"title": ar_data['title'],
|
|
||||||
"authors": au_ID,
|
|
||||||
"authors_name": ar_data['authors'],
|
|
||||||
"submit_datetime": ar_data['submit_datetime'],
|
|
||||||
"publish_datetime": ar_data['publish_datetime'],
|
|
||||||
"keywords": ar_data['keywords'],
|
|
||||||
"MSC": ar_data['MSC'],
|
|
||||||
"URL": ar_data['URL'],
|
|
||||||
"DOI": ar_data['DOI'],
|
|
||||||
"publisher": ar_data['publisher'],
|
|
||||||
"journal": ar_data['journal'],
|
|
||||||
"volume": ar_data['volume'],
|
|
||||||
"issue": ar_data['issue'],
|
|
||||||
"page": ar_data['page']
|
|
||||||
}
|
|
||||||
|
|
||||||
num[0] += 1 # Update the counter
|
|
||||||
return ar_data_transform
|
|
||||||
|
|
||||||
# ====== Main code for function =====
|
|
||||||
ar_names = os.listdir(au_folder) # Read the folder
|
|
||||||
|
|
||||||
for ar_list in ar_dataset:
|
|
||||||
for Dict in ar_list:
|
|
||||||
year = Dict.get('publish_datetime')
|
|
||||||
if year is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
year = year.split('-')
|
|
||||||
|
|
||||||
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
|
||||||
print(str(num[0]) + " copies of article data structure have been transformed.")
|
|
||||||
|
|
||||||
if int(year[0]) <= 2009:
|
|
||||||
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
|
|
||||||
ar_dataset_new[3].append(Dict)
|
|
||||||
|
|
||||||
elif 2010 <= int(year[0]) <= 2014:
|
|
||||||
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
|
|
||||||
ar_dataset_new[0].append(Dict)
|
|
||||||
|
|
||||||
elif 2015 <= int(year[0]) <= 2020:
|
|
||||||
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
|
|
||||||
ar_dataset_new[1].append(Dict)
|
|
||||||
|
|
||||||
else:
|
|
||||||
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
|
|
||||||
ar_dataset_new[2].append(Dict)
|
|
||||||
|
|
||||||
# Store into the new file
|
|
||||||
filepaths = [
|
|
||||||
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2010-2014).json",
|
|
||||||
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2015-2020).json",
|
|
||||||
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(newest).json",
|
|
||||||
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(oldest).json",
|
|
||||||
]
|
|
||||||
|
|
||||||
for i in range(4):
|
|
||||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
|
||||||
json.dump(ar_dataset_new[i], json_file, indent=4)
|
|
||||||
|
|
||||||
print("\nComplete: All of the article data structure have been transformed.")
|
|
||||||
|
|
||||||
|
|
||||||
# Author data structure transfer
|
|
||||||
def auDataTransform(au_dataset, num):
|
|
||||||
def transform(list, num):
|
|
||||||
new_list = [] # New list to store transformed data
|
|
||||||
|
|
||||||
for au_data in list:
|
|
||||||
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
|
||||||
print(str(num[0]) + " copies of author data structure have been transformed.\n")
|
|
||||||
|
|
||||||
if au_data['middle_name'] is not None:
|
|
||||||
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
|
|
||||||
else:
|
|
||||||
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
|
|
||||||
|
|
||||||
au_data_transform = {
|
|
||||||
"author_id": au_data['author_id'],
|
|
||||||
"from_article": au_data['from_article'][0],
|
|
||||||
"first_name": au_data['last_name'],
|
|
||||||
"last_name": au_data['first_name'],
|
|
||||||
"middle_name": au_data['middle_name'],
|
|
||||||
"raw_name": raw_name,
|
|
||||||
"affiliation": au_data['affiliation']
|
|
||||||
}
|
|
||||||
|
|
||||||
new_list.append(au_data_transform)
|
|
||||||
num[0] += 1 # Update the counter
|
|
||||||
|
|
||||||
return new_list
|
|
||||||
|
|
||||||
for i in range(4):
|
|
||||||
au_list = transform(au_dataset[i], num)
|
|
||||||
au_dataset_new[i].append(au_list)
|
|
||||||
|
|
||||||
# Store into the new file
|
|
||||||
filepaths = [
|
|
||||||
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2010-2014).json",
|
|
||||||
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2015-2020).json",
|
|
||||||
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(newest).json",
|
|
||||||
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(oldest).json",
|
|
||||||
]
|
|
||||||
|
|
||||||
for i in range(4):
|
|
||||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
|
||||||
json.dump(au_dataset_new[i], json_file, indent=4)
|
|
||||||
|
|
||||||
print("\nComplete: All of the author data structure have been transformed.")
|
|
||||||
|
|
||||||
|
|
||||||
# ========== Main code ========== #
|
|
||||||
# New list for storing data
|
|
||||||
ar_dataset = []
|
|
||||||
au_dataset = []
|
|
||||||
|
|
||||||
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
|
|
||||||
au_dataset_new = [[] for _ in range(4)] # New list to store transformed data
|
|
||||||
|
|
||||||
num1 = [0] # Counter for complete ar_date
|
|
||||||
num2 = [0] # Counter for complete au_data
|
|
||||||
|
|
||||||
os.makedirs('./EJDE_buffer_transform/Article_output/', exist_ok=True)
|
|
||||||
os.makedirs('./EJDE_buffer_transform/Author_output/', exist_ok=True)
|
|
||||||
|
|
||||||
# Read the data
|
|
||||||
ar_dataset = fileReader('./EJDE_buffer/Article_output', ar_dataset)
|
|
||||||
au_dataset = fileReader('./EJDE_buffer/Author_output', au_dataset)
|
|
||||||
|
|
||||||
# Change the structure
|
|
||||||
arDataTransform('./EJDE_buffer/Author_output', ar_dataset, num1)
|
|
||||||
auDataTransform(au_dataset, num2)
|
|
||||||
@ -1,206 +0,0 @@
|
|||||||
import json
|
|
||||||
import os
|
|
||||||
import unicodedata
|
|
||||||
|
|
||||||
from collections import OrderedDict
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
'''
|
|
||||||
========== FileStructureTransfer ==========
|
|
||||||
1. 本程序用于将获取的数据进行结构调整
|
|
||||||
2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中
|
|
||||||
(1) newest: 发表于 2020 年之后
|
|
||||||
(2) oldest: 发表于 2010 年之前
|
|
||||||
(3) 2010-2014: 发表于 2010 年至 2014 年
|
|
||||||
(4) 2015-2020: 发表于 2015 年至 2020 年
|
|
||||||
3. 考虑到部分网站的总数据量过大,所以分成多份
|
|
||||||
4. 本程序运行顺序为:
|
|
||||||
(1) fileReader() 读取本地已爬取数据,存入待处理列表
|
|
||||||
(2) arDataTransform() 转换论文数据格式
|
|
||||||
(3) auDataTransform() 转换作者数据格式
|
|
||||||
(4) 存入转换后数据的存储文件夹
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
# Read the data
|
|
||||||
def fileReader(folder, dataset):
|
|
||||||
files = os.listdir(folder)
|
|
||||||
for file in files:
|
|
||||||
file_path = os.path.join(folder, file)
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as json_file:
|
|
||||||
Data = json.load(json_file)
|
|
||||||
dataset.append(Data)
|
|
||||||
|
|
||||||
return dataset
|
|
||||||
|
|
||||||
|
|
||||||
# Article data structure transfer
|
|
||||||
def arDataTransform(au_folder, ar_dataset, num):
|
|
||||||
def auInfoFind(path, file_name, ar_data, num):
|
|
||||||
authors = ar_data.get('authors')
|
|
||||||
authors.append(ar_data.get('corresponding_authors'))
|
|
||||||
|
|
||||||
file_path = os.path.join(path, file_name)
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
|
||||||
Data = json.load(file)
|
|
||||||
|
|
||||||
au_ID = [] # A new list to store author_id
|
|
||||||
|
|
||||||
# Find the author_id
|
|
||||||
for author in authors:
|
|
||||||
author = author.replace(" ", "")
|
|
||||||
|
|
||||||
for Dict in Data:
|
|
||||||
Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
|
|
||||||
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
|
|
||||||
unicodedata.category(char) != 'Mn')
|
|
||||||
|
|
||||||
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
|
|
||||||
au_ID.append(Dict.get('author_id'))
|
|
||||||
|
|
||||||
author_names_new = []
|
|
||||||
author_names = ar_data['authors']
|
|
||||||
|
|
||||||
for author_name in author_names:
|
|
||||||
author_name_new = ''
|
|
||||||
author_name = author_name.split(", ")
|
|
||||||
|
|
||||||
for i in range(len(author_name)-1, 0, -1):
|
|
||||||
# print(author_name[i])
|
|
||||||
author_name_new += author_name[i]
|
|
||||||
if i != 0:
|
|
||||||
author_name_new += ', '
|
|
||||||
|
|
||||||
print(author_name_new)
|
|
||||||
author_names_new.append(author_name_new)
|
|
||||||
|
|
||||||
# Change the structure
|
|
||||||
ar_data_transform = {
|
|
||||||
"article_id": ar_data['article_id'],
|
|
||||||
"title": ar_data['title'],
|
|
||||||
"authors": au_ID,
|
|
||||||
"authors_name": author_names_new,
|
|
||||||
"submit_datetime": ar_data['submit_datetime'],
|
|
||||||
"publish_datetime": ar_data['publish_datetime'],
|
|
||||||
"keywords": ar_data['keywords'],
|
|
||||||
"MSC": ar_data['MSC'],
|
|
||||||
"URL": ar_data['URL'],
|
|
||||||
"DOI": ar_data['DOI'],
|
|
||||||
"publisher": ar_data['publisher'],
|
|
||||||
"journal": ar_data['journal'],
|
|
||||||
"volume": ar_data['volume'],
|
|
||||||
"issue": ar_data['issue'],
|
|
||||||
"page": ar_data['page']
|
|
||||||
}
|
|
||||||
|
|
||||||
num[0] += 1 # Update the counter
|
|
||||||
return ar_data_transform
|
|
||||||
|
|
||||||
# ====== Main code for function =====
|
|
||||||
ar_names = os.listdir(au_folder) # Read the folder
|
|
||||||
|
|
||||||
for ar_list in ar_dataset:
|
|
||||||
for Dict in ar_list:
|
|
||||||
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
|
||||||
print(str(num[0]) + " copies of article data structure have been transformed.")
|
|
||||||
|
|
||||||
if int(Dict.get('volume')) <= 2009:
|
|
||||||
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
|
|
||||||
ar_dataset_new[3].append(Dict)
|
|
||||||
|
|
||||||
elif 2010 <= int(Dict.get('volume')) <= 2014:
|
|
||||||
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
|
|
||||||
ar_dataset_new[0].append(Dict)
|
|
||||||
|
|
||||||
elif 2015 <= int(Dict.get('volume')) <= 2020:
|
|
||||||
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
|
|
||||||
ar_dataset_new[1].append(Dict)
|
|
||||||
|
|
||||||
else:
|
|
||||||
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
|
|
||||||
ar_dataset_new[2].append(Dict)
|
|
||||||
|
|
||||||
# Store into the new file
|
|
||||||
filepaths = [
|
|
||||||
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2010-2014).json",
|
|
||||||
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(2015-2020).json",
|
|
||||||
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(newest).json",
|
|
||||||
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
|
|
||||||
]
|
|
||||||
|
|
||||||
for i in range(4):
|
|
||||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
|
||||||
json.dump(ar_dataset_new[i], json_file, indent=4)
|
|
||||||
|
|
||||||
print("\nComplete: All of the article data structure have been transformed.")
|
|
||||||
|
|
||||||
|
|
||||||
# Author data structure transfer
|
|
||||||
def auDataTransform(au_dataset, num):
|
|
||||||
def transform(list, num):
|
|
||||||
new_list = [] # New list to store transformed data
|
|
||||||
|
|
||||||
for au_data in list:
|
|
||||||
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
|
||||||
print(str(num[0]) + " copies of author data structure have been transformed.\n")
|
|
||||||
|
|
||||||
if au_data['middle_name'] is not None:
|
|
||||||
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
|
|
||||||
else:
|
|
||||||
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
|
|
||||||
|
|
||||||
au_data_transform = {
|
|
||||||
"author_id": au_data['author_id'],
|
|
||||||
"from_article": au_data['from_article'][0],
|
|
||||||
"first_name": au_data['last_name'],
|
|
||||||
"last_name": au_data['first_name'],
|
|
||||||
"middle_name": au_data['middle_name'],
|
|
||||||
"raw_name": raw_name,
|
|
||||||
"affiliation": au_data['affiliation']
|
|
||||||
}
|
|
||||||
|
|
||||||
new_list.append(au_data_transform)
|
|
||||||
num[0] += 1 # Update the counter
|
|
||||||
|
|
||||||
return new_list
|
|
||||||
|
|
||||||
for i in range(4):
|
|
||||||
au_list = transform(au_dataset[i], num)
|
|
||||||
au_dataset_new[i].append(au_list)
|
|
||||||
|
|
||||||
# Store into the new file
|
|
||||||
filepaths = [
|
|
||||||
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2010-2014).json",
|
|
||||||
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(2015-2020).json",
|
|
||||||
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(newest).json",
|
|
||||||
"./EJQTDE_buffer_transform/Author_output/EJQTDE_Author_output_file(oldest).json",
|
|
||||||
]
|
|
||||||
|
|
||||||
for i in range(4):
|
|
||||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
|
||||||
json.dump(au_dataset_new[i], json_file, indent=4)
|
|
||||||
|
|
||||||
print("\nComplete: All of the author data structure have been transformed.")
|
|
||||||
|
|
||||||
|
|
||||||
# ========== Main code ========== #
|
|
||||||
# New list for storing data
|
|
||||||
ar_dataset = []
|
|
||||||
au_dataset = []
|
|
||||||
|
|
||||||
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
|
|
||||||
au_dataset_new = [[] for _ in range(4)] # New list to store transformed data
|
|
||||||
|
|
||||||
num1 = [0] # Counter for complete ar_date
|
|
||||||
num2 = [0] # Counter for complete au_data
|
|
||||||
|
|
||||||
os.makedirs('./EJQTDE_buffer_transform/Article_output/', exist_ok=True)
|
|
||||||
os.makedirs('./EJQTDE_buffer_transform/Author_output/', exist_ok=True)
|
|
||||||
|
|
||||||
# Read the data
|
|
||||||
ar_dataset = fileReader('./EJQTDE_buffer/Article_output', ar_dataset)
|
|
||||||
au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset)
|
|
||||||
|
|
||||||
# Change the structure
|
|
||||||
arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
|
|
||||||
auDataTransform(au_dataset, num2)
|
|
||||||
Loading…
x
Reference in New Issue
Block a user