Compare commits
10 Commits
24aa62c8db
...
ad427c24dc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ad427c24dc | ||
|
|
61ef0081d8 | ||
|
|
ad63bcf6c4 | ||
|
|
50e30e105b | ||
|
|
7f9ab94adc | ||
|
|
35ea1dd424 | ||
|
|
ad6ba8832a | ||
|
|
dd0c4379da | ||
|
|
c3c460a4dc | ||
|
|
4c2c68feca |
Binary file not shown.
BIN
Data/Transform/EJDE_buffer_transform.zip
Normal file
BIN
Data/Transform/EJDE_buffer_transform.zip
Normal file
Binary file not shown.
BIN
Data/Transform/SpringerOpen_buffer_transform.zip
Normal file
BIN
Data/Transform/SpringerOpen_buffer_transform.zip
Normal file
Binary file not shown.
199
DataTransformer/FileStructureTansfer(EJDE).py
Normal file
199
DataTransformer/FileStructureTansfer(EJDE).py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
'''
|
||||||
|
========== FileStructureTransfer ==========
|
||||||
|
1. 本程序用于将获取的数据进行结构调整
|
||||||
|
2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中
|
||||||
|
(1) newest: 发表于 2020 年之后
|
||||||
|
(2) oldest: 发表于 2010 年之前
|
||||||
|
(3) 2010-2014: 发表于 2010 年至 2014 年
|
||||||
|
(4) 2015-2020: 发表于 2015 年至 2020 年
|
||||||
|
3. 考虑到部分网站的总数据量过大,所以分成多份
|
||||||
|
4. 本程序运行顺序为:
|
||||||
|
(1) fileReader() 读取本地已爬取数据,存入待处理列表
|
||||||
|
(2) arDataTransform() 转换论文数据格式
|
||||||
|
(3) auDataTransform() 转换作者数据格式
|
||||||
|
(4) 存入转换后数据的存储文件夹
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# Read the data
|
||||||
|
def fileReader(folder, dataset):
|
||||||
|
files = os.listdir(folder)
|
||||||
|
for file in files:
|
||||||
|
file_path = os.path.join(folder, file)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as json_file:
|
||||||
|
Data = json.load(json_file)
|
||||||
|
dataset.append(Data)
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
# Article data structure transfer
|
||||||
|
def arDataTransform(au_folder, ar_dataset, num):
|
||||||
|
def auInfoFind(path, file_name, ar_data, num):
|
||||||
|
authors = ar_data.get('authors')
|
||||||
|
authors.append(ar_data.get('corresponding_authors'))
|
||||||
|
|
||||||
|
file_path = os.path.join(path, file_name)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
Data = json.load(file)
|
||||||
|
|
||||||
|
au_ID = [] # A new list to store author_id
|
||||||
|
|
||||||
|
# Find the author_id
|
||||||
|
for author in authors:
|
||||||
|
if author is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
author = author.replace(" ", "")
|
||||||
|
|
||||||
|
for Dict in Data:
|
||||||
|
Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
|
||||||
|
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
|
||||||
|
unicodedata.category(char) != 'Mn')
|
||||||
|
|
||||||
|
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
|
||||||
|
au_ID.append(Dict.get('author_id'))
|
||||||
|
|
||||||
|
# Change the structure
|
||||||
|
ar_data_transform = {
|
||||||
|
"article_id": ar_data['article_id'],
|
||||||
|
"title": ar_data['title'],
|
||||||
|
"authors": au_ID,
|
||||||
|
"authors_name": ar_data['authors'],
|
||||||
|
"submit_datetime": ar_data['submit_datetime'],
|
||||||
|
"publish_datetime": ar_data['publish_datetime'],
|
||||||
|
"keywords": ar_data['keywords'],
|
||||||
|
"MSC": ar_data['MSC'],
|
||||||
|
"URL": ar_data['URL'],
|
||||||
|
"DOI": ar_data['DOI'],
|
||||||
|
"publisher": ar_data['publisher'],
|
||||||
|
"journal": ar_data['journal'],
|
||||||
|
"volume": ar_data['volume'],
|
||||||
|
"issue": ar_data['issue'],
|
||||||
|
"page": ar_data['page']
|
||||||
|
}
|
||||||
|
|
||||||
|
num[0] += 1 # Update the counter
|
||||||
|
return ar_data_transform
|
||||||
|
|
||||||
|
# ====== Main code for function =====
|
||||||
|
ar_names = os.listdir(au_folder) # Read the folder
|
||||||
|
|
||||||
|
for ar_list in ar_dataset:
|
||||||
|
for Dict in ar_list:
|
||||||
|
year = Dict.get('publish_datetime')
|
||||||
|
if year is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
year = year.split('-')
|
||||||
|
|
||||||
|
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
||||||
|
print(str(num[0]) + " copies of article data structure have been transformed.")
|
||||||
|
|
||||||
|
if int(year[0]) <= 2009:
|
||||||
|
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
|
||||||
|
ar_dataset_new[3].append(Dict)
|
||||||
|
|
||||||
|
elif 2010 <= int(year[0]) <= 2014:
|
||||||
|
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
|
||||||
|
ar_dataset_new[0].append(Dict)
|
||||||
|
|
||||||
|
elif 2015 <= int(year[0]) <= 2020:
|
||||||
|
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
|
||||||
|
ar_dataset_new[1].append(Dict)
|
||||||
|
|
||||||
|
else:
|
||||||
|
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
|
||||||
|
ar_dataset_new[2].append(Dict)
|
||||||
|
|
||||||
|
# Store into the new file
|
||||||
|
filepaths = [
|
||||||
|
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2010-2014).json",
|
||||||
|
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2015-2020).json",
|
||||||
|
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(newest).json",
|
||||||
|
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(oldest).json",
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(4):
|
||||||
|
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||||
|
json.dump(ar_dataset_new[i], json_file, indent=4)
|
||||||
|
|
||||||
|
print("\nComplete: All of the article data structure have been transformed.")
|
||||||
|
|
||||||
|
|
||||||
|
# Author data structure transfer
|
||||||
|
def auDataTransform(au_dataset, num):
|
||||||
|
def transform(list, num):
|
||||||
|
new_list = [] # New list to store transformed data
|
||||||
|
|
||||||
|
for au_data in list:
|
||||||
|
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
||||||
|
print(str(num[0]) + " copies of author data structure have been transformed.\n")
|
||||||
|
|
||||||
|
if au_data['middle_name'] is not None:
|
||||||
|
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
|
||||||
|
else:
|
||||||
|
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
|
||||||
|
|
||||||
|
au_data_transform = {
|
||||||
|
"author_id": au_data['author_id'],
|
||||||
|
"from_article": au_data['from_article'][0],
|
||||||
|
"first_name": au_data['last_name'],
|
||||||
|
"last_name": au_data['first_name'],
|
||||||
|
"middle_name": au_data['middle_name'],
|
||||||
|
"raw_name": raw_name,
|
||||||
|
"affiliation": au_data['affiliation']
|
||||||
|
}
|
||||||
|
|
||||||
|
new_list.append(au_data_transform)
|
||||||
|
num[0] += 1 # Update the counter
|
||||||
|
|
||||||
|
return new_list
|
||||||
|
|
||||||
|
for i in range(4):
|
||||||
|
au_list = transform(au_dataset[i], num)
|
||||||
|
au_dataset_new[i].append(au_list)
|
||||||
|
|
||||||
|
# Store into the new file
|
||||||
|
filepaths = [
|
||||||
|
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2010-2014).json",
|
||||||
|
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2015-2020).json",
|
||||||
|
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(newest).json",
|
||||||
|
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(oldest).json",
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(4):
|
||||||
|
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||||
|
json.dump(au_dataset_new[i], json_file, indent=4)
|
||||||
|
|
||||||
|
print("\nComplete: All of the author data structure have been transformed.")
|
||||||
|
|
||||||
|
|
||||||
|
# ========== Main code ========== #
|
||||||
|
# New list for storing data
|
||||||
|
ar_dataset = []
|
||||||
|
au_dataset = []
|
||||||
|
|
||||||
|
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
|
||||||
|
au_dataset_new = [[] for _ in range(4)] # New list to store transformed data
|
||||||
|
|
||||||
|
num1 = [0] # Counter for complete ar_date
|
||||||
|
num2 = [0] # Counter for complete au_data
|
||||||
|
|
||||||
|
os.makedirs('./EJDE_buffer_transform/Article_output/', exist_ok=True)
|
||||||
|
os.makedirs('./EJDE_buffer_transform/Author_output/', exist_ok=True)
|
||||||
|
|
||||||
|
# Read the data
|
||||||
|
ar_dataset = fileReader('./EJDE_buffer/Article_output', ar_dataset)
|
||||||
|
au_dataset = fileReader('./EJDE_buffer/Author_output', au_dataset)
|
||||||
|
|
||||||
|
# Change the structure
|
||||||
|
arDataTransform('./EJDE_buffer/Author_output', ar_dataset, num1)
|
||||||
|
auDataTransform(au_dataset, num2)
|
||||||
@ -5,6 +5,22 @@ import unicodedata
|
|||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
|
'''
|
||||||
|
========== FileStructureTransfer ==========
|
||||||
|
1. 本程序用于将获取的数据进行结构调整
|
||||||
|
2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中
|
||||||
|
(1) newest: 发表于 2020 年之后
|
||||||
|
(2) oldest: 发表于 2010 年之前
|
||||||
|
(3) 2010-2014: 发表于 2010 年至 2014 年
|
||||||
|
(4) 2015-2020: 发表于 2015 年至 2020 年
|
||||||
|
3. 考虑到部分网站的总数据量过大,所以分成多份
|
||||||
|
4. 本程序运行顺序为:
|
||||||
|
(1) fileReader() 读取本地已爬取数据,存入待处理列表
|
||||||
|
(2) arDataTransform() 转换论文数据格式
|
||||||
|
(3) auDataTransform() 转换作者数据格式
|
||||||
|
(4) 存入转换后数据的存储文件夹
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
# Read the data
|
# Read the data
|
||||||
def fileReader(folder, dataset):
|
def fileReader(folder, dataset):
|
||||||
@ -42,12 +58,28 @@ def arDataTransform(au_folder, ar_dataset, num):
|
|||||||
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
|
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
|
||||||
au_ID.append(Dict.get('author_id'))
|
au_ID.append(Dict.get('author_id'))
|
||||||
|
|
||||||
|
author_names_new = []
|
||||||
|
author_names = ar_data['authors']
|
||||||
|
|
||||||
|
for author_name in author_names:
|
||||||
|
author_name_new = ''
|
||||||
|
author_name = author_name.split(", ")
|
||||||
|
|
||||||
|
for i in range(len(author_name)-1, 0, -1):
|
||||||
|
# print(author_name[i])
|
||||||
|
author_name_new += author_name[i]
|
||||||
|
if i != 0:
|
||||||
|
author_name_new += ', '
|
||||||
|
|
||||||
|
print(author_name_new)
|
||||||
|
author_names_new.append(author_name_new)
|
||||||
|
|
||||||
# Change the structure
|
# Change the structure
|
||||||
ar_data_transform = {
|
ar_data_transform = {
|
||||||
"article_id": ar_data['article_id'],
|
"article_id": ar_data['article_id'],
|
||||||
"title": ar_data['title'],
|
"title": ar_data['title'],
|
||||||
"authors": au_ID,
|
"authors": au_ID,
|
||||||
"authors_name": ar_data['authors'],
|
"authors_name": author_names_new,
|
||||||
"submit_datetime": ar_data['submit_datetime'],
|
"submit_datetime": ar_data['submit_datetime'],
|
||||||
"publish_datetime": ar_data['publish_datetime'],
|
"publish_datetime": ar_data['publish_datetime'],
|
||||||
"keywords": ar_data['keywords'],
|
"keywords": ar_data['keywords'],
|
||||||
@ -96,13 +128,6 @@ def arDataTransform(au_folder, ar_dataset, num):
|
|||||||
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
|
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
|
||||||
]
|
]
|
||||||
|
|
||||||
# for filepath in filepaths:
|
|
||||||
# for list in ar_dataset_new:
|
|
||||||
# with open(filepath, "w", encoding='utf-8') as json_file:
|
|
||||||
# json.dump(list, json_file, indent=4)
|
|
||||||
#
|
|
||||||
# break
|
|
||||||
|
|
||||||
for i in range(4):
|
for i in range(4):
|
||||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||||
json.dump(ar_dataset_new[i], json_file, indent=4)
|
json.dump(ar_dataset_new[i], json_file, indent=4)
|
||||||
@ -127,8 +152,8 @@ def auDataTransform(au_dataset, num):
|
|||||||
au_data_transform = {
|
au_data_transform = {
|
||||||
"author_id": au_data['author_id'],
|
"author_id": au_data['author_id'],
|
||||||
"from_article": au_data['from_article'][0],
|
"from_article": au_data['from_article'][0],
|
||||||
"first_name": au_data['first_name'],
|
"first_name": au_data['last_name'],
|
||||||
"last_name": au_data['last_name'],
|
"last_name": au_data['first_name'],
|
||||||
"middle_name": au_data['middle_name'],
|
"middle_name": au_data['middle_name'],
|
||||||
"raw_name": raw_name,
|
"raw_name": raw_name,
|
||||||
"affiliation": au_data['affiliation']
|
"affiliation": au_data['affiliation']
|
||||||
@ -139,13 +164,6 @@ def auDataTransform(au_dataset, num):
|
|||||||
|
|
||||||
return new_list
|
return new_list
|
||||||
|
|
||||||
# # Transform the author data structure
|
|
||||||
# au_dataset_new = [] # New list to store transformed data
|
|
||||||
|
|
||||||
# for au_list in au_dataset:
|
|
||||||
# au_list_new = transform(au_list, num)
|
|
||||||
# au_dataset_new.append(au_list_new)
|
|
||||||
|
|
||||||
for i in range(4):
|
for i in range(4):
|
||||||
au_list = transform(au_dataset[i], num)
|
au_list = transform(au_dataset[i], num)
|
||||||
au_dataset_new[i].append(au_list)
|
au_dataset_new[i].append(au_list)
|
||||||
@ -185,4 +203,4 @@ au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset)
|
|||||||
|
|
||||||
# Change the structure
|
# Change the structure
|
||||||
arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
|
arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
|
||||||
# auDataTransform(au_dataset, num2)
|
auDataTransform(au_dataset, num2)
|
||||||
189
DataTransformer/FileStructureTansfer(SprigerOpen).py
Normal file
189
DataTransformer/FileStructureTansfer(SprigerOpen).py
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
'''
|
||||||
|
========== FileStructureTransfer ==========
|
||||||
|
1. 本程序用于将获取的数据进行结构调整
|
||||||
|
2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中
|
||||||
|
(1) newest: 发表于 2020 年之后
|
||||||
|
(2) oldest: 发表于 2010 年之前
|
||||||
|
(3) 2010-2014: 发表于 2010 年至 2014 年
|
||||||
|
(4) 2015-2020: 发表于 2015 年至 2020 年
|
||||||
|
3. 考虑到部分网站的总数据量过大,所以分成多份
|
||||||
|
4. 本程序运行顺序为:
|
||||||
|
(1) fileReader() 读取本地已爬取数据,存入待处理列表
|
||||||
|
(2) arDataTransform() 转换论文数据格式
|
||||||
|
(3) auDataTransform() 转换作者数据格式
|
||||||
|
(4) 存入转换后数据的存储文件夹
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# Read the data
|
||||||
|
def fileReader(folder, dataset):
|
||||||
|
files = os.listdir(folder)
|
||||||
|
for file in files:
|
||||||
|
file_path = os.path.join(folder, file)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as json_file:
|
||||||
|
Data = json.load(json_file)
|
||||||
|
dataset.append(Data)
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
# Article data structure transfer
|
||||||
|
def arDataTransform(au_folder, ar_dataset, num):
|
||||||
|
def auInfoFind(path, file_name, ar_data, num):
|
||||||
|
authors = ar_data.get('authors')
|
||||||
|
authors.extend(ar_data.get('corresponding_authors'))
|
||||||
|
|
||||||
|
file_path = os.path.join(path, file_name)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
Data = json.load(file)
|
||||||
|
|
||||||
|
au_ID = [] # A new list to store author_id
|
||||||
|
|
||||||
|
# Find the author_id
|
||||||
|
for author in authors:
|
||||||
|
|
||||||
|
for Dict in Data:
|
||||||
|
Dict_name = Dict.get('first_name') + ' ' + Dict.get('last_name')
|
||||||
|
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
|
||||||
|
unicodedata.category(char) != 'Mn')
|
||||||
|
|
||||||
|
if Dict.get('from_article') == ar_data.get('article_id') and Dict_name == author:
|
||||||
|
au_ID.append(Dict.get('author_id'))
|
||||||
|
|
||||||
|
# Change the structure
|
||||||
|
ar_data_transform = {
|
||||||
|
"article_id": ar_data['article_id'],
|
||||||
|
"title": ar_data['title'],
|
||||||
|
"authors": au_ID,
|
||||||
|
"authors_name": authors,
|
||||||
|
"submit_datetime": ar_data['submit_datetime'],
|
||||||
|
"publish_datetime": ar_data['publish_datetime'],
|
||||||
|
"keywords": ar_data['keywords'],
|
||||||
|
"MSC": ar_data['MSC'],
|
||||||
|
"URL": ar_data['URL'],
|
||||||
|
"DOI": ar_data['DOI'],
|
||||||
|
"publisher": ar_data['publisher'],
|
||||||
|
"journal": ar_data['journal'],
|
||||||
|
"volume": ar_data['volume'],
|
||||||
|
"issue": ar_data['issue'],
|
||||||
|
"page": ar_data['page']
|
||||||
|
}
|
||||||
|
|
||||||
|
num[0] += 1 # Update the counter
|
||||||
|
return ar_data_transform
|
||||||
|
|
||||||
|
# ====== Main code for function =====
|
||||||
|
ar_names = os.listdir(au_folder) # Read the folder
|
||||||
|
|
||||||
|
for ar_list in ar_dataset:
|
||||||
|
for Dict in ar_list:
|
||||||
|
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
||||||
|
print(str(num[0]) + " copies of article data structure have been transformed.")
|
||||||
|
|
||||||
|
if int(Dict.get('volume')) <= 2009:
|
||||||
|
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
|
||||||
|
ar_dataset_new[3].append(Dict)
|
||||||
|
|
||||||
|
elif 2010 <= int(Dict.get('volume')) <= 2014:
|
||||||
|
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
|
||||||
|
ar_dataset_new[0].append(Dict)
|
||||||
|
|
||||||
|
elif 2015 <= int(Dict.get('volume')) <= 2020:
|
||||||
|
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
|
||||||
|
ar_dataset_new[1].append(Dict)
|
||||||
|
|
||||||
|
else:
|
||||||
|
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
|
||||||
|
ar_dataset_new[2].append(Dict)
|
||||||
|
|
||||||
|
# Store into the new file
|
||||||
|
filepaths = [
|
||||||
|
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2010-2014).json",
|
||||||
|
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2015-2020).json",
|
||||||
|
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(newest).json",
|
||||||
|
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(oldest).json",
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(4):
|
||||||
|
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||||
|
json.dump(ar_dataset_new[i], json_file, indent=4)
|
||||||
|
|
||||||
|
print("\nComplete: All of the article data structure have been transformed.")
|
||||||
|
|
||||||
|
|
||||||
|
# Author data structure transfer
|
||||||
|
def auDataTransform(au_dataset, num):
|
||||||
|
def transform(list, num):
|
||||||
|
new_list = [] # New list to store transformed data
|
||||||
|
|
||||||
|
for au_data in list:
|
||||||
|
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
||||||
|
print(str(num[0]) + " copies of author data structure have been transformed.\n")
|
||||||
|
|
||||||
|
if au_data['middle_name'] is not None:
|
||||||
|
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
|
||||||
|
else:
|
||||||
|
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
|
||||||
|
|
||||||
|
au_data_transform = {
|
||||||
|
"author_id": au_data['author_id'],
|
||||||
|
"from_article": au_data['from_article'],
|
||||||
|
"first_name": au_data['first_name'],
|
||||||
|
"last_name": au_data['last_name'],
|
||||||
|
"middle_name": au_data['middle_name'],
|
||||||
|
"raw_name": raw_name,
|
||||||
|
"affiliation": au_data['affiliation']
|
||||||
|
}
|
||||||
|
|
||||||
|
new_list.append(au_data_transform)
|
||||||
|
num[0] += 1 # Update the counter
|
||||||
|
|
||||||
|
return new_list
|
||||||
|
|
||||||
|
for i in range(4):
|
||||||
|
au_list = transform(au_dataset[i], num)
|
||||||
|
au_dataset_new[i].extend(au_list)
|
||||||
|
|
||||||
|
# Store into the new file
|
||||||
|
filepaths = [
|
||||||
|
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2010-2014).json",
|
||||||
|
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2015-2020).json",
|
||||||
|
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(newest).json",
|
||||||
|
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(oldest).json",
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(4):
|
||||||
|
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||||
|
json.dump(au_dataset_new[i], json_file, indent=4)
|
||||||
|
|
||||||
|
print("\nComplete: All of the author data structure have been transformed.")
|
||||||
|
|
||||||
|
|
||||||
|
# ========== Main code ========== #
|
||||||
|
# New list for storing data
|
||||||
|
ar_dataset = []
|
||||||
|
au_dataset = []
|
||||||
|
|
||||||
|
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
|
||||||
|
au_dataset_new = [[] for _ in range(4)] # New list to store transformed data
|
||||||
|
|
||||||
|
num1 = [0] # Counter for complete ar_date
|
||||||
|
num2 = [0] # Counter for complete au_data
|
||||||
|
|
||||||
|
os.makedirs('./SpringerOpen_buffer_transform/Article_output/', exist_ok=True)
|
||||||
|
os.makedirs('./SpringerOpen_buffer_transform/Author_output/', exist_ok=True)
|
||||||
|
|
||||||
|
# Read the data
|
||||||
|
ar_dataset = fileReader('./SpringerOpen_buffer/Article_output', ar_dataset)
|
||||||
|
au_dataset = fileReader('./SpringerOpen_buffer/Author_output', au_dataset)
|
||||||
|
|
||||||
|
# Change the structure
|
||||||
|
# arDataTransform('./SpringerOpen_buffer/Author_output', ar_dataset, num1)
|
||||||
|
auDataTransform(au_dataset, num2)
|
||||||
@ -26,11 +26,12 @@ payload = {
|
|||||||
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
|
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
|
||||||
|
|
||||||
# Aminer API
|
# Aminer API
|
||||||
api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
|
api_paper_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
|
||||||
api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
|
api_paper_detail = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
|
||||||
|
api_author_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search"
|
||||||
|
|
||||||
|
|
||||||
def aminer_get_id(title):
|
def aminer_get_paper_id(title):
|
||||||
headers = {
|
headers = {
|
||||||
"Authorization": f"Bearer {jwt_token}"
|
"Authorization": f"Bearer {jwt_token}"
|
||||||
}
|
}
|
||||||
@ -39,7 +40,7 @@ def aminer_get_id(title):
|
|||||||
"size": "",
|
"size": "",
|
||||||
"title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
|
"title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
|
||||||
}
|
}
|
||||||
response = requests.get(api_get_id, headers=headers, params=params)
|
response = requests.get(api_paper_id, headers=headers, params=params)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
@ -49,7 +50,7 @@ def aminer_get_id(title):
|
|||||||
not_on_aminer.append(title)
|
not_on_aminer.append(title)
|
||||||
|
|
||||||
|
|
||||||
def aminer_post_citation(aminer_id):
|
def aminer_post_paper_citation(aminer_id):
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json;charset=utf-8",
|
"Content-Type": "application/json;charset=utf-8",
|
||||||
"Authorization": f"Bearer {jwt_token}"
|
"Authorization": f"Bearer {jwt_token}"
|
||||||
@ -57,7 +58,7 @@ def aminer_post_citation(aminer_id):
|
|||||||
request_data = {
|
request_data = {
|
||||||
"ids": aminer_id
|
"ids": aminer_id
|
||||||
}
|
}
|
||||||
response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
|
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
@ -72,6 +73,31 @@ def aminer_post_citation(aminer_id):
|
|||||||
aminer_paper_citation_retry.append(aminer_id)
|
aminer_paper_citation_retry.append(aminer_id)
|
||||||
|
|
||||||
|
|
||||||
|
def aminer_author_info(author_aminer_id, author_name, offset):
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json;charset=utf-8",
|
||||||
|
"Authorization": f"Bearer {jwt_token}"
|
||||||
|
}
|
||||||
|
request_data = {
|
||||||
|
"ids": author_aminer_id,
|
||||||
|
"query": author_name,
|
||||||
|
"offset": offset
|
||||||
|
}
|
||||||
|
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
if data.get("success"):
|
||||||
|
for item in data.get('data', []):
|
||||||
|
if 'n_citation' in item:
|
||||||
|
n_citation = item['n_citation']
|
||||||
|
else:
|
||||||
|
n_citation = 0
|
||||||
|
aminer_paper_citation.append(n_citation)
|
||||||
|
else:
|
||||||
|
aminer_paper_citation_retry.append(author_aminer_id)
|
||||||
|
|
||||||
|
|
||||||
def scholarly_get_citation(title):
|
def scholarly_get_citation(title):
|
||||||
# # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
|
# # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
|
||||||
pg = ProxyGenerator()
|
pg = ProxyGenerator()
|
||||||
@ -92,8 +118,7 @@ aminer_paper_citation = []
|
|||||||
aminer_paper_citation_retry = []
|
aminer_paper_citation_retry = []
|
||||||
|
|
||||||
# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
|
# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
|
||||||
|
aminer_get_paper_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
|
||||||
aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
|
|
||||||
if aminer_paper_id:
|
if aminer_paper_id:
|
||||||
aminer_post_citation(aminer_paper_id)
|
aminer_post_paper_citation(aminer_paper_id)
|
||||||
print(aminer_paper_citation)
|
print(aminer_paper_citation)
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
|
import re
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
import requests
|
import requests
|
||||||
import re
|
import threading
|
||||||
import ejde_save
|
import ejde_save
|
||||||
|
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
@ -13,8 +14,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
'''
|
'''
|
||||||
爬取网站:'ejde.math.txstate.edu'
|
爬取网站:'ejde.math.txstate.edu'
|
||||||
|
|
||||||
Total number of papers: 2023/08/08 - 4761
|
Total number of papers: 2023/08/08 - 4785
|
||||||
Total Time via VPN w/100ms-delay: 306.73s
|
Total Time via VPN w/100ms-delay: 96.30s
|
||||||
|
|
||||||
==========运行顺序==========
|
==========运行顺序==========
|
||||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||||
@ -23,6 +24,22 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def append_data_thread_safe(from_list, to_list, data_lock):
|
||||||
|
with data_lock:
|
||||||
|
to_list.append(from_list)
|
||||||
|
|
||||||
|
|
||||||
|
def save_data_thread_safe(data, data_lock, data_type):
|
||||||
|
global articleNum, authorNum
|
||||||
|
with data_lock:
|
||||||
|
ejde_save.save_data(data, f"{data_type}_TS", str(uuid.uuid4()) + ".json")
|
||||||
|
if data_type == "Article":
|
||||||
|
articleNum += len(data)
|
||||||
|
else:
|
||||||
|
authorNum += len(data)
|
||||||
|
data.clear()
|
||||||
|
|
||||||
|
|
||||||
def datetime_transform(date):
|
def datetime_transform(date):
|
||||||
month_typo = {
|
month_typo = {
|
||||||
"Janaury": "January",
|
"Janaury": "January",
|
||||||
@ -120,7 +137,7 @@ def process_html_article(baseweb, article):
|
|||||||
# Get article title & url
|
# Get article title & url
|
||||||
try:
|
try:
|
||||||
title = article.text.strip()
|
title = article.text.strip()
|
||||||
title = re.sub(r'\s+', ' ', title).strip()
|
title = str(re.sub(r'\s+', ' ', title).strip())
|
||||||
article_url = baseweb + article.find_next("a")["href"]
|
article_url = baseweb + article.find_next("a")["href"]
|
||||||
if "../../index.html" in article_url:
|
if "../../index.html" in article_url:
|
||||||
print("Redundant URL:", article_url)
|
print("Redundant URL:", article_url)
|
||||||
@ -148,7 +165,6 @@ def process_html_article(baseweb, article):
|
|||||||
|
|
||||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||||
def process_article(title, article_url):
|
def process_article(title, article_url):
|
||||||
global articleNum, authorNum
|
|
||||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
||||||
article_response = requests.get(article_url, headers=headers)
|
article_response = requests.get(article_url, headers=headers)
|
||||||
@ -162,11 +178,11 @@ def process_article(title, article_url):
|
|||||||
# Extract title if title == None
|
# Extract title if title == None
|
||||||
if not title:
|
if not title:
|
||||||
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
||||||
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
|
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""
|
||||||
|
|
||||||
# Extract issue
|
# Extract issue
|
||||||
issue_match = re.search(r'No\. (\d+)', article_text)
|
issue_match = re.search(r'No\. (\d+)', article_text)
|
||||||
issue = issue_match.group(1) if issue_match else None
|
issue = issue_match.group(1) if issue_match else ""
|
||||||
|
|
||||||
# Extract volume
|
# Extract volume
|
||||||
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||||||
@ -189,21 +205,23 @@ def process_article(title, article_url):
|
|||||||
volume = str(volume)
|
volume = str(volume)
|
||||||
issue = "Conference " + str(issue_number)
|
issue = "Conference " + str(issue_number)
|
||||||
else:
|
else:
|
||||||
volume = None
|
volume = ""
|
||||||
|
|
||||||
# Extract pp
|
# Extract pp
|
||||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||||
pp = pp_match.group(1) if pp_match else None
|
pp = pp_match.group(1) if pp_match else ""
|
||||||
|
|
||||||
# Extract submission date
|
# Extract submission date
|
||||||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
submitted_date = match.group(1) if match else None
|
if not match:
|
||||||
|
match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
|
||||||
|
submitted_date = match.group(1) if match else ""
|
||||||
if submitted_date:
|
if submitted_date:
|
||||||
submitted_date = datetime_transform(submitted_date)
|
submitted_date = datetime_transform(submitted_date)
|
||||||
|
|
||||||
# Extract publication date
|
# Extract publication date
|
||||||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||||
publish_date = match.group(1) if match else None
|
publish_date = match.group(1) if match else ""
|
||||||
if publish_date:
|
if publish_date:
|
||||||
publish_date = datetime_transform(publish_date)
|
publish_date = datetime_transform(publish_date)
|
||||||
|
|
||||||
@ -234,25 +252,25 @@ def process_article(title, article_url):
|
|||||||
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
|
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
|
||||||
if not doi_match:
|
if not doi_match:
|
||||||
doi_match = re.search(r'DOI: (.+)', html)
|
doi_match = re.search(r'DOI: (.+)', html)
|
||||||
doi = doi_match.group(1) if doi_match else None
|
doi = doi_match.group(1) if doi_match else ""
|
||||||
doi = doi.replace('https://doi.org/', '') # strip doi website header
|
|
||||||
|
|
||||||
# Article_id
|
# Article_id
|
||||||
article_id = str(uuid.uuid4())
|
article_id = str(uuid.uuid4())
|
||||||
|
|
||||||
# Author info
|
# Author info
|
||||||
authors = []
|
authors = []
|
||||||
|
author_names = []
|
||||||
table = article_soup.find('table')
|
table = article_soup.find('table')
|
||||||
if table:
|
if table:
|
||||||
for row in table.find_all('tr'):
|
for row in table.find_all('tr'):
|
||||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
if "email:" in cell:
|
if "email" in cell:
|
||||||
cell = cell.split("email:")
|
cell = cell.split("email")
|
||||||
email_list = str(cell[1]).split(',')
|
email_list = str(cell[1]).split(',')
|
||||||
cell = cell[0]
|
cell = cell[0]
|
||||||
elif "e-mail:" in cell:
|
elif "e-mail" in cell:
|
||||||
cell = cell.split("e-mail:")
|
cell = cell.split("e-mail")
|
||||||
email_list = str(cell[1]).split(',')
|
email_list = str(cell[1]).split(',')
|
||||||
cell = cell[0]
|
cell = cell[0]
|
||||||
else:
|
else:
|
||||||
@ -264,8 +282,11 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
if cell[0]:
|
if cell[0]:
|
||||||
authors.append(unidecode(cell[0]))
|
author_id = str(uuid.uuid4())
|
||||||
name = re.split(r'[ .]', cell[0])
|
authors.append(author_id)
|
||||||
|
author_names.append(unidecode(cell[0]))
|
||||||
|
name = re.split(r'\s+', cell[0])
|
||||||
|
name = [item for item in name if item != '']
|
||||||
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||||
@ -276,19 +297,21 @@ def process_article(title, article_url):
|
|||||||
emails.append(unidecode(email_match.group())) if email_match else None
|
emails.append(unidecode(email_match.group())) if email_match else None
|
||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": author_id,
|
||||||
"from_article": [article_id],
|
"from_article": article_id,
|
||||||
"first_name": unidecode(name[0]),
|
"first_name": unidecode(name[0]),
|
||||||
"last_name": unidecode(name[-1]),
|
"last_name": unidecode(name[-1]),
|
||||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
|
||||||
"affiliation": [{
|
"raw_name": unidecode(cell[0]),
|
||||||
|
"affiliation": [
|
||||||
|
{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": unidecode(affiliation),
|
"affiliation": unidecode(affiliation),
|
||||||
"email": emails
|
"email": ", ".join(emails)
|
||||||
}]
|
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
]
|
||||||
authorNum += 1
|
}
|
||||||
|
append_data_thread_safe(author_data, authorData, authorDataLock)
|
||||||
# If no author table
|
# If no author table
|
||||||
else:
|
else:
|
||||||
match_type = 0
|
match_type = 0
|
||||||
@ -308,12 +331,12 @@ def process_article(title, article_url):
|
|||||||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||||||
matches = matches.split("<p>")
|
matches = matches.split("<p>")
|
||||||
for match in matches:
|
for match in matches:
|
||||||
if "email:" in match:
|
if "email" in match:
|
||||||
match = match.split("email:")
|
match = match.split("email")
|
||||||
email_list = str(match[1]).split(',')
|
email_list = str(match[1]).split(',')
|
||||||
match = match[0]
|
match = match[0]
|
||||||
elif "e-mail:" in match:
|
elif "e-mail" in match:
|
||||||
match = match.split("e-mail:")
|
match = match.split("e-mail")
|
||||||
email_list = str(match[1]).split(',')
|
email_list = str(match[1]).split(',')
|
||||||
match = match[0]
|
match = match[0]
|
||||||
else:
|
else:
|
||||||
@ -330,8 +353,11 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
if match[0]:
|
if match[0]:
|
||||||
|
author_id = str(uuid.uuid4())
|
||||||
|
authors.append(author_id)
|
||||||
authors.append(unidecode(match[0]))
|
authors.append(unidecode(match[0]))
|
||||||
name = re.split(r'[ .]', match[0])
|
name = re.split(r'\s+', match[0])
|
||||||
|
name = [item for item in name if item != '']
|
||||||
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||||
@ -342,19 +368,21 @@ def process_article(title, article_url):
|
|||||||
emails.append(unidecode(email_match.group())) if email_match else None
|
emails.append(unidecode(email_match.group())) if email_match else None
|
||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": author_id,
|
||||||
"from_article": [article_id],
|
"from_article": article_id,
|
||||||
"first_name": unidecode(name[0]),
|
"first_name": unidecode(name[0]),
|
||||||
"last_name": unidecode(name[-1]),
|
"last_name": unidecode(name[-1]),
|
||||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
|
||||||
"affiliation": [{
|
"raw_name": unidecode(match[0]),
|
||||||
|
"affiliation": [
|
||||||
|
{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": unidecode(affiliation),
|
"affiliation": unidecode(affiliation),
|
||||||
"email": emails
|
"email": ", ".join(emails)
|
||||||
}]
|
|
||||||
}
|
}
|
||||||
authorData.append(author_data)
|
]
|
||||||
authorNum += 1
|
}
|
||||||
|
append_data_thread_safe(author_data, authorData, authorDataLock)
|
||||||
else:
|
else:
|
||||||
print("AUTHOR SEARCHING ERROR:", article_url)
|
print("AUTHOR SEARCHING ERROR:", article_url)
|
||||||
fail = {
|
fail = {
|
||||||
@ -368,7 +396,7 @@ def process_article(title, article_url):
|
|||||||
"article_id": article_id,
|
"article_id": article_id,
|
||||||
"title": unidecode(title),
|
"title": unidecode(title),
|
||||||
"authors": authors,
|
"authors": authors,
|
||||||
"corresponding_authors": None,
|
"author_names": author_names,
|
||||||
"submit_datetime": submitted_date,
|
"submit_datetime": submitted_date,
|
||||||
"publish_datetime": publish_date,
|
"publish_datetime": publish_date,
|
||||||
"keywords": keywords,
|
"keywords": keywords,
|
||||||
@ -381,17 +409,14 @@ def process_article(title, article_url):
|
|||||||
"issue": issue,
|
"issue": issue,
|
||||||
"page": pp
|
"page": pp
|
||||||
}
|
}
|
||||||
articleData.append(article_data)
|
append_data_thread_safe(article_data, articleData, articleDataLock)
|
||||||
articleNum += 1
|
|
||||||
|
|
||||||
# Save the data periodically based on batch size
|
# Save the data periodically based on batch size
|
||||||
if len(articleData) % batch_size == 0:
|
if len(articleData) % batch_size == 0:
|
||||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
save_data_thread_safe(articleData, articleDataLock, "Article")
|
||||||
articleData.clear()
|
|
||||||
|
|
||||||
if len(authorData) % batch_size == 0:
|
if len(authorData) % batch_size == 0:
|
||||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
save_data_thread_safe(authorData, authorDataLock, "Author")
|
||||||
authorData.clear()
|
|
||||||
|
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
@ -429,6 +454,8 @@ authorNum = 0
|
|||||||
articleNum = 0
|
articleNum = 0
|
||||||
|
|
||||||
batch_size = 100 # Number of articles to process before saving
|
batch_size = 100 # Number of articles to process before saving
|
||||||
|
authorDataLock = threading.Lock()
|
||||||
|
articleDataLock = threading.Lock()
|
||||||
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
|
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
|
||||||
|
|
||||||
# Process each URL using multithreading
|
# Process each URL using multithreading
|
||||||
@ -442,11 +469,12 @@ for future in as_completed(futures):
|
|||||||
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
||||||
|
|
||||||
# Retry failed processing paper
|
# Retry failed processing paper
|
||||||
|
if len(failedData):
|
||||||
print("START RETRYING:", len(failedData))
|
print("START RETRYING:", len(failedData))
|
||||||
while failedData:
|
while failedData:
|
||||||
data = failedData.pop(0)
|
fail_data = failedData.pop(0)
|
||||||
articleTitle = data["title"]
|
articleTitle = fail_data["title"]
|
||||||
articleUrl = data["URL"]
|
articleUrl = fail_data["URL"]
|
||||||
try:
|
try:
|
||||||
process_article(articleTitle, articleUrl)
|
process_article(articleTitle, articleUrl)
|
||||||
except Exception as retry_err:
|
except Exception as retry_err:
|
||||||
@ -459,11 +487,11 @@ while failedData:
|
|||||||
|
|
||||||
# Save remaining data
|
# Save remaining data
|
||||||
if len(articleData) > 0:
|
if len(articleData) > 0:
|
||||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
save_data_thread_safe(articleData, articleDataLock, "Article")
|
||||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||||
|
|
||||||
if len(authorData) > 0:
|
if len(authorData) > 0:
|
||||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
save_data_thread_safe(authorData, authorDataLock, "Author")
|
||||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||||
|
|
||||||
# Save error record
|
# Save error record
|
||||||
@ -485,5 +513,5 @@ print("Total fetched author:", authorNum)
|
|||||||
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||||
|
|
||||||
# Transfer to large file and delete the temporary storage files
|
# Transfer to large file and delete the temporary storage files
|
||||||
ejde_save.Transf()
|
ejde_save.transform_data()
|
||||||
ejde_save.delete()
|
ejde_save.delete_data()
|
||||||
|
|||||||
@ -14,8 +14,8 @@ def save_data(dataset, filetype, filename):
|
|||||||
|
|
||||||
|
|
||||||
# Write into output files
|
# Write into output files
|
||||||
def Transf():
|
def transform_data():
|
||||||
def Read(folder_path, output_files):
|
def read(folder_path, output_files):
|
||||||
# Create new folders
|
# Create new folders
|
||||||
os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
|
os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
|
||||||
os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
|
os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
|
||||||
@ -24,6 +24,8 @@ def Transf():
|
|||||||
data_2010_2014 = []
|
data_2010_2014 = []
|
||||||
data_2015_2020 = []
|
data_2015_2020 = []
|
||||||
data_newest = []
|
data_newest = []
|
||||||
|
data_no_date = []
|
||||||
|
data_integrate = []
|
||||||
|
|
||||||
for filename in os.listdir(folder_path):
|
for filename in os.listdir(folder_path):
|
||||||
if filename.endswith('.json'):
|
if filename.endswith('.json'):
|
||||||
@ -31,24 +33,29 @@ def Transf():
|
|||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
for Dict in data:
|
for Dict in data:
|
||||||
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) != '':
|
||||||
# Select data
|
# Select data
|
||||||
if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
||||||
data_oldest.append(Dict)
|
data_oldest.append(Dict)
|
||||||
|
|
||||||
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
|
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
|
||||||
data_2010_2014.append(Dict)
|
data_2010_2014.append(Dict)
|
||||||
|
|
||||||
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
||||||
data_2015_2020.append(Dict)
|
data_2015_2020.append(Dict)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
data_newest.append(Dict)
|
data_newest.append(Dict)
|
||||||
|
else:
|
||||||
|
data_no_date.append(Dict)
|
||||||
|
|
||||||
|
data_integrate.append(data_oldest)
|
||||||
|
data_integrate.append(data_2010_2014)
|
||||||
|
data_integrate.append(data_2015_2020)
|
||||||
|
data_integrate.append(data_newest)
|
||||||
|
data_integrate.append(data_no_date)
|
||||||
|
|
||||||
# Transfer
|
# Transfer
|
||||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest, data_no_date, data_integrate]
|
||||||
|
|
||||||
for index in range(0, 4):
|
for index in range(0, 6):
|
||||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||||
json.dump(Data[index], file, indent=4)
|
json.dump(Data[index], file, indent=4)
|
||||||
|
|
||||||
@ -61,26 +68,30 @@ def Transf():
|
|||||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
|
||||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
|
||||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
|
||||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json',
|
||||||
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(no date).json',
|
||||||
|
'./ejde_buffer/Author_output/EJDE_Author_output_file(integration).json'
|
||||||
]
|
]
|
||||||
|
|
||||||
article_output_file = [
|
article_output_file = [
|
||||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
|
||||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
|
||||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
|
||||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json',
|
||||||
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(no date).json',
|
||||||
|
'./ejde_buffer/Article_output/EJDE_Article_output_file(integration).json'
|
||||||
]
|
]
|
||||||
|
|
||||||
# Read and write into files
|
# Read and write into files
|
||||||
Read(author_folder_path, author_output_file)
|
read(author_folder_path, author_output_file)
|
||||||
Read(article_folder_path, article_output_file)
|
read(article_folder_path, article_output_file)
|
||||||
|
|
||||||
# End
|
# End
|
||||||
print("\nData has been written into files.")
|
print("\nData has been written into files.")
|
||||||
|
|
||||||
|
|
||||||
# Delete files in temporary storage area
|
# Delete files in temporary storage area
|
||||||
def delete():
|
def delete_data():
|
||||||
folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
|
folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
|
||||||
for folder_path in folder_paths:
|
for folder_path in folder_paths:
|
||||||
file_names = os.listdir(folder_path)
|
file_names = os.listdir(folder_path)
|
||||||
@ -89,5 +100,4 @@ def delete():
|
|||||||
if os.path.isfile(file_path):
|
if os.path.isfile(file_path):
|
||||||
os.remove(file_path)
|
os.remove(file_path)
|
||||||
os.rmdir(folder_path)
|
os.rmdir(folder_path)
|
||||||
|
|
||||||
print('\nAttention: The temporary storage files have been deleted!')
|
print('\nAttention: The temporary storage files have been deleted!')
|
||||||
|
|||||||
@ -67,7 +67,6 @@ with ThreadPoolExecutor(max_workers=25) as executor:
|
|||||||
futures = [executor.submit(extract_href, url) for url in url_list]
|
futures = [executor.submit(extract_href, url) for url in url_list]
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
wait(futures)
|
wait(futures)
|
||||||
print('\nAll links have been got.\n')
|
print('\nAll links have been got.\n')
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user