Compare commits
No commits in common. "ad427c24dce6a8d74b9e2b1c03c79b8ffa890341" and "24aa62c8db13b93444777cf261b2681809b16675" have entirely different histories.
ad427c24dc
...
24aa62c8db
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,199 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import unicodedata
|
||||
|
||||
from collections import OrderedDict
|
||||
from pprint import pprint
|
||||
|
||||
'''
|
||||
========== FileStructureTransfer ==========
|
||||
1. 本程序用于将获取的数据进行结构调整
|
||||
2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中
|
||||
(1) newest: 发表于 2020 年之后
|
||||
(2) oldest: 发表于 2010 年之前
|
||||
(3) 2010-2014: 发表于 2010 年至 2014 年
|
||||
(4) 2015-2020: 发表于 2015 年至 2020 年
|
||||
3. 考虑到部分网站的总数据量过大,所以分成多份
|
||||
4. 本程序运行顺序为:
|
||||
(1) fileReader() 读取本地已爬取数据,存入待处理列表
|
||||
(2) arDataTransform() 转换论文数据格式
|
||||
(3) auDataTransform() 转换作者数据格式
|
||||
(4) 存入转换后数据的存储文件夹
|
||||
'''
|
||||
|
||||
|
||||
# Read the data
|
||||
def fileReader(folder, dataset):
|
||||
files = os.listdir(folder)
|
||||
for file in files:
|
||||
file_path = os.path.join(folder, file)
|
||||
with open(file_path, 'r', encoding='utf-8') as json_file:
|
||||
Data = json.load(json_file)
|
||||
dataset.append(Data)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
# Article data structure transfer
|
||||
def arDataTransform(au_folder, ar_dataset, num):
|
||||
def auInfoFind(path, file_name, ar_data, num):
|
||||
authors = ar_data.get('authors')
|
||||
authors.append(ar_data.get('corresponding_authors'))
|
||||
|
||||
file_path = os.path.join(path, file_name)
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
Data = json.load(file)
|
||||
|
||||
au_ID = [] # A new list to store author_id
|
||||
|
||||
# Find the author_id
|
||||
for author in authors:
|
||||
if author is None:
|
||||
continue
|
||||
|
||||
author = author.replace(" ", "")
|
||||
|
||||
for Dict in Data:
|
||||
Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
|
||||
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
|
||||
unicodedata.category(char) != 'Mn')
|
||||
|
||||
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
|
||||
au_ID.append(Dict.get('author_id'))
|
||||
|
||||
# Change the structure
|
||||
ar_data_transform = {
|
||||
"article_id": ar_data['article_id'],
|
||||
"title": ar_data['title'],
|
||||
"authors": au_ID,
|
||||
"authors_name": ar_data['authors'],
|
||||
"submit_datetime": ar_data['submit_datetime'],
|
||||
"publish_datetime": ar_data['publish_datetime'],
|
||||
"keywords": ar_data['keywords'],
|
||||
"MSC": ar_data['MSC'],
|
||||
"URL": ar_data['URL'],
|
||||
"DOI": ar_data['DOI'],
|
||||
"publisher": ar_data['publisher'],
|
||||
"journal": ar_data['journal'],
|
||||
"volume": ar_data['volume'],
|
||||
"issue": ar_data['issue'],
|
||||
"page": ar_data['page']
|
||||
}
|
||||
|
||||
num[0] += 1 # Update the counter
|
||||
return ar_data_transform
|
||||
|
||||
# ====== Main code for function =====
|
||||
ar_names = os.listdir(au_folder) # Read the folder
|
||||
|
||||
for ar_list in ar_dataset:
|
||||
for Dict in ar_list:
|
||||
year = Dict.get('publish_datetime')
|
||||
if year is None:
|
||||
continue
|
||||
|
||||
year = year.split('-')
|
||||
|
||||
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
||||
print(str(num[0]) + " copies of article data structure have been transformed.")
|
||||
|
||||
if int(year[0]) <= 2009:
|
||||
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
|
||||
ar_dataset_new[3].append(Dict)
|
||||
|
||||
elif 2010 <= int(year[0]) <= 2014:
|
||||
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
|
||||
ar_dataset_new[0].append(Dict)
|
||||
|
||||
elif 2015 <= int(year[0]) <= 2020:
|
||||
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
|
||||
ar_dataset_new[1].append(Dict)
|
||||
|
||||
else:
|
||||
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
|
||||
ar_dataset_new[2].append(Dict)
|
||||
|
||||
# Store into the new file
|
||||
filepaths = [
|
||||
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2010-2014).json",
|
||||
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2015-2020).json",
|
||||
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(newest).json",
|
||||
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(oldest).json",
|
||||
]
|
||||
|
||||
for i in range(4):
|
||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||
json.dump(ar_dataset_new[i], json_file, indent=4)
|
||||
|
||||
print("\nComplete: All of the article data structure have been transformed.")
|
||||
|
||||
|
||||
# Author data structure transfer
|
||||
def auDataTransform(au_dataset, num):
|
||||
def transform(list, num):
|
||||
new_list = [] # New list to store transformed data
|
||||
|
||||
for au_data in list:
|
||||
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
||||
print(str(num[0]) + " copies of author data structure have been transformed.\n")
|
||||
|
||||
if au_data['middle_name'] is not None:
|
||||
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
|
||||
else:
|
||||
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
|
||||
|
||||
au_data_transform = {
|
||||
"author_id": au_data['author_id'],
|
||||
"from_article": au_data['from_article'][0],
|
||||
"first_name": au_data['last_name'],
|
||||
"last_name": au_data['first_name'],
|
||||
"middle_name": au_data['middle_name'],
|
||||
"raw_name": raw_name,
|
||||
"affiliation": au_data['affiliation']
|
||||
}
|
||||
|
||||
new_list.append(au_data_transform)
|
||||
num[0] += 1 # Update the counter
|
||||
|
||||
return new_list
|
||||
|
||||
for i in range(4):
|
||||
au_list = transform(au_dataset[i], num)
|
||||
au_dataset_new[i].append(au_list)
|
||||
|
||||
# Store into the new file
|
||||
filepaths = [
|
||||
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2010-2014).json",
|
||||
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2015-2020).json",
|
||||
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(newest).json",
|
||||
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(oldest).json",
|
||||
]
|
||||
|
||||
for i in range(4):
|
||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||
json.dump(au_dataset_new[i], json_file, indent=4)
|
||||
|
||||
print("\nComplete: All of the author data structure have been transformed.")
|
||||
|
||||
|
||||
# ========== Main code ========== #
|
||||
# New list for storing data
|
||||
ar_dataset = []
|
||||
au_dataset = []
|
||||
|
||||
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
|
||||
au_dataset_new = [[] for _ in range(4)] # New list to store transformed data
|
||||
|
||||
num1 = [0] # Counter for complete ar_date
|
||||
num2 = [0] # Counter for complete au_data
|
||||
|
||||
os.makedirs('./EJDE_buffer_transform/Article_output/', exist_ok=True)
|
||||
os.makedirs('./EJDE_buffer_transform/Author_output/', exist_ok=True)
|
||||
|
||||
# Read the data
|
||||
ar_dataset = fileReader('./EJDE_buffer/Article_output', ar_dataset)
|
||||
au_dataset = fileReader('./EJDE_buffer/Author_output', au_dataset)
|
||||
|
||||
# Change the structure
|
||||
arDataTransform('./EJDE_buffer/Author_output', ar_dataset, num1)
|
||||
auDataTransform(au_dataset, num2)
|
||||
@ -1,189 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import unicodedata
|
||||
|
||||
from collections import OrderedDict
|
||||
from pprint import pprint
|
||||
|
||||
'''
|
||||
========== FileStructureTransfer ==========
|
||||
1. 本程序用于将获取的数据进行结构调整
|
||||
2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中
|
||||
(1) newest: 发表于 2020 年之后
|
||||
(2) oldest: 发表于 2010 年之前
|
||||
(3) 2010-2014: 发表于 2010 年至 2014 年
|
||||
(4) 2015-2020: 发表于 2015 年至 2020 年
|
||||
3. 考虑到部分网站的总数据量过大,所以分成多份
|
||||
4. 本程序运行顺序为:
|
||||
(1) fileReader() 读取本地已爬取数据,存入待处理列表
|
||||
(2) arDataTransform() 转换论文数据格式
|
||||
(3) auDataTransform() 转换作者数据格式
|
||||
(4) 存入转换后数据的存储文件夹
|
||||
'''
|
||||
|
||||
|
||||
# Read the data
|
||||
def fileReader(folder, dataset):
|
||||
files = os.listdir(folder)
|
||||
for file in files:
|
||||
file_path = os.path.join(folder, file)
|
||||
with open(file_path, 'r', encoding='utf-8') as json_file:
|
||||
Data = json.load(json_file)
|
||||
dataset.append(Data)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
# Article data structure transfer
|
||||
def arDataTransform(au_folder, ar_dataset, num):
|
||||
def auInfoFind(path, file_name, ar_data, num):
|
||||
authors = ar_data.get('authors')
|
||||
authors.extend(ar_data.get('corresponding_authors'))
|
||||
|
||||
file_path = os.path.join(path, file_name)
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
Data = json.load(file)
|
||||
|
||||
au_ID = [] # A new list to store author_id
|
||||
|
||||
# Find the author_id
|
||||
for author in authors:
|
||||
|
||||
for Dict in Data:
|
||||
Dict_name = Dict.get('first_name') + ' ' + Dict.get('last_name')
|
||||
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
|
||||
unicodedata.category(char) != 'Mn')
|
||||
|
||||
if Dict.get('from_article') == ar_data.get('article_id') and Dict_name == author:
|
||||
au_ID.append(Dict.get('author_id'))
|
||||
|
||||
# Change the structure
|
||||
ar_data_transform = {
|
||||
"article_id": ar_data['article_id'],
|
||||
"title": ar_data['title'],
|
||||
"authors": au_ID,
|
||||
"authors_name": authors,
|
||||
"submit_datetime": ar_data['submit_datetime'],
|
||||
"publish_datetime": ar_data['publish_datetime'],
|
||||
"keywords": ar_data['keywords'],
|
||||
"MSC": ar_data['MSC'],
|
||||
"URL": ar_data['URL'],
|
||||
"DOI": ar_data['DOI'],
|
||||
"publisher": ar_data['publisher'],
|
||||
"journal": ar_data['journal'],
|
||||
"volume": ar_data['volume'],
|
||||
"issue": ar_data['issue'],
|
||||
"page": ar_data['page']
|
||||
}
|
||||
|
||||
num[0] += 1 # Update the counter
|
||||
return ar_data_transform
|
||||
|
||||
# ====== Main code for function =====
|
||||
ar_names = os.listdir(au_folder) # Read the folder
|
||||
|
||||
for ar_list in ar_dataset:
|
||||
for Dict in ar_list:
|
||||
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
||||
print(str(num[0]) + " copies of article data structure have been transformed.")
|
||||
|
||||
if int(Dict.get('volume')) <= 2009:
|
||||
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
|
||||
ar_dataset_new[3].append(Dict)
|
||||
|
||||
elif 2010 <= int(Dict.get('volume')) <= 2014:
|
||||
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
|
||||
ar_dataset_new[0].append(Dict)
|
||||
|
||||
elif 2015 <= int(Dict.get('volume')) <= 2020:
|
||||
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
|
||||
ar_dataset_new[1].append(Dict)
|
||||
|
||||
else:
|
||||
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
|
||||
ar_dataset_new[2].append(Dict)
|
||||
|
||||
# Store into the new file
|
||||
filepaths = [
|
||||
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2010-2014).json",
|
||||
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2015-2020).json",
|
||||
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(newest).json",
|
||||
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(oldest).json",
|
||||
]
|
||||
|
||||
for i in range(4):
|
||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||
json.dump(ar_dataset_new[i], json_file, indent=4)
|
||||
|
||||
print("\nComplete: All of the article data structure have been transformed.")
|
||||
|
||||
|
||||
# Author data structure transfer
|
||||
def auDataTransform(au_dataset, num):
|
||||
def transform(list, num):
|
||||
new_list = [] # New list to store transformed data
|
||||
|
||||
for au_data in list:
|
||||
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
|
||||
print(str(num[0]) + " copies of author data structure have been transformed.\n")
|
||||
|
||||
if au_data['middle_name'] is not None:
|
||||
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
|
||||
else:
|
||||
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
|
||||
|
||||
au_data_transform = {
|
||||
"author_id": au_data['author_id'],
|
||||
"from_article": au_data['from_article'],
|
||||
"first_name": au_data['first_name'],
|
||||
"last_name": au_data['last_name'],
|
||||
"middle_name": au_data['middle_name'],
|
||||
"raw_name": raw_name,
|
||||
"affiliation": au_data['affiliation']
|
||||
}
|
||||
|
||||
new_list.append(au_data_transform)
|
||||
num[0] += 1 # Update the counter
|
||||
|
||||
return new_list
|
||||
|
||||
for i in range(4):
|
||||
au_list = transform(au_dataset[i], num)
|
||||
au_dataset_new[i].extend(au_list)
|
||||
|
||||
# Store into the new file
|
||||
filepaths = [
|
||||
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2010-2014).json",
|
||||
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2015-2020).json",
|
||||
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(newest).json",
|
||||
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(oldest).json",
|
||||
]
|
||||
|
||||
for i in range(4):
|
||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||
json.dump(au_dataset_new[i], json_file, indent=4)
|
||||
|
||||
print("\nComplete: All of the author data structure have been transformed.")
|
||||
|
||||
|
||||
# ========== Main code ========== #
|
||||
# New list for storing data
|
||||
ar_dataset = []
|
||||
au_dataset = []
|
||||
|
||||
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
|
||||
au_dataset_new = [[] for _ in range(4)] # New list to store transformed data
|
||||
|
||||
num1 = [0] # Counter for complete ar_date
|
||||
num2 = [0] # Counter for complete au_data
|
||||
|
||||
os.makedirs('./SpringerOpen_buffer_transform/Article_output/', exist_ok=True)
|
||||
os.makedirs('./SpringerOpen_buffer_transform/Author_output/', exist_ok=True)
|
||||
|
||||
# Read the data
|
||||
ar_dataset = fileReader('./SpringerOpen_buffer/Article_output', ar_dataset)
|
||||
au_dataset = fileReader('./SpringerOpen_buffer/Author_output', au_dataset)
|
||||
|
||||
# Change the structure
|
||||
# arDataTransform('./SpringerOpen_buffer/Author_output', ar_dataset, num1)
|
||||
auDataTransform(au_dataset, num2)
|
||||
@ -5,22 +5,6 @@ import unicodedata
|
||||
from collections import OrderedDict
|
||||
from pprint import pprint
|
||||
|
||||
'''
|
||||
========== FileStructureTransfer ==========
|
||||
1. 本程序用于将获取的数据进行结构调整
|
||||
2. 根据论文发表的时间年限,分别将最后的数据存储在四个 json 文件中
|
||||
(1) newest: 发表于 2020 年之后
|
||||
(2) oldest: 发表于 2010 年之前
|
||||
(3) 2010-2014: 发表于 2010 年至 2014 年
|
||||
(4) 2015-2020: 发表于 2015 年至 2020 年
|
||||
3. 考虑到部分网站的总数据量过大,所以分成多份
|
||||
4. 本程序运行顺序为:
|
||||
(1) fileReader() 读取本地已爬取数据,存入待处理列表
|
||||
(2) arDataTransform() 转换论文数据格式
|
||||
(3) auDataTransform() 转换作者数据格式
|
||||
(4) 存入转换后数据的存储文件夹
|
||||
'''
|
||||
|
||||
|
||||
# Read the data
|
||||
def fileReader(folder, dataset):
|
||||
@ -58,28 +42,12 @@ def arDataTransform(au_folder, ar_dataset, num):
|
||||
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
|
||||
au_ID.append(Dict.get('author_id'))
|
||||
|
||||
author_names_new = []
|
||||
author_names = ar_data['authors']
|
||||
|
||||
for author_name in author_names:
|
||||
author_name_new = ''
|
||||
author_name = author_name.split(", ")
|
||||
|
||||
for i in range(len(author_name)-1, 0, -1):
|
||||
# print(author_name[i])
|
||||
author_name_new += author_name[i]
|
||||
if i != 0:
|
||||
author_name_new += ', '
|
||||
|
||||
print(author_name_new)
|
||||
author_names_new.append(author_name_new)
|
||||
|
||||
# Change the structure
|
||||
ar_data_transform = {
|
||||
"article_id": ar_data['article_id'],
|
||||
"title": ar_data['title'],
|
||||
"authors": au_ID,
|
||||
"authors_name": author_names_new,
|
||||
"authors_name": ar_data['authors'],
|
||||
"submit_datetime": ar_data['submit_datetime'],
|
||||
"publish_datetime": ar_data['publish_datetime'],
|
||||
"keywords": ar_data['keywords'],
|
||||
@ -128,6 +96,13 @@ def arDataTransform(au_folder, ar_dataset, num):
|
||||
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
|
||||
]
|
||||
|
||||
# for filepath in filepaths:
|
||||
# for list in ar_dataset_new:
|
||||
# with open(filepath, "w", encoding='utf-8') as json_file:
|
||||
# json.dump(list, json_file, indent=4)
|
||||
#
|
||||
# break
|
||||
|
||||
for i in range(4):
|
||||
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
|
||||
json.dump(ar_dataset_new[i], json_file, indent=4)
|
||||
@ -152,8 +127,8 @@ def auDataTransform(au_dataset, num):
|
||||
au_data_transform = {
|
||||
"author_id": au_data['author_id'],
|
||||
"from_article": au_data['from_article'][0],
|
||||
"first_name": au_data['last_name'],
|
||||
"last_name": au_data['first_name'],
|
||||
"first_name": au_data['first_name'],
|
||||
"last_name": au_data['last_name'],
|
||||
"middle_name": au_data['middle_name'],
|
||||
"raw_name": raw_name,
|
||||
"affiliation": au_data['affiliation']
|
||||
@ -164,6 +139,13 @@ def auDataTransform(au_dataset, num):
|
||||
|
||||
return new_list
|
||||
|
||||
# # Transform the author data structure
|
||||
# au_dataset_new = [] # New list to store transformed data
|
||||
|
||||
# for au_list in au_dataset:
|
||||
# au_list_new = transform(au_list, num)
|
||||
# au_dataset_new.append(au_list_new)
|
||||
|
||||
for i in range(4):
|
||||
au_list = transform(au_dataset[i], num)
|
||||
au_dataset_new[i].append(au_list)
|
||||
@ -203,4 +185,4 @@ au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset)
|
||||
|
||||
# Change the structure
|
||||
arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
|
||||
auDataTransform(au_dataset, num2)
|
||||
# auDataTransform(au_dataset, num2)
|
||||
@ -26,12 +26,11 @@ payload = {
|
||||
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
|
||||
|
||||
# Aminer API
|
||||
api_paper_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
|
||||
api_paper_detail = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
|
||||
api_author_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search"
|
||||
api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
|
||||
api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
|
||||
|
||||
|
||||
def aminer_get_paper_id(title):
|
||||
def aminer_get_id(title):
|
||||
headers = {
|
||||
"Authorization": f"Bearer {jwt_token}"
|
||||
}
|
||||
@ -40,7 +39,7 @@ def aminer_get_paper_id(title):
|
||||
"size": "",
|
||||
"title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
|
||||
}
|
||||
response = requests.get(api_paper_id, headers=headers, params=params)
|
||||
response = requests.get(api_get_id, headers=headers, params=params)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
@ -50,7 +49,7 @@ def aminer_get_paper_id(title):
|
||||
not_on_aminer.append(title)
|
||||
|
||||
|
||||
def aminer_post_paper_citation(aminer_id):
|
||||
def aminer_post_citation(aminer_id):
|
||||
headers = {
|
||||
"Content-Type": "application/json;charset=utf-8",
|
||||
"Authorization": f"Bearer {jwt_token}"
|
||||
@ -58,7 +57,7 @@ def aminer_post_paper_citation(aminer_id):
|
||||
request_data = {
|
||||
"ids": aminer_id
|
||||
}
|
||||
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
|
||||
response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
@ -73,31 +72,6 @@ def aminer_post_paper_citation(aminer_id):
|
||||
aminer_paper_citation_retry.append(aminer_id)
|
||||
|
||||
|
||||
def aminer_author_info(author_aminer_id, author_name, offset):
|
||||
headers = {
|
||||
"Content-Type": "application/json;charset=utf-8",
|
||||
"Authorization": f"Bearer {jwt_token}"
|
||||
}
|
||||
request_data = {
|
||||
"ids": author_aminer_id,
|
||||
"query": author_name,
|
||||
"offset": offset
|
||||
}
|
||||
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data.get("success"):
|
||||
for item in data.get('data', []):
|
||||
if 'n_citation' in item:
|
||||
n_citation = item['n_citation']
|
||||
else:
|
||||
n_citation = 0
|
||||
aminer_paper_citation.append(n_citation)
|
||||
else:
|
||||
aminer_paper_citation_retry.append(author_aminer_id)
|
||||
|
||||
|
||||
def scholarly_get_citation(title):
|
||||
# # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
|
||||
pg = ProxyGenerator()
|
||||
@ -118,7 +92,8 @@ aminer_paper_citation = []
|
||||
aminer_paper_citation_retry = []
|
||||
|
||||
# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
|
||||
aminer_get_paper_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
|
||||
|
||||
aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
|
||||
if aminer_paper_id:
|
||||
aminer_post_paper_citation(aminer_paper_id)
|
||||
aminer_post_citation(aminer_paper_id)
|
||||
print(aminer_paper_citation)
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
import requests
|
||||
import threading
|
||||
import re
|
||||
import ejde_save
|
||||
|
||||
from retrying import retry
|
||||
@ -14,8 +13,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
'''
|
||||
爬取网站:'ejde.math.txstate.edu'
|
||||
|
||||
Total number of papers: 2023/08/08 - 4785
|
||||
Total Time via VPN w/100ms-delay: 96.30s
|
||||
Total number of papers: 2023/08/08 - 4761
|
||||
Total Time via VPN w/100ms-delay: 306.73s
|
||||
|
||||
==========运行顺序==========
|
||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||
@ -24,22 +23,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
'''
|
||||
|
||||
|
||||
def append_data_thread_safe(from_list, to_list, data_lock):
|
||||
with data_lock:
|
||||
to_list.append(from_list)
|
||||
|
||||
|
||||
def save_data_thread_safe(data, data_lock, data_type):
|
||||
global articleNum, authorNum
|
||||
with data_lock:
|
||||
ejde_save.save_data(data, f"{data_type}_TS", str(uuid.uuid4()) + ".json")
|
||||
if data_type == "Article":
|
||||
articleNum += len(data)
|
||||
else:
|
||||
authorNum += len(data)
|
||||
data.clear()
|
||||
|
||||
|
||||
def datetime_transform(date):
|
||||
month_typo = {
|
||||
"Janaury": "January",
|
||||
@ -137,7 +120,7 @@ def process_html_article(baseweb, article):
|
||||
# Get article title & url
|
||||
try:
|
||||
title = article.text.strip()
|
||||
title = str(re.sub(r'\s+', ' ', title).strip())
|
||||
title = re.sub(r'\s+', ' ', title).strip()
|
||||
article_url = baseweb + article.find_next("a")["href"]
|
||||
if "../../index.html" in article_url:
|
||||
print("Redundant URL:", article_url)
|
||||
@ -165,6 +148,7 @@ def process_html_article(baseweb, article):
|
||||
|
||||
@retry(wait_fixed=5000, stop_max_attempt_number=5)
|
||||
def process_article(title, article_url):
|
||||
global articleNum, authorNum
|
||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
||||
article_response = requests.get(article_url, headers=headers)
|
||||
@ -178,11 +162,11 @@ def process_article(title, article_url):
|
||||
# Extract title if title == None
|
||||
if not title:
|
||||
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
||||
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""
|
||||
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
|
||||
|
||||
# Extract issue
|
||||
issue_match = re.search(r'No\. (\d+)', article_text)
|
||||
issue = issue_match.group(1) if issue_match else ""
|
||||
issue = issue_match.group(1) if issue_match else None
|
||||
|
||||
# Extract volume
|
||||
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||||
@ -205,23 +189,21 @@ def process_article(title, article_url):
|
||||
volume = str(volume)
|
||||
issue = "Conference " + str(issue_number)
|
||||
else:
|
||||
volume = ""
|
||||
volume = None
|
||||
|
||||
# Extract pp
|
||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||
pp = pp_match.group(1) if pp_match else ""
|
||||
pp = pp_match.group(1) if pp_match else None
|
||||
|
||||
# Extract submission date
|
||||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||
if not match:
|
||||
match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
|
||||
submitted_date = match.group(1) if match else ""
|
||||
submitted_date = match.group(1) if match else None
|
||||
if submitted_date:
|
||||
submitted_date = datetime_transform(submitted_date)
|
||||
|
||||
# Extract publication date
|
||||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||
publish_date = match.group(1) if match else ""
|
||||
publish_date = match.group(1) if match else None
|
||||
if publish_date:
|
||||
publish_date = datetime_transform(publish_date)
|
||||
|
||||
@ -252,25 +234,25 @@ def process_article(title, article_url):
|
||||
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
|
||||
if not doi_match:
|
||||
doi_match = re.search(r'DOI: (.+)', html)
|
||||
doi = doi_match.group(1) if doi_match else ""
|
||||
doi = doi_match.group(1) if doi_match else None
|
||||
doi = doi.replace('https://doi.org/', '') # strip doi website header
|
||||
|
||||
# Article_id
|
||||
article_id = str(uuid.uuid4())
|
||||
|
||||
# Author info
|
||||
authors = []
|
||||
author_names = []
|
||||
table = article_soup.find('table')
|
||||
if table:
|
||||
for row in table.find_all('tr'):
|
||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||
for cell in cells:
|
||||
if "email" in cell:
|
||||
cell = cell.split("email")
|
||||
if "email:" in cell:
|
||||
cell = cell.split("email:")
|
||||
email_list = str(cell[1]).split(',')
|
||||
cell = cell[0]
|
||||
elif "e-mail" in cell:
|
||||
cell = cell.split("e-mail")
|
||||
elif "e-mail:" in cell:
|
||||
cell = cell.split("e-mail:")
|
||||
email_list = str(cell[1]).split(',')
|
||||
cell = cell[0]
|
||||
else:
|
||||
@ -282,11 +264,8 @@ def process_article(title, article_url):
|
||||
|
||||
# Data processing
|
||||
if cell[0]:
|
||||
author_id = str(uuid.uuid4())
|
||||
authors.append(author_id)
|
||||
author_names.append(unidecode(cell[0]))
|
||||
name = re.split(r'\s+', cell[0])
|
||||
name = [item for item in name if item != '']
|
||||
authors.append(unidecode(cell[0]))
|
||||
name = re.split(r'[ .]', cell[0])
|
||||
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||
@ -297,21 +276,19 @@ def process_article(title, article_url):
|
||||
emails.append(unidecode(email_match.group())) if email_match else None
|
||||
|
||||
author_data = {
|
||||
"author_id": author_id,
|
||||
"from_article": article_id,
|
||||
"author_id": str(uuid.uuid4()),
|
||||
"from_article": [article_id],
|
||||
"first_name": unidecode(name[0]),
|
||||
"last_name": unidecode(name[-1]),
|
||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
|
||||
"raw_name": unidecode(cell[0]),
|
||||
"affiliation": [
|
||||
{
|
||||
"year": volume,
|
||||
"affiliation": unidecode(affiliation),
|
||||
"email": ", ".join(emails)
|
||||
}
|
||||
]
|
||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": unidecode(affiliation),
|
||||
"email": emails
|
||||
}]
|
||||
}
|
||||
append_data_thread_safe(author_data, authorData, authorDataLock)
|
||||
authorData.append(author_data)
|
||||
authorNum += 1
|
||||
# If no author table
|
||||
else:
|
||||
match_type = 0
|
||||
@ -331,12 +308,12 @@ def process_article(title, article_url):
|
||||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||||
matches = matches.split("<p>")
|
||||
for match in matches:
|
||||
if "email" in match:
|
||||
match = match.split("email")
|
||||
if "email:" in match:
|
||||
match = match.split("email:")
|
||||
email_list = str(match[1]).split(',')
|
||||
match = match[0]
|
||||
elif "e-mail" in match:
|
||||
match = match.split("e-mail")
|
||||
elif "e-mail:" in match:
|
||||
match = match.split("e-mail:")
|
||||
email_list = str(match[1]).split(',')
|
||||
match = match[0]
|
||||
else:
|
||||
@ -353,11 +330,8 @@ def process_article(title, article_url):
|
||||
|
||||
# Data processing
|
||||
if match[0]:
|
||||
author_id = str(uuid.uuid4())
|
||||
authors.append(author_id)
|
||||
authors.append(unidecode(match[0]))
|
||||
name = re.split(r'\s+', match[0])
|
||||
name = [item for item in name if item != '']
|
||||
name = re.split(r'[ .]', match[0])
|
||||
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||
@ -368,21 +342,19 @@ def process_article(title, article_url):
|
||||
emails.append(unidecode(email_match.group())) if email_match else None
|
||||
|
||||
author_data = {
|
||||
"author_id": author_id,
|
||||
"from_article": article_id,
|
||||
"author_id": str(uuid.uuid4()),
|
||||
"from_article": [article_id],
|
||||
"first_name": unidecode(name[0]),
|
||||
"last_name": unidecode(name[-1]),
|
||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
|
||||
"raw_name": unidecode(match[0]),
|
||||
"affiliation": [
|
||||
{
|
||||
"year": volume,
|
||||
"affiliation": unidecode(affiliation),
|
||||
"email": ", ".join(emails)
|
||||
}
|
||||
]
|
||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
||||
"affiliation": [{
|
||||
"year": volume,
|
||||
"affiliation": unidecode(affiliation),
|
||||
"email": emails
|
||||
}]
|
||||
}
|
||||
append_data_thread_safe(author_data, authorData, authorDataLock)
|
||||
authorData.append(author_data)
|
||||
authorNum += 1
|
||||
else:
|
||||
print("AUTHOR SEARCHING ERROR:", article_url)
|
||||
fail = {
|
||||
@ -396,7 +368,7 @@ def process_article(title, article_url):
|
||||
"article_id": article_id,
|
||||
"title": unidecode(title),
|
||||
"authors": authors,
|
||||
"author_names": author_names,
|
||||
"corresponding_authors": None,
|
||||
"submit_datetime": submitted_date,
|
||||
"publish_datetime": publish_date,
|
||||
"keywords": keywords,
|
||||
@ -409,14 +381,17 @@ def process_article(title, article_url):
|
||||
"issue": issue,
|
||||
"page": pp
|
||||
}
|
||||
append_data_thread_safe(article_data, articleData, articleDataLock)
|
||||
articleData.append(article_data)
|
||||
articleNum += 1
|
||||
|
||||
# Save the data periodically based on batch size
|
||||
if len(articleData) % batch_size == 0:
|
||||
save_data_thread_safe(articleData, articleDataLock, "Article")
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
articleData.clear()
|
||||
|
||||
if len(authorData) % batch_size == 0:
|
||||
save_data_thread_safe(authorData, authorDataLock, "Author")
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
authorData.clear()
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
@ -454,8 +429,6 @@ authorNum = 0
|
||||
articleNum = 0
|
||||
|
||||
batch_size = 100 # Number of articles to process before saving
|
||||
authorDataLock = threading.Lock()
|
||||
articleDataLock = threading.Lock()
|
||||
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
|
||||
|
||||
# Process each URL using multithreading
|
||||
@ -469,30 +442,29 @@ for future in as_completed(futures):
|
||||
print("VOLUME PROCESSING ERROR:", str(vol_err))
|
||||
|
||||
# Retry failed processing paper
|
||||
if len(failedData):
|
||||
print("START RETRYING:", len(failedData))
|
||||
while failedData:
|
||||
fail_data = failedData.pop(0)
|
||||
articleTitle = fail_data["title"]
|
||||
articleUrl = fail_data["URL"]
|
||||
try:
|
||||
process_article(articleTitle, articleUrl)
|
||||
except Exception as retry_err:
|
||||
print("ARTICLE RETRYING FAILURE:", str(retry_err))
|
||||
totally_fail = {
|
||||
"title": articleTitle,
|
||||
"URL": articleUrl
|
||||
}
|
||||
totallyFailedData.append(totally_fail)
|
||||
print("START RETRYING:", len(failedData))
|
||||
while failedData:
|
||||
data = failedData.pop(0)
|
||||
articleTitle = data["title"]
|
||||
articleUrl = data["URL"]
|
||||
try:
|
||||
process_article(articleTitle, articleUrl)
|
||||
except Exception as retry_err:
|
||||
print("ARTICLE RETRYING FAILURE:", str(retry_err))
|
||||
totally_fail = {
|
||||
"title": articleTitle,
|
||||
"URL": articleUrl
|
||||
}
|
||||
totallyFailedData.append(totally_fail)
|
||||
|
||||
# Save remaining data
|
||||
if len(articleData) > 0:
|
||||
save_data_thread_safe(articleData, articleDataLock, "Article")
|
||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||
|
||||
if len(authorData) > 0:
|
||||
save_data_thread_safe(authorData, authorDataLock, "Author")
|
||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||
|
||||
# Save error record
|
||||
if len(totallyFailedData) > 0:
|
||||
@ -513,5 +485,5 @@ print("Total fetched author:", authorNum)
|
||||
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||
|
||||
# Transfer to large file and delete the temporary storage files
|
||||
ejde_save.transform_data()
|
||||
ejde_save.delete_data()
|
||||
ejde_save.Transf()
|
||||
ejde_save.delete()
|
||||
|
||||
@ -14,8 +14,8 @@ def save_data(dataset, filetype, filename):
|
||||
|
||||
|
||||
# Write into output files
|
||||
def transform_data():
|
||||
def read(folder_path, output_files):
|
||||
def Transf():
|
||||
def Read(folder_path, output_files):
|
||||
# Create new folders
|
||||
os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
|
||||
os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
|
||||
@ -24,8 +24,6 @@ def transform_data():
|
||||
data_2010_2014 = []
|
||||
data_2015_2020 = []
|
||||
data_newest = []
|
||||
data_no_date = []
|
||||
data_integrate = []
|
||||
|
||||
for filename in os.listdir(folder_path):
|
||||
if filename.endswith('.json'):
|
||||
@ -33,29 +31,24 @@ def transform_data():
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
for Dict in data:
|
||||
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) != '':
|
||||
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||
# Select data
|
||||
if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
||||
data_oldest.append(Dict)
|
||||
|
||||
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
|
||||
data_2010_2014.append(Dict)
|
||||
|
||||
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
||||
data_2015_2020.append(Dict)
|
||||
|
||||
else:
|
||||
data_newest.append(Dict)
|
||||
else:
|
||||
data_no_date.append(Dict)
|
||||
|
||||
data_integrate.append(data_oldest)
|
||||
data_integrate.append(data_2010_2014)
|
||||
data_integrate.append(data_2015_2020)
|
||||
data_integrate.append(data_newest)
|
||||
data_integrate.append(data_no_date)
|
||||
|
||||
# Transfer
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest, data_no_date, data_integrate]
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||
|
||||
for index in range(0, 6):
|
||||
for index in range(0, 4):
|
||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||
json.dump(Data[index], file, indent=4)
|
||||
|
||||
@ -68,30 +61,26 @@ def transform_data():
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(no date).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(integration).json'
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
|
||||
]
|
||||
|
||||
article_output_file = [
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(no date).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(integration).json'
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
|
||||
]
|
||||
|
||||
# Read and write into files
|
||||
read(author_folder_path, author_output_file)
|
||||
read(article_folder_path, article_output_file)
|
||||
Read(author_folder_path, author_output_file)
|
||||
Read(article_folder_path, article_output_file)
|
||||
|
||||
# End
|
||||
print("\nData has been written into files.")
|
||||
|
||||
|
||||
# Delete files in temporary storage area
|
||||
def delete_data():
|
||||
def delete():
|
||||
folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
|
||||
for folder_path in folder_paths:
|
||||
file_names = os.listdir(folder_path)
|
||||
@ -100,4 +89,5 @@ def delete_data():
|
||||
if os.path.isfile(file_path):
|
||||
os.remove(file_path)
|
||||
os.rmdir(folder_path)
|
||||
|
||||
print('\nAttention: The temporary storage files have been deleted!')
|
||||
|
||||
@ -67,6 +67,7 @@ with ThreadPoolExecutor(max_workers=25) as executor:
|
||||
futures = [executor.submit(extract_href, url) for url in url_list]
|
||||
for future in as_completed(futures):
|
||||
pass
|
||||
|
||||
wait(futures)
|
||||
print('\nAll links have been got.\n')
|
||||
|
||||
@ -110,4 +111,4 @@ print(count2, ' author_data has been stored.')
|
||||
|
||||
# Transfer to large file and delete the temporary storage files
|
||||
ejqtde_save.Transf()
|
||||
ejqtde_save.delete()
|
||||
ejqtde_save.delete()
|
||||
Loading…
x
Reference in New Issue
Block a user