Compare commits

..

No commits in common. "ad427c24dce6a8d74b9e2b1c03c79b8ffa890341" and "24aa62c8db13b93444777cf261b2681809b16675" have entirely different histories.

10 changed files with 114 additions and 582 deletions

Binary file not shown.

View File

@ -1,199 +0,0 @@
import json
import os
import unicodedata
from collections import OrderedDict
from pprint import pprint
'''
========== FileStructureTransfer ==========
1. 本程序用于将获取的数据进行结构调整
2. 根据论文发表的时间年限分别将最后的数据存储在四个 json 文件中
1 newest 发表于 2020 年之后
2 oldest 发表于 2010 年之前
3 2010-2014 发表于 2010 年至 2014
4 2015-2020 发表于 2015 年至 2020
3. 考虑到部分网站的总数据量过大所以分成多份
4. 本程序运行顺序为
1 fileReader() 读取本地已爬取数据存入待处理列表
2 arDataTransform() 转换论文数据格式
3 auDataTransform() 转换作者数据格式
4 存入转换后数据的存储文件夹
'''
# Read the data
def fileReader(folder, dataset):
files = os.listdir(folder)
for file in files:
file_path = os.path.join(folder, file)
with open(file_path, 'r', encoding='utf-8') as json_file:
Data = json.load(json_file)
dataset.append(Data)
return dataset
# Article data structure transfer
def arDataTransform(au_folder, ar_dataset, num):
def auInfoFind(path, file_name, ar_data, num):
authors = ar_data.get('authors')
authors.append(ar_data.get('corresponding_authors'))
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
Data = json.load(file)
au_ID = [] # A new list to store author_id
# Find the author_id
for author in authors:
if author is None:
continue
author = author.replace(" ", "")
for Dict in Data:
Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
unicodedata.category(char) != 'Mn')
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
au_ID.append(Dict.get('author_id'))
# Change the structure
ar_data_transform = {
"article_id": ar_data['article_id'],
"title": ar_data['title'],
"authors": au_ID,
"authors_name": ar_data['authors'],
"submit_datetime": ar_data['submit_datetime'],
"publish_datetime": ar_data['publish_datetime'],
"keywords": ar_data['keywords'],
"MSC": ar_data['MSC'],
"URL": ar_data['URL'],
"DOI": ar_data['DOI'],
"publisher": ar_data['publisher'],
"journal": ar_data['journal'],
"volume": ar_data['volume'],
"issue": ar_data['issue'],
"page": ar_data['page']
}
num[0] += 1 # Update the counter
return ar_data_transform
# ====== Main code for function =====
ar_names = os.listdir(au_folder) # Read the folder
for ar_list in ar_dataset:
for Dict in ar_list:
year = Dict.get('publish_datetime')
if year is None:
continue
year = year.split('-')
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
print(str(num[0]) + " copies of article data structure have been transformed.")
if int(year[0]) <= 2009:
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
ar_dataset_new[3].append(Dict)
elif 2010 <= int(year[0]) <= 2014:
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
ar_dataset_new[0].append(Dict)
elif 2015 <= int(year[0]) <= 2020:
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
ar_dataset_new[1].append(Dict)
else:
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
ar_dataset_new[2].append(Dict)
# Store into the new file
filepaths = [
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2010-2014).json",
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2015-2020).json",
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(newest).json",
"./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(oldest).json",
]
for i in range(4):
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
json.dump(ar_dataset_new[i], json_file, indent=4)
print("\nComplete: All of the article data structure have been transformed.")
# Author data structure transfer
def auDataTransform(au_dataset, num):
def transform(list, num):
new_list = [] # New list to store transformed data
for au_data in list:
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
print(str(num[0]) + " copies of author data structure have been transformed.\n")
if au_data['middle_name'] is not None:
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
else:
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
au_data_transform = {
"author_id": au_data['author_id'],
"from_article": au_data['from_article'][0],
"first_name": au_data['last_name'],
"last_name": au_data['first_name'],
"middle_name": au_data['middle_name'],
"raw_name": raw_name,
"affiliation": au_data['affiliation']
}
new_list.append(au_data_transform)
num[0] += 1 # Update the counter
return new_list
for i in range(4):
au_list = transform(au_dataset[i], num)
au_dataset_new[i].append(au_list)
# Store into the new file
filepaths = [
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2010-2014).json",
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2015-2020).json",
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(newest).json",
"./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(oldest).json",
]
for i in range(4):
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
json.dump(au_dataset_new[i], json_file, indent=4)
print("\nComplete: All of the author data structure have been transformed.")
# ========== Main code ========== #
# New list for storing data
ar_dataset = []
au_dataset = []
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
au_dataset_new = [[] for _ in range(4)] # New list to store transformed data
num1 = [0] # Counter for complete ar_date
num2 = [0] # Counter for complete au_data
os.makedirs('./EJDE_buffer_transform/Article_output/', exist_ok=True)
os.makedirs('./EJDE_buffer_transform/Author_output/', exist_ok=True)
# Read the data
ar_dataset = fileReader('./EJDE_buffer/Article_output', ar_dataset)
au_dataset = fileReader('./EJDE_buffer/Author_output', au_dataset)
# Change the structure
arDataTransform('./EJDE_buffer/Author_output', ar_dataset, num1)
auDataTransform(au_dataset, num2)

View File

@ -1,189 +0,0 @@
import json
import os
import unicodedata
from collections import OrderedDict
from pprint import pprint
'''
========== FileStructureTransfer ==========
1. 本程序用于将获取的数据进行结构调整
2. 根据论文发表的时间年限分别将最后的数据存储在四个 json 文件中
1 newest 发表于 2020 年之后
2 oldest 发表于 2010 年之前
3 2010-2014 发表于 2010 年至 2014
4 2015-2020 发表于 2015 年至 2020
3. 考虑到部分网站的总数据量过大所以分成多份
4. 本程序运行顺序为
1 fileReader() 读取本地已爬取数据存入待处理列表
2 arDataTransform() 转换论文数据格式
3 auDataTransform() 转换作者数据格式
4 存入转换后数据的存储文件夹
'''
# Read the data
def fileReader(folder, dataset):
files = os.listdir(folder)
for file in files:
file_path = os.path.join(folder, file)
with open(file_path, 'r', encoding='utf-8') as json_file:
Data = json.load(json_file)
dataset.append(Data)
return dataset
# Article data structure transfer
def arDataTransform(au_folder, ar_dataset, num):
def auInfoFind(path, file_name, ar_data, num):
authors = ar_data.get('authors')
authors.extend(ar_data.get('corresponding_authors'))
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
Data = json.load(file)
au_ID = [] # A new list to store author_id
# Find the author_id
for author in authors:
for Dict in Data:
Dict_name = Dict.get('first_name') + ' ' + Dict.get('last_name')
Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
unicodedata.category(char) != 'Mn')
if Dict.get('from_article') == ar_data.get('article_id') and Dict_name == author:
au_ID.append(Dict.get('author_id'))
# Change the structure
ar_data_transform = {
"article_id": ar_data['article_id'],
"title": ar_data['title'],
"authors": au_ID,
"authors_name": authors,
"submit_datetime": ar_data['submit_datetime'],
"publish_datetime": ar_data['publish_datetime'],
"keywords": ar_data['keywords'],
"MSC": ar_data['MSC'],
"URL": ar_data['URL'],
"DOI": ar_data['DOI'],
"publisher": ar_data['publisher'],
"journal": ar_data['journal'],
"volume": ar_data['volume'],
"issue": ar_data['issue'],
"page": ar_data['page']
}
num[0] += 1 # Update the counter
return ar_data_transform
# ====== Main code for function =====
ar_names = os.listdir(au_folder) # Read the folder
for ar_list in ar_dataset:
for Dict in ar_list:
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
print(str(num[0]) + " copies of article data structure have been transformed.")
if int(Dict.get('volume')) <= 2009:
Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
ar_dataset_new[3].append(Dict)
elif 2010 <= int(Dict.get('volume')) <= 2014:
Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
ar_dataset_new[0].append(Dict)
elif 2015 <= int(Dict.get('volume')) <= 2020:
Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
ar_dataset_new[1].append(Dict)
else:
Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
ar_dataset_new[2].append(Dict)
# Store into the new file
filepaths = [
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2010-2014).json",
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2015-2020).json",
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(newest).json",
"./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(oldest).json",
]
for i in range(4):
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
json.dump(ar_dataset_new[i], json_file, indent=4)
print("\nComplete: All of the article data structure have been transformed.")
# Author data structure transfer
def auDataTransform(au_dataset, num):
def transform(list, num):
new_list = [] # New list to store transformed data
for au_data in list:
if num[0] % 100 == 0 and num[0] != 0: # Alert for complete data
print(str(num[0]) + " copies of author data structure have been transformed.\n")
if au_data['middle_name'] is not None:
raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
else:
raw_name = au_data['first_name'] + ' ' + au_data['last_name']
au_data_transform = {
"author_id": au_data['author_id'],
"from_article": au_data['from_article'],
"first_name": au_data['first_name'],
"last_name": au_data['last_name'],
"middle_name": au_data['middle_name'],
"raw_name": raw_name,
"affiliation": au_data['affiliation']
}
new_list.append(au_data_transform)
num[0] += 1 # Update the counter
return new_list
for i in range(4):
au_list = transform(au_dataset[i], num)
au_dataset_new[i].extend(au_list)
# Store into the new file
filepaths = [
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2010-2014).json",
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2015-2020).json",
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(newest).json",
"./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(oldest).json",
]
for i in range(4):
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
json.dump(au_dataset_new[i], json_file, indent=4)
print("\nComplete: All of the author data structure have been transformed.")
# ========== Main code ========== #
# New list for storing data
ar_dataset = []
au_dataset = []
ar_dataset_new = [[] for _ in range(4)] # New list for transformed data
au_dataset_new = [[] for _ in range(4)] # New list to store transformed data
num1 = [0] # Counter for complete ar_date
num2 = [0] # Counter for complete au_data
os.makedirs('./SpringerOpen_buffer_transform/Article_output/', exist_ok=True)
os.makedirs('./SpringerOpen_buffer_transform/Author_output/', exist_ok=True)
# Read the data
ar_dataset = fileReader('./SpringerOpen_buffer/Article_output', ar_dataset)
au_dataset = fileReader('./SpringerOpen_buffer/Author_output', au_dataset)
# Change the structure
# arDataTransform('./SpringerOpen_buffer/Author_output', ar_dataset, num1)
auDataTransform(au_dataset, num2)

View File

@ -5,22 +5,6 @@ import unicodedata
from collections import OrderedDict
from pprint import pprint
'''
========== FileStructureTransfer ==========
1. 本程序用于将获取的数据进行结构调整
2. 根据论文发表的时间年限分别将最后的数据存储在四个 json 文件中
1 newest 发表于 2020 年之后
2 oldest 发表于 2010 年之前
3 2010-2014 发表于 2010 年至 2014
4 2015-2020 发表于 2015 年至 2020
3. 考虑到部分网站的总数据量过大所以分成多份
4. 本程序运行顺序为
1 fileReader() 读取本地已爬取数据存入待处理列表
2 arDataTransform() 转换论文数据格式
3 auDataTransform() 转换作者数据格式
4 存入转换后数据的存储文件夹
'''
# Read the data
def fileReader(folder, dataset):
@ -58,28 +42,12 @@ def arDataTransform(au_folder, ar_dataset, num):
if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
au_ID.append(Dict.get('author_id'))
author_names_new = []
author_names = ar_data['authors']
for author_name in author_names:
author_name_new = ''
author_name = author_name.split(", ")
for i in range(len(author_name)-1, 0, -1):
# print(author_name[i])
author_name_new += author_name[i]
if i != 0:
author_name_new += ', '
print(author_name_new)
author_names_new.append(author_name_new)
# Change the structure
ar_data_transform = {
"article_id": ar_data['article_id'],
"title": ar_data['title'],
"authors": au_ID,
"authors_name": author_names_new,
"authors_name": ar_data['authors'],
"submit_datetime": ar_data['submit_datetime'],
"publish_datetime": ar_data['publish_datetime'],
"keywords": ar_data['keywords'],
@ -128,6 +96,13 @@ def arDataTransform(au_folder, ar_dataset, num):
"./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
]
# for filepath in filepaths:
# for list in ar_dataset_new:
# with open(filepath, "w", encoding='utf-8') as json_file:
# json.dump(list, json_file, indent=4)
#
# break
for i in range(4):
with open(filepaths[i], 'w', encoding='utf-8') as json_file:
json.dump(ar_dataset_new[i], json_file, indent=4)
@ -152,8 +127,8 @@ def auDataTransform(au_dataset, num):
au_data_transform = {
"author_id": au_data['author_id'],
"from_article": au_data['from_article'][0],
"first_name": au_data['last_name'],
"last_name": au_data['first_name'],
"first_name": au_data['first_name'],
"last_name": au_data['last_name'],
"middle_name": au_data['middle_name'],
"raw_name": raw_name,
"affiliation": au_data['affiliation']
@ -164,6 +139,13 @@ def auDataTransform(au_dataset, num):
return new_list
# # Transform the author data structure
# au_dataset_new = [] # New list to store transformed data
# for au_list in au_dataset:
# au_list_new = transform(au_list, num)
# au_dataset_new.append(au_list_new)
for i in range(4):
au_list = transform(au_dataset[i], num)
au_dataset_new[i].append(au_list)
@ -203,4 +185,4 @@ au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset)
# Change the structure
arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
auDataTransform(au_dataset, num2)
# auDataTransform(au_dataset, num2)

View File

@ -26,12 +26,11 @@ payload = {
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
# Aminer API
api_paper_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
api_paper_detail = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
api_author_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search"
api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
def aminer_get_paper_id(title):
def aminer_get_id(title):
headers = {
"Authorization": f"Bearer {jwt_token}"
}
@ -40,7 +39,7 @@ def aminer_get_paper_id(title):
"size": "",
"title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
}
response = requests.get(api_paper_id, headers=headers, params=params)
response = requests.get(api_get_id, headers=headers, params=params)
if response.status_code == 200:
data = response.json()
@ -50,7 +49,7 @@ def aminer_get_paper_id(title):
not_on_aminer.append(title)
def aminer_post_paper_citation(aminer_id):
def aminer_post_citation(aminer_id):
headers = {
"Content-Type": "application/json;charset=utf-8",
"Authorization": f"Bearer {jwt_token}"
@ -58,7 +57,7 @@ def aminer_post_paper_citation(aminer_id):
request_data = {
"ids": aminer_id
}
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
if response.status_code == 200:
data = response.json()
@ -73,31 +72,6 @@ def aminer_post_paper_citation(aminer_id):
aminer_paper_citation_retry.append(aminer_id)
def aminer_author_info(author_aminer_id, author_name, offset):
headers = {
"Content-Type": "application/json;charset=utf-8",
"Authorization": f"Bearer {jwt_token}"
}
request_data = {
"ids": author_aminer_id,
"query": author_name,
"offset": offset
}
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
if response.status_code == 200:
data = response.json()
if data.get("success"):
for item in data.get('data', []):
if 'n_citation' in item:
n_citation = item['n_citation']
else:
n_citation = 0
aminer_paper_citation.append(n_citation)
else:
aminer_paper_citation_retry.append(author_aminer_id)
def scholarly_get_citation(title):
# # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
pg = ProxyGenerator()
@ -118,7 +92,8 @@ aminer_paper_citation = []
aminer_paper_citation_retry = []
# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
aminer_get_paper_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
if aminer_paper_id:
aminer_post_paper_citation(aminer_paper_id)
aminer_post_citation(aminer_paper_id)
print(aminer_paper_citation)

View File

@ -1,8 +1,7 @@
import re
import time
import uuid
import requests
import threading
import re
import ejde_save
from retrying import retry
@ -14,8 +13,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
'''
爬取网站'ejde.math.txstate.edu'
Total number of papers: 2023/08/08 - 4785
Total Time via VPN w/100ms-delay: 96.30s
Total number of papers: 2023/08/08 - 4761
Total Time via VPN w/100ms-delay: 306.73s
==========运行顺序==========
1ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件json暂存
@ -24,22 +23,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
'''
def append_data_thread_safe(from_list, to_list, data_lock):
with data_lock:
to_list.append(from_list)
def save_data_thread_safe(data, data_lock, data_type):
global articleNum, authorNum
with data_lock:
ejde_save.save_data(data, f"{data_type}_TS", str(uuid.uuid4()) + ".json")
if data_type == "Article":
articleNum += len(data)
else:
authorNum += len(data)
data.clear()
def datetime_transform(date):
month_typo = {
"Janaury": "January",
@ -137,7 +120,7 @@ def process_html_article(baseweb, article):
# Get article title & url
try:
title = article.text.strip()
title = str(re.sub(r'\s+', ' ', title).strip())
title = re.sub(r'\s+', ' ', title).strip()
article_url = baseweb + article.find_next("a")["href"]
if "../../index.html" in article_url:
print("Redundant URL:", article_url)
@ -165,6 +148,7 @@ def process_html_article(baseweb, article):
@retry(wait_fixed=5000, stop_max_attempt_number=5)
def process_article(title, article_url):
global articleNum, authorNum
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
article_response = requests.get(article_url, headers=headers)
@ -178,11 +162,11 @@ def process_article(title, article_url):
# Extract title if title == None
if not title:
title_match = re.search(r"<h3>(.*?)<p>", article_text)
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
# Extract issue
issue_match = re.search(r'No\. (\d+)', article_text)
issue = issue_match.group(1) if issue_match else ""
issue = issue_match.group(1) if issue_match else None
# Extract volume
volume_match = re.search(r'Vol\. (\d+)', article_text)
@ -205,23 +189,21 @@ def process_article(title, article_url):
volume = str(volume)
issue = "Conference " + str(issue_number)
else:
volume = ""
volume = None
# Extract pp
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
pp = pp_match.group(1) if pp_match else ""
pp = pp_match.group(1) if pp_match else None
# Extract submission date
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
if not match:
match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
submitted_date = match.group(1) if match else ""
submitted_date = match.group(1) if match else None
if submitted_date:
submitted_date = datetime_transform(submitted_date)
# Extract publication date
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
publish_date = match.group(1) if match else ""
publish_date = match.group(1) if match else None
if publish_date:
publish_date = datetime_transform(publish_date)
@ -252,25 +234,25 @@ def process_article(title, article_url):
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
if not doi_match:
doi_match = re.search(r'DOI: (.+)', html)
doi = doi_match.group(1) if doi_match else ""
doi = doi_match.group(1) if doi_match else None
doi = doi.replace('https://doi.org/', '') # strip doi website header
# Article_id
article_id = str(uuid.uuid4())
# Author info
authors = []
author_names = []
table = article_soup.find('table')
if table:
for row in table.find_all('tr'):
cells = [cell.text.strip() for cell in row.find_all('td')]
for cell in cells:
if "email" in cell:
cell = cell.split("email")
if "email:" in cell:
cell = cell.split("email:")
email_list = str(cell[1]).split(',')
cell = cell[0]
elif "e-mail" in cell:
cell = cell.split("e-mail")
elif "e-mail:" in cell:
cell = cell.split("e-mail:")
email_list = str(cell[1]).split(',')
cell = cell[0]
else:
@ -282,11 +264,8 @@ def process_article(title, article_url):
# Data processing
if cell[0]:
author_id = str(uuid.uuid4())
authors.append(author_id)
author_names.append(unidecode(cell[0]))
name = re.split(r'\s+', cell[0])
name = [item for item in name if item != '']
authors.append(unidecode(cell[0]))
name = re.split(r'[ .]', cell[0])
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
affiliation = affiliation.lstrip(",").rstrip(",").strip()
@ -297,21 +276,19 @@ def process_article(title, article_url):
emails.append(unidecode(email_match.group())) if email_match else None
author_data = {
"author_id": author_id,
"from_article": article_id,
"author_id": str(uuid.uuid4()),
"from_article": [article_id],
"first_name": unidecode(name[0]),
"last_name": unidecode(name[-1]),
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
"raw_name": unidecode(cell[0]),
"affiliation": [
{
"year": volume,
"affiliation": unidecode(affiliation),
"email": ", ".join(emails)
}
]
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
"affiliation": [{
"year": volume,
"affiliation": unidecode(affiliation),
"email": emails
}]
}
append_data_thread_safe(author_data, authorData, authorDataLock)
authorData.append(author_data)
authorNum += 1
# If no author table
else:
match_type = 0
@ -331,12 +308,12 @@ def process_article(title, article_url):
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
matches = matches.split("<p>")
for match in matches:
if "email" in match:
match = match.split("email")
if "email:" in match:
match = match.split("email:")
email_list = str(match[1]).split(',')
match = match[0]
elif "e-mail" in match:
match = match.split("e-mail")
elif "e-mail:" in match:
match = match.split("e-mail:")
email_list = str(match[1]).split(',')
match = match[0]
else:
@ -353,11 +330,8 @@ def process_article(title, article_url):
# Data processing
if match[0]:
author_id = str(uuid.uuid4())
authors.append(author_id)
authors.append(unidecode(match[0]))
name = re.split(r'\s+', match[0])
name = [item for item in name if item != '']
name = re.split(r'[ .]', match[0])
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
affiliation = affiliation.lstrip(",").rstrip(",").strip()
@ -368,21 +342,19 @@ def process_article(title, article_url):
emails.append(unidecode(email_match.group())) if email_match else None
author_data = {
"author_id": author_id,
"from_article": article_id,
"author_id": str(uuid.uuid4()),
"from_article": [article_id],
"first_name": unidecode(name[0]),
"last_name": unidecode(name[-1]),
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
"raw_name": unidecode(match[0]),
"affiliation": [
{
"year": volume,
"affiliation": unidecode(affiliation),
"email": ", ".join(emails)
}
]
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
"affiliation": [{
"year": volume,
"affiliation": unidecode(affiliation),
"email": emails
}]
}
append_data_thread_safe(author_data, authorData, authorDataLock)
authorData.append(author_data)
authorNum += 1
else:
print("AUTHOR SEARCHING ERROR:", article_url)
fail = {
@ -396,7 +368,7 @@ def process_article(title, article_url):
"article_id": article_id,
"title": unidecode(title),
"authors": authors,
"author_names": author_names,
"corresponding_authors": None,
"submit_datetime": submitted_date,
"publish_datetime": publish_date,
"keywords": keywords,
@ -409,14 +381,17 @@ def process_article(title, article_url):
"issue": issue,
"page": pp
}
append_data_thread_safe(article_data, articleData, articleDataLock)
articleData.append(article_data)
articleNum += 1
# Save the data periodically based on batch size
if len(articleData) % batch_size == 0:
save_data_thread_safe(articleData, articleDataLock, "Article")
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
articleData.clear()
if len(authorData) % batch_size == 0:
save_data_thread_safe(authorData, authorDataLock, "Author")
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
authorData.clear()
start_time = time.time()
@ -454,8 +429,6 @@ authorNum = 0
articleNum = 0
batch_size = 100 # Number of articles to process before saving
authorDataLock = threading.Lock()
articleDataLock = threading.Lock()
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
# Process each URL using multithreading
@ -469,30 +442,29 @@ for future in as_completed(futures):
print("VOLUME PROCESSING ERROR:", str(vol_err))
# Retry failed processing paper
if len(failedData):
print("START RETRYING:", len(failedData))
while failedData:
fail_data = failedData.pop(0)
articleTitle = fail_data["title"]
articleUrl = fail_data["URL"]
try:
process_article(articleTitle, articleUrl)
except Exception as retry_err:
print("ARTICLE RETRYING FAILURE:", str(retry_err))
totally_fail = {
"title": articleTitle,
"URL": articleUrl
}
totallyFailedData.append(totally_fail)
print("START RETRYING:", len(failedData))
while failedData:
data = failedData.pop(0)
articleTitle = data["title"]
articleUrl = data["URL"]
try:
process_article(articleTitle, articleUrl)
except Exception as retry_err:
print("ARTICLE RETRYING FAILURE:", str(retry_err))
totally_fail = {
"title": articleTitle,
"URL": articleUrl
}
totallyFailedData.append(totally_fail)
# Save remaining data
if len(articleData) > 0:
save_data_thread_safe(articleData, articleDataLock, "Article")
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
if len(authorData) > 0:
save_data_thread_safe(authorData, authorDataLock, "Author")
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
# Save error record
if len(totallyFailedData) > 0:
@ -513,5 +485,5 @@ print("Total fetched author:", authorNum)
print("time elapsed: {:.2f}s".format(time.time() - start_time))
# Transfer to large file and delete the temporary storage files
ejde_save.transform_data()
ejde_save.delete_data()
ejde_save.Transf()
ejde_save.delete()

View File

@ -14,8 +14,8 @@ def save_data(dataset, filetype, filename):
# Write into output files
def transform_data():
def read(folder_path, output_files):
def Transf():
def Read(folder_path, output_files):
# Create new folders
os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
@ -24,8 +24,6 @@ def transform_data():
data_2010_2014 = []
data_2015_2020 = []
data_newest = []
data_no_date = []
data_integrate = []
for filename in os.listdir(folder_path):
if filename.endswith('.json'):
@ -33,29 +31,24 @@ def transform_data():
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
for Dict in data:
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) != '':
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
# Select data
if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
data_oldest.append(Dict)
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
data_2010_2014.append(Dict)
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
data_2015_2020.append(Dict)
else:
data_newest.append(Dict)
else:
data_no_date.append(Dict)
data_integrate.append(data_oldest)
data_integrate.append(data_2010_2014)
data_integrate.append(data_2015_2020)
data_integrate.append(data_newest)
data_integrate.append(data_no_date)
# Transfer
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest, data_no_date, data_integrate]
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
for index in range(0, 6):
for index in range(0, 4):
with open(output_files[index], 'w', encoding='utf-8') as file:
json.dump(Data[index], file, indent=4)
@ -68,30 +61,26 @@ def transform_data():
'./ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
'./ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
'./ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json',
'./ejde_buffer/Author_output/EJDE_Author_output_file(no date).json',
'./ejde_buffer/Author_output/EJDE_Author_output_file(integration).json'
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
]
article_output_file = [
'./ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
'./ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
'./ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json',
'./ejde_buffer/Article_output/EJDE_Article_output_file(no date).json',
'./ejde_buffer/Article_output/EJDE_Article_output_file(integration).json'
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
]
# Read and write into files
read(author_folder_path, author_output_file)
read(article_folder_path, article_output_file)
Read(author_folder_path, author_output_file)
Read(article_folder_path, article_output_file)
# End
print("\nData has been written into files.")
# Delete files in temporary storage area
def delete_data():
def delete():
folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
for folder_path in folder_paths:
file_names = os.listdir(folder_path)
@ -100,4 +89,5 @@ def delete_data():
if os.path.isfile(file_path):
os.remove(file_path)
os.rmdir(folder_path)
print('\nAttention: The temporary storage files have been deleted!')

View File

@ -67,6 +67,7 @@ with ThreadPoolExecutor(max_workers=25) as executor:
futures = [executor.submit(extract_href, url) for url in url_list]
for future in as_completed(futures):
pass
wait(futures)
print('\nAll links have been got.\n')
@ -110,4 +111,4 @@ print(count2, ' author_data has been stored.')
# Transfer to large file and delete the temporary storage files
ejqtde_save.Transf()
ejqtde_save.delete()
ejqtde_save.delete()