Updated ejde parser format
Fixed duplicate data dumping problem Pushed new "ejde_buffer.zip"
This commit is contained in:
parent
50e30e105b
commit
ad63bcf6c4
Binary file not shown.
@ -26,11 +26,12 @@ payload = {
|
||||
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
|
||||
|
||||
# Aminer API
|
||||
api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
|
||||
api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
|
||||
api_paper_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
|
||||
api_paper_detail = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
|
||||
api_author_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search"
|
||||
|
||||
|
||||
def aminer_get_id(title):
|
||||
def aminer_get_paper_id(title):
|
||||
headers = {
|
||||
"Authorization": f"Bearer {jwt_token}"
|
||||
}
|
||||
@ -39,7 +40,7 @@ def aminer_get_id(title):
|
||||
"size": "",
|
||||
"title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
|
||||
}
|
||||
response = requests.get(api_get_id, headers=headers, params=params)
|
||||
response = requests.get(api_paper_id, headers=headers, params=params)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
@ -49,7 +50,7 @@ def aminer_get_id(title):
|
||||
not_on_aminer.append(title)
|
||||
|
||||
|
||||
def aminer_post_citation(aminer_id):
|
||||
def aminer_post_paper_citation(aminer_id):
|
||||
headers = {
|
||||
"Content-Type": "application/json;charset=utf-8",
|
||||
"Authorization": f"Bearer {jwt_token}"
|
||||
@ -57,7 +58,7 @@ def aminer_post_citation(aminer_id):
|
||||
request_data = {
|
||||
"ids": aminer_id
|
||||
}
|
||||
response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
|
||||
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
@ -72,6 +73,31 @@ def aminer_post_citation(aminer_id):
|
||||
aminer_paper_citation_retry.append(aminer_id)
|
||||
|
||||
|
||||
def aminer_author_info(author_aminer_id, author_name, offset):
|
||||
headers = {
|
||||
"Content-Type": "application/json;charset=utf-8",
|
||||
"Authorization": f"Bearer {jwt_token}"
|
||||
}
|
||||
request_data = {
|
||||
"ids": author_aminer_id,
|
||||
"query": author_name,
|
||||
"offset": offset
|
||||
}
|
||||
response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data.get("success"):
|
||||
for item in data.get('data', []):
|
||||
if 'n_citation' in item:
|
||||
n_citation = item['n_citation']
|
||||
else:
|
||||
n_citation = 0
|
||||
aminer_paper_citation.append(n_citation)
|
||||
else:
|
||||
aminer_paper_citation_retry.append(author_aminer_id)
|
||||
|
||||
|
||||
def scholarly_get_citation(title):
|
||||
# # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
|
||||
pg = ProxyGenerator()
|
||||
@ -92,8 +118,7 @@ aminer_paper_citation = []
|
||||
aminer_paper_citation_retry = []
|
||||
|
||||
# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
|
||||
|
||||
aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
|
||||
aminer_get_paper_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
|
||||
if aminer_paper_id:
|
||||
aminer_post_citation(aminer_paper_id)
|
||||
aminer_post_paper_citation(aminer_paper_id)
|
||||
print(aminer_paper_citation)
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
import requests
|
||||
import re
|
||||
import threading
|
||||
import ejde_save
|
||||
|
||||
from retrying import retry
|
||||
@ -13,8 +14,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
'''
|
||||
爬取网站:'ejde.math.txstate.edu'
|
||||
|
||||
Total number of papers: 2023/08/08 - 4761
|
||||
Total Time via VPN w/100ms-delay: 306.73s
|
||||
Total number of papers: 2023/08/08 - 4785
|
||||
Total Time via VPN w/100ms-delay: 48.04s
|
||||
|
||||
==========运行顺序==========
|
||||
1、ejde_main 获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件(json)暂存
|
||||
@ -23,6 +24,12 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
'''
|
||||
|
||||
|
||||
def save_data_thread_safe(data, data_lock, data_type):
|
||||
with data_lock:
|
||||
ejde_save.save_data(data, f"{data_type}", str(uuid.uuid4()) + ".json")
|
||||
data.clear()
|
||||
|
||||
|
||||
def datetime_transform(date):
|
||||
month_typo = {
|
||||
"Janaury": "January",
|
||||
@ -120,7 +127,7 @@ def process_html_article(baseweb, article):
|
||||
# Get article title & url
|
||||
try:
|
||||
title = article.text.strip()
|
||||
title = re.sub(r'\s+', ' ', title).strip()
|
||||
title = str(re.sub(r'\s+', ' ', title).strip())
|
||||
article_url = baseweb + article.find_next("a")["href"]
|
||||
if "../../index.html" in article_url:
|
||||
print("Redundant URL:", article_url)
|
||||
@ -162,11 +169,11 @@ def process_article(title, article_url):
|
||||
# Extract title if title == None
|
||||
if not title:
|
||||
title_match = re.search(r"<h3>(.*?)<p>", article_text)
|
||||
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
|
||||
title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""
|
||||
|
||||
# Extract issue
|
||||
issue_match = re.search(r'No\. (\d+)', article_text)
|
||||
issue = issue_match.group(1) if issue_match else None
|
||||
issue = issue_match.group(1) if issue_match else ""
|
||||
|
||||
# Extract volume
|
||||
volume_match = re.search(r'Vol\. (\d+)', article_text)
|
||||
@ -189,21 +196,23 @@ def process_article(title, article_url):
|
||||
volume = str(volume)
|
||||
issue = "Conference " + str(issue_number)
|
||||
else:
|
||||
volume = None
|
||||
volume = ""
|
||||
|
||||
# Extract pp
|
||||
pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
|
||||
pp = pp_match.group(1) if pp_match else None
|
||||
pp = pp_match.group(1) if pp_match else ""
|
||||
|
||||
# Extract submission date
|
||||
match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||
submitted_date = match.group(1) if match else None
|
||||
if not match:
|
||||
match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
|
||||
submitted_date = match.group(1) if match else ""
|
||||
if submitted_date:
|
||||
submitted_date = datetime_transform(submitted_date)
|
||||
|
||||
# Extract publication date
|
||||
match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
|
||||
publish_date = match.group(1) if match else None
|
||||
publish_date = match.group(1) if match else ""
|
||||
if publish_date:
|
||||
publish_date = datetime_transform(publish_date)
|
||||
|
||||
@ -234,25 +243,25 @@ def process_article(title, article_url):
|
||||
doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
|
||||
if not doi_match:
|
||||
doi_match = re.search(r'DOI: (.+)', html)
|
||||
doi = doi_match.group(1) if doi_match else None
|
||||
doi = doi.replace('https://doi.org/', '') # strip doi website header
|
||||
doi = doi_match.group(1) if doi_match else ""
|
||||
|
||||
# Article_id
|
||||
article_id = str(uuid.uuid4())
|
||||
|
||||
# Author info
|
||||
authors = []
|
||||
author_names = []
|
||||
table = article_soup.find('table')
|
||||
if table:
|
||||
for row in table.find_all('tr'):
|
||||
cells = [cell.text.strip() for cell in row.find_all('td')]
|
||||
for cell in cells:
|
||||
if "email:" in cell:
|
||||
cell = cell.split("email:")
|
||||
if "email" in cell:
|
||||
cell = cell.split("email")
|
||||
email_list = str(cell[1]).split(',')
|
||||
cell = cell[0]
|
||||
elif "e-mail:" in cell:
|
||||
cell = cell.split("e-mail:")
|
||||
elif "e-mail" in cell:
|
||||
cell = cell.split("e-mail")
|
||||
email_list = str(cell[1]).split(',')
|
||||
cell = cell[0]
|
||||
else:
|
||||
@ -264,8 +273,11 @@ def process_article(title, article_url):
|
||||
|
||||
# Data processing
|
||||
if cell[0]:
|
||||
authors.append(unidecode(cell[0]))
|
||||
name = re.split(r'[ .]', cell[0])
|
||||
author_id = str(uuid.uuid4())
|
||||
authors.append(author_id)
|
||||
author_names.append(unidecode(cell[0]))
|
||||
name = re.split(r'\s+', cell[0])
|
||||
name = [item for item in name if item != '']
|
||||
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||
@ -276,16 +288,19 @@ def process_article(title, article_url):
|
||||
emails.append(unidecode(email_match.group())) if email_match else None
|
||||
|
||||
author_data = {
|
||||
"author_id": str(uuid.uuid4()),
|
||||
"from_article": [article_id],
|
||||
"author_id": author_id,
|
||||
"from_article": article_id,
|
||||
"first_name": unidecode(name[0]),
|
||||
"last_name": unidecode(name[-1]),
|
||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
||||
"affiliation": [{
|
||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
|
||||
"raw_name": unidecode(cell[0]),
|
||||
"affiliation": [
|
||||
{
|
||||
"year": volume,
|
||||
"affiliation": unidecode(affiliation),
|
||||
"email": emails
|
||||
}]
|
||||
"email": ", ".join(emails)
|
||||
}
|
||||
]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
authorNum += 1
|
||||
@ -308,12 +323,12 @@ def process_article(title, article_url):
|
||||
matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
|
||||
matches = matches.split("<p>")
|
||||
for match in matches:
|
||||
if "email:" in match:
|
||||
match = match.split("email:")
|
||||
if "email" in match:
|
||||
match = match.split("email")
|
||||
email_list = str(match[1]).split(',')
|
||||
match = match[0]
|
||||
elif "e-mail:" in match:
|
||||
match = match.split("e-mail:")
|
||||
elif "e-mail" in match:
|
||||
match = match.split("e-mail")
|
||||
email_list = str(match[1]).split(',')
|
||||
match = match[0]
|
||||
else:
|
||||
@ -330,8 +345,11 @@ def process_article(title, article_url):
|
||||
|
||||
# Data processing
|
||||
if match[0]:
|
||||
author_id = str(uuid.uuid4())
|
||||
authors.append(author_id)
|
||||
authors.append(unidecode(match[0]))
|
||||
name = re.split(r'[ .]', match[0])
|
||||
name = re.split(r'\s+', match[0])
|
||||
name = [item for item in name if item != '']
|
||||
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||
affiliation = affiliation.lstrip(",").rstrip(",").strip()
|
||||
@ -342,16 +360,19 @@ def process_article(title, article_url):
|
||||
emails.append(unidecode(email_match.group())) if email_match else None
|
||||
|
||||
author_data = {
|
||||
"author_id": str(uuid.uuid4()),
|
||||
"from_article": [article_id],
|
||||
"author_id": author_id,
|
||||
"from_article": article_id,
|
||||
"first_name": unidecode(name[0]),
|
||||
"last_name": unidecode(name[-1]),
|
||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
||||
"affiliation": [{
|
||||
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
|
||||
"raw_name": unidecode(match[0]),
|
||||
"affiliation": [
|
||||
{
|
||||
"year": volume,
|
||||
"affiliation": unidecode(affiliation),
|
||||
"email": emails
|
||||
}]
|
||||
"email": ", ".join(emails)
|
||||
}
|
||||
]
|
||||
}
|
||||
authorData.append(author_data)
|
||||
authorNum += 1
|
||||
@ -368,7 +389,7 @@ def process_article(title, article_url):
|
||||
"article_id": article_id,
|
||||
"title": unidecode(title),
|
||||
"authors": authors,
|
||||
"corresponding_authors": None,
|
||||
"author_names": author_names,
|
||||
"submit_datetime": submitted_date,
|
||||
"publish_datetime": publish_date,
|
||||
"keywords": keywords,
|
||||
@ -386,12 +407,10 @@ def process_article(title, article_url):
|
||||
|
||||
# Save the data periodically based on batch size
|
||||
if len(articleData) % batch_size == 0:
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
articleData.clear()
|
||||
save_data_thread_safe(articleData, articleDataLock, "Article_TS")
|
||||
|
||||
if len(authorData) % batch_size == 0:
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
authorData.clear()
|
||||
save_data_thread_safe(authorData, authorDataLock, "Author_TS")
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
@ -429,6 +448,8 @@ authorNum = 0
|
||||
articleNum = 0
|
||||
|
||||
batch_size = 100 # Number of articles to process before saving
|
||||
authorDataLock = threading.Lock()
|
||||
articleDataLock = threading.Lock()
|
||||
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
|
||||
|
||||
# Process each URL using multithreading
|
||||
@ -444,9 +465,9 @@ for future in as_completed(futures):
|
||||
# Retry failed processing paper
|
||||
print("START RETRYING:", len(failedData))
|
||||
while failedData:
|
||||
data = failedData.pop(0)
|
||||
articleTitle = data["title"]
|
||||
articleUrl = data["URL"]
|
||||
fail_data = failedData.pop(0)
|
||||
articleTitle = fail_data["title"]
|
||||
articleUrl = fail_data["URL"]
|
||||
try:
|
||||
process_article(articleTitle, articleUrl)
|
||||
except Exception as retry_err:
|
||||
@ -460,11 +481,11 @@ while failedData:
|
||||
# Save remaining data
|
||||
if len(articleData) > 0:
|
||||
ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
|
||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||
print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
|
||||
|
||||
if len(authorData) > 0:
|
||||
ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
|
||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||
print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
|
||||
|
||||
# Save error record
|
||||
if len(totallyFailedData) > 0:
|
||||
@ -485,5 +506,5 @@ print("Total fetched author:", authorNum)
|
||||
print("time elapsed: {:.2f}s".format(time.time() - start_time))
|
||||
|
||||
# Transfer to large file and delete the temporary storage files
|
||||
ejde_save.Transf()
|
||||
ejde_save.delete()
|
||||
ejde_save.transform_data()
|
||||
# ejde_save.delete_data()
|
||||
|
||||
@ -14,8 +14,8 @@ def save_data(dataset, filetype, filename):
|
||||
|
||||
|
||||
# Write into output files
|
||||
def Transf():
|
||||
def Read(folder_path, output_files):
|
||||
def transform_data():
|
||||
def read(folder_path, output_files):
|
||||
# Create new folders
|
||||
os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
|
||||
os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
|
||||
@ -24,6 +24,8 @@ def Transf():
|
||||
data_2010_2014 = []
|
||||
data_2015_2020 = []
|
||||
data_newest = []
|
||||
data_no_date = []
|
||||
data_integrate = []
|
||||
|
||||
for filename in os.listdir(folder_path):
|
||||
if filename.endswith('.json'):
|
||||
@ -31,24 +33,29 @@ def Transf():
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
for Dict in data:
|
||||
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
|
||||
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) != '':
|
||||
# Select data
|
||||
if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
||||
data_oldest.append(Dict)
|
||||
|
||||
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
|
||||
data_2010_2014.append(Dict)
|
||||
|
||||
elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
||||
data_2015_2020.append(Dict)
|
||||
|
||||
else:
|
||||
data_newest.append(Dict)
|
||||
else:
|
||||
data_no_date.append(Dict)
|
||||
|
||||
data_integrate.append(data_oldest)
|
||||
data_integrate.append(data_2010_2014)
|
||||
data_integrate.append(data_2015_2020)
|
||||
data_integrate.append(data_newest)
|
||||
data_integrate.append(data_no_date)
|
||||
|
||||
# Transfer
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
|
||||
Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest, data_no_date, data_integrate]
|
||||
|
||||
for index in range(0, 4):
|
||||
for index in range(0, 6):
|
||||
with open(output_files[index], 'w', encoding='utf-8') as file:
|
||||
json.dump(Data[index], file, indent=4)
|
||||
|
||||
@ -61,26 +68,30 @@ def Transf():
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(no date).json',
|
||||
'./ejde_buffer/Author_output/EJDE_Author_output_file(integration).json'
|
||||
]
|
||||
|
||||
article_output_file = [
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(no date).json',
|
||||
'./ejde_buffer/Article_output/EJDE_Article_output_file(integration).json'
|
||||
]
|
||||
|
||||
# Read and write into files
|
||||
Read(author_folder_path, author_output_file)
|
||||
Read(article_folder_path, article_output_file)
|
||||
read(author_folder_path, author_output_file)
|
||||
read(article_folder_path, article_output_file)
|
||||
|
||||
# End
|
||||
print("\nData has been written into files.")
|
||||
|
||||
|
||||
# Delete files in temporary storage area
|
||||
def delete():
|
||||
def delete_data():
|
||||
folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
|
||||
for folder_path in folder_paths:
|
||||
file_names = os.listdir(folder_path)
|
||||
@ -89,5 +100,4 @@ def delete():
|
||||
if os.path.isfile(file_path):
|
||||
os.remove(file_path)
|
||||
os.rmdir(folder_path)
|
||||
|
||||
print('\nAttention: The temporary storage files have been deleted!')
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user