New feature:
1. unidecode for EJDE parser 2. get paper citation by aminer api Bug contains: 1. function "scholarly_get_citation" cannot work properly
This commit is contained in:
parent
2f6f86a48e
commit
f45c63fa8c
4
.idea/misc.xml
generated
4
.idea/misc.xml
generated
@ -1,4 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (cst-project)" project-jdk-type="Python SDK" />
|
|
||||||
</project>
|
|
||||||
@ -7,6 +7,7 @@ import ejde_save
|
|||||||
from retrying import retry
|
from retrying import retry
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from unidecode import unidecode
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ -224,7 +225,8 @@ def process_article(title, article_url):
|
|||||||
if keywords_match:
|
if keywords_match:
|
||||||
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
|
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
|
||||||
keywords = re.split(r', |;', keywords)
|
keywords = re.split(r', |;', keywords)
|
||||||
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())]
|
keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in
|
||||||
|
keywords if len(keyword.strip())]
|
||||||
else:
|
else:
|
||||||
keywords = []
|
keywords = []
|
||||||
|
|
||||||
@ -233,6 +235,7 @@ def process_article(title, article_url):
|
|||||||
if not doi_match:
|
if not doi_match:
|
||||||
doi_match = re.search(r'DOI: (.+)', html)
|
doi_match = re.search(r'DOI: (.+)', html)
|
||||||
doi = doi_match.group(1) if doi_match else None
|
doi = doi_match.group(1) if doi_match else None
|
||||||
|
doi = doi.replace('https://doi.org/', '') # strip doi website header
|
||||||
|
|
||||||
# Article_id
|
# Article_id
|
||||||
article_id = str(uuid.uuid4())
|
article_id = str(uuid.uuid4())
|
||||||
@ -261,7 +264,7 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
if cell[0]:
|
if cell[0]:
|
||||||
authors.append(cell[0])
|
authors.append(unidecode(cell[0]))
|
||||||
name = re.split(r'[ .]', cell[0])
|
name = re.split(r'[ .]', cell[0])
|
||||||
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||||
@ -270,17 +273,17 @@ def process_article(title, article_url):
|
|||||||
if email_list:
|
if email_list:
|
||||||
for email in email_list:
|
for email in email_list:
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
||||||
emails.append(email_match.group()) if email_match else None
|
emails.append(unidecode(email_match.group())) if email_match else None
|
||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
"from_article": [article_id],
|
"from_article": [article_id],
|
||||||
"first_name": name[0],
|
"first_name": unidecode(name[0]),
|
||||||
"last_name": name[-1],
|
"last_name": unidecode(name[-1]),
|
||||||
"middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None,
|
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": unidecode(affiliation),
|
||||||
"email": emails
|
"email": emails
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
@ -327,7 +330,7 @@ def process_article(title, article_url):
|
|||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
if match[0]:
|
if match[0]:
|
||||||
authors.append(match[0])
|
authors.append(unidecode(match[0]))
|
||||||
name = re.split(r'[ .]', match[0])
|
name = re.split(r'[ .]', match[0])
|
||||||
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
|
||||||
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
|
||||||
@ -336,17 +339,17 @@ def process_article(title, article_url):
|
|||||||
if email_list:
|
if email_list:
|
||||||
for email in email_list:
|
for email in email_list:
|
||||||
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
email_match = re.search(r'[\w.-]+@[\w.-]+', email)
|
||||||
emails.append(email_match.group()) if email_match else None
|
emails.append(unidecode(email_match.group())) if email_match else None
|
||||||
|
|
||||||
author_data = {
|
author_data = {
|
||||||
"author_id": str(uuid.uuid4()),
|
"author_id": str(uuid.uuid4()),
|
||||||
"from_article": [article_id],
|
"from_article": [article_id],
|
||||||
"first_name": name[0],
|
"first_name": unidecode(name[0]),
|
||||||
"last_name": name[-1],
|
"last_name": unidecode(name[-1]),
|
||||||
"middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None,
|
"middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
|
||||||
"affiliation": [{
|
"affiliation": [{
|
||||||
"year": volume,
|
"year": volume,
|
||||||
"affiliation": affiliation,
|
"affiliation": unidecode(affiliation),
|
||||||
"email": emails
|
"email": emails
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
@ -363,7 +366,7 @@ def process_article(title, article_url):
|
|||||||
# Article info
|
# Article info
|
||||||
article_data = {
|
article_data = {
|
||||||
"article_id": article_id,
|
"article_id": article_id,
|
||||||
"title": title,
|
"title": unidecode(title),
|
||||||
"authors": authors,
|
"authors": authors,
|
||||||
"corresponding_authors": None,
|
"corresponding_authors": None,
|
||||||
"submit_datetime": submitted_date,
|
"submit_datetime": submitted_date,
|
||||||
@ -426,7 +429,7 @@ authorNum = 0
|
|||||||
articleNum = 0
|
articleNum = 0
|
||||||
|
|
||||||
batch_size = 100 # Number of articles to process before saving
|
batch_size = 100 # Number of articles to process before saving
|
||||||
executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2)) # Set the number of worker threads
|
executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
|
||||||
|
|
||||||
# Process each URL using multithreading
|
# Process each URL using multithreading
|
||||||
futures = [executor.submit(process_volume, url) for url in url_list]
|
futures = [executor.submit(process_volume, url) for url in url_list]
|
||||||
|
|||||||
99
01_EJDE_spider/get_paper_citation.py
Normal file
99
01_EJDE_spider/get_paper_citation.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
import re
|
||||||
|
import jwt
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from scholarly import scholarly
|
||||||
|
from scholarly import ProxyGenerator
|
||||||
|
|
||||||
|
# Aminer secret key
|
||||||
|
secret_key = "81hJKrNgKkMqow=="
|
||||||
|
user_id = "650c31aa078ed986b5d526cc"
|
||||||
|
expire_time = int(time.time()) + 60 # Expire in 1 min
|
||||||
|
now_time = int(time.time())
|
||||||
|
|
||||||
|
# Aminer JWT token generator
|
||||||
|
head = {
|
||||||
|
"alg": "HS256",
|
||||||
|
"sign_type": "SIGN"
|
||||||
|
}
|
||||||
|
payload = {
|
||||||
|
"user_id": user_id,
|
||||||
|
"exp": expire_time,
|
||||||
|
"timestamp": now_time
|
||||||
|
}
|
||||||
|
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
|
||||||
|
|
||||||
|
# Aminer API
|
||||||
|
api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
|
||||||
|
api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
|
||||||
|
|
||||||
|
|
||||||
|
def aminer_get_id(title):
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {jwt_token}"
|
||||||
|
}
|
||||||
|
params = {
|
||||||
|
"page": "",
|
||||||
|
"size": "",
|
||||||
|
"title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
|
||||||
|
}
|
||||||
|
response = requests.get(api_get_id, headers=headers, params=params)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
if data.get("success") and data['data'] is not None:
|
||||||
|
aminer_paper_id.append(data['data'][0]['id'])
|
||||||
|
else:
|
||||||
|
not_on_aminer.append(title)
|
||||||
|
|
||||||
|
|
||||||
|
def aminer_post_citation(aminer_id):
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json;charset=utf-8",
|
||||||
|
"Authorization": f"Bearer {jwt_token}"
|
||||||
|
}
|
||||||
|
request_data = {
|
||||||
|
"ids": aminer_id
|
||||||
|
}
|
||||||
|
response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
if data.get("success"):
|
||||||
|
for item in data.get('data', []):
|
||||||
|
if 'n_citation' in item:
|
||||||
|
n_citation = item['n_citation']
|
||||||
|
else:
|
||||||
|
n_citation = 0
|
||||||
|
aminer_paper_citation.append(n_citation)
|
||||||
|
else:
|
||||||
|
aminer_paper_citation_retry.append(aminer_id)
|
||||||
|
|
||||||
|
|
||||||
|
def scholarly_get_citation(title):
|
||||||
|
# # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
|
||||||
|
pg = ProxyGenerator()
|
||||||
|
pg.FreeProxies()
|
||||||
|
scholarly.use_proxy(pg)
|
||||||
|
|
||||||
|
# Now search Google Scholar from behind a proxy
|
||||||
|
search_query = scholarly.search_pubs(title)
|
||||||
|
try:
|
||||||
|
scholarly.pprint(next(search_query, None))
|
||||||
|
except StopIteration:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
not_on_aminer = []
|
||||||
|
aminer_paper_id = []
|
||||||
|
aminer_paper_citation = []
|
||||||
|
aminer_paper_citation_retry = []
|
||||||
|
|
||||||
|
# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
|
||||||
|
|
||||||
|
aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
|
||||||
|
if aminer_paper_id:
|
||||||
|
aminer_post_citation(aminer_paper_id)
|
||||||
|
print(aminer_paper_citation)
|
||||||
Loading…
x
Reference in New Issue
Block a user