New feature:

1. unidecode for EJDE parser
2. get paper citation by aminer api
Bug contains:
1. function "scholarly_get_citation" cannot work properly
This commit is contained in:
ldy 2023-09-23 10:43:46 +08:00
parent 2f6f86a48e
commit f45c63fa8c
3 changed files with 117 additions and 19 deletions

4
.idea/misc.xml generated
View File

@ -1,4 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (cst-project)" project-jdk-type="Python SDK" />
</project>

View File

@ -7,6 +7,7 @@ import ejde_save
from retrying import retry from retrying import retry
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from unidecode import unidecode
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
''' '''
@ -224,7 +225,8 @@ def process_article(title, article_url):
if keywords_match: if keywords_match:
keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '') keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
keywords = re.split(r', |;', keywords) keywords = re.split(r', |;', keywords)
keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())] keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in
keywords if len(keyword.strip())]
else: else:
keywords = [] keywords = []
@ -233,6 +235,7 @@ def process_article(title, article_url):
if not doi_match: if not doi_match:
doi_match = re.search(r'DOI: (.+)', html) doi_match = re.search(r'DOI: (.+)', html)
doi = doi_match.group(1) if doi_match else None doi = doi_match.group(1) if doi_match else None
doi = doi.replace('https://doi.org/', '') # strip doi website header
# Article_id # Article_id
article_id = str(uuid.uuid4()) article_id = str(uuid.uuid4())
@ -261,7 +264,7 @@ def process_article(title, article_url):
# Data processing # Data processing
if cell[0]: if cell[0]:
authors.append(cell[0]) authors.append(unidecode(cell[0]))
name = re.split(r'[ .]', cell[0]) name = re.split(r'[ .]', cell[0])
affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip() affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
@ -270,17 +273,17 @@ def process_article(title, article_url):
if email_list: if email_list:
for email in email_list: for email in email_list:
email_match = re.search(r'[\w.-]+@[\w.-]+', email) email_match = re.search(r'[\w.-]+@[\w.-]+', email)
emails.append(email_match.group()) if email_match else None emails.append(unidecode(email_match.group())) if email_match else None
author_data = { author_data = {
"author_id": str(uuid.uuid4()), "author_id": str(uuid.uuid4()),
"from_article": [article_id], "from_article": [article_id],
"first_name": name[0], "first_name": unidecode(name[0]),
"last_name": name[-1], "last_name": unidecode(name[-1]),
"middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None, "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": unidecode(affiliation),
"email": emails "email": emails
}] }]
} }
@ -327,7 +330,7 @@ def process_article(title, article_url):
# Data processing # Data processing
if match[0]: if match[0]:
authors.append(match[0]) authors.append(unidecode(match[0]))
name = re.split(r'[ .]', match[0]) name = re.split(r'[ .]', match[0])
affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip() affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
@ -336,17 +339,17 @@ def process_article(title, article_url):
if email_list: if email_list:
for email in email_list: for email in email_list:
email_match = re.search(r'[\w.-]+@[\w.-]+', email) email_match = re.search(r'[\w.-]+@[\w.-]+', email)
emails.append(email_match.group()) if email_match else None emails.append(unidecode(email_match.group())) if email_match else None
author_data = { author_data = {
"author_id": str(uuid.uuid4()), "author_id": str(uuid.uuid4()),
"from_article": [article_id], "from_article": [article_id],
"first_name": name[0], "first_name": unidecode(name[0]),
"last_name": name[-1], "last_name": unidecode(name[-1]),
"middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None, "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
"affiliation": [{ "affiliation": [{
"year": volume, "year": volume,
"affiliation": affiliation, "affiliation": unidecode(affiliation),
"email": emails "email": emails
}] }]
} }
@ -363,7 +366,7 @@ def process_article(title, article_url):
# Article info # Article info
article_data = { article_data = {
"article_id": article_id, "article_id": article_id,
"title": title, "title": unidecode(title),
"authors": authors, "authors": authors,
"corresponding_authors": None, "corresponding_authors": None,
"submit_datetime": submitted_date, "submit_datetime": submitted_date,
@ -426,7 +429,7 @@ authorNum = 0
articleNum = 0 articleNum = 0
batch_size = 100 # Number of articles to process before saving batch_size = 100 # Number of articles to process before saving
executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2)) # Set the number of worker threads executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads
# Process each URL using multithreading # Process each URL using multithreading
futures = [executor.submit(process_volume, url) for url in url_list] futures = [executor.submit(process_volume, url) for url in url_list]

View File

@ -0,0 +1,99 @@
import re
import jwt
import time
import json
import requests
from scholarly import scholarly
from scholarly import ProxyGenerator
# Aminer secret key
secret_key = "81hJKrNgKkMqow=="
user_id = "650c31aa078ed986b5d526cc"
expire_time = int(time.time()) + 60 # Expire in 1 min
now_time = int(time.time())
# Aminer JWT token generator
head = {
"alg": "HS256",
"sign_type": "SIGN"
}
payload = {
"user_id": user_id,
"exp": expire_time,
"timestamp": now_time
}
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
# Aminer API
api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
def aminer_get_id(title):
headers = {
"Authorization": f"Bearer {jwt_token}"
}
params = {
"page": "",
"size": "",
"title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
}
response = requests.get(api_get_id, headers=headers, params=params)
if response.status_code == 200:
data = response.json()
if data.get("success") and data['data'] is not None:
aminer_paper_id.append(data['data'][0]['id'])
else:
not_on_aminer.append(title)
def aminer_post_citation(aminer_id):
headers = {
"Content-Type": "application/json;charset=utf-8",
"Authorization": f"Bearer {jwt_token}"
}
request_data = {
"ids": aminer_id
}
response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
if response.status_code == 200:
data = response.json()
if data.get("success"):
for item in data.get('data', []):
if 'n_citation' in item:
n_citation = item['n_citation']
else:
n_citation = 0
aminer_paper_citation.append(n_citation)
else:
aminer_paper_citation_retry.append(aminer_id)
def scholarly_get_citation(title):
# # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)
# Now search Google Scholar from behind a proxy
search_query = scholarly.search_pubs(title)
try:
scholarly.pprint(next(search_query, None))
except StopIteration:
return None
not_on_aminer = []
aminer_paper_id = []
aminer_paper_citation = []
aminer_paper_citation_retry = []
# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
if aminer_paper_id:
aminer_post_citation(aminer_paper_id)
print(aminer_paper_citation)