From f45c63fa8cb043c80c3009d916e7a16158d301a5 Mon Sep 17 00:00:00 2001 From: ldy Date: Sat, 23 Sep 2023 10:43:46 +0800 Subject: [PATCH] New feature: 1. unidecode for EJDE parser 2. get paper citation by aminer api Bug contains: 1. function "scholarly_get_citation" cannot work properly --- .idea/misc.xml | 4 -- 01_EJDE_spider/ejde_main.py | 33 +++++----- 01_EJDE_spider/get_paper_citation.py | 99 ++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 19 deletions(-) delete mode 100644 .idea/misc.xml create mode 100644 01_EJDE_spider/get_paper_citation.py diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index c2238d2..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py index 078572f..d4678c0 100644 --- a/01_EJDE_spider/ejde_main.py +++ b/01_EJDE_spider/ejde_main.py @@ -7,6 +7,7 @@ import ejde_save from retrying import retry from datetime import datetime from bs4 import BeautifulSoup +from unidecode import unidecode from concurrent.futures import ThreadPoolExecutor, as_completed ''' @@ -224,7 +225,8 @@ def process_article(title, article_url): if keywords_match: keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '') keywords = re.split(r', |;', keywords) - keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())] + keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in + keywords if len(keyword.strip())] else: keywords = [] @@ -233,6 +235,7 @@ def process_article(title, article_url): if not doi_match: doi_match = re.search(r'DOI: (.+)', html) doi = doi_match.group(1) if doi_match else None + doi = doi.replace('https://doi.org/', '') # strip doi website header # Article_id article_id = str(uuid.uuid4()) @@ -261,7 +264,7 @@ def process_article(title, article_url): # Data processing if cell[0]: - authors.append(cell[0]) + authors.append(unidecode(cell[0])) name = re.split(r'[ .]', cell[0]) affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) @@ -270,17 +273,17 @@ def process_article(title, article_url): if email_list: for email in email_list: email_match = re.search(r'[\w.-]+@[\w.-]+', email) - emails.append(email_match.group()) if email_match else None + emails.append(unidecode(email_match.group())) if email_match else None author_data = { "author_id": str(uuid.uuid4()), "from_article": [article_id], - "first_name": name[0], - "last_name": name[-1], - "middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None, + "first_name": unidecode(name[0]), + "last_name": unidecode(name[-1]), + "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None, "affiliation": [{ "year": volume, - "affiliation": affiliation, + "affiliation": unidecode(affiliation), "email": emails }] } @@ -327,7 +330,7 @@ def process_article(title, article_url): # Data processing if match[0]: - authors.append(match[0]) + authors.append(unidecode(match[0])) name = re.split(r'[ .]', match[0]) affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip() affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation))) @@ -336,17 +339,17 @@ def process_article(title, article_url): if email_list: for email in email_list: email_match = re.search(r'[\w.-]+@[\w.-]+', email) - emails.append(email_match.group()) if email_match else None + emails.append(unidecode(email_match.group())) if email_match else None author_data = { "author_id": str(uuid.uuid4()), "from_article": [article_id], - "first_name": name[0], - "last_name": name[-1], - "middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None, + "first_name": unidecode(name[0]), + "last_name": unidecode(name[-1]), + "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None, "affiliation": [{ "year": volume, - "affiliation": affiliation, + "affiliation": unidecode(affiliation), "email": emails }] } @@ -363,7 +366,7 @@ def process_article(title, article_url): # Article info article_data = { "article_id": article_id, - "title": title, + "title": unidecode(title), "authors": authors, "corresponding_authors": None, "submit_datetime": submitted_date, @@ -426,7 +429,7 @@ authorNum = 0 articleNum = 0 batch_size = 100 # Number of articles to process before saving -executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2)) # Set the number of worker threads +executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2)) # Set the number of worker threads # Process each URL using multithreading futures = [executor.submit(process_volume, url) for url in url_list] diff --git a/01_EJDE_spider/get_paper_citation.py b/01_EJDE_spider/get_paper_citation.py new file mode 100644 index 0000000..133e729 --- /dev/null +++ b/01_EJDE_spider/get_paper_citation.py @@ -0,0 +1,99 @@ +import re +import jwt +import time +import json +import requests + +from scholarly import scholarly +from scholarly import ProxyGenerator + +# Aminer secret key +secret_key = "81hJKrNgKkMqow==" +user_id = "650c31aa078ed986b5d526cc" +expire_time = int(time.time()) + 60 # Expire in 1 min +now_time = int(time.time()) + +# Aminer JWT token generator +head = { + "alg": "HS256", + "sign_type": "SIGN" +} +payload = { + "user_id": user_id, + "exp": expire_time, + "timestamp": now_time +} +jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head) + +# Aminer API +api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish" +api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list" + + +def aminer_get_id(title): + headers = { + "Authorization": f"Bearer {jwt_token}" + } + params = { + "page": "", + "size": "", + "title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip() + } + response = requests.get(api_get_id, headers=headers, params=params) + + if response.status_code == 200: + data = response.json() + if data.get("success") and data['data'] is not None: + aminer_paper_id.append(data['data'][0]['id']) + else: + not_on_aminer.append(title) + + +def aminer_post_citation(aminer_id): + headers = { + "Content-Type": "application/json;charset=utf-8", + "Authorization": f"Bearer {jwt_token}" + } + request_data = { + "ids": aminer_id + } + response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data)) + + if response.status_code == 200: + data = response.json() + if data.get("success"): + for item in data.get('data', []): + if 'n_citation' in item: + n_citation = item['n_citation'] + else: + n_citation = 0 + aminer_paper_citation.append(n_citation) + else: + aminer_paper_citation_retry.append(aminer_id) + + +def scholarly_get_citation(title): + # # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session + pg = ProxyGenerator() + pg.FreeProxies() + scholarly.use_proxy(pg) + + # Now search Google Scholar from behind a proxy + search_query = scholarly.search_pubs(title) + try: + scholarly.pprint(next(search_query, None)) + except StopIteration: + return None + + +not_on_aminer = [] +aminer_paper_id = [] +aminer_paper_citation = [] +aminer_paper_citation_retry = [] + +# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion") + +aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols") +if aminer_paper_id: + aminer_post_citation(aminer_paper_id) +print(aminer_paper_citation)