New feature:

1. unidecode for EJDE parser 2. get paper citation by aminer api Bug contains: 1. function "scholarly_get_citation" cannot work properly
2023-09-23 10:43:46 +08:00 · 2023-09-23 10:43:46 +08:00 · f45c63fa8c
commit f45c63fa8c
parent 2f6f86a48e
3 changed files with 117 additions and 19 deletions
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,4 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (cst-project)" project-jdk-type="Python SDK" />
 </project>
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -7,6 +7,7 @@ import ejde_save
 from retrying import retry
 from datetime import datetime
 from bs4 import BeautifulSoup
 from unidecode import unidecode
 from concurrent.futures import ThreadPoolExecutor, as_completed
 '''
@ -224,7 +225,8 @@ def process_article(title, article_url):
        if keywords_match:
            keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
            keywords = re.split(r', |;', keywords)
-            keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())]
+            keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in
                        keywords if len(keyword.strip())]
        else:
            keywords = []
@ -233,6 +235,7 @@ def process_article(title, article_url):
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
        doi = doi_match.group(1) if doi_match else None
        doi = doi.replace('https://doi.org/', '')  # strip doi website header
        # Article_id
        article_id = str(uuid.uuid4())
@ -261,7 +264,7 @@ def process_article(title, article_url):
                    # Data processing
                    if cell[0]:
-                        authors.append(cell[0])
+                        authors.append(unidecode(cell[0]))
                        name = re.split(r'[ .]', cell[0])
                        affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
@ -270,17 +273,17 @@ def process_article(title, article_url):
                        if email_list:
                            for email in email_list:
                                email_match = re.search(r'[\w.-]+@[\w.-]+', email)
-                                emails.append(email_match.group()) if email_match else None
+                                emails.append(unidecode(email_match.group())) if email_match else None
                        author_data = {
                            "author_id": str(uuid.uuid4()),
                            "from_article": [article_id],
-                            "first_name": name[0],
+                            "first_name": unidecode(name[0]),
-                            "last_name": name[-1],
+                            "last_name": unidecode(name[-1]),
-                            "middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None,
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
                            "affiliation": [{
                                "year": volume,
-                                "affiliation": affiliation,
+                                "affiliation": unidecode(affiliation),
                                "email": emails
                            }]
                        }
@ -327,7 +330,7 @@ def process_article(title, article_url):
                    # Data processing
                    if match[0]:
-                        authors.append(match[0])
+                        authors.append(unidecode(match[0]))
                        name = re.split(r'[ .]', match[0])
                        affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
@ -336,17 +339,17 @@ def process_article(title, article_url):
                        if email_list:
                            for email in email_list:
                                email_match = re.search(r'[\w.-]+@[\w.-]+', email)
-                                emails.append(email_match.group()) if email_match else None
+                                emails.append(unidecode(email_match.group())) if email_match else None
                        author_data = {
                            "author_id": str(uuid.uuid4()),
                            "from_article": [article_id],
-                            "first_name": name[0],
+                            "first_name": unidecode(name[0]),
-                            "last_name": name[-1],
+                            "last_name": unidecode(name[-1]),
-                            "middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None,
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
                            "affiliation": [{
                                "year": volume,
-                                "affiliation": affiliation,
+                                "affiliation": unidecode(affiliation),
                                "email": emails
                            }]
                        }
@ -363,7 +366,7 @@ def process_article(title, article_url):
        # Article info
        article_data = {
            "article_id": article_id,
-            "title": title,
+            "title": unidecode(title),
            "authors": authors,
            "corresponding_authors": None,
            "submit_datetime": submitted_date,
@ -426,7 +429,7 @@ authorNum = 0
 articleNum = 0
 batch_size = 100  # Number of articles to process before saving
-executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2))  # Set the number of worker threads
+executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2))  # Set the number of worker threads
 # Process each URL using multithreading
 futures = [executor.submit(process_volume, url) for url in url_list]
--- a/01_EJDE_spider/get_paper_citation.py
+++ b/01_EJDE_spider/get_paper_citation.py
@ -0,0 +1,99 @@
 import re
 import jwt
 import time
 import json
 import requests
 from scholarly import scholarly
 from scholarly import ProxyGenerator
 # Aminer secret key
 secret_key = "81hJKrNgKkMqow=="
 user_id = "650c31aa078ed986b5d526cc"
 expire_time = int(time.time()) + 60  # Expire in 1 min
 now_time = int(time.time())
 # Aminer JWT token generator
 head = {
    "alg": "HS256",
    "sign_type": "SIGN"
 }
 payload = {
    "user_id": user_id,
    "exp": expire_time,
    "timestamp": now_time
 }
 jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
 # Aminer API
 api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
 api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
 def aminer_get_id(title):
    headers = {
        "Authorization": f"Bearer {jwt_token}"
    }
    params = {
        "page": "",
        "size": "",
        "title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
    }
    response = requests.get(api_get_id, headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
        if data.get("success") and data['data'] is not None:
            aminer_paper_id.append(data['data'][0]['id'])
        else:
            not_on_aminer.append(title)
 def aminer_post_citation(aminer_id):
    headers = {
        "Content-Type": "application/json;charset=utf-8",
        "Authorization": f"Bearer {jwt_token}"
    }
    request_data = {
        "ids": aminer_id
    }
    response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
    if response.status_code == 200:
        data = response.json()
        if data.get("success"):
            for item in data.get('data', []):
                if 'n_citation' in item:
                    n_citation = item['n_citation']
                else:
                    n_citation = 0
                aminer_paper_citation.append(n_citation)
    else:
        aminer_paper_citation_retry.append(aminer_id)
 def scholarly_get_citation(title):
    # # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
    pg = ProxyGenerator()
    pg.FreeProxies()
    scholarly.use_proxy(pg)
    # Now search Google Scholar from behind a proxy
    search_query = scholarly.search_pubs(title)
    try:
        scholarly.pprint(next(search_query, None))
    except StopIteration:
        return None
 not_on_aminer = []
 aminer_paper_id = []
 aminer_paper_citation = []
 aminer_paper_citation_retry = []
 # scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
 aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
 if aminer_paper_id:
    aminer_post_citation(aminer_paper_id)
 print(aminer_paper_citation)