New feature:

1. unidecode for EJDE parser 2. get paper citation by aminer api Bug contains: 1. function "scholarly_get_citation" cannot work properly
2023-09-23 10:43:46 +08:00 · 2023-09-23 10:43:46 +08:00 · f45c63fa8c
commit f45c63fa8c
parent 2f6f86a48e
3 changed files with 117 additions and 19 deletions
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (cst-project)" project-jdk-type="Python SDK" />
-</project>
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@ -7,6 +7,7 @@ import ejde_save
 from retrying import retry
 from datetime import datetime
 from bs4 import BeautifulSoup
+from unidecode import unidecode
 from concurrent.futures import ThreadPoolExecutor, as_completed

 '''
@ -224,7 +225,8 @@ def process_article(title, article_url):
        if keywords_match:
            keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
            keywords = re.split(r', |;', keywords)
-            keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())]
+            keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in
+                        keywords if len(keyword.strip())]
        else:
            keywords = []

@ -233,6 +235,7 @@ def process_article(title, article_url):
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
        doi = doi_match.group(1) if doi_match else None
+        doi = doi.replace('https://doi.org/', '')  # strip doi website header

        # Article_id
        article_id = str(uuid.uuid4())
@ -261,7 +264,7 @@ def process_article(title, article_url):

                    # Data processing
                    if cell[0]:
-                        authors.append(cell[0])
+                        authors.append(unidecode(cell[0]))
                        name = re.split(r'[ .]', cell[0])
                        affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
@ -270,17 +273,17 @@ def process_article(title, article_url):
                        if email_list:
                            for email in email_list:
                                email_match = re.search(r'[\w.-]+@[\w.-]+', email)
-                                emails.append(email_match.group()) if email_match else None
+                                emails.append(unidecode(email_match.group())) if email_match else None

                        author_data = {
                            "author_id": str(uuid.uuid4()),
                            "from_article": [article_id],
-                            "first_name": name[0],
-                            "last_name": name[-1],
-                            "middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None,
+                            "first_name": unidecode(name[0]),
+                            "last_name": unidecode(name[-1]),
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
                            "affiliation": [{
                                "year": volume,
-                                "affiliation": affiliation,
+                                "affiliation": unidecode(affiliation),
                                "email": emails
                            }]
                        }
@ -327,7 +330,7 @@ def process_article(title, article_url):

                    # Data processing
                    if match[0]:
-                        authors.append(match[0])
+                        authors.append(unidecode(match[0]))
                        name = re.split(r'[ .]', match[0])
                        affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
@ -336,17 +339,17 @@ def process_article(title, article_url):
                        if email_list:
                            for email in email_list:
                                email_match = re.search(r'[\w.-]+@[\w.-]+', email)
-                                emails.append(email_match.group()) if email_match else None
+                                emails.append(unidecode(email_match.group())) if email_match else None

                        author_data = {
                            "author_id": str(uuid.uuid4()),
                            "from_article": [article_id],
-                            "first_name": name[0],
-                            "last_name": name[-1],
-                            "middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None,
+                            "first_name": unidecode(name[0]),
+                            "last_name": unidecode(name[-1]),
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
                            "affiliation": [{
                                "year": volume,
-                                "affiliation": affiliation,
+                                "affiliation": unidecode(affiliation),
                                "email": emails
                            }]
                        }
@ -363,7 +366,7 @@ def process_article(title, article_url):
        # Article info
        article_data = {
            "article_id": article_id,
-            "title": title,
+            "title": unidecode(title),
            "authors": authors,
            "corresponding_authors": None,
            "submit_datetime": submitted_date,
@ -426,7 +429,7 @@ authorNum = 0
 articleNum = 0

 batch_size = 100  # Number of articles to process before saving
-executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2))  # Set the number of worker threads
+executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2))  # Set the number of worker threads

 # Process each URL using multithreading
 futures = [executor.submit(process_volume, url) for url in url_list]
--- a/01_EJDE_spider/get_paper_citation.py
+++ b/01_EJDE_spider/get_paper_citation.py
@ -0,0 +1,99 @@
+import re
+import jwt
+import time
+import json
+import requests
+
+from scholarly import scholarly
+from scholarly import ProxyGenerator
+
+# Aminer secret key
+secret_key = "81hJKrNgKkMqow=="
+user_id = "650c31aa078ed986b5d526cc"
+expire_time = int(time.time()) + 60  # Expire in 1 min
+now_time = int(time.time())
+
+# Aminer JWT token generator
+head = {
+    "alg": "HS256",
+    "sign_type": "SIGN"
+}
+payload = {
+    "user_id": user_id,
+    "exp": expire_time,
+    "timestamp": now_time
+}
+jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
+
+# Aminer API
+api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
+api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
+
+
+def aminer_get_id(title):
+    headers = {
+        "Authorization": f"Bearer {jwt_token}"
+    }
+    params = {
+        "page": "",
+        "size": "",
+        "title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
+    }
+    response = requests.get(api_get_id, headers=headers, params=params)
+
+    if response.status_code == 200:
+        data = response.json()
+        if data.get("success") and data['data'] is not None:
+            aminer_paper_id.append(data['data'][0]['id'])
+        else:
+            not_on_aminer.append(title)
+
+
+def aminer_post_citation(aminer_id):
+    headers = {
+        "Content-Type": "application/json;charset=utf-8",
+        "Authorization": f"Bearer {jwt_token}"
+    }
+    request_data = {
+        "ids": aminer_id
+    }
+    response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
+
+    if response.status_code == 200:
+        data = response.json()
+        if data.get("success"):
+            for item in data.get('data', []):
+                if 'n_citation' in item:
+                    n_citation = item['n_citation']
+                else:
+                    n_citation = 0
+                aminer_paper_citation.append(n_citation)
+    else:
+        aminer_paper_citation_retry.append(aminer_id)
+
+
+def scholarly_get_citation(title):
+    # # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
+    pg = ProxyGenerator()
+    pg.FreeProxies()
+    scholarly.use_proxy(pg)
+
+    # Now search Google Scholar from behind a proxy
+    search_query = scholarly.search_pubs(title)
+    try:
+        scholarly.pprint(next(search_query, None))
+    except StopIteration:
+        return None
+
+
+not_on_aminer = []
+aminer_paper_id = []
+aminer_paper_citation = []
+aminer_paper_citation_retry = []
+
+# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
+
+aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
+if aminer_paper_id:
+    aminer_post_citation(aminer_paper_id)
+print(aminer_paper_citation)