From f45c63fa8cb043c80c3009d916e7a16158d301a5 Mon Sep 17 00:00:00 2001
From: ldy <s230026071@mail.uic.edu.cn>
Date: Sat, 23 Sep 2023 10:43:46 +0800
Subject: [PATCH] New feature: 1. unidecode for EJDE parser 2. get paper
 citation by aminer api Bug contains: 1. function "scholarly_get_citation"
 cannot work properly

---
 .idea/misc.xml                       |  4 --
 01_EJDE_spider/ejde_main.py          | 33 +++++-----
 01_EJDE_spider/get_paper_citation.py | 99 ++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 19 deletions(-)
 delete mode 100644 .idea/misc.xml
 create mode 100644 01_EJDE_spider/get_paper_citation.py
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index c2238d2..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (cst-project)" project-jdk-type="Python SDK" />
-</project>
\ No newline at end of file
diff --git a/01_EJDE_spider/ejde_main.py b/01_EJDE_spider/ejde_main.py
index 078572f..d4678c0 100644
--- a/01_EJDE_spider/ejde_main.py
+++ b/01_EJDE_spider/ejde_main.py
@@ -7,6 +7,7 @@ import ejde_save
 from retrying import retry
 from datetime import datetime
 from bs4 import BeautifulSoup
+from unidecode import unidecode
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 '''
@@ -224,7 +225,8 @@ def process_article(title, article_url):
         if keywords_match:
             keywords = re.sub(r'<[^>]+>', '', keywords_match.group(1).strip()).replace('\n', '')
             keywords = re.split(r', |;', keywords)
-            keywords = [re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip() for keyword in keywords if len(keyword.strip())]
+            keywords = [unidecode(re.sub(r'\s+', ' ', keyword.strip().strip('.')).strip()) for keyword in
+                        keywords if len(keyword.strip())]
         else:
             keywords = []
 
@@ -233,6 +235,7 @@ def process_article(title, article_url):
         if not doi_match:
             doi_match = re.search(r'DOI: (.+)', html)
         doi = doi_match.group(1) if doi_match else None
+        doi = doi.replace('https://doi.org/', '')  # strip doi website header
 
         # Article_id
         article_id = str(uuid.uuid4())
@@ -261,7 +264,7 @@ def process_article(title, article_url):
 
                     # Data processing
                     if cell[0]:
-                        authors.append(cell[0])
+                        authors.append(unidecode(cell[0]))
                         name = re.split(r'[ .]', cell[0])
                         affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
                         affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
@@ -270,17 +273,17 @@ def process_article(title, article_url):
                         if email_list:
                             for email in email_list:
                                 email_match = re.search(r'[\w.-]+@[\w.-]+', email)
-                                emails.append(email_match.group()) if email_match else None
+                                emails.append(unidecode(email_match.group())) if email_match else None
 
                         author_data = {
                             "author_id": str(uuid.uuid4()),
                             "from_article": [article_id],
-                            "first_name": name[0],
-                            "last_name": name[-1],
-                            "middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None,
+                            "first_name": unidecode(name[0]),
+                            "last_name": unidecode(name[-1]),
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
                             "affiliation": [{
                                 "year": volume,
-                                "affiliation": affiliation,
+                                "affiliation": unidecode(affiliation),
                                 "email": emails
                             }]
                         }
@@ -327,7 +330,7 @@ def process_article(title, article_url):
 
                     # Data processing
                     if match[0]:
-                        authors.append(match[0])
+                        authors.append(unidecode(match[0]))
                         name = re.split(r'[ .]', match[0])
                         affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
                         affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
@@ -336,17 +339,17 @@ def process_article(title, article_url):
                         if email_list:
                             for email in email_list:
                                 email_match = re.search(r'[\w.-]+@[\w.-]+', email)
-                                emails.append(email_match.group()) if email_match else None
+                                emails.append(unidecode(email_match.group())) if email_match else None
 
                         author_data = {
                             "author_id": str(uuid.uuid4()),
                             "from_article": [article_id],
-                            "first_name": name[0],
-                            "last_name": name[-1],
-                            "middle_name": ''.join(name[1:-1]) if len(name[1:-1]) > 0 else None,
+                            "first_name": unidecode(name[0]),
+                            "last_name": unidecode(name[-1]),
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
                             "affiliation": [{
                                 "year": volume,
-                                "affiliation": affiliation,
+                                "affiliation": unidecode(affiliation),
                                 "email": emails
                             }]
                         }
@@ -363,7 +366,7 @@ def process_article(title, article_url):
         # Article info
         article_data = {
             "article_id": article_id,
-            "title": title,
+            "title": unidecode(title),
             "authors": authors,
             "corresponding_authors": None,
             "submit_datetime": submitted_date,
@@ -426,7 +429,7 @@ authorNum = 0
 articleNum = 0
 
 batch_size = 100  # Number of articles to process before saving
-executor = ThreadPoolExecutor(max_workers=int(len(url_list)/2))  # Set the number of worker threads
+executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2))  # Set the number of worker threads
 
 # Process each URL using multithreading
 futures = [executor.submit(process_volume, url) for url in url_list]
diff --git a/01_EJDE_spider/get_paper_citation.py b/01_EJDE_spider/get_paper_citation.py
new file mode 100644
index 0000000..133e729
--- /dev/null
+++ b/01_EJDE_spider/get_paper_citation.py
@@ -0,0 +1,99 @@
+import re
+import jwt
+import time
+import json
+import requests
+
+from scholarly import scholarly
+from scholarly import ProxyGenerator
+
+# Aminer secret key
+secret_key = "81hJKrNgKkMqow=="
+user_id = "650c31aa078ed986b5d526cc"
+expire_time = int(time.time()) + 60  # Expire in 1 min
+now_time = int(time.time())
+
+# Aminer JWT token generator
+head = {
+    "alg": "HS256",
+    "sign_type": "SIGN"
+}
+payload = {
+    "user_id": user_id,
+    "exp": expire_time,
+    "timestamp": now_time
+}
+jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
+
+# Aminer API
+api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
+api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
+
+
+def aminer_get_id(title):
+    headers = {
+        "Authorization": f"Bearer {jwt_token}"
+    }
+    params = {
+        "page": "",
+        "size": "",
+        "title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
+    }
+    response = requests.get(api_get_id, headers=headers, params=params)
+
+    if response.status_code == 200:
+        data = response.json()
+        if data.get("success") and data['data'] is not None:
+            aminer_paper_id.append(data['data'][0]['id'])
+        else:
+            not_on_aminer.append(title)
+
+
+def aminer_post_citation(aminer_id):
+    headers = {
+        "Content-Type": "application/json;charset=utf-8",
+        "Authorization": f"Bearer {jwt_token}"
+    }
+    request_data = {
+        "ids": aminer_id
+    }
+    response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
+
+    if response.status_code == 200:
+        data = response.json()
+        if data.get("success"):
+            for item in data.get('data', []):
+                if 'n_citation' in item:
+                    n_citation = item['n_citation']
+                else:
+                    n_citation = 0
+                aminer_paper_citation.append(n_citation)
+    else:
+        aminer_paper_citation_retry.append(aminer_id)
+
+
+def scholarly_get_citation(title):
+    # # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
+    pg = ProxyGenerator()
+    pg.FreeProxies()
+    scholarly.use_proxy(pg)
+
+    # Now search Google Scholar from behind a proxy
+    search_query = scholarly.search_pubs(title)
+    try:
+        scholarly.pprint(next(search_query, None))
+    except StopIteration:
+        return None
+
+
+not_on_aminer = []
+aminer_paper_id = []
+aminer_paper_citation = []
+aminer_paper_citation_retry = []
+
+# scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
+
+aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
+if aminer_paper_id:
+    aminer_post_citation(aminer_paper_id)
+print(aminer_paper_citation)