ScholarDataMining/FileMerger/webSearch_merge.py

import jwt
import json
import requests
import time
import re

from pprint import pprint


# ========== Aminer pre-set ==========
# Aminer secret key
secret_key = "81hJKrNgKkMqow=="
user_id = "650c31aa078ed986b5d526cc"
expire_time = int(time.time()) + 60  # Expire in 1 min
now_time = int(time.time())

# Aminer JWT token generator
head = {
    "alg": "HS256",
    "sign_type": "SIGN"
}
payload = {
    "user_id": user_id,
    "exp": expire_time,
    "timestamp": now_time
}
jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)


# ========== Aminer API link ==========
api_author_info = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search"
api_article_info = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
api_articleDetail_info = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"


# ========== Function ==========
# ---------- Get the article web-ID ----------
def aminer_article_webID(title):
    headers = {
        "Authorization": f"Bearer {jwt_token}"
    }
    params = {
        "page": "",
        "size": "",
        "title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
    }

    response = requests.post(api_article_info, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
        if len(data.get('data')) > 1:
            print("\nERROR: Cannot find the web-ID of \'" + title + "\'")
            Retry_article_title.append(title)  # Add the title into retry list

            return None

        print('\n========== The web-ID of the article ==========')
        pprint(data)

        article_aminerID.append(data.get('data')[0].get('id'))      # Return article_ID in website

    else:
        print("\nERROR: Cannot find the web-ID of \'" + title + "\'")
        Retry_article_title.append(title)           # Add the title into retry list


# ---------- Get the article details ----------
def aminer_article_Detail(article_id):
    headers = {
        "Content_Type": "application/json;charset=utf-8",
        "Authorization": f"Bearer {jwt_token}"
    }

    request_data = {
        "ids": [article_id]
    }

    response = requests.post(api_articleDetail_info, headers=headers, data=json.dumps(request_data))

    if response.status_code == 200:
        data = response.json()

        print('\n========== The detail of the article ==========')
        pprint(data)

        authors_ID = data.get('data')[0].get('authors')
        for author_ID in authors_ID:
            author_ID = author_ID.get('id')
            if author_ID is not None:
                author_aminerID.append(author_ID)

    else:
        print("\nERROR: Cannot find the detail of the article \'" + article_id + "\'")
        Retry_article_aminerID.append(article_id)       # Add the article ID into retry list


# ---------- Get the author's article list ----------
def aminer_author_info(author_id):
    headers = {
        "Content-Type": "application/json;charset=utf-8",
        "Authorization": f"Bearer {jwt_token}"
    }
    request_data = {
        "ids": author_id
    }
    response = requests.post(api_author_info, headers=headers, data=json.dumps(request_data))

    if response.status_code == 200:
        data = response.json()

        print("\n========== The author's information ==========")
        pprint(data)

    else:
        print("\nERROR: Cannot find the article-list's ID of \'" + author_id + "\'")
        Retry_author_aminerID.append(author_id)         # Add the author ID into the retry list


# ========== Main code ==========
# ---------- Lists to store the data needed to be searched in API ----------
article_title = ['Entire solutions of the spruce budworm model']     # Test title
article_aminerID = []
author_aminerID = []

# ---------- Lists to store the data needed to be retried ----------
Retry_article_title = []
Retry_article_aminerID = []
Retry_author_aminerID = []


# ---------- Call the API ----------
for title in article_title:             # Get the article web-ID
    aminer_article_webID(title)

    if len(article_aminerID) > 0:
        for ar_id in article_aminerID:          # Get the article details
            aminer_article_Detail(ar_id)

    if len(author_aminerID) > 0:
        aminer_author_info(author_aminerID)     # Get the author's article list