Updated ejde parser format

Fixed duplicate data dumping problem Pushed new "ejde_buffer.zip"
2023-11-02 04:28:06 +08:00 · 2023-11-02 04:28:06 +08:00 · ad63bcf6c4
commit ad63bcf6c4
parent 50e30e105b
4 changed files with 130 additions and 74 deletions
--- a/Data/Origin/ejde_buffer.zip
+++ b/Data/Origin/ejde_buffer.zip
--- a/Parsers/00_Citation_spider/get_paper_citation.py
+++ b/Parsers/00_Citation_spider/get_paper_citation.py
@ -26,11 +26,12 @@ payload = {
 jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)

 # Aminer API
-api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
-api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
+api_paper_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
+api_paper_detail = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
+api_author_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search"


-def aminer_get_id(title):
+def aminer_get_paper_id(title):
    headers = {
        "Authorization": f"Bearer {jwt_token}"
    }
@ -39,7 +40,7 @@ def aminer_get_id(title):
        "size": "",
        "title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
    }
-    response = requests.get(api_get_id, headers=headers, params=params)
+    response = requests.get(api_paper_id, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
@ -49,7 +50,7 @@ def aminer_get_id(title):
            not_on_aminer.append(title)


-def aminer_post_citation(aminer_id):
+def aminer_post_paper_citation(aminer_id):
    headers = {
        "Content-Type": "application/json;charset=utf-8",
        "Authorization": f"Bearer {jwt_token}"
@ -57,7 +58,7 @@ def aminer_post_citation(aminer_id):
    request_data = {
        "ids": aminer_id
    }
-    response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
+    response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))

    if response.status_code == 200:
        data = response.json()
@ -72,6 +73,31 @@ def aminer_post_citation(aminer_id):
        aminer_paper_citation_retry.append(aminer_id)


+def aminer_author_info(author_aminer_id, author_name, offset):
+    headers = {
+        "Content-Type": "application/json;charset=utf-8",
+        "Authorization": f"Bearer {jwt_token}"
+    }
+    request_data = {
+        "ids": author_aminer_id,
+        "query": author_name,
+        "offset": offset
+    }
+    response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
+
+    if response.status_code == 200:
+        data = response.json()
+        if data.get("success"):
+            for item in data.get('data', []):
+                if 'n_citation' in item:
+                    n_citation = item['n_citation']
+                else:
+                    n_citation = 0
+                aminer_paper_citation.append(n_citation)
+    else:
+        aminer_paper_citation_retry.append(author_aminer_id)
+
+
 def scholarly_get_citation(title):
    # # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
    pg = ProxyGenerator()
@ -92,8 +118,7 @@ aminer_paper_citation = []
 aminer_paper_citation_retry = []

 # scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
-
-aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
+aminer_get_paper_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
 if aminer_paper_id:
-    aminer_post_citation(aminer_paper_id)
+    aminer_post_paper_citation(aminer_paper_id)
 print(aminer_paper_citation)
--- a/Parsers/01_EJDE_spider/ejde_main.py
+++ b/Parsers/01_EJDE_spider/ejde_main.py
@ -1,7 +1,8 @@
+import re
 import time
 import uuid
 import requests
-import re
+import threading
 import ejde_save

 from retrying import retry
@ -13,8 +14,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 '''
    爬取网站：'ejde.math.txstate.edu'

-    Total number of papers: 2023/08/08 - 4761
-    Total Time via VPN w/100ms-delay: 306.73s
+    Total number of papers: 2023/08/08 - 4785
+    Total Time via VPN w/100ms-delay: 48.04s

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
@ -23,6 +24,12 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 '''


+def save_data_thread_safe(data, data_lock, data_type):
+    with data_lock:
+        ejde_save.save_data(data, f"{data_type}", str(uuid.uuid4()) + ".json")
+        data.clear()
+
+
 def datetime_transform(date):
    month_typo = {
        "Janaury": "January",
@ -120,7 +127,7 @@ def process_html_article(baseweb, article):
    # Get article title & url
    try:
        title = article.text.strip()
-        title = re.sub(r'\s+', ' ', title).strip()
+        title = str(re.sub(r'\s+', ' ', title).strip())
        article_url = baseweb + article.find_next("a")["href"]
        if "../../index.html" in article_url:
            print("Redundant URL:", article_url)
@ -162,11 +169,11 @@ def process_article(title, article_url):
        # Extract title if title == None
        if not title:
            title_match = re.search(r"<h3>(.*?)<p>", article_text)
-            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
+            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""

        # Extract issue
        issue_match = re.search(r'No\. (\d+)', article_text)
-        issue = issue_match.group(1) if issue_match else None
+        issue = issue_match.group(1) if issue_match else ""

        # Extract volume
        volume_match = re.search(r'Vol\. (\d+)', article_text)
@ -189,21 +196,23 @@ def process_article(title, article_url):
                        volume = str(volume)
                        issue = "Conference " + str(issue_number)
                    else:
-                        volume = None
+                        volume = ""

        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
-        pp = pp_match.group(1) if pp_match else None
+        pp = pp_match.group(1) if pp_match else ""

        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
-        submitted_date = match.group(1) if match else None
+        if not match:
+            match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
+        submitted_date = match.group(1) if match else ""
        if submitted_date:
            submitted_date = datetime_transform(submitted_date)

        # Extract publication date
        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
-        publish_date = match.group(1) if match else None
+        publish_date = match.group(1) if match else ""
        if publish_date:
            publish_date = datetime_transform(publish_date)

@ -234,25 +243,25 @@ def process_article(title, article_url):
        doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
-        doi = doi_match.group(1) if doi_match else None
-        doi = doi.replace('https://doi.org/', '')  # strip doi website header
+        doi = doi_match.group(1) if doi_match else ""

        # Article_id
        article_id = str(uuid.uuid4())

        # Author info
        authors = []
+        author_names = []
        table = article_soup.find('table')
        if table:
            for row in table.find_all('tr'):
                cells = [cell.text.strip() for cell in row.find_all('td')]
                for cell in cells:
-                    if "email:" in cell:
-                        cell = cell.split("email:")
+                    if "email" in cell:
+                        cell = cell.split("email")
                        email_list = str(cell[1]).split(',')
                        cell = cell[0]
-                    elif "e-mail:" in cell:
-                        cell = cell.split("e-mail:")
+                    elif "e-mail" in cell:
+                        cell = cell.split("e-mail")
                        email_list = str(cell[1]).split(',')
                        cell = cell[0]
                    else:
@ -264,8 +273,11 @@ def process_article(title, article_url):

                    # Data processing
                    if cell[0]:
-                        authors.append(unidecode(cell[0]))
-                        name = re.split(r'[ .]', cell[0])
+                        author_id = str(uuid.uuid4())
+                        authors.append(author_id)
+                        author_names.append(unidecode(cell[0]))
+                        name = re.split(r'\s+', cell[0])
+                        name = [item for item in name if item != '']
                        affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
@ -276,16 +288,19 @@ def process_article(title, article_url):
                                emails.append(unidecode(email_match.group())) if email_match else None

                        author_data = {
-                            "author_id": str(uuid.uuid4()),
-                            "from_article": [article_id],
+                            "author_id": author_id,
+                            "from_article": article_id,
                            "first_name": unidecode(name[0]),
                            "last_name": unidecode(name[-1]),
-                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
-                            "affiliation": [{
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
+                            "raw_name": unidecode(cell[0]),
+                            "affiliation": [
+                                {
                                    "year": volume,
                                    "affiliation": unidecode(affiliation),
-                                "email": emails
-                            }]
+                                    "email": ", ".join(emails)
+                                }
+                            ]
                        }
                        authorData.append(author_data)
                        authorNum += 1
@ -308,12 +323,12 @@ def process_article(title, article_url):
                matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
                matches = matches.split("<p>")
                for match in matches:
-                    if "email:" in match:
-                        match = match.split("email:")
+                    if "email" in match:
+                        match = match.split("email")
                        email_list = str(match[1]).split(',')
                        match = match[0]
-                    elif "e-mail:" in match:
-                        match = match.split("e-mail:")
+                    elif "e-mail" in match:
+                        match = match.split("e-mail")
                        email_list = str(match[1]).split(',')
                        match = match[0]
                    else:
@ -330,8 +345,11 @@ def process_article(title, article_url):

                    # Data processing
                    if match[0]:
+                        author_id = str(uuid.uuid4())
+                        authors.append(author_id)
                        authors.append(unidecode(match[0]))
-                        name = re.split(r'[ .]', match[0])
+                        name = re.split(r'\s+', match[0])
+                        name = [item for item in name if item != '']
                        affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
@ -342,16 +360,19 @@ def process_article(title, article_url):
                                emails.append(unidecode(email_match.group())) if email_match else None

                        author_data = {
-                            "author_id": str(uuid.uuid4()),
-                            "from_article": [article_id],
+                            "author_id": author_id,
+                            "from_article": article_id,
                            "first_name": unidecode(name[0]),
                            "last_name": unidecode(name[-1]),
-                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
-                            "affiliation": [{
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
+                            "raw_name": unidecode(match[0]),
+                            "affiliation": [
+                                {
                                    "year": volume,
                                    "affiliation": unidecode(affiliation),
-                                "email": emails
-                            }]
+                                    "email": ", ".join(emails)
+                                }
+                            ]
                        }
                        authorData.append(author_data)
                        authorNum += 1
@ -368,7 +389,7 @@ def process_article(title, article_url):
            "article_id": article_id,
            "title": unidecode(title),
            "authors": authors,
-            "corresponding_authors": None,
+            "author_names": author_names,
            "submit_datetime": submitted_date,
            "publish_datetime": publish_date,
            "keywords": keywords,
@ -386,12 +407,10 @@ def process_article(title, article_url):

        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
-            ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
-            articleData.clear()
+            save_data_thread_safe(articleData, articleDataLock, "Article_TS")

        if len(authorData) % batch_size == 0:
-            ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
-            authorData.clear()
+            save_data_thread_safe(authorData, authorDataLock, "Author_TS")


 start_time = time.time()
@ -429,6 +448,8 @@ authorNum = 0
 articleNum = 0

 batch_size = 100  # Number of articles to process before saving
+authorDataLock = threading.Lock()
+articleDataLock = threading.Lock()
 executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2))  # Set the number of worker threads

 # Process each URL using multithreading
@ -444,9 +465,9 @@ for future in as_completed(futures):
 # Retry failed processing paper
 print("START RETRYING:", len(failedData))
 while failedData:
-    data = failedData.pop(0)
-    articleTitle = data["title"]
-    articleUrl = data["URL"]
+    fail_data = failedData.pop(0)
+    articleTitle = fail_data["title"]
+    articleUrl = fail_data["URL"]
    try:
        process_article(articleTitle, articleUrl)
    except Exception as retry_err:
@ -460,11 +481,11 @@ while failedData:
 # Save remaining data
 if len(articleData) > 0:
    ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
-    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
+print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")

 if len(authorData) > 0:
    ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
-    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
+print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")

 # Save error record
 if len(totallyFailedData) > 0:
@ -485,5 +506,5 @@ print("Total fetched author:", authorNum)
 print("time elapsed: {:.2f}s".format(time.time() - start_time))

 # Transfer to large file and delete the temporary storage files
-ejde_save.Transf()
-ejde_save.delete()
+ejde_save.transform_data()
+# ejde_save.delete_data()
--- a/Parsers/01_EJDE_spider/ejde_save.py
+++ b/Parsers/01_EJDE_spider/ejde_save.py
@ -14,8 +14,8 @@ def save_data(dataset, filetype, filename):


 # Write into output files
-def Transf():
-    def Read(folder_path, output_files):
+def transform_data():
+    def read(folder_path, output_files):
        # Create new folders
        os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
        os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
@ -24,6 +24,8 @@ def Transf():
        data_2010_2014 = []
        data_2015_2020 = []
        data_newest = []
+        data_no_date = []
+        data_integrate = []

        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
@ -31,24 +33,29 @@ def Transf():
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    for Dict in data:
-                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) != '':
                            # Select data
                            if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
                                data_oldest.append(Dict)
-
                            elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
                                data_2010_2014.append(Dict)
-
                            elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
                                data_2015_2020.append(Dict)
-
                            else:
                                data_newest.append(Dict)
+                        else:
+                            data_no_date.append(Dict)
+
+        data_integrate.append(data_oldest)
+        data_integrate.append(data_2010_2014)
+        data_integrate.append(data_2015_2020)
+        data_integrate.append(data_newest)
+        data_integrate.append(data_no_date)

        # Transfer
-        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest, data_no_date, data_integrate]

-        for index in range(0, 4):
+        for index in range(0, 6):
            with open(output_files[index], 'w', encoding='utf-8') as file:
                json.dump(Data[index], file, indent=4)

@ -61,26 +68,30 @@ def Transf():
        './ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
        './ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
        './ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
-        './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
+        './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json',
+        './ejde_buffer/Author_output/EJDE_Author_output_file(no date).json',
+        './ejde_buffer/Author_output/EJDE_Author_output_file(integration).json'
    ]

    article_output_file = [
        './ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
        './ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
        './ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
-        './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
+        './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json',
+        './ejde_buffer/Article_output/EJDE_Article_output_file(no date).json',
+        './ejde_buffer/Article_output/EJDE_Article_output_file(integration).json'
    ]

    # Read and write into files
-    Read(author_folder_path, author_output_file)
-    Read(article_folder_path, article_output_file)
+    read(author_folder_path, author_output_file)
+    read(article_folder_path, article_output_file)

    # End
    print("\nData has been written into files.")


 # Delete files in temporary storage area
-def delete():
+def delete_data():
    folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
    for folder_path in folder_paths:
        file_names = os.listdir(folder_path)
@ -89,5 +100,4 @@ def delete():
            if os.path.isfile(file_path):
                os.remove(file_path)
        os.rmdir(folder_path)
-
    print('\nAttention: The temporary storage files have been deleted!')