10 changed files with 114 additions and 582 deletions
--- a/Data/Origin/ejde_buffer.zip
+++ b/Data/Origin/ejde_buffer.zip
--- a/Data/Transform/EJDE_buffer_transform.zip
+++ b/Data/Transform/EJDE_buffer_transform.zip
--- a/Data/Transform/SpringerOpen_buffer_transform.zip
+++ b/Data/Transform/SpringerOpen_buffer_transform.zip
--- a/DataTransformer/FileStructureTansfer(EJDE).py
+++ b/DataTransformer/FileStructureTansfer(EJDE).py
@ -1,199 +0,0 @@
-import json
-import os
-import unicodedata
-
-from collections import OrderedDict
-from pprint import pprint
-
-'''
-    ========== FileStructureTransfer ==========
-    1. 本程序用于将获取的数据进行结构调整
-    2. 根据论文发表的时间年限，分别将最后的数据存储在四个 json 文件中
-        （1） newest： 发表于 2020 年之后
-        （2） oldest： 发表于 2010 年之前
-        （3） 2010-2014： 发表于 2010 年至 2014 年
-        （4） 2015-2020： 发表于 2015 年至 2020 年
-    3. 考虑到部分网站的总数据量过大，所以分成多份
-    4. 本程序运行顺序为：
-        （1） fileReader() 读取本地已爬取数据，存入待处理列表
-        （2） arDataTransform() 转换论文数据格式
-        （3） auDataTransform() 转换作者数据格式
-        （4） 存入转换后数据的存储文件夹
-'''
-
-
-# Read the data
-def fileReader(folder, dataset):
-    files = os.listdir(folder)
-    for file in files:
-        file_path = os.path.join(folder, file)
-        with open(file_path, 'r', encoding='utf-8') as json_file:
-            Data = json.load(json_file)
-            dataset.append(Data)
-
-    return dataset
-
-
-# Article data structure transfer
-def arDataTransform(au_folder, ar_dataset, num):
-    def auInfoFind(path, file_name, ar_data, num):
-        authors = ar_data.get('authors')
-        authors.append(ar_data.get('corresponding_authors'))
-
-        file_path = os.path.join(path, file_name)
-        with open(file_path, 'r', encoding='utf-8') as file:
-            Data = json.load(file)
-
-        au_ID = []      # A new list to store author_id
-
-        # Find the author_id
-        for author in authors:
-            if author is None:
-                continue
-
-            author = author.replace(" ", "")
-
-            for Dict in Data:
-                Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
-                Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
-                                    unicodedata.category(char) != 'Mn')
-
-                if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
-                    au_ID.append(Dict.get('author_id'))
-
-        # Change the structure
-        ar_data_transform = {
-            "article_id": ar_data['article_id'],
-            "title": ar_data['title'],
-            "authors": au_ID,
-            "authors_name": ar_data['authors'],
-            "submit_datetime": ar_data['submit_datetime'],
-            "publish_datetime": ar_data['publish_datetime'],
-            "keywords": ar_data['keywords'],
-            "MSC": ar_data['MSC'],
-            "URL": ar_data['URL'],
-            "DOI": ar_data['DOI'],
-            "publisher": ar_data['publisher'],
-            "journal": ar_data['journal'],
-            "volume": ar_data['volume'],
-            "issue": ar_data['issue'],
-            "page": ar_data['page']
-        }
-
-        num[0] += 1     # Update the counter
-        return ar_data_transform
-
-    # ====== Main code for function =====
-    ar_names = os.listdir(au_folder)    # Read the folder
-
-    for ar_list in ar_dataset:
-        for Dict in ar_list:
-            year = Dict.get('publish_datetime')
-            if year is None:
-                continue
-
-            year = year.split('-')
-
-            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
-                print(str(num[0]) + " copies of article data structure have been transformed.")
-
-            if int(year[0]) <= 2009:
-                Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
-                ar_dataset_new[3].append(Dict)
-
-            elif 2010 <= int(year[0]) <= 2014:
-                Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
-                ar_dataset_new[0].append(Dict)
-
-            elif 2015 <= int(year[0]) <= 2020:
-                Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
-                ar_dataset_new[1].append(Dict)
-
-            else:
-                Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
-                ar_dataset_new[2].append(Dict)
-
-    # Store into the new file
-    filepaths = [
-        "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2010-2014).json",
-        "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2015-2020).json",
-        "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(newest).json",
-        "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(oldest).json",
-    ]
-
-    for i in range(4):
-        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
-            json.dump(ar_dataset_new[i], json_file, indent=4)
-
-    print("\nComplete: All of the article data structure have been transformed.")
-
-
-# Author data structure transfer
-def auDataTransform(au_dataset, num):
-    def transform(list, num):
-        new_list = []   # New list to store transformed data
-
-        for au_data in list:
-            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
-                print(str(num[0]) + " copies of author data structure have been transformed.\n")
-
-            if au_data['middle_name'] is not None:
-                raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
-            else:
-                raw_name = au_data['first_name'] + ' ' + au_data['last_name']
-
-            au_data_transform = {
-                "author_id": au_data['author_id'],
-                "from_article": au_data['from_article'][0],
-                "first_name": au_data['last_name'],
-                "last_name": au_data['first_name'],
-                "middle_name": au_data['middle_name'],
-                "raw_name": raw_name,
-                "affiliation": au_data['affiliation']
-            }
-
-            new_list.append(au_data_transform)
-            num[0] += 1         # Update the counter
-
-        return new_list
-
-    for i in range(4):
-        au_list = transform(au_dataset[i], num)
-        au_dataset_new[i].append(au_list)
-
-    # Store into the new file
-    filepaths = [
-        "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2010-2014).json",
-        "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2015-2020).json",
-        "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(newest).json",
-        "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(oldest).json",
-    ]
-
-    for i in range(4):
-        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
-            json.dump(au_dataset_new[i], json_file, indent=4)
-
-    print("\nComplete: All of the author data structure have been transformed.")
-
-
-# ========== Main code ========== #
-# New list for storing data
-ar_dataset = []
-au_dataset = []
-
-ar_dataset_new = [[] for _ in range(4)]    # New list for transformed data
-au_dataset_new = [[] for _ in range(4)]    # New list to store transformed data
-
-num1 = [0]      # Counter for complete ar_date
-num2 = [0]      # Counter for complete au_data
-
-os.makedirs('./EJDE_buffer_transform/Article_output/', exist_ok=True)
-os.makedirs('./EJDE_buffer_transform/Author_output/', exist_ok=True)
-
-# Read the data
-ar_dataset = fileReader('./EJDE_buffer/Article_output', ar_dataset)
-au_dataset = fileReader('./EJDE_buffer/Author_output', au_dataset)
-
-# Change the structure
-arDataTransform('./EJDE_buffer/Author_output', ar_dataset, num1)
-auDataTransform(au_dataset, num2)
--- a/DataTransformer/FileStructureTansfer(SprigerOpen).py
+++ b/DataTransformer/FileStructureTansfer(SprigerOpen).py
@ -1,189 +0,0 @@
-import json
-import os
-import unicodedata
-
-from collections import OrderedDict
-from pprint import pprint
-
-'''
-    ========== FileStructureTransfer ==========
-    1. 本程序用于将获取的数据进行结构调整
-    2. 根据论文发表的时间年限，分别将最后的数据存储在四个 json 文件中
-        （1） newest： 发表于 2020 年之后
-        （2） oldest： 发表于 2010 年之前
-        （3） 2010-2014： 发表于 2010 年至 2014 年
-        （4） 2015-2020： 发表于 2015 年至 2020 年
-    3. 考虑到部分网站的总数据量过大，所以分成多份
-    4. 本程序运行顺序为：
-        （1） fileReader() 读取本地已爬取数据，存入待处理列表
-        （2） arDataTransform() 转换论文数据格式
-        （3） auDataTransform() 转换作者数据格式
-        （4） 存入转换后数据的存储文件夹
-'''
-
-
-# Read the data
-def fileReader(folder, dataset):
-    files = os.listdir(folder)
-    for file in files:
-        file_path = os.path.join(folder, file)
-        with open(file_path, 'r', encoding='utf-8') as json_file:
-            Data = json.load(json_file)
-            dataset.append(Data)
-
-    return dataset
-
-
-# Article data structure transfer
-def arDataTransform(au_folder, ar_dataset, num):
-    def auInfoFind(path, file_name, ar_data, num):
-        authors = ar_data.get('authors')
-        authors.extend(ar_data.get('corresponding_authors'))
-
-        file_path = os.path.join(path, file_name)
-        with open(file_path, 'r', encoding='utf-8') as file:
-            Data = json.load(file)
-
-        au_ID = []      # A new list to store author_id
-
-        # Find the author_id
-        for author in authors:
-
-            for Dict in Data:
-                Dict_name = Dict.get('first_name') + ' ' + Dict.get('last_name')
-                Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
-                                    unicodedata.category(char) != 'Mn')
-
-                if Dict.get('from_article') == ar_data.get('article_id') and Dict_name == author:
-                    au_ID.append(Dict.get('author_id'))
-
-        # Change the structure
-        ar_data_transform = {
-            "article_id": ar_data['article_id'],
-            "title": ar_data['title'],
-            "authors": au_ID,
-            "authors_name": authors,
-            "submit_datetime": ar_data['submit_datetime'],
-            "publish_datetime": ar_data['publish_datetime'],
-            "keywords": ar_data['keywords'],
-            "MSC": ar_data['MSC'],
-            "URL": ar_data['URL'],
-            "DOI": ar_data['DOI'],
-            "publisher": ar_data['publisher'],
-            "journal": ar_data['journal'],
-            "volume": ar_data['volume'],
-            "issue": ar_data['issue'],
-            "page": ar_data['page']
-        }
-
-        num[0] += 1     # Update the counter
-        return ar_data_transform
-
-    # ====== Main code for function =====
-    ar_names = os.listdir(au_folder)    # Read the folder
-
-    for ar_list in ar_dataset:
-        for Dict in ar_list:
-            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
-                print(str(num[0]) + " copies of article data structure have been transformed.")
-
-            if int(Dict.get('volume')) <= 2009:
-                Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
-                ar_dataset_new[3].append(Dict)
-
-            elif 2010 <= int(Dict.get('volume')) <= 2014:
-                Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
-                ar_dataset_new[0].append(Dict)
-
-            elif 2015 <= int(Dict.get('volume')) <= 2020:
-                Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
-                ar_dataset_new[1].append(Dict)
-
-            else:
-                Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
-                ar_dataset_new[2].append(Dict)
-
-    # Store into the new file
-    filepaths = [
-        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2010-2014).json",
-        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2015-2020).json",
-        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(newest).json",
-        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(oldest).json",
-    ]
-
-    for i in range(4):
-        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
-            json.dump(ar_dataset_new[i], json_file, indent=4)
-
-    print("\nComplete: All of the article data structure have been transformed.")
-
-
-# Author data structure transfer
-def auDataTransform(au_dataset, num):
-    def transform(list, num):
-        new_list = []   # New list to store transformed data
-
-        for au_data in list:
-            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
-                print(str(num[0]) + " copies of author data structure have been transformed.\n")
-
-            if au_data['middle_name'] is not None:
-                raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
-            else:
-                raw_name = au_data['first_name'] + ' ' + au_data['last_name']
-
-            au_data_transform = {
-                "author_id": au_data['author_id'],
-                "from_article": au_data['from_article'],
-                "first_name": au_data['first_name'],
-                "last_name": au_data['last_name'],
-                "middle_name": au_data['middle_name'],
-                "raw_name": raw_name,
-                "affiliation": au_data['affiliation']
-            }
-
-            new_list.append(au_data_transform)
-            num[0] += 1         # Update the counter
-
-        return new_list
-
-    for i in range(4):
-        au_list = transform(au_dataset[i], num)
-        au_dataset_new[i].extend(au_list)
-
-    # Store into the new file
-    filepaths = [
-        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2010-2014).json",
-        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2015-2020).json",
-        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(newest).json",
-        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(oldest).json",
-    ]
-
-    for i in range(4):
-        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
-            json.dump(au_dataset_new[i], json_file, indent=4)
-
-    print("\nComplete: All of the author data structure have been transformed.")
-
-
-# ========== Main code ========== #
-# New list for storing data
-ar_dataset = []
-au_dataset = []
-
-ar_dataset_new = [[] for _ in range(4)]    # New list for transformed data
-au_dataset_new = [[] for _ in range(4)]    # New list to store transformed data
-
-num1 = [0]      # Counter for complete ar_date
-num2 = [0]      # Counter for complete au_data
-
-os.makedirs('./SpringerOpen_buffer_transform/Article_output/', exist_ok=True)
-os.makedirs('./SpringerOpen_buffer_transform/Author_output/', exist_ok=True)
-
-# Read the data
-ar_dataset = fileReader('./SpringerOpen_buffer/Article_output', ar_dataset)
-au_dataset = fileReader('./SpringerOpen_buffer/Author_output', au_dataset)
-
-# Change the structure
-# arDataTransform('./SpringerOpen_buffer/Author_output', ar_dataset, num1)
-auDataTransform(au_dataset, num2)
--- a/DataTransformer/FileStructureTansfer(EJQTDE).py
+++ b/DataTransformer/FileStructureTansfer(EJQTDE).py
@ -5,22 +5,6 @@ import unicodedata
 from collections import OrderedDict
 from pprint import pprint

-'''
-    ========== FileStructureTransfer ==========
-    1. 本程序用于将获取的数据进行结构调整
-    2. 根据论文发表的时间年限，分别将最后的数据存储在四个 json 文件中
-        （1） newest： 发表于 2020 年之后
-        （2） oldest： 发表于 2010 年之前
-        （3） 2010-2014： 发表于 2010 年至 2014 年
-        （4） 2015-2020： 发表于 2015 年至 2020 年
-    3. 考虑到部分网站的总数据量过大，所以分成多份
-    4. 本程序运行顺序为：
-        （1） fileReader() 读取本地已爬取数据，存入待处理列表
-        （2） arDataTransform() 转换论文数据格式
-        （3） auDataTransform() 转换作者数据格式
-        （4） 存入转换后数据的存储文件夹
-'''
-

 # Read the data
 def fileReader(folder, dataset):
@ -58,28 +42,12 @@ def arDataTransform(au_folder, ar_dataset, num):
                if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
                    au_ID.append(Dict.get('author_id'))

-        author_names_new = []
-        author_names = ar_data['authors']
-
-        for author_name in author_names:
-            author_name_new = ''
-            author_name = author_name.split(", ")
-
-            for i in range(len(author_name)-1, 0, -1):
-                # print(author_name[i])
-                author_name_new += author_name[i]
-                if i != 0:
-                    author_name_new += ', '
-
-            print(author_name_new)
-            author_names_new.append(author_name_new)
-
        # Change the structure
        ar_data_transform = {
            "article_id": ar_data['article_id'],
            "title": ar_data['title'],
            "authors": au_ID,
-            "authors_name": author_names_new,
+            "authors_name": ar_data['authors'],
            "submit_datetime": ar_data['submit_datetime'],
            "publish_datetime": ar_data['publish_datetime'],
            "keywords": ar_data['keywords'],
@ -128,6 +96,13 @@ def arDataTransform(au_folder, ar_dataset, num):
        "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
    ]

+    # for filepath in filepaths:
+    #     for list in ar_dataset_new:
+    #         with open(filepath, "w", encoding='utf-8') as json_file:
+    #             json.dump(list, json_file, indent=4)
+    #
+    #         break
+
    for i in range(4):
        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
            json.dump(ar_dataset_new[i], json_file, indent=4)
@ -152,8 +127,8 @@ def auDataTransform(au_dataset, num):
            au_data_transform = {
                "author_id": au_data['author_id'],
                "from_article": au_data['from_article'][0],
-                "first_name": au_data['last_name'],
-                "last_name": au_data['first_name'],
+                "first_name": au_data['first_name'],
+                "last_name": au_data['last_name'],
                "middle_name": au_data['middle_name'],
                "raw_name": raw_name,
                "affiliation": au_data['affiliation']
@ -164,6 +139,13 @@ def auDataTransform(au_dataset, num):

        return new_list

+    # # Transform the author data structure
+    # au_dataset_new = []     # New list to store transformed data
+
+    # for au_list in au_dataset:
+    #     au_list_new = transform(au_list, num)
+    #     au_dataset_new.append(au_list_new)
+
    for i in range(4):
        au_list = transform(au_dataset[i], num)
        au_dataset_new[i].append(au_list)
@ -203,4 +185,4 @@ au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset)

 # Change the structure
 arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
-auDataTransform(au_dataset, num2)
+# auDataTransform(au_dataset, num2)
--- a/Parsers/00_Citation_spider/get_paper_citation.py
+++ b/Parsers/00_Citation_spider/get_paper_citation.py
@ -26,12 +26,11 @@ payload = {
 jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)

 # Aminer API
-api_paper_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
-api_paper_detail = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
-api_author_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search"
+api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
+api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"


-def aminer_get_paper_id(title):
+def aminer_get_id(title):
    headers = {
        "Authorization": f"Bearer {jwt_token}"
    }
@ -40,7 +39,7 @@ def aminer_get_paper_id(title):
        "size": "",
        "title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
    }
-    response = requests.get(api_paper_id, headers=headers, params=params)
+    response = requests.get(api_get_id, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
@ -50,7 +49,7 @@ def aminer_get_paper_id(title):
            not_on_aminer.append(title)


-def aminer_post_paper_citation(aminer_id):
+def aminer_post_citation(aminer_id):
    headers = {
        "Content-Type": "application/json;charset=utf-8",
        "Authorization": f"Bearer {jwt_token}"
@ -58,7 +57,7 @@ def aminer_post_paper_citation(aminer_id):
    request_data = {
        "ids": aminer_id
    }
-    response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
+    response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))

    if response.status_code == 200:
        data = response.json()
@ -73,31 +72,6 @@ def aminer_post_paper_citation(aminer_id):
        aminer_paper_citation_retry.append(aminer_id)


-def aminer_author_info(author_aminer_id, author_name, offset):
-    headers = {
-        "Content-Type": "application/json;charset=utf-8",
-        "Authorization": f"Bearer {jwt_token}"
-    }
-    request_data = {
-        "ids": author_aminer_id,
-        "query": author_name,
-        "offset": offset
-    }
-    response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
-
-    if response.status_code == 200:
-        data = response.json()
-        if data.get("success"):
-            for item in data.get('data', []):
-                if 'n_citation' in item:
-                    n_citation = item['n_citation']
-                else:
-                    n_citation = 0
-                aminer_paper_citation.append(n_citation)
-    else:
-        aminer_paper_citation_retry.append(author_aminer_id)
-
-
 def scholarly_get_citation(title):
    # # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
    pg = ProxyGenerator()
@ -118,7 +92,8 @@ aminer_paper_citation = []
 aminer_paper_citation_retry = []

 # scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
-aminer_get_paper_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
+
+aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
 if aminer_paper_id:
-    aminer_post_paper_citation(aminer_paper_id)
+    aminer_post_citation(aminer_paper_id)
 print(aminer_paper_citation)
--- a/Parsers/01_EJDE_spider/ejde_main.py
+++ b/Parsers/01_EJDE_spider/ejde_main.py
@ -1,8 +1,7 @@
-import re
 import time
 import uuid
 import requests
-import threading
+import re
 import ejde_save

 from retrying import retry
@ -14,8 +13,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 '''
    爬取网站：'ejde.math.txstate.edu'

-    Total number of papers: 2023/08/08 - 4785
-    Total Time via VPN w/100ms-delay: 96.30s
+    Total number of papers: 2023/08/08 - 4761
+    Total Time via VPN w/100ms-delay: 306.73s

    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
@ -24,22 +23,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 '''


-def append_data_thread_safe(from_list, to_list, data_lock):
-    with data_lock:
-        to_list.append(from_list)
-
-
-def save_data_thread_safe(data, data_lock, data_type):
-    global articleNum, authorNum
-    with data_lock:
-        ejde_save.save_data(data, f"{data_type}_TS", str(uuid.uuid4()) + ".json")
-        if data_type == "Article":
-            articleNum += len(data)
-        else:
-            authorNum += len(data)
-        data.clear()
-
-
 def datetime_transform(date):
    month_typo = {
        "Janaury": "January",
@ -137,7 +120,7 @@ def process_html_article(baseweb, article):
    # Get article title & url
    try:
        title = article.text.strip()
-        title = str(re.sub(r'\s+', ' ', title).strip())
+        title = re.sub(r'\s+', ' ', title).strip()
        article_url = baseweb + article.find_next("a")["href"]
        if "../../index.html" in article_url:
            print("Redundant URL:", article_url)
@ -165,6 +148,7 @@ def process_html_article(baseweb, article):

@retry(wait_fixed=5000, stop_max_attempt_number=5)
 def process_article(title, article_url):
+    global articleNum, authorNum
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
    article_response = requests.get(article_url, headers=headers)
@ -178,11 +162,11 @@ def process_article(title, article_url):
        # Extract title if title == None
        if not title:
            title_match = re.search(r"<h3>(.*?)<p>", article_text)
-            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""
+            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None

        # Extract issue
        issue_match = re.search(r'No\. (\d+)', article_text)
-        issue = issue_match.group(1) if issue_match else ""
+        issue = issue_match.group(1) if issue_match else None

        # Extract volume
        volume_match = re.search(r'Vol\. (\d+)', article_text)
@ -205,23 +189,21 @@ def process_article(title, article_url):
                        volume = str(volume)
                        issue = "Conference " + str(issue_number)
                    else:
-                        volume = ""
+                        volume = None

        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
-        pp = pp_match.group(1) if pp_match else ""
+        pp = pp_match.group(1) if pp_match else None

        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
-        if not match:
-            match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
-        submitted_date = match.group(1) if match else ""
+        submitted_date = match.group(1) if match else None
        if submitted_date:
            submitted_date = datetime_transform(submitted_date)

        # Extract publication date
        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
-        publish_date = match.group(1) if match else ""
+        publish_date = match.group(1) if match else None
        if publish_date:
            publish_date = datetime_transform(publish_date)

@ -252,25 +234,25 @@ def process_article(title, article_url):
        doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
-        doi = doi_match.group(1) if doi_match else ""
+        doi = doi_match.group(1) if doi_match else None
+        doi = doi.replace('https://doi.org/', '')  # strip doi website header

        # Article_id
        article_id = str(uuid.uuid4())

        # Author info
        authors = []
-        author_names = []
        table = article_soup.find('table')
        if table:
            for row in table.find_all('tr'):
                cells = [cell.text.strip() for cell in row.find_all('td')]
                for cell in cells:
-                    if "email" in cell:
-                        cell = cell.split("email")
+                    if "email:" in cell:
+                        cell = cell.split("email:")
                        email_list = str(cell[1]).split(',')
                        cell = cell[0]
-                    elif "e-mail" in cell:
-                        cell = cell.split("e-mail")
+                    elif "e-mail:" in cell:
+                        cell = cell.split("e-mail:")
                        email_list = str(cell[1]).split(',')
                        cell = cell[0]
                    else:
@ -282,11 +264,8 @@ def process_article(title, article_url):

                    # Data processing
                    if cell[0]:
-                        author_id = str(uuid.uuid4())
-                        authors.append(author_id)
-                        author_names.append(unidecode(cell[0]))
-                        name = re.split(r'\s+', cell[0])
-                        name = [item for item in name if item != '']
+                        authors.append(unidecode(cell[0]))
+                        name = re.split(r'[ .]', cell[0])
                        affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
@ -297,21 +276,19 @@ def process_article(title, article_url):
                                emails.append(unidecode(email_match.group())) if email_match else None

                        author_data = {
-                            "author_id": author_id,
-                            "from_article": article_id,
+                            "author_id": str(uuid.uuid4()),
+                            "from_article": [article_id],
                            "first_name": unidecode(name[0]),
                            "last_name": unidecode(name[-1]),
-                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
-                            "raw_name": unidecode(cell[0]),
-                            "affiliation": [
-                                {
-                                    "year": volume,
-                                    "affiliation": unidecode(affiliation),
-                                    "email": ", ".join(emails)
-                                }
-                            ]
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
+                            "affiliation": [{
+                                "year": volume,
+                                "affiliation": unidecode(affiliation),
+                                "email": emails
+                            }]
                        }
-                        append_data_thread_safe(author_data, authorData, authorDataLock)
+                        authorData.append(author_data)
+                        authorNum += 1
        # If no author table
        else:
            match_type = 0
@ -331,12 +308,12 @@ def process_article(title, article_url):
                matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
                matches = matches.split("<p>")
                for match in matches:
-                    if "email" in match:
-                        match = match.split("email")
+                    if "email:" in match:
+                        match = match.split("email:")
                        email_list = str(match[1]).split(',')
                        match = match[0]
-                    elif "e-mail" in match:
-                        match = match.split("e-mail")
+                    elif "e-mail:" in match:
+                        match = match.split("e-mail:")
                        email_list = str(match[1]).split(',')
                        match = match[0]
                    else:
@ -353,11 +330,8 @@ def process_article(title, article_url):

                    # Data processing
                    if match[0]:
-                        author_id = str(uuid.uuid4())
-                        authors.append(author_id)
                        authors.append(unidecode(match[0]))
-                        name = re.split(r'\s+', match[0])
-                        name = [item for item in name if item != '']
+                        name = re.split(r'[ .]', match[0])
                        affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
@ -368,21 +342,19 @@ def process_article(title, article_url):
                                emails.append(unidecode(email_match.group())) if email_match else None

                        author_data = {
-                            "author_id": author_id,
-                            "from_article": article_id,
+                            "author_id": str(uuid.uuid4()),
+                            "from_article": [article_id],
                            "first_name": unidecode(name[0]),
                            "last_name": unidecode(name[-1]),
-                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
-                            "raw_name": unidecode(match[0]),
-                            "affiliation": [
-                                {
-                                    "year": volume,
-                                    "affiliation": unidecode(affiliation),
-                                    "email": ", ".join(emails)
-                                }
-                            ]
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
+                            "affiliation": [{
+                                "year": volume,
+                                "affiliation": unidecode(affiliation),
+                                "email": emails
+                            }]
                        }
-                        append_data_thread_safe(author_data, authorData, authorDataLock)
+                        authorData.append(author_data)
+                        authorNum += 1
            else:
                print("AUTHOR SEARCHING ERROR:", article_url)
                fail = {
@ -396,7 +368,7 @@ def process_article(title, article_url):
            "article_id": article_id,
            "title": unidecode(title),
            "authors": authors,
-            "author_names": author_names,
+            "corresponding_authors": None,
            "submit_datetime": submitted_date,
            "publish_datetime": publish_date,
            "keywords": keywords,
@ -409,14 +381,17 @@ def process_article(title, article_url):
            "issue": issue,
            "page": pp
        }
-        append_data_thread_safe(article_data, articleData, articleDataLock)
+        articleData.append(article_data)
+        articleNum += 1

        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
-            save_data_thread_safe(articleData, articleDataLock, "Article")
+            ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
+            articleData.clear()

        if len(authorData) % batch_size == 0:
-            save_data_thread_safe(authorData, authorDataLock, "Author")
+            ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
+            authorData.clear()


 start_time = time.time()
@ -454,8 +429,6 @@ authorNum = 0
 articleNum = 0

 batch_size = 100  # Number of articles to process before saving
-authorDataLock = threading.Lock()
-articleDataLock = threading.Lock()
 executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2))  # Set the number of worker threads

 # Process each URL using multithreading
@ -469,30 +442,29 @@ for future in as_completed(futures):
        print("VOLUME PROCESSING ERROR:", str(vol_err))

 # Retry failed processing paper
-if len(failedData):
-    print("START RETRYING:", len(failedData))
-    while failedData:
-        fail_data = failedData.pop(0)
-        articleTitle = fail_data["title"]
-        articleUrl = fail_data["URL"]
-        try:
-            process_article(articleTitle, articleUrl)
-        except Exception as retry_err:
-            print("ARTICLE RETRYING FAILURE:", str(retry_err))
-            totally_fail = {
-                "title": articleTitle,
-                "URL": articleUrl
-            }
-            totallyFailedData.append(totally_fail)
+print("START RETRYING:", len(failedData))
+while failedData:
+    data = failedData.pop(0)
+    articleTitle = data["title"]
+    articleUrl = data["URL"]
+    try:
+        process_article(articleTitle, articleUrl)
+    except Exception as retry_err:
+        print("ARTICLE RETRYING FAILURE:", str(retry_err))
+        totally_fail = {
+            "title": articleTitle,
+            "URL": articleUrl
+        }
+        totallyFailedData.append(totally_fail)

 # Save remaining data
 if len(articleData) > 0:
-    save_data_thread_safe(articleData, articleDataLock, "Article")
-print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
+    ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
+    print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")

 if len(authorData) > 0:
-    save_data_thread_safe(authorData, authorDataLock, "Author")
-print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
+    ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
+    print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")

 # Save error record
 if len(totallyFailedData) > 0:
@ -513,5 +485,5 @@ print("Total fetched author:", authorNum)
 print("time elapsed: {:.2f}s".format(time.time() - start_time))

 # Transfer to large file and delete the temporary storage files
-ejde_save.transform_data()
-ejde_save.delete_data()
+ejde_save.Transf()
+ejde_save.delete()
--- a/Parsers/01_EJDE_spider/ejde_save.py
+++ b/Parsers/01_EJDE_spider/ejde_save.py
@ -14,8 +14,8 @@ def save_data(dataset, filetype, filename):


 # Write into output files
-def transform_data():
-    def read(folder_path, output_files):
+def Transf():
+    def Read(folder_path, output_files):
        # Create new folders
        os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
        os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
@ -24,8 +24,6 @@ def transform_data():
        data_2010_2014 = []
        data_2015_2020 = []
        data_newest = []
-        data_no_date = []
-        data_integrate = []

        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
@ -33,29 +31,24 @@ def transform_data():
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    for Dict in data:
-                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) != '':
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
                            # Select data
                            if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
                                data_oldest.append(Dict)
+
                            elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
                                data_2010_2014.append(Dict)
+
                            elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
                                data_2015_2020.append(Dict)
+
                            else:
                                data_newest.append(Dict)
-                        else:
-                            data_no_date.append(Dict)
-
-        data_integrate.append(data_oldest)
-        data_integrate.append(data_2010_2014)
-        data_integrate.append(data_2015_2020)
-        data_integrate.append(data_newest)
-        data_integrate.append(data_no_date)

        # Transfer
-        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest, data_no_date, data_integrate]
+        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]

-        for index in range(0, 6):
+        for index in range(0, 4):
            with open(output_files[index], 'w', encoding='utf-8') as file:
                json.dump(Data[index], file, indent=4)

@ -68,30 +61,26 @@ def transform_data():
        './ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
        './ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
        './ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
-        './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json',
-        './ejde_buffer/Author_output/EJDE_Author_output_file(no date).json',
-        './ejde_buffer/Author_output/EJDE_Author_output_file(integration).json'
+        './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
    ]

    article_output_file = [
        './ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
        './ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
        './ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
-        './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json',
-        './ejde_buffer/Article_output/EJDE_Article_output_file(no date).json',
-        './ejde_buffer/Article_output/EJDE_Article_output_file(integration).json'
+        './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
    ]

    # Read and write into files
-    read(author_folder_path, author_output_file)
-    read(article_folder_path, article_output_file)
+    Read(author_folder_path, author_output_file)
+    Read(article_folder_path, article_output_file)

    # End
    print("\nData has been written into files.")


 # Delete files in temporary storage area
-def delete_data():
+def delete():
    folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
    for folder_path in folder_paths:
        file_names = os.listdir(folder_path)
@ -100,4 +89,5 @@ def delete_data():
            if os.path.isfile(file_path):
                os.remove(file_path)
        os.rmdir(folder_path)
+
    print('\nAttention: The temporary storage files have been deleted!')
--- a/Parsers/02_EJQTDE_spider/ejqtde_main.py
+++ b/Parsers/02_EJQTDE_spider/ejqtde_main.py
@ -67,6 +67,7 @@ with ThreadPoolExecutor(max_workers=25) as executor:
    futures = [executor.submit(extract_href, url) for url in url_list]
    for future in as_completed(futures):
        pass
+
    wait(futures)
 print('\nAll links have been got.\n')

@ -110,4 +111,4 @@ print(count2, ' author_data has been stored.')

 # Transfer to large file and delete the temporary storage files
 ejqtde_save.Transf()
-ejqtde_save.delete()
+ejqtde_save.delete()