Fixed: miss append data while multithreading

Pushed: new parsed zip
Pushed new "ejde_buffer_transformation.zip"
2023-11-02 11:19:59 +08:00 · 2023-11-02 04:33:02 +08:00 · 2023-11-02 04:28:06 +08:00 · 2023-11-01 20:11:20 +08:00 · 2023-11-01 13:13:28 +08:00 · 2023-11-01 13:12:38 +08:00
10 changed files with 582 additions and 114 deletions
--- a/Data/Origin/ejde_buffer.zip
+++ b/Data/Origin/ejde_buffer.zip
--- a/Data/Transform/EJDE_buffer_transform.zip
+++ b/Data/Transform/EJDE_buffer_transform.zip
--- a/Data/Transform/SpringerOpen_buffer_transform.zip
+++ b/Data/Transform/SpringerOpen_buffer_transform.zip
--- a/DataTransformer/FileStructureTansfer(EJDE).py
+++ b/DataTransformer/FileStructureTansfer(EJDE).py
@ -0,0 +1,199 @@
 import json
 import os
 import unicodedata
 from collections import OrderedDict
 from pprint import pprint
 '''
    ========== FileStructureTransfer ==========
    1. 本程序用于将获取的数据进行结构调整
    2. 根据论文发表的时间年限，分别将最后的数据存储在四个 json 文件中
        （1） newest： 发表于 2020 年之后
        （2） oldest： 发表于 2010 年之前
        （3） 2010-2014： 发表于 2010 年至 2014 年
        （4） 2015-2020： 发表于 2015 年至 2020 年
    3. 考虑到部分网站的总数据量过大，所以分成多份
    4. 本程序运行顺序为：
        （1） fileReader() 读取本地已爬取数据，存入待处理列表
        （2） arDataTransform() 转换论文数据格式
        （3） auDataTransform() 转换作者数据格式
        （4） 存入转换后数据的存储文件夹
 '''
 # Read the data
 def fileReader(folder, dataset):
    files = os.listdir(folder)
    for file in files:
        file_path = os.path.join(folder, file)
        with open(file_path, 'r', encoding='utf-8') as json_file:
            Data = json.load(json_file)
            dataset.append(Data)
    return dataset
 # Article data structure transfer
 def arDataTransform(au_folder, ar_dataset, num):
    def auInfoFind(path, file_name, ar_data, num):
        authors = ar_data.get('authors')
        authors.append(ar_data.get('corresponding_authors'))
        file_path = os.path.join(path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            Data = json.load(file)
        au_ID = []      # A new list to store author_id
        # Find the author_id
        for author in authors:
            if author is None:
                continue
            author = author.replace(" ", "")
            for Dict in Data:
                Dict_name = Dict.get('first_name') + "," + Dict.get('last_name')
                Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
                                    unicodedata.category(char) != 'Mn')
                if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
                    au_ID.append(Dict.get('author_id'))
        # Change the structure
        ar_data_transform = {
            "article_id": ar_data['article_id'],
            "title": ar_data['title'],
            "authors": au_ID,
            "authors_name": ar_data['authors'],
            "submit_datetime": ar_data['submit_datetime'],
            "publish_datetime": ar_data['publish_datetime'],
            "keywords": ar_data['keywords'],
            "MSC": ar_data['MSC'],
            "URL": ar_data['URL'],
            "DOI": ar_data['DOI'],
            "publisher": ar_data['publisher'],
            "journal": ar_data['journal'],
            "volume": ar_data['volume'],
            "issue": ar_data['issue'],
            "page": ar_data['page']
        }
        num[0] += 1     # Update the counter
        return ar_data_transform
    # ====== Main code for function =====
    ar_names = os.listdir(au_folder)    # Read the folder
    for ar_list in ar_dataset:
        for Dict in ar_list:
            year = Dict.get('publish_datetime')
            if year is None:
                continue
            year = year.split('-')
            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
                print(str(num[0]) + " copies of article data structure have been transformed.")
            if int(year[0]) <= 2009:
                Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
                ar_dataset_new[3].append(Dict)
            elif 2010 <= int(year[0]) <= 2014:
                Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
                ar_dataset_new[0].append(Dict)
            elif 2015 <= int(year[0]) <= 2020:
                Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
                ar_dataset_new[1].append(Dict)
            else:
                Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
                ar_dataset_new[2].append(Dict)
    # Store into the new file
    filepaths = [
        "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2010-2014).json",
        "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(2015-2020).json",
        "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(newest).json",
        "./EJDE_buffer_transform/Article_output/EJDE_Article_output_file(oldest).json",
    ]
    for i in range(4):
        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
            json.dump(ar_dataset_new[i], json_file, indent=4)
    print("\nComplete: All of the article data structure have been transformed.")
 # Author data structure transfer
 def auDataTransform(au_dataset, num):
    def transform(list, num):
        new_list = []   # New list to store transformed data
        for au_data in list:
            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
                print(str(num[0]) + " copies of author data structure have been transformed.\n")
            if au_data['middle_name'] is not None:
                raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
            else:
                raw_name = au_data['first_name'] + ' ' + au_data['last_name']
            au_data_transform = {
                "author_id": au_data['author_id'],
                "from_article": au_data['from_article'][0],
                "first_name": au_data['last_name'],
                "last_name": au_data['first_name'],
                "middle_name": au_data['middle_name'],
                "raw_name": raw_name,
                "affiliation": au_data['affiliation']
            }
            new_list.append(au_data_transform)
            num[0] += 1         # Update the counter
        return new_list
    for i in range(4):
        au_list = transform(au_dataset[i], num)
        au_dataset_new[i].append(au_list)
    # Store into the new file
    filepaths = [
        "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2010-2014).json",
        "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(2015-2020).json",
        "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(newest).json",
        "./EJDE_buffer_transform/Author_output/EJDE_Author_output_file(oldest).json",
    ]
    for i in range(4):
        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
            json.dump(au_dataset_new[i], json_file, indent=4)
    print("\nComplete: All of the author data structure have been transformed.")
 # ========== Main code ========== #
 # New list for storing data
 ar_dataset = []
 au_dataset = []
 ar_dataset_new = [[] for _ in range(4)]    # New list for transformed data
 au_dataset_new = [[] for _ in range(4)]    # New list to store transformed data
 num1 = [0]      # Counter for complete ar_date
 num2 = [0]      # Counter for complete au_data
 os.makedirs('./EJDE_buffer_transform/Article_output/', exist_ok=True)
 os.makedirs('./EJDE_buffer_transform/Author_output/', exist_ok=True)
 # Read the data
 ar_dataset = fileReader('./EJDE_buffer/Article_output', ar_dataset)
 au_dataset = fileReader('./EJDE_buffer/Author_output', au_dataset)
 # Change the structure
 arDataTransform('./EJDE_buffer/Author_output', ar_dataset, num1)
 auDataTransform(au_dataset, num2)
--- a/DataTransformer/FileStructureTansfer(EJQTDE).py
+++ b/DataTransformer/FileStructureTansfer(EJQTDE).py
@ -5,6 +5,22 @@ import unicodedata
 from collections import OrderedDict
 from pprint import pprint
 '''
    ========== FileStructureTransfer ==========
    1. 本程序用于将获取的数据进行结构调整
    2. 根据论文发表的时间年限，分别将最后的数据存储在四个 json 文件中
        （1） newest： 发表于 2020 年之后
        （2） oldest： 发表于 2010 年之前
        （3） 2010-2014： 发表于 2010 年至 2014 年
        （4） 2015-2020： 发表于 2015 年至 2020 年
    3. 考虑到部分网站的总数据量过大，所以分成多份
    4. 本程序运行顺序为：
        （1） fileReader() 读取本地已爬取数据，存入待处理列表
        （2） arDataTransform() 转换论文数据格式
        （3） auDataTransform() 转换作者数据格式
        （4） 存入转换后数据的存储文件夹
 '''
 # Read the data
 def fileReader(folder, dataset):
@ -42,12 +58,28 @@ def arDataTransform(au_folder, ar_dataset, num):
                if Dict.get('from_article')[0] == ar_data.get('article_id') and Dict_name == author:
                    au_ID.append(Dict.get('author_id'))
        author_names_new = []
        author_names = ar_data['authors']
        for author_name in author_names:
            author_name_new = ''
            author_name = author_name.split(", ")
            for i in range(len(author_name)-1, 0, -1):
                # print(author_name[i])
                author_name_new += author_name[i]
                if i != 0:
                    author_name_new += ', '
            print(author_name_new)
            author_names_new.append(author_name_new)
        # Change the structure
        ar_data_transform = {
            "article_id": ar_data['article_id'],
            "title": ar_data['title'],
            "authors": au_ID,
-            "authors_name": ar_data['authors'],
+            "authors_name": author_names_new,
            "submit_datetime": ar_data['submit_datetime'],
            "publish_datetime": ar_data['publish_datetime'],
            "keywords": ar_data['keywords'],
@ -96,13 +128,6 @@ def arDataTransform(au_folder, ar_dataset, num):
        "./EJQTDE_buffer_transform/Article_output/EJQTDE_Article_output_file(oldest).json",
    ]
    # for filepath in filepaths:
    #     for list in ar_dataset_new:
    #         with open(filepath, "w", encoding='utf-8') as json_file:
    #             json.dump(list, json_file, indent=4)
    #
    #         break
    for i in range(4):
        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
            json.dump(ar_dataset_new[i], json_file, indent=4)
@ -127,8 +152,8 @@ def auDataTransform(au_dataset, num):
            au_data_transform = {
                "author_id": au_data['author_id'],
                "from_article": au_data['from_article'][0],
-                "first_name": au_data['first_name'],
+                "first_name": au_data['last_name'],
-                "last_name": au_data['last_name'],
+                "last_name": au_data['first_name'],
                "middle_name": au_data['middle_name'],
                "raw_name": raw_name,
                "affiliation": au_data['affiliation']
@ -139,13 +164,6 @@ def auDataTransform(au_dataset, num):
        return new_list
    # # Transform the author data structure
    # au_dataset_new = []     # New list to store transformed data
    # for au_list in au_dataset:
    #     au_list_new = transform(au_list, num)
    #     au_dataset_new.append(au_list_new)
    for i in range(4):
        au_list = transform(au_dataset[i], num)
        au_dataset_new[i].append(au_list)
@ -185,4 +203,4 @@ au_dataset = fileReader('./EJQTDE_buffer/Author_output', au_dataset)
 # Change the structure
 arDataTransform('./EJQTDE_buffer/Author_output', ar_dataset, num1)
-# auDataTransform(au_dataset, num2)
+auDataTransform(au_dataset, num2)
--- a/DataTransformer/FileStructureTansfer(SprigerOpen).py
+++ b/DataTransformer/FileStructureTansfer(SprigerOpen).py
@ -0,0 +1,189 @@
 import json
 import os
 import unicodedata
 from collections import OrderedDict
 from pprint import pprint
 '''
    ========== FileStructureTransfer ==========
    1. 本程序用于将获取的数据进行结构调整
    2. 根据论文发表的时间年限，分别将最后的数据存储在四个 json 文件中
        （1） newest： 发表于 2020 年之后
        （2） oldest： 发表于 2010 年之前
        （3） 2010-2014： 发表于 2010 年至 2014 年
        （4） 2015-2020： 发表于 2015 年至 2020 年
    3. 考虑到部分网站的总数据量过大，所以分成多份
    4. 本程序运行顺序为：
        （1） fileReader() 读取本地已爬取数据，存入待处理列表
        （2） arDataTransform() 转换论文数据格式
        （3） auDataTransform() 转换作者数据格式
        （4） 存入转换后数据的存储文件夹
 '''
 # Read the data
 def fileReader(folder, dataset):
    files = os.listdir(folder)
    for file in files:
        file_path = os.path.join(folder, file)
        with open(file_path, 'r', encoding='utf-8') as json_file:
            Data = json.load(json_file)
            dataset.append(Data)
    return dataset
 # Article data structure transfer
 def arDataTransform(au_folder, ar_dataset, num):
    def auInfoFind(path, file_name, ar_data, num):
        authors = ar_data.get('authors')
        authors.extend(ar_data.get('corresponding_authors'))
        file_path = os.path.join(path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            Data = json.load(file)
        au_ID = []      # A new list to store author_id
        # Find the author_id
        for author in authors:
            for Dict in Data:
                Dict_name = Dict.get('first_name') + ' ' + Dict.get('last_name')
                Dict_name = ''.join(char for char in unicodedata.normalize('NFKD', Dict_name) if
                                    unicodedata.category(char) != 'Mn')
                if Dict.get('from_article') == ar_data.get('article_id') and Dict_name == author:
                    au_ID.append(Dict.get('author_id'))
        # Change the structure
        ar_data_transform = {
            "article_id": ar_data['article_id'],
            "title": ar_data['title'],
            "authors": au_ID,
            "authors_name": authors,
            "submit_datetime": ar_data['submit_datetime'],
            "publish_datetime": ar_data['publish_datetime'],
            "keywords": ar_data['keywords'],
            "MSC": ar_data['MSC'],
            "URL": ar_data['URL'],
            "DOI": ar_data['DOI'],
            "publisher": ar_data['publisher'],
            "journal": ar_data['journal'],
            "volume": ar_data['volume'],
            "issue": ar_data['issue'],
            "page": ar_data['page']
        }
        num[0] += 1     # Update the counter
        return ar_data_transform
    # ====== Main code for function =====
    ar_names = os.listdir(au_folder)    # Read the folder
    for ar_list in ar_dataset:
        for Dict in ar_list:
            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
                print(str(num[0]) + " copies of article data structure have been transformed.")
            if int(Dict.get('volume')) <= 2009:
                Dict = auInfoFind(au_folder, ar_names[3], Dict, num)
                ar_dataset_new[3].append(Dict)
            elif 2010 <= int(Dict.get('volume')) <= 2014:
                Dict = auInfoFind(au_folder, ar_names[0], Dict, num)
                ar_dataset_new[0].append(Dict)
            elif 2015 <= int(Dict.get('volume')) <= 2020:
                Dict = auInfoFind(au_folder, ar_names[1], Dict, num)
                ar_dataset_new[1].append(Dict)
            else:
                Dict = auInfoFind(au_folder, ar_names[2], Dict, num)
                ar_dataset_new[2].append(Dict)
    # Store into the new file
    filepaths = [
        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2010-2014).json",
        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(2015-2020).json",
        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(newest).json",
        "./SpringerOpen_buffer_transform/Article_output/SpringerOpen_Article_output_file(oldest).json",
    ]
    for i in range(4):
        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
            json.dump(ar_dataset_new[i], json_file, indent=4)
    print("\nComplete: All of the article data structure have been transformed.")
 # Author data structure transfer
 def auDataTransform(au_dataset, num):
    def transform(list, num):
        new_list = []   # New list to store transformed data
        for au_data in list:
            if num[0] % 100 == 0 and num[0] != 0:       # Alert for complete data
                print(str(num[0]) + " copies of author data structure have been transformed.\n")
            if au_data['middle_name'] is not None:
                raw_name = au_data['first_name'] + ' ' + au_data['middle_name'] + ' ' + au_data['last_name']
            else:
                raw_name = au_data['first_name'] + ' ' + au_data['last_name']
            au_data_transform = {
                "author_id": au_data['author_id'],
                "from_article": au_data['from_article'],
                "first_name": au_data['first_name'],
                "last_name": au_data['last_name'],
                "middle_name": au_data['middle_name'],
                "raw_name": raw_name,
                "affiliation": au_data['affiliation']
            }
            new_list.append(au_data_transform)
            num[0] += 1         # Update the counter
        return new_list
    for i in range(4):
        au_list = transform(au_dataset[i], num)
        au_dataset_new[i].extend(au_list)
    # Store into the new file
    filepaths = [
        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2010-2014).json",
        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(2015-2020).json",
        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(newest).json",
        "./SpringerOpen_buffer_transform/Author_output/SpringerOpen_Author_output_file(oldest).json",
    ]
    for i in range(4):
        with open(filepaths[i], 'w', encoding='utf-8') as json_file:
            json.dump(au_dataset_new[i], json_file, indent=4)
    print("\nComplete: All of the author data structure have been transformed.")
 # ========== Main code ========== #
 # New list for storing data
 ar_dataset = []
 au_dataset = []
 ar_dataset_new = [[] for _ in range(4)]    # New list for transformed data
 au_dataset_new = [[] for _ in range(4)]    # New list to store transformed data
 num1 = [0]      # Counter for complete ar_date
 num2 = [0]      # Counter for complete au_data
 os.makedirs('./SpringerOpen_buffer_transform/Article_output/', exist_ok=True)
 os.makedirs('./SpringerOpen_buffer_transform/Author_output/', exist_ok=True)
 # Read the data
 ar_dataset = fileReader('./SpringerOpen_buffer/Article_output', ar_dataset)
 au_dataset = fileReader('./SpringerOpen_buffer/Author_output', au_dataset)
 # Change the structure
 # arDataTransform('./SpringerOpen_buffer/Author_output', ar_dataset, num1)
 auDataTransform(au_dataset, num2)
--- a/Parsers/00_Citation_spider/get_paper_citation.py
+++ b/Parsers/00_Citation_spider/get_paper_citation.py
@ -26,11 +26,12 @@ payload = {
 jwt_token = jwt.encode(payload, secret_key, algorithm="HS256", headers=head)
 # Aminer API
-api_get_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
+api_paper_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/list/by/publish"
-api_get_citation = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
+api_paper_detail = "https://datacenter.aminer.cn/gateway/open_platform/api/v3/paper/detail/list"
 api_author_id = "https://datacenter.aminer.cn/gateway/open_platform/api/v2/person/search"
-def aminer_get_id(title):
+def aminer_get_paper_id(title):
    headers = {
        "Authorization": f"Bearer {jwt_token}"
    }
@ -39,7 +40,7 @@ def aminer_get_id(title):
        "size": "",
        "title": re.sub(r'[^a-zA-Z0-9\s]+', ' ', title).strip()
    }
-    response = requests.get(api_get_id, headers=headers, params=params)
+    response = requests.get(api_paper_id, headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
@ -49,7 +50,7 @@ def aminer_get_id(title):
            not_on_aminer.append(title)
-def aminer_post_citation(aminer_id):
+def aminer_post_paper_citation(aminer_id):
    headers = {
        "Content-Type": "application/json;charset=utf-8",
        "Authorization": f"Bearer {jwt_token}"
@ -57,7 +58,7 @@ def aminer_post_citation(aminer_id):
    request_data = {
        "ids": aminer_id
    }
-    response = requests.post(api_get_citation, headers=headers, data=json.dumps(request_data))
+    response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
    if response.status_code == 200:
        data = response.json()
@ -72,6 +73,31 @@ def aminer_post_citation(aminer_id):
        aminer_paper_citation_retry.append(aminer_id)
 def aminer_author_info(author_aminer_id, author_name, offset):
    headers = {
        "Content-Type": "application/json;charset=utf-8",
        "Authorization": f"Bearer {jwt_token}"
    }
    request_data = {
        "ids": author_aminer_id,
        "query": author_name,
        "offset": offset
    }
    response = requests.post(api_paper_detail, headers=headers, data=json.dumps(request_data))
    if response.status_code == 200:
        data = response.json()
        if data.get("success"):
            for item in data.get('data', []):
                if 'n_citation' in item:
                    n_citation = item['n_citation']
                else:
                    n_citation = 0
                aminer_paper_citation.append(n_citation)
    else:
        aminer_paper_citation_retry.append(author_aminer_id)
 def scholarly_get_citation(title):
    # # Set up a ProxyGenerator object to use free proxies. This needs to be done only once per session
    pg = ProxyGenerator()
@ -92,8 +118,7 @@ aminer_paper_citation = []
 aminer_paper_citation_retry = []
 # scholarly_get_citation("Traveling waves for unbalanced bistable equations with density dependent diffusion")
-
+aminer_get_paper_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
 aminer_get_id("Heat kernel estimates for fourth-order non-uniformly elliptic operators with non-strongly convex symbols")
 if aminer_paper_id:
-    aminer_post_citation(aminer_paper_id)
+    aminer_post_paper_citation(aminer_paper_id)
 print(aminer_paper_citation)
--- a/Parsers/01_EJDE_spider/ejde_main.py
+++ b/Parsers/01_EJDE_spider/ejde_main.py
@ -1,7 +1,8 @@
 import re
 import time
 import uuid
 import requests
-import re
+import threading
 import ejde_save
 from retrying import retry
@ -13,8 +14,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 '''
    爬取网站：'ejde.math.txstate.edu'
-    Total number of papers: 2023/08/08 - 4761
+    Total number of papers: 2023/08/08 - 4785
-    Total Time via VPN w/100ms-delay: 306.73s
+    Total Time via VPN w/100ms-delay: 96.30s
    ==========运行顺序==========
    1、ejde_main                    获取各年份的期刊链接 -> 抓取各篇论文的信息和作者信息 -> 调用ejde_save -> 存入小文件（json）暂存
@ -23,6 +24,22 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 '''
 def append_data_thread_safe(from_list, to_list, data_lock):
    with data_lock:
        to_list.append(from_list)
 def save_data_thread_safe(data, data_lock, data_type):
    global articleNum, authorNum
    with data_lock:
        ejde_save.save_data(data, f"{data_type}_TS", str(uuid.uuid4()) + ".json")
        if data_type == "Article":
            articleNum += len(data)
        else:
            authorNum += len(data)
        data.clear()
 def datetime_transform(date):
    month_typo = {
        "Janaury": "January",
@ -120,7 +137,7 @@ def process_html_article(baseweb, article):
    # Get article title & url
    try:
        title = article.text.strip()
-        title = re.sub(r'\s+', ' ', title).strip()
+        title = str(re.sub(r'\s+', ' ', title).strip())
        article_url = baseweb + article.find_next("a")["href"]
        if "../../index.html" in article_url:
            print("Redundant URL:", article_url)
@ -148,7 +165,6 @@ def process_html_article(baseweb, article):
@retry(wait_fixed=5000, stop_max_attempt_number=5)
 def process_article(title, article_url):
    global articleNum, authorNum
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
    article_response = requests.get(article_url, headers=headers)
@ -162,11 +178,11 @@ def process_article(title, article_url):
        # Extract title if title == None
        if not title:
            title_match = re.search(r"<h3>(.*?)<p>", article_text)
-            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else None
+            title = str(re.sub(r'<[^>]+>', '', title_match.group(1)).strip()) if title_match else ""
        # Extract issue
        issue_match = re.search(r'No\. (\d+)', article_text)
-        issue = issue_match.group(1) if issue_match else None
+        issue = issue_match.group(1) if issue_match else ""
        # Extract volume
        volume_match = re.search(r'Vol\. (\d+)', article_text)
@ -189,21 +205,23 @@ def process_article(title, article_url):
                        volume = str(volume)
                        issue = "Conference " + str(issue_number)
                    else:
-                        volume = None
+                        volume = ""
        # Extract pp
        pp_match = re.search(r'pp\. (\d+-\d+)', article_text)
-        pp = pp_match.group(1) if pp_match else None
+        pp = pp_match.group(1) if pp_match else ""
        # Extract submission date
        match = re.search(r"Submitted ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
-        submitted_date = match.group(1) if match else None
+        if not match:
            match = re.search(r"Submitted\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", html)
        submitted_date = match.group(1) if match else ""
        if submitted_date:
            submitted_date = datetime_transform(submitted_date)
        # Extract publication date
        match = re.search(r"Published ([A-Za-z]+\s+\d{1,2}, \d{4})", html)
-        publish_date = match.group(1) if match else None
+        publish_date = match.group(1) if match else ""
        if publish_date:
            publish_date = datetime_transform(publish_date)
@ -234,25 +252,25 @@ def process_article(title, article_url):
        doi_match = re.search(r'DOI: ([^\t\n<]+)', html)
        if not doi_match:
            doi_match = re.search(r'DOI: (.+)', html)
-        doi = doi_match.group(1) if doi_match else None
+        doi = doi_match.group(1) if doi_match else ""
        doi = doi.replace('https://doi.org/', '')  # strip doi website header
        # Article_id
        article_id = str(uuid.uuid4())
        # Author info
        authors = []
        author_names = []
        table = article_soup.find('table')
        if table:
            for row in table.find_all('tr'):
                cells = [cell.text.strip() for cell in row.find_all('td')]
                for cell in cells:
-                    if "email:" in cell:
+                    if "email" in cell:
-                        cell = cell.split("email:")
+                        cell = cell.split("email")
                        email_list = str(cell[1]).split(',')
                        cell = cell[0]
-                    elif "e-mail:" in cell:
+                    elif "e-mail" in cell:
-                        cell = cell.split("e-mail:")
+                        cell = cell.split("e-mail")
                        email_list = str(cell[1]).split(',')
                        cell = cell[0]
                    else:
@ -264,8 +282,11 @@ def process_article(title, article_url):
                    # Data processing
                    if cell[0]:
-                        authors.append(unidecode(cell[0]))
+                        author_id = str(uuid.uuid4())
-                        name = re.split(r'[ .]', cell[0])
+                        authors.append(author_id)
                        author_names.append(unidecode(cell[0]))
                        name = re.split(r'\s+', cell[0])
                        name = [item for item in name if item != '']
                        affiliation = ', '.join(cell[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
@ -276,19 +297,21 @@ def process_article(title, article_url):
                                emails.append(unidecode(email_match.group())) if email_match else None
                        author_data = {
-                            "author_id": str(uuid.uuid4()),
+                            "author_id": author_id,
-                            "from_article": [article_id],
+                            "from_article": article_id,
                            "first_name": unidecode(name[0]),
                            "last_name": unidecode(name[-1]),
-                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
-                            "affiliation": [{
+                            "raw_name": unidecode(cell[0]),
                            "affiliation": [
                                {
                                    "year": volume,
                                    "affiliation": unidecode(affiliation),
-                                "email": emails
+                                    "email": ", ".join(emails)
                            }]
                                }
-                        authorData.append(author_data)
+                            ]
-                        authorNum += 1
+                        }
                        append_data_thread_safe(author_data, authorData, authorDataLock)
        # If no author table
        else:
            match_type = 0
@ -308,12 +331,12 @@ def process_article(title, article_url):
                matches = matches.replace('["', '').replace('"]', '').replace('[\'', '').replace('\']', '')
                matches = matches.split("<p>")
                for match in matches:
-                    if "email:" in match:
+                    if "email" in match:
-                        match = match.split("email:")
+                        match = match.split("email")
                        email_list = str(match[1]).split(',')
                        match = match[0]
-                    elif "e-mail:" in match:
+                    elif "e-mail" in match:
-                        match = match.split("e-mail:")
+                        match = match.split("e-mail")
                        email_list = str(match[1]).split(',')
                        match = match[0]
                    else:
@ -330,8 +353,11 @@ def process_article(title, article_url):
                    # Data processing
                    if match[0]:
                        author_id = str(uuid.uuid4())
                        authors.append(author_id)
                        authors.append(unidecode(match[0]))
-                        name = re.split(r'[ .]', match[0])
+                        name = re.split(r'\s+', match[0])
                        name = [item for item in name if item != '']
                        affiliation = ''.join(match[1:]).lstrip(",").rstrip(",").strip()
                        affiliation = re.sub(r'^[^a-zA-Z]*', '', re.sub(r'\s+', ' ', re.sub(r',+', ',', affiliation)))
                        affiliation = affiliation.lstrip(",").rstrip(",").strip()
@ -342,19 +368,21 @@ def process_article(title, article_url):
                                emails.append(unidecode(email_match.group())) if email_match else None
                        author_data = {
-                            "author_id": str(uuid.uuid4()),
+                            "author_id": author_id,
-                            "from_article": [article_id],
+                            "from_article": article_id,
                            "first_name": unidecode(name[0]),
                            "last_name": unidecode(name[-1]),
-                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else None,
+                            "middle_name": unidecode(''.join(name[1:-1])) if len(name[1:-1]) > 0 else "",
-                            "affiliation": [{
+                            "raw_name": unidecode(match[0]),
                            "affiliation": [
                                {
                                    "year": volume,
                                    "affiliation": unidecode(affiliation),
-                                "email": emails
+                                    "email": ", ".join(emails)
                            }]
                                }
-                        authorData.append(author_data)
+                            ]
-                        authorNum += 1
+                        }
                        append_data_thread_safe(author_data, authorData, authorDataLock)
            else:
                print("AUTHOR SEARCHING ERROR:", article_url)
                fail = {
@ -368,7 +396,7 @@ def process_article(title, article_url):
            "article_id": article_id,
            "title": unidecode(title),
            "authors": authors,
-            "corresponding_authors": None,
+            "author_names": author_names,
            "submit_datetime": submitted_date,
            "publish_datetime": publish_date,
            "keywords": keywords,
@ -381,17 +409,14 @@ def process_article(title, article_url):
            "issue": issue,
            "page": pp
        }
-        articleData.append(article_data)
+        append_data_thread_safe(article_data, articleData, articleDataLock)
        articleNum += 1
        # Save the data periodically based on batch size
        if len(articleData) % batch_size == 0:
-            ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
+            save_data_thread_safe(articleData, articleDataLock, "Article")
            articleData.clear()
        if len(authorData) % batch_size == 0:
-            ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
+            save_data_thread_safe(authorData, authorDataLock, "Author")
            authorData.clear()
 start_time = time.time()
@ -429,6 +454,8 @@ authorNum = 0
 articleNum = 0
 batch_size = 100  # Number of articles to process before saving
 authorDataLock = threading.Lock()
 articleDataLock = threading.Lock()
 executor = ThreadPoolExecutor(max_workers=int(len(url_list) / 2))  # Set the number of worker threads
 # Process each URL using multithreading
@ -442,11 +469,12 @@ for future in as_completed(futures):
        print("VOLUME PROCESSING ERROR:", str(vol_err))
 # Retry failed processing paper
 if len(failedData):
    print("START RETRYING:", len(failedData))
    while failedData:
-    data = failedData.pop(0)
+        fail_data = failedData.pop(0)
-    articleTitle = data["title"]
+        articleTitle = fail_data["title"]
-    articleUrl = data["URL"]
+        articleUrl = fail_data["URL"]
        try:
            process_article(articleTitle, articleUrl)
        except Exception as retry_err:
@ -459,11 +487,11 @@ while failedData:
 # Save remaining data
 if len(articleData) > 0:
-    ejde_save.save_data(articleData, "Article_TS", str(uuid.uuid4()) + ".json")
+    save_data_thread_safe(articleData, articleDataLock, "Article")
 print("COMPLETE: All EJDE article data has been saved to ./ejde_buffer/Article_TS/")
 if len(authorData) > 0:
-    ejde_save.save_data(authorData, "Author_TS", str(uuid.uuid4()) + ".json")
+    save_data_thread_safe(authorData, authorDataLock, "Author")
 print("COMPLETE: All EJDE author data has been saved to ./ejde_buffer/Author_TS/")
 # Save error record
@ -485,5 +513,5 @@ print("Total fetched author:", authorNum)
 print("time elapsed: {:.2f}s".format(time.time() - start_time))
 # Transfer to large file and delete the temporary storage files
-ejde_save.Transf()
+ejde_save.transform_data()
-ejde_save.delete()
+ejde_save.delete_data()
--- a/Parsers/01_EJDE_spider/ejde_save.py
+++ b/Parsers/01_EJDE_spider/ejde_save.py
@ -14,8 +14,8 @@ def save_data(dataset, filetype, filename):
 # Write into output files
-def Transf():
+def transform_data():
-    def Read(folder_path, output_files):
+    def read(folder_path, output_files):
        # Create new folders
        os.makedirs('./ejde_buffer/Article_output/', exist_ok=True)
        os.makedirs('./ejde_buffer/Author_output/', exist_ok=True)
@ -24,6 +24,8 @@ def Transf():
        data_2010_2014 = []
        data_2015_2020 = []
        data_newest = []
        data_no_date = []
        data_integrate = []
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
@ -31,24 +33,29 @@ def Transf():
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    for Dict in data:
-                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
+                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) != '':
                            # Select data
                            if int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
                                data_oldest.append(Dict)
                            elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2014:
                                data_2010_2014.append(Dict)
                            elif int(Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
                                data_2015_2020.append(Dict)
                            else:
                                data_newest.append(Dict)
                        else:
                            data_no_date.append(Dict)
        data_integrate.append(data_oldest)
        data_integrate.append(data_2010_2014)
        data_integrate.append(data_2015_2020)
        data_integrate.append(data_newest)
        data_integrate.append(data_no_date)
        # Transfer
-        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest]
+        Data = [data_oldest, data_2010_2014, data_2015_2020, data_newest, data_no_date, data_integrate]
-        for index in range(0, 4):
+        for index in range(0, 6):
            with open(output_files[index], 'w', encoding='utf-8') as file:
                json.dump(Data[index], file, indent=4)
@ -61,26 +68,30 @@ def Transf():
        './ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
        './ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
        './ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
-        './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
+        './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json',
        './ejde_buffer/Author_output/EJDE_Author_output_file(no date).json',
        './ejde_buffer/Author_output/EJDE_Author_output_file(integration).json'
    ]
    article_output_file = [
        './ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
        './ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
        './ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
-        './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
+        './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json',
        './ejde_buffer/Article_output/EJDE_Article_output_file(no date).json',
        './ejde_buffer/Article_output/EJDE_Article_output_file(integration).json'
    ]
    # Read and write into files
-    Read(author_folder_path, author_output_file)
+    read(author_folder_path, author_output_file)
-    Read(article_folder_path, article_output_file)
+    read(article_folder_path, article_output_file)
    # End
    print("\nData has been written into files.")
 # Delete files in temporary storage area
-def delete():
+def delete_data():
    folder_paths = ['./ejde_buffer/Author_TS', './ejde_buffer/Article_TS']
    for folder_path in folder_paths:
        file_names = os.listdir(folder_path)
@ -89,5 +100,4 @@ def delete():
            if os.path.isfile(file_path):
                os.remove(file_path)
        os.rmdir(folder_path)
    print('\nAttention: The temporary storage files have been deleted!')
--- a/Parsers/02_EJQTDE_spider/ejqtde_main.py
+++ b/Parsers/02_EJQTDE_spider/ejqtde_main.py
@ -67,7 +67,6 @@ with ThreadPoolExecutor(max_workers=25) as executor:
    futures = [executor.submit(extract_href, url) for url in url_list]
    for future in as_completed(futures):
        pass
    wait(futures)
 print('\nAll links have been got.\n')
Author	SHA1	Message	Date
ldy	ad427c24dc	Fixed: miss append data while multithreading Pushed: new parsed zip	2023-11-02 11:19:59 +08:00
ldy	61ef0081d8	Pushed new "ejde_buffer_transformation.zip"	2023-11-02 04:33:02 +08:00
ldy	ad63bcf6c4	Updated ejde parser format Fixed duplicate data dumping problem Pushed new "ejde_buffer.zip"	2023-11-02 04:28:06 +08:00
Chenxiao Xia	50e30e105b	New code for transformation. Transform the structure of SprigerOpen data	2023-11-01 20:11:20 +08:00
Chenxiao Xia	7f9ab94adc	New code for transformation	2023-11-01 13:13:28 +08:00
Chenxiao Xia	35ea1dd424	Remove error code	2023-11-01 13:12:38 +08:00
Chenxiao Xia	ad6ba8832a	Merge branch 'main' of https://code.uic.edu.cn/scholar-data-mining/cst-project	2023-11-01 13:08:28 +08:00
ldy	dd0c4379da	Push: ejde transformed data	2023-11-01 13:07:24 +08:00
Chenxiao Xia	c3c460a4dc	Fix bugs, new code for transform ejde data structure	2023-11-01 12:57:09 +08:00
Chenxiao Xia	4c2c68feca	the data of ejde.com has been transform	2023-11-01 12:37:08 +08:00