Merge remote-tracking branch 'origin/main'

2023-08-11 12:44:53 +08:00 · 2023-08-11 12:44:53 +08:00 · 69b10a9f72
commit 69b10a9f72
parent 35f5f2ac5e e504e73409
5 changed files with 44 additions and 136 deletions
--- a/00_File_merge/Merge.py
+++ b/00_File_merge/Merge.py
@ -1,8 +1,7 @@
 import json
 import os
 from pprint import pprint
 import unicodedata
 from pprint import pprint
 '''
    ========== SameWeb_merge(folder_path) 运行顺序 ==========
@ -53,13 +52,16 @@ def SameWeb_merge(folder_path):
                # Uniform characters in English
                fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
-                faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
+                faa = unicodedata.normalize('NFKD', aa['affiliation'][0]["affiliation"]).encode('ascii', 'ignore')
                if fa != faa:
                    a['affiliation'].extend(aa['affiliation'])
-                elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
+                elif fa == faa and a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
                    a['affiliation'].extend(aa['affiliation'])
                if len(a['affiliation']) > 1 and a['affiliation'][0] == a['affiliation'][1]:
                    a['affiliation'].remove(a['affiliation'][0])
                # Delete extra elements
                Data.remove(aa)
@ -82,7 +84,7 @@ def SameWeb_merge(folder_path):
                if len(data) > 0:
                    Data.extend(data)
-    Database = len(Data)
+    Database = len(Data)        # The length of the original data
    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
    # Same website data merge
@ -96,7 +98,7 @@ def SameWeb_merge(folder_path):
    print(str(len(Data)) + ' copies of data in total, now.')
    # Save into file
-    path = os.path.dirname(folder_path)
+    path = os.path.dirname(folder_path)     # parent path
    path = os.path.join(path, "Author_data(merged)")
    os.makedirs(path, exist_ok=True)
    path = os.path.join(path, "Author_data(merged).json")
@ -106,10 +108,10 @@ def SameWeb_merge(folder_path):
    print('\nData has been added to ' + path + '\Author_data(merged).json')
 # =========== input the file path here ==========
 # SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
 # SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
 # SameWeb_merge('.\ejde\ejde_buffer\Author_output')
--- a/01_EJDE_spider/ejde_merge.py
+++ b/01_EJDE_spider/ejde_merge.py
@ -1,97 +0,0 @@
 import json
 import os
 import unicodedata
 '''
    ========== SameWeb_merge(folder_path) 运行顺序 ==========
    1. 创建新列表，依次读取文件夹内的json文件并录入，对列表按年份进行排序
    2. 使用       
                for a in Data:
                    Merge(a, count, Data):
                        for aa in Data:
                                            结构进行遍历，每次扫
       描到符合条件的aa元素后，添加至a元素中，然后删除aa元素再次调用Merge
       函数重新遍历，直至data中不再包含符合条件的aa元素为止
    3. 最后Data列表中元素全部完成相同作者合并
    *Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法，对于数
           据格式较为规范的网站效果良好，对于数据格式不做规范的网站效果较差。
           具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性，或者同
           义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值
           是否相同。
           建议：
           1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
           2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比（比较
           复杂，暂时没有好思路）
 '''
 def SameWeb_merge(folder_path):
    # Function
    def Merge(a, count, Data):
        for aa in Data:
            if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get(
                    "lastname") == aa.get("lastname") and a != aa:
                # Add different key-elements of "from_article" into the first element
                if isinstance(a["from_article"], list):
                    if isinstance(aa["from_article"], list):
                        a["from_article"] += aa["from_article"]
                    else:
                        a["from_article"].append(aa["from_article"])
                elif isinstance(a["from_article"], str):
                    if isinstance(aa["from_article"], list):
                        a["from_article"] = [a["from_article"]] + aa["from_article"]
                    else:
                        a["from_article"] = [a["from_article"], aa["from_article"]]
                # Add different key-elements of "affiliation" into the first element
                a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year'])  # Sort by year
                # Uniform characters in English
                fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
                faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
                if fa != faa:
                    a['affiliation'].extend(aa['affiliation'])
                elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
                    a['affiliation'].extend(aa['affiliation'])
                # Delete extra elements
                Data.remove(aa)
                # Update the counter
                count[0] += 1
                return Merge(a, count, Data)
    # ========== Main code ==========
    Data = []  # Empty list
    count = [0]  # counter
    # Add data into list
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                if len(data) > 0:
                    Data.extend(data)
    Database = len(Data)
    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
    # Same website data merge
    for a in Data:
        Merge(a, count, Data)
    # Information
    print(str(count[0]) + ' copies of data have been merged.')
    print(str(Database) + ' copies of data in total, before')
    print(str(len(Data)) + ' copies of data in total, now.')
 # =========== input the file path here ==========
 SameWeb_merge('./EJDE_buffer/Author_output')
--- a/01_EJDE_spider/ejde_save.py
+++ b/01_EJDE_spider/ejde_save.py
@ -63,17 +63,17 @@ def Transf():
    # The path of storage
    author_output_file = [
-        './ejde_buffer/Author_output/Author_output_file(oldest).json',
+        './ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
-        './ejde_buffer/Author_output/Author_output_file(2010-2014).json',
+        './ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
-        './ejde_buffer/Author_output/Author_output_file(2015-2020).json',
+        './ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
-        './ejde_buffer/Author_output/Author_output_file(newest).json'
+        './ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
    ]
    article_output_file = [
-        './ejde_buffer/Article_output/Article_output_file(oldest).json',
+        './ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
-        './ejde_buffer/Article_output/Article_output_file(2010-2014).json',
+        './ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
-        './ejde_buffer/Article_output/Article_output_file(2015-2020).json',
+        './ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
-        './ejde_buffer/Article_output/Article_output_file(newest).json'
+        './ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
    ]
    # Read and write into files
--- a/02_EJQTDE_spider/ejqtde_save.py
+++ b/02_EJQTDE_spider/ejqtde_save.py
@ -35,7 +35,7 @@ def Transf():
                    for Dict in data:
                        if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
-                            # 筛选文章
+                            # Select data
                            if (isinstance(Dict, dict) and int(
                                    Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
                                data_oldest.append(Dict)
@ -65,17 +65,17 @@ def Transf():
    # The path of storage
    author_output_file = [
-        './EJQTDE_buffer/Author_output/Author_output_file(oldest).json',
+        './EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(oldest).json',
-        './EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json',
+        './EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(2010-2014).json',
-        './EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json',
+        './EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(2015-2020).json',
-        './EJQTDE_buffer/Author_output/Author_output_file(newest).json'
+        './EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(newest).json'
    ]
    article_output_file = [
-        './EJQTDE_buffer/Article_output/Article_output_file(oldest).json',
+        './EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(oldest).json',
-        './EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json',
+        './EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(2010-2014).json',
-        './EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json',
+        './EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(2015-2020).json',
-        './EJQTDE_buffer/Article_output/Article_output_file(newest).json'
+        './EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(newest).json'
    ]
    # Read and write into files
@ -95,6 +95,7 @@ def delete():
            file_path = os.path.join(folder_path, file_name)
            if os.path.isfile(file_path):
                os.remove(file_path)
        os.rmdir(folder_path)
    print('\nAttention: The temporary storage files have been deleted!')
--- a/04_SpringerOpen_spider/SD_save.py
+++ b/04_SpringerOpen_spider/SD_save.py
@ -63,17 +63,17 @@ def Transf():
    # 存储路径
    author_output_file = [
-        './SpringerOpen_buffer/Author_output/Author_output_file(oldest).json',
+        './SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(oldest).json',
-        './SpringerOpen_buffer/Author_output/Author_output_file(2010-2014).json',
+        './SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(2010-2014).json',
-        './SpringerOpen_buffer/Author_output/Author_output_file(2015-2020).json',
+        './SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(2015-2020).json',
-        './SpringerOpen_buffer/Author_output/Author_output_file(newest).json'
+        './SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(newest).json'
    ]
    article_output_file = [
-        './SpringerOpen_buffer/Article_output/Article_output_file(oldest).json',
+        './SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(oldest).json',
-        './SpringerOpen_buffer/Article_output/Article_output_file(2010-2014).json',
+        './SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(2010-2014).json',
-        './SpringerOpen_buffer/Article_output/Article_output_file(2015-2020).json',
+        './SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(2015-2020).json',
-        './SpringerOpen_buffer/Article_output/Article_output_file(newest).json'
+        './SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(newest).json'
    ]
    # 读取并写入文件
@ -85,12 +85,14 @@ def Transf():
 # 删除暂存区文件
-def delete(folder_path):
+def delete():
-    file_names = os.listdir(folder_path)
+    folder_paths = ['./SpringerOpen_buffer/Author_TS', './SpringerOpen_buffer/Article_TS']
-
+    for folder_path in folder_paths:
-    for file_name in file_names:
+        file_names = os.listdir(folder_path)
-        file_path = os.path.join(folder_path, file_name)
+        for file_name in file_names:
-        if os.path.isfile(file_path):
+            file_path = os.path.join(folder_path, file_name)
-            os.remove(file_path)
+            if os.path.isfile(file_path):
                os.remove(file_path)
        os.rmdir(folder_path)
    print('\nAttention: The temporary storage files have been deleted!')