Replace the code for merging data

2023-08-08 22:57:29 +08:00 · 2023-08-08 22:57:29 +08:00 · 9ee9bc4462
commit 9ee9bc4462
parent 73cf15980f
1 changed files with 80 additions and 88 deletions
--- a/01_EJDE_spider/ejde_merge.py
+++ b/01_EJDE_spider/ejde_merge.py
@ -1,105 +1,97 @@
 import json
 import os
 from pprint import pprint
 import unicodedata
 '''
    ========== SameWeb_merge(folder_path) 运行顺序 ==========
    1. 创建新列表，依次读取文件夹内的json文件并录入，对列表按年份进行排序
    2. 使用       
                for a in Data:
                    Merge(a, count, Data):
                        for aa in Data:
                                            结构进行遍历，每次扫
       描到符合条件的aa元素后，添加至a元素中，然后删除aa元素再次调用Merge
       函数重新遍历，直至data中不再包含符合条件的aa元素为止
    3. 最后Data列表中元素全部完成相同作者合并
-def Merge(folder_path):
+    *Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法，对于数
-    Data = []       # Empty list
+           据格式较为规范的网站效果良好，对于数据格式不做规范的网站效果较差。
           具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性，或者同
           义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值
           是否相同。
           建议：
           1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
           2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比（比较
           复杂，暂时没有好思路）
 '''
 def SameWeb_merge(folder_path):
    # Function
    def Merge(a, count, Data):
        for aa in Data:
            if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get(
                    "lastname") == aa.get("lastname") and a != aa:
                # Add different key-elements of "from_article" into the first element
                if isinstance(a["from_article"], list):
                    if isinstance(aa["from_article"], list):
                        a["from_article"] += aa["from_article"]
                    else:
                        a["from_article"].append(aa["from_article"])
                elif isinstance(a["from_article"], str):
                    if isinstance(aa["from_article"], list):
                        a["from_article"] = [a["from_article"]] + aa["from_article"]
                    else:
                        a["from_article"] = [a["from_article"], aa["from_article"]]
                # Add different key-elements of "affiliation" into the first element
                a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year'])  # Sort by year
                # Uniform characters in English
                fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
                faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
                if fa != faa:
                    a['affiliation'].extend(aa['affiliation'])
                elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
                    a['affiliation'].extend(aa['affiliation'])
                # Delete extra elements
                Data.remove(aa)
                # Update the counter
                count[0] += 1
                return Merge(a, count, Data)
    # ========== Main code ==========
    Data = []  # Empty list
    count = [0]  # counter
    # Add data into list
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
-                Data.append(data)
+                if len(data) > 0:
                    Data.extend(data)
-    count = 0       # counter
+    Database = len(Data)
    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
-    # Same file merge
+    # Same website data merge
-    for data in Data:
+    for a in Data:
-        if len(data) > 0:
+        Merge(a, count, Data)
            data = sorted(data, key=lambda x: x['affiliation'][0]['year'])
            for a in data:
                for aa in data:
                    if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
                            a.get("lastname") == aa.get("lastname"):
-                        # Add different key-elements of "affiliation" into the first element
+    # Information
-                        if a.get('affiliation') != aa.get('affiliation'):
+    print(str(count[0]) + ' copies of data have been merged.')
-                            # Uniform text formatting
+    print(str(Database) + ' copies of data in total, before')
-                            ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
+    print(str(len(Data)) + ' copies of data in total, now.')
-                            ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
+
-                            if ch_1 != ch_2:
+# =========== input the file path here ==========
-                                hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
+SameWeb_merge('./EJDE_buffer/Author_output')
                                hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
                                if hash_1 != hash_2:
                                    a['affiliation'] += aa['affiliation']
                                    # Add different key-elements of "from_article" into the first element
                                    a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
                                        isinstance(a.get("from_article"), str) else a.get("from_article") + (
                                        [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
                                        aa.get("from_article"))
                                    pprint(a)
                                    print('//////////////////////////////////////\n')
                        # Delete extra elements
                        data.remove(aa)
                        # Update the counter
                        count += 1
    # Different files merge
    A = Data[2]     # newest
    B = Data[1]     # (2015-2020)
    C = Data[0]     # (2010-2014)
    D = Data[3]     # oldest
    Data.clear()
    Data = [B, C, D]
    for data in Data:
        if len(data) > 0:
            for a in A:
                for aa in data:
                    if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and \
                            a.get("lastname") == aa.get("lastname"):
                        # Add different key-elements of "affiliation" into the first element
                        if a.get('affiliation') != aa.get('affiliation'):
                            # Uniform text formatting
                            ch_1 = unicodedata.normalize('NFKD', str(a.get('affiliation')[-1])).encode('ascii', 'ignore')
                            ch_2 = unicodedata.normalize('NFKD', str(aa.get('affiliation')[0])).encode('ascii', 'ignore')
                            if ch_1 != ch_2:
                                hash_1 = hash(('affiliation', tuple(a.get('affiliation')[-1].values())))
                                hash_2 = hash(('affiliation', tuple(aa.get('affiliation')[0].values())))
                                if hash_1 != hash_2:
                                    a['affiliation'] += aa['affiliation']
                                    # Add different key-elements of "from_article" into the first element
                                    a["from_article"] = [a.get("from_article"), aa.get("from_article")] if \
                                        isinstance(a.get("from_article"), str) else a.get("from_article") + (
                                        [aa.get("from_article")] if isinstance(aa.get("from_article"), str) else
                                        aa.get("from_article"))
                                pprint(a)
                                print('================================\n')
                        # Delete extra elements
                        data.remove(aa)
                        # Update the counter
                        count += 1
            # Combined in one list
            A += data
    # Tips
    print(str(count) + ' file(s) have been merged.')
    print('There are ' + str(len(A)) + ' file(s) in total, now.')
 # input the file path
 Merge('./ejde_buffer/Author_output')