Update code for merging author data by API

2023-10-09 23:24:58 +08:00 · 2023-10-09 23:24:58 +08:00 · 193581cd6a
commit 193581cd6a
parent 9b33cbabe7
5 changed files with 186 additions and 67 deletions
--- a/FileMerger/Division_byName.py
+++ b/FileMerger/Division_byName.py
@ -27,12 +27,14 @@ def Division(folder_path):
    # Save into different files
    def Transf(data):
        os.makedirs("./nameDivision/", exist_ok=True)      # Create a new folder
-        list = [[] for _ in range(27)]                          # list of lists to stored data
+        list = [[] for _ in range(27)]                     # list of lists to stored data
        # Division into 27 files according to the first alpha,
        for Dict in data:
-            if Dict.get('last_name') is not None and len(Dict.get('last_name')[0].lower()) < 2 and \
+            # print(Dict.get('last_name'))
-                    97 <= ord(Dict.get('last_name')[0].lower()) <= 122:
+            # print(len(Dict.get('last_name')[0].lower()))
            if Dict.get('last_name') is not None and Dict.get('last_name') != "" and \
                    len(Dict.get('last_name')[0].lower()) < 2 and 97 <= ord(Dict.get('last_name')[0].lower()) <= 122:
                num = ord(Dict.get('last_name')[0].lower()) - 96
                list[num].append(Dict)
@ -55,7 +57,7 @@ def Division(folder_path):
    Transf(Read(folder_path))
 # ========== Test code ==========
-# Division('./test_buffer/Author_output')
+Division('./test_buffer/Author_output')
--- a/FileMerger/Merge.py
+++ b/FileMerger/Merge.py
@ -250,17 +250,18 @@ def SameWeb_merge(folder_path):
    print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
-    st = time.time()    # Start time
+    # Bert model merge
-
+    # st = time.time()    # Start time
-    if len(temp_list) > 1:
+    #
-        executor = ThreadPoolExecutor(max_workers=10)                           # Thread pool
+    # if len(temp_list) > 1:
-        futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
+    #     executor = ThreadPoolExecutor(max_workers=10)                           # Thread pool
-        for future in as_completed(futures):
+    #     futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
-            pass
+    #     for future in as_completed(futures):
-        wait(futures)
+    #         pass
-
+    #     wait(futures)
-    et = time.time()    # End time
+    #
-    print('\nThread pool has been run for ' + str(et-st) + 's')
+    # et = time.time()    # End time
    # print('\nThread pool has been run for ' + str(et-st) + 's')
    # Combine Data with temp_list
    for i in temp_list:
@ -308,7 +309,7 @@ def SameWeb_merge(folder_path):
 # =========== input the file path here ==========
 # SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
 # SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
-SameWeb_merge('.\ejde\ejde_buffer\Author_output')
+# SameWeb_merge('.\ejde\ejde_buffer\Author_output')
-
+SameWeb_merge('.\\nameDivision\\1')
--- a/FileMerger/Merge_byNameAndEmail.py
+++ b/FileMerger/Merge_byNameAndEmail.py
@ -0,0 +1,132 @@
 import json
 import os
 def SameWeb_merge(folder_path):
    # Function
    def SameName_merge(i, count1, count2, Data, ml):
        # Same name merge
        for j in range(i + 1, len(Data)):
            if j < len(Data):
                a = Data[i]
                aa = Data[j]
                if isinstance(a, dict) and isinstance(aa, dict):
                    if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
                        if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
                                "middle_name") and a.get("last_name") == aa.get("last_name"):
                            if a['from_article'] == aa['from_article']:                 # Remove same data
                                Data.remove(Data[j])
                                count2[0] += 1
                                return SameName_merge(i, count1, count2, Data, ml)
                            else:
                                ml.append(aa)
                                # Update the counter
                                count1[0] += 1
                                if count1[0] % 100 == 0 and count1[0] != 0:
                                    print(str(count1[0]) + ' copies of data have been merged by same name.')
                                # Delete extra elements
                                Data.remove(Data[j])
                                return SameName_merge(i, count1, count2, Data, ml)
        if len(ml) > 0:
            ml.append(Data[i])  # Add first element
            Data.remove(Data[i])
            ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year'))  # Sorted by year
            # Add into Data list
            if len(ml) == 1:
                Data.insert(-1, ml[0])
            else:
                Data.insert(-1, ml)
    # ========== Main code ==========
    Data = []  # List of all data
    count1 = [0]  # Same name merged data counter
    count2 = [0]  # Duplicate data counter
    num1 = 0  # Unique data counter
    num2 = 0  # Complete merged data counter
    num3 = 0  # Incomplete merged data counter
    # Add data into list
    for num_folder in os.listdir(folder_path):
        num_folder_path = os.path.join(folder_path, num_folder)
        for filename in os.listdir(num_folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(num_folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    if len(data) > 0:
                        Data.extend(data)
    Database = len(Data)  # The length of the original data
    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
    # ========== Merge ==========
    # ----- Same name data merge -----
    ml = []
    if len(Data) > 1:
        for i in range(len(Data)):
            ml.clear()
            SameName_merge(i, count1, count2, Data, ml)
    print('\n----- Same name data merge complete -----\n')
    # ----- Similarity algorithm merge -----
    # Change the index of incomplete data before other data
    temp_list = []  # Temp list for incomplete merged data
    if len(Data) > 1:
        for i in range(len(Data) - 1, -1, -1):
            if isinstance(Data[i], list):
                temp = Data[i]
                Data.remove(Data[i])
                temp_list.append(temp)
    print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
    print('\n----- Similarity algorithm merge complete -----\n')
    # ========== Statistic data ==========
    # Data counter update
    for a in Data:
        if isinstance(a, dict) and len(a['from_article']) == 1:
            num1 += 1
        elif isinstance(a, dict) and len(a['from_article']) > 1:
            num2 += 1
        else:
            num3 += 1
    # Information
    print('\n========== Complete ==========\n')
    print(str(Database) + ' copies of data in total, before')
    print(str(count1[0]) + ' copies of data have been merged by same name.')
    print(str(count2[0]) + ' copies of duplicate data have been deleted')
    print(str(len(Data)) + ' copies of data in total, now.\n')
    print(str(num1) + ' copies of data are unique.')
    print(str(num3) + ' copies of data are incomplete merged')
    # Save into file
    path = os.path.dirname(folder_path)  # parent path
    path = os.path.join(path, "Author_data(merged)")
    os.makedirs(path, exist_ok=True)
    path = os.path.join(path, "Author_data(merged).json")
    with open(path, 'w', encoding='utf-8') as file:
        json.dump(temp_list, file, indent=4)
    print('\nData has been added to ' + path)
 # =========== input the file path here ==========
 SameWeb_merge('.\\nameDivision')
--- a/FileMerger/fileReader.py
+++ b/FileMerger/fileReader.py
@ -1,87 +1,71 @@
 import os
 import json
 from pprint import pprint
 '''
    ========== fileReader =========
-    1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。
+    1. 本程序用于读取作者信息后获取来源文章的标题，并且将其存储到新的字典当中
    2. 通过检索作者信息获取 author_id 和 from_article，并且返回作者信息中检索来源文章的 title，
-       将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。
+       将作者信息和标题一并存储到新字典
-    3. 将 ar_list 作为结果返回。
+    3. 将新字典存储到 json 文件中
 '''
 # Function to find the author data which does not have "email" information
 def Read(author_path, article_path):
    # Read data list
-    def au_read(path, file_names, list):
+    def au_read(path, list):
-        for file_name in file_names:
+        with open(path, 'r', encoding='utf-8') as file:
-            file_path = os.path.join(path, file_name)
+            data = json.load(file)
-            with open(file_path, 'r', encoding='utf-8') as file:
+            for Dict in range(len(data)-1, -1, -1):
-                data = json.load(file)
+                list.append(data[Dict])
-                for Dict in range(len(data)-1, -1, -1):
+
                    if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None:
                        list.append(data[Dict])
                        # del data[Dict]
        return list
-    def ar_read(path, file_name, list, ar_id, au_id):
+    def ar_read(path, file_name, ar_id, au_data, num):
        file_path = os.path.join(path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            for Dict in data:
-                if Dict.get('article_id') == ar_id:
+                if Dict.get('article_id') == ar_id[0]:
-                    # A new dictionary to stored key information
+                    # A new dictionary to stored information
-                    temp_data = {
+                    au_data['from_article_title'] = Dict.get('title')
-                        'title': Dict.get('title'),
+                    num[0] += 1
                        'author_id': au_id
                    }
                    list.append(temp_data)  # Add into list
        return list
    # ========== Main code ==========
    au_list = []    # List for author data
-    ar_list = []    # List for article data
+    num = [0]         # Data number counter
    ar_temp = []    # List for temp stored
    num = 0         # Data number counter
    # Read the file
    au_names = os.listdir(author_path)
    ar_names = os.listdir(article_path)
    # Stored the author data which has no "email" information
-    au_list = au_read(author_path, au_names, au_list)
+    au_list = au_read(author_path, au_list)
    # Search the articles where the authors from
-    for au_data in au_list:
+    for au_MergeList in au_list:
-        if len(ar_temp) == 100:
+        for au_data in au_MergeList:
-            num += 100
+            if num[0] % 100 == 0 and num[0] != 0:
-            ar_list.append(ar_temp)
+                print(str(num[0]) + " copies of data have been done.")
            ar_temp.clear()
-            print(str(num) + " copies of data has been stored.")
+            if int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
                ar_read(article_path, ar_names[3], au_data.get('from_article'), au_data, num)
-        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
+            elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
-            ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+                ar_read(article_path, ar_names[0], au_data.get('from_article'), au_data, num)
-        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
+            elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
-            ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+                ar_read(article_path, ar_names[1], au_data.get('from_article'), au_data, num)
-        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
+            else:
-            ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+                ar_read(article_path, ar_names[2], au_data.get('from_article'), au_data, num)
-        else:
+    with open('./Author_data(merged)/Author_data(info_supplementary).json', 'w', encoding='utf-8') as file:
-            ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+        json.dump(au_list, file, indent=4)
    if len(ar_temp) > 0:            # Stored remaining data
        ar_list.append(ar_temp)
    print(len(ar_list))
    return ar_list
    print('All data have been stored into ./Author_data(merged)/Author_data(info_supplementary).json')
 # ========== Test code ==========
-# Read('./test_buffer/Author_output', './test_buffer/Article_output')
+Read('./Author_data(merged)/Author_data(merged).json', './test_buffer/Article_output')
--- a/FileMerger/webSearch_merge.py
+++ b/FileMerger/webSearch_merge.py
@ -130,7 +130,7 @@ Retry_author_aminerID = []
 # ---------- Call the API ----------
-for title in article_title:             # Get the article web-ID
+for title in article_title:                     # Get the article web-ID
    aminer_article_webID(title)
    if len(article_aminerID) > 0: