Update code for merging author data by API

2023-10-09 23:24:58 +08:00 · 2023-10-09 23:24:58 +08:00 · 193581cd6a
commit 193581cd6a
parent 9b33cbabe7
5 changed files with 186 additions and 67 deletions
--- a/FileMerger/Division_byName.py
+++ b/FileMerger/Division_byName.py
@ -27,12 +27,14 @@ def Division(folder_path):
    # Save into different files
    def Transf(data):
        os.makedirs("./nameDivision/", exist_ok=True)      # Create a new folder
-        list = [[] for _ in range(27)]                          # list of lists to stored data
+        list = [[] for _ in range(27)]                     # list of lists to stored data

        # Division into 27 files according to the first alpha,
        for Dict in data:
-            if Dict.get('last_name') is not None and len(Dict.get('last_name')[0].lower()) < 2 and \
-                    97 <= ord(Dict.get('last_name')[0].lower()) <= 122:
+            # print(Dict.get('last_name'))
+            # print(len(Dict.get('last_name')[0].lower()))
+            if Dict.get('last_name') is not None and Dict.get('last_name') != "" and \
+                    len(Dict.get('last_name')[0].lower()) < 2 and 97 <= ord(Dict.get('last_name')[0].lower()) <= 122:

                num = ord(Dict.get('last_name')[0].lower()) - 96
                list[num].append(Dict)
@ -55,7 +57,7 @@ def Division(folder_path):
    Transf(Read(folder_path))

 # ========== Test code ==========
-# Division('./test_buffer/Author_output')
+Division('./test_buffer/Author_output')



--- a/FileMerger/Merge.py
+++ b/FileMerger/Merge.py
@ -250,17 +250,18 @@ def SameWeb_merge(folder_path):

    print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')

-    st = time.time()    # Start time
-
-    if len(temp_list) > 1:
-        executor = ThreadPoolExecutor(max_workers=10)                           # Thread pool
-        futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
-        for future in as_completed(futures):
-            pass
-        wait(futures)
-
-    et = time.time()    # End time
-    print('\nThread pool has been run for ' + str(et-st) + 's')
+    # Bert model merge
+    # st = time.time()    # Start time
+    #
+    # if len(temp_list) > 1:
+    #     executor = ThreadPoolExecutor(max_workers=10)                           # Thread pool
+    #     futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
+    #     for future in as_completed(futures):
+    #         pass
+    #     wait(futures)
+    #
+    # et = time.time()    # End time
+    # print('\nThread pool has been run for ' + str(et-st) + 's')

    # Combine Data with temp_list
    for i in temp_list:
@ -308,7 +309,7 @@ def SameWeb_merge(folder_path):
 # =========== input the file path here ==========
 # SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
 # SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
-SameWeb_merge('.\ejde\ejde_buffer\Author_output')
-
+# SameWeb_merge('.\ejde\ejde_buffer\Author_output')
+SameWeb_merge('.\\nameDivision\\1')


--- a/FileMerger/Merge_byNameAndEmail.py
+++ b/FileMerger/Merge_byNameAndEmail.py
@ -0,0 +1,132 @@
+import json
+import os
+
+
+def SameWeb_merge(folder_path):
+    # Function
+    def SameName_merge(i, count1, count2, Data, ml):
+        # Same name merge
+        for j in range(i + 1, len(Data)):
+            if j < len(Data):
+                a = Data[i]
+                aa = Data[j]
+
+                if isinstance(a, dict) and isinstance(aa, dict):
+                    if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
+                        if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
+                                "middle_name") and a.get("last_name") == aa.get("last_name"):
+
+                            if a['from_article'] == aa['from_article']:                 # Remove same data
+                                Data.remove(Data[j])
+                                count2[0] += 1
+                                return SameName_merge(i, count1, count2, Data, ml)
+
+                            else:
+                                ml.append(aa)
+
+                                # Update the counter
+                                count1[0] += 1
+                                if count1[0] % 100 == 0 and count1[0] != 0:
+                                    print(str(count1[0]) + ' copies of data have been merged by same name.')
+
+                                # Delete extra elements
+                                Data.remove(Data[j])
+
+                                return SameName_merge(i, count1, count2, Data, ml)
+
+        if len(ml) > 0:
+            ml.append(Data[i])  # Add first element
+            Data.remove(Data[i])
+            ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year'))  # Sorted by year
+
+            # Add into Data list
+            if len(ml) == 1:
+                Data.insert(-1, ml[0])
+            else:
+                Data.insert(-1, ml)
+
+    # ========== Main code ==========
+    Data = []  # List of all data
+
+    count1 = [0]  # Same name merged data counter
+    count2 = [0]  # Duplicate data counter
+
+    num1 = 0  # Unique data counter
+    num2 = 0  # Complete merged data counter
+    num3 = 0  # Incomplete merged data counter
+
+    # Add data into list
+    for num_folder in os.listdir(folder_path):
+        num_folder_path = os.path.join(folder_path, num_folder)
+        for filename in os.listdir(num_folder_path):
+            if filename.endswith('.json'):
+                file_path = os.path.join(num_folder_path, filename)
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    data = json.load(file)
+                    if len(data) > 0:
+                        Data.extend(data)
+
+    Database = len(Data)  # The length of the original data
+    Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
+
+    # ========== Merge ==========
+    # ----- Same name data merge -----
+    ml = []
+    if len(Data) > 1:
+        for i in range(len(Data)):
+            ml.clear()
+            SameName_merge(i, count1, count2, Data, ml)
+
+    print('\n----- Same name data merge complete -----\n')
+
+    # ----- Similarity algorithm merge -----
+    # Change the index of incomplete data before other data
+    temp_list = []  # Temp list for incomplete merged data
+
+    if len(Data) > 1:
+        for i in range(len(Data) - 1, -1, -1):
+            if isinstance(Data[i], list):
+                temp = Data[i]
+                Data.remove(Data[i])
+                temp_list.append(temp)
+
+    print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
+
+    print('\n----- Similarity algorithm merge complete -----\n')
+
+    # ========== Statistic data ==========
+    # Data counter update
+    for a in Data:
+        if isinstance(a, dict) and len(a['from_article']) == 1:
+            num1 += 1
+        elif isinstance(a, dict) and len(a['from_article']) > 1:
+            num2 += 1
+        else:
+            num3 += 1
+
+    # Information
+    print('\n========== Complete ==========\n')
+    print(str(Database) + ' copies of data in total, before')
+    print(str(count1[0]) + ' copies of data have been merged by same name.')
+    print(str(count2[0]) + ' copies of duplicate data have been deleted')
+    print(str(len(Data)) + ' copies of data in total, now.\n')
+
+    print(str(num1) + ' copies of data are unique.')
+    print(str(num3) + ' copies of data are incomplete merged')
+
+    # Save into file
+    path = os.path.dirname(folder_path)  # parent path
+    path = os.path.join(path, "Author_data(merged)")
+    os.makedirs(path, exist_ok=True)
+    path = os.path.join(path, "Author_data(merged).json")
+
+    with open(path, 'w', encoding='utf-8') as file:
+        json.dump(temp_list, file, indent=4)
+
+    print('\nData has been added to ' + path)
+
+
+# =========== input the file path here ==========
+SameWeb_merge('.\\nameDivision')
+
+
--- a/FileMerger/fileReader.py
+++ b/FileMerger/fileReader.py
@ -1,87 +1,71 @@
 import os
 import json
+from pprint import pprint

 '''
    ========== fileReader =========
-    1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。
+    1. 本程序用于读取作者信息后获取来源文章的标题，并且将其存储到新的字典当中
    2. 通过检索作者信息获取 author_id 和 from_article，并且返回作者信息中检索来源文章的 title，
-       将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。
-    3. 将 ar_list 作为结果返回。
+       将作者信息和标题一并存储到新字典
+    3. 将新字典存储到 json 文件中
 '''


 # Function to find the author data which does not have "email" information
 def Read(author_path, article_path):
    # Read data list
-    def au_read(path, file_names, list):
-        for file_name in file_names:
-            file_path = os.path.join(path, file_name)
-            with open(file_path, 'r', encoding='utf-8') as file:
-                data = json.load(file)
-                for Dict in range(len(data)-1, -1, -1):
-                    if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None:
-                        list.append(data[Dict])
-                        # del data[Dict]
+    def au_read(path, list):
+        with open(path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+            for Dict in range(len(data)-1, -1, -1):
+                list.append(data[Dict])
+
        return list

-    def ar_read(path, file_name, list, ar_id, au_id):
+    def ar_read(path, file_name, ar_id, au_data, num):
        file_path = os.path.join(path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            for Dict in data:
-                if Dict.get('article_id') == ar_id:
-                    # A new dictionary to stored key information
-                    temp_data = {
-                        'title': Dict.get('title'),
-                        'author_id': au_id
-                    }
-
-                    list.append(temp_data)  # Add into list
-
-        return list
+                if Dict.get('article_id') == ar_id[0]:
+                    # A new dictionary to stored information
+                    au_data['from_article_title'] = Dict.get('title')
+                    num[0] += 1

    # ========== Main code ==========
    au_list = []    # List for author data
-    ar_list = []    # List for article data
-    ar_temp = []    # List for temp stored
-    num = 0         # Data number counter
+    num = [0]         # Data number counter

    # Read the file
-    au_names = os.listdir(author_path)
    ar_names = os.listdir(article_path)

    # Stored the author data which has no "email" information
-    au_list = au_read(author_path, au_names, au_list)
+    au_list = au_read(author_path, au_list)

    # Search the articles where the authors from
-    for au_data in au_list:
-        if len(ar_temp) == 100:
-            num += 100
-            ar_list.append(ar_temp)
-            ar_temp.clear()
+    for au_MergeList in au_list:
+        for au_data in au_MergeList:
+            if num[0] % 100 == 0 and num[0] != 0:
+                print(str(num[0]) + " copies of data have been done.")

-            print(str(num) + " copies of data has been stored.")
+            if int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
+                ar_read(article_path, ar_names[3], au_data.get('from_article'), au_data, num)

-        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
-            ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+            elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
+                ar_read(article_path, ar_names[0], au_data.get('from_article'), au_data, num)

-        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
-            ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+            elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
+                ar_read(article_path, ar_names[1], au_data.get('from_article'), au_data, num)

-        elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
-            ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
+            else:
+                ar_read(article_path, ar_names[2], au_data.get('from_article'), au_data, num)

-        else:
-            ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
-
-    if len(ar_temp) > 0:            # Stored remaining data
-        ar_list.append(ar_temp)
-
-    print(len(ar_list))
-    return ar_list
+    with open('./Author_data(merged)/Author_data(info_supplementary).json', 'w', encoding='utf-8') as file:
+        json.dump(au_list, file, indent=4)

+    print('All data have been stored into ./Author_data(merged)/Author_data(info_supplementary).json')

 # ========== Test code ==========
-# Read('./test_buffer/Author_output', './test_buffer/Article_output')
+Read('./Author_data(merged)/Author_data(merged).json', './test_buffer/Article_output')


--- a/FileMerger/webSearch_merge.py
+++ b/FileMerger/webSearch_merge.py
@ -130,7 +130,7 @@ Retry_author_aminerID = []


 # ---------- Call the API ----------
-for title in article_title:             # Get the article web-ID
+for title in article_title:                     # Get the article web-ID
    aminer_article_webID(title)

    if len(article_aminerID) > 0: