From 193581cd6a96402efc896d351dfcc7b3b04e97f5 Mon Sep 17 00:00:00 2001 From: Chenxiao Xia Date: Mon, 9 Oct 2023 23:24:58 +0800 Subject: [PATCH] Update code for merging author data by API --- FileMerger/Division_byName.py | 10 +- FileMerger/Merge.py | 27 ++-- FileMerger/Merge_byNameAndEmail.py | 132 ++++++++++++++++++ FileMerger/fileReader.py | 82 +++++------ .../{webSearch_merge.py => webSearch_API.py} | 2 +- 5 files changed, 186 insertions(+), 67 deletions(-) create mode 100644 FileMerger/Merge_byNameAndEmail.py rename FileMerger/{webSearch_merge.py => webSearch_API.py} (98%) diff --git a/FileMerger/Division_byName.py b/FileMerger/Division_byName.py index 9d868a4..62641f6 100644 --- a/FileMerger/Division_byName.py +++ b/FileMerger/Division_byName.py @@ -27,12 +27,14 @@ def Division(folder_path): # Save into different files def Transf(data): os.makedirs("./nameDivision/", exist_ok=True) # Create a new folder - list = [[] for _ in range(27)] # list of lists to stored data + list = [[] for _ in range(27)] # list of lists to stored data # Division into 27 files according to the first alpha, for Dict in data: - if Dict.get('last_name') is not None and len(Dict.get('last_name')[0].lower()) < 2 and \ - 97 <= ord(Dict.get('last_name')[0].lower()) <= 122: + # print(Dict.get('last_name')) + # print(len(Dict.get('last_name')[0].lower())) + if Dict.get('last_name') is not None and Dict.get('last_name') != "" and \ + len(Dict.get('last_name')[0].lower()) < 2 and 97 <= ord(Dict.get('last_name')[0].lower()) <= 122: num = ord(Dict.get('last_name')[0].lower()) - 96 list[num].append(Dict) @@ -55,7 +57,7 @@ def Division(folder_path): Transf(Read(folder_path)) # ========== Test code ========== -# Division('./test_buffer/Author_output') +Division('./test_buffer/Author_output') diff --git a/FileMerger/Merge.py b/FileMerger/Merge.py index 0d0c56c..75c01c9 100644 --- a/FileMerger/Merge.py +++ b/FileMerger/Merge.py @@ -250,17 +250,18 @@ def SameWeb_merge(folder_path): print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n') - st = time.time() # Start time - - if len(temp_list) > 1: - executor = ThreadPoolExecutor(max_workers=10) # Thread pool - futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))] - for future in as_completed(futures): - pass - wait(futures) - - et = time.time() # End time - print('\nThread pool has been run for ' + str(et-st) + 's') + # Bert model merge + # st = time.time() # Start time + # + # if len(temp_list) > 1: + # executor = ThreadPoolExecutor(max_workers=10) # Thread pool + # futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))] + # for future in as_completed(futures): + # pass + # wait(futures) + # + # et = time.time() # End time + # print('\nThread pool has been run for ' + str(et-st) + 's') # Combine Data with temp_list for i in temp_list: @@ -308,7 +309,7 @@ def SameWeb_merge(folder_path): # =========== input the file path here ========== # SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output') # SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output') -SameWeb_merge('.\ejde\ejde_buffer\Author_output') - +# SameWeb_merge('.\ejde\ejde_buffer\Author_output') +SameWeb_merge('.\\nameDivision\\1') diff --git a/FileMerger/Merge_byNameAndEmail.py b/FileMerger/Merge_byNameAndEmail.py new file mode 100644 index 0000000..def8f39 --- /dev/null +++ b/FileMerger/Merge_byNameAndEmail.py @@ -0,0 +1,132 @@ +import json +import os + + +def SameWeb_merge(folder_path): + # Function + def SameName_merge(i, count1, count2, Data, ml): + # Same name merge + for j in range(i + 1, len(Data)): + if j < len(Data): + a = Data[i] + aa = Data[j] + + if isinstance(a, dict) and isinstance(aa, dict): + if len(a['from_article']) == 1 and len(aa['from_article']) == 1: + if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get( + "middle_name") and a.get("last_name") == aa.get("last_name"): + + if a['from_article'] == aa['from_article']: # Remove same data + Data.remove(Data[j]) + count2[0] += 1 + return SameName_merge(i, count1, count2, Data, ml) + + else: + ml.append(aa) + + # Update the counter + count1[0] += 1 + if count1[0] % 100 == 0 and count1[0] != 0: + print(str(count1[0]) + ' copies of data have been merged by same name.') + + # Delete extra elements + Data.remove(Data[j]) + + return SameName_merge(i, count1, count2, Data, ml) + + if len(ml) > 0: + ml.append(Data[i]) # Add first element + Data.remove(Data[i]) + ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year')) # Sorted by year + + # Add into Data list + if len(ml) == 1: + Data.insert(-1, ml[0]) + else: + Data.insert(-1, ml) + + # ========== Main code ========== + Data = [] # List of all data + + count1 = [0] # Same name merged data counter + count2 = [0] # Duplicate data counter + + num1 = 0 # Unique data counter + num2 = 0 # Complete merged data counter + num3 = 0 # Incomplete merged data counter + + # Add data into list + for num_folder in os.listdir(folder_path): + num_folder_path = os.path.join(folder_path, num_folder) + for filename in os.listdir(num_folder_path): + if filename.endswith('.json'): + file_path = os.path.join(num_folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + if len(data) > 0: + Data.extend(data) + + Database = len(Data) # The length of the original data + Data = sorted(Data, key=lambda x: x['affiliation'][0]['year']) + + # ========== Merge ========== + # ----- Same name data merge ----- + ml = [] + if len(Data) > 1: + for i in range(len(Data)): + ml.clear() + SameName_merge(i, count1, count2, Data, ml) + + print('\n----- Same name data merge complete -----\n') + + # ----- Similarity algorithm merge ----- + # Change the index of incomplete data before other data + temp_list = [] # Temp list for incomplete merged data + + if len(Data) > 1: + for i in range(len(Data) - 1, -1, -1): + if isinstance(Data[i], list): + temp = Data[i] + Data.remove(Data[i]) + temp_list.append(temp) + + print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n') + + print('\n----- Similarity algorithm merge complete -----\n') + + # ========== Statistic data ========== + # Data counter update + for a in Data: + if isinstance(a, dict) and len(a['from_article']) == 1: + num1 += 1 + elif isinstance(a, dict) and len(a['from_article']) > 1: + num2 += 1 + else: + num3 += 1 + + # Information + print('\n========== Complete ==========\n') + print(str(Database) + ' copies of data in total, before') + print(str(count1[0]) + ' copies of data have been merged by same name.') + print(str(count2[0]) + ' copies of duplicate data have been deleted') + print(str(len(Data)) + ' copies of data in total, now.\n') + + print(str(num1) + ' copies of data are unique.') + print(str(num3) + ' copies of data are incomplete merged') + + # Save into file + path = os.path.dirname(folder_path) # parent path + path = os.path.join(path, "Author_data(merged)") + os.makedirs(path, exist_ok=True) + path = os.path.join(path, "Author_data(merged).json") + + with open(path, 'w', encoding='utf-8') as file: + json.dump(temp_list, file, indent=4) + + print('\nData has been added to ' + path) + + +# =========== input the file path here ========== +SameWeb_merge('.\\nameDivision') + + diff --git a/FileMerger/fileReader.py b/FileMerger/fileReader.py index 8f1ce33..c7f961d 100644 --- a/FileMerger/fileReader.py +++ b/FileMerger/fileReader.py @@ -1,87 +1,71 @@ import os import json +from pprint import pprint ''' ========== fileReader ========= - 1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。 + 1. 本程序用于读取作者信息后获取来源文章的标题,并且将其存储到新的字典当中 2. 通过检索作者信息获取 author_id 和 from_article,并且返回作者信息中检索来源文章的 title, - 将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。 - 3. 将 ar_list 作为结果返回。 + 将作者信息和标题一并存储到新字典 + 3. 将新字典存储到 json 文件中 ''' # Function to find the author data which does not have "email" information def Read(author_path, article_path): # Read data list - def au_read(path, file_names, list): - for file_name in file_names: - file_path = os.path.join(path, file_name) - with open(file_path, 'r', encoding='utf-8') as file: - data = json.load(file) - for Dict in range(len(data)-1, -1, -1): - if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None: - list.append(data[Dict]) - # del data[Dict] + def au_read(path, list): + with open(path, 'r', encoding='utf-8') as file: + data = json.load(file) + for Dict in range(len(data)-1, -1, -1): + list.append(data[Dict]) + return list - def ar_read(path, file_name, list, ar_id, au_id): + def ar_read(path, file_name, ar_id, au_data, num): file_path = os.path.join(path, file_name) with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) for Dict in data: - if Dict.get('article_id') == ar_id: - # A new dictionary to stored key information - temp_data = { - 'title': Dict.get('title'), - 'author_id': au_id - } - - list.append(temp_data) # Add into list - - return list + if Dict.get('article_id') == ar_id[0]: + # A new dictionary to stored information + au_data['from_article_title'] = Dict.get('title') + num[0] += 1 # ========== Main code ========== au_list = [] # List for author data - ar_list = [] # List for article data - ar_temp = [] # List for temp stored - num = 0 # Data number counter + num = [0] # Data number counter # Read the file - au_names = os.listdir(author_path) ar_names = os.listdir(article_path) # Stored the author data which has no "email" information - au_list = au_read(author_path, au_names, au_list) + au_list = au_read(author_path, au_list) # Search the articles where the authors from - for au_data in au_list: - if len(ar_temp) == 100: - num += 100 - ar_list.append(ar_temp) - ar_temp.clear() + for au_MergeList in au_list: + for au_data in au_MergeList: + if num[0] % 100 == 0 and num[0] != 0: + print(str(num[0]) + " copies of data have been done.") - print(str(num) + " copies of data has been stored.") + if int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009: + ar_read(article_path, ar_names[3], au_data.get('from_article'), au_data, num) - elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009: - ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id')) + elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010: + ar_read(article_path, ar_names[0], au_data.get('from_article'), au_data, num) - elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010: - ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id')) + elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020: + ar_read(article_path, ar_names[1], au_data.get('from_article'), au_data, num) - elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020: - ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id')) + else: + ar_read(article_path, ar_names[2], au_data.get('from_article'), au_data, num) - else: - ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id')) - - if len(ar_temp) > 0: # Stored remaining data - ar_list.append(ar_temp) - - print(len(ar_list)) - return ar_list + with open('./Author_data(merged)/Author_data(info_supplementary).json', 'w', encoding='utf-8') as file: + json.dump(au_list, file, indent=4) + print('All data have been stored into ./Author_data(merged)/Author_data(info_supplementary).json') # ========== Test code ========== -# Read('./test_buffer/Author_output', './test_buffer/Article_output') +Read('./Author_data(merged)/Author_data(merged).json', './test_buffer/Article_output') diff --git a/FileMerger/webSearch_merge.py b/FileMerger/webSearch_API.py similarity index 98% rename from FileMerger/webSearch_merge.py rename to FileMerger/webSearch_API.py index 396f9df..3cc051f 100644 --- a/FileMerger/webSearch_merge.py +++ b/FileMerger/webSearch_API.py @@ -130,7 +130,7 @@ Retry_author_aminerID = [] # ---------- Call the API ---------- -for title in article_title: # Get the article web-ID +for title in article_title: # Get the article web-ID aminer_article_webID(title) if len(article_aminerID) > 0: