diff --git a/00_File_merge/Merge.py b/00_File_merge/Merge.py index a8614ec..0d0c56c 100644 --- a/00_File_merge/Merge.py +++ b/00_File_merge/Merge.py @@ -2,13 +2,12 @@ import json import os import re import time - import unicodedata +import torch + from pprint import pprint from concurrent.futures import ThreadPoolExecutor, as_completed, wait - from transformers import BertTokenizer, BertModel -import torch from sklearn.metrics.pairwise import cosine_similarity ''' @@ -43,8 +42,11 @@ from sklearn.metrics.pairwise import cosine_similarity 建议: 1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决) - 2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(比较 - 复杂,暂时没有好思路) + 2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(已解 + 决,通过删除多余空格,全部标点符号,再进行相似度比较) + 3. 相似度比较需要一个更加快捷的方式,通过预置的模型对比耗时过长,cpu + 占用率也较高 + ''' @@ -61,7 +63,7 @@ def SameWeb_merge(folder_path): if len(a['from_article']) == 1 and len(aa['from_article']) == 1: if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get( "middle_name") and a.get("last_name") == aa.get("last_name"): - if a['from_article'] == aa['from_article']: + if a['from_article'] == aa['from_article']: # Remove same data Data.remove(Data[j]) count2[0] += 1 return SameName_merge(i, count1, count2, Data, ml) @@ -165,7 +167,7 @@ def SameWeb_merge(folder_path): text1 = re.sub(r'[^\w\s]', '', str(text1)).lower() text2 = re.sub(r'[^\w\s]', '', str(text2)).lower() - # Delete space + # Delete extra spaces text1 = re.sub(r'\s+', ' ', text1).strip() text2 = re.sub(r'\s+', ' ', text2).strip() @@ -211,6 +213,7 @@ def SameWeb_merge(folder_path): num1 = 0 # Unique data counter num2 = 0 # Complete merged data counter num3 = 0 # Incomplete merged data counter + num4 = 0 # Similarity algorithm merged data counter # Add data into list for filename in os.listdir(folder_path): @@ -262,9 +265,10 @@ def SameWeb_merge(folder_path): # Combine Data with temp_list for i in temp_list: if len(i) == 1: - Data.append(i[0]) + num4 += 1 + Data.insert(0, i[0]) else: - Data.append(i) + Data.insert(-1, i) print('\n----- Similarity algorithm merge complete -----\n') @@ -287,6 +291,7 @@ def SameWeb_merge(folder_path): print(str(num1) + ' copies of data are unique.') print(str(num2) + ' copies of data are complete merged') + print(str(num4) + ' copies of data are complete merged by similarity algorithm') print(str(num3) + ' copies of data are incomplete merged') # Save into file diff --git a/00_File_merge/fileReader.py b/00_File_merge/fileReader.py new file mode 100644 index 0000000..8f1ce33 --- /dev/null +++ b/00_File_merge/fileReader.py @@ -0,0 +1,87 @@ +import os +import json + +''' + ========== fileReader ========= + 1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。 + 2. 通过检索作者信息获取 author_id 和 from_article,并且返回作者信息中检索来源文章的 title, + 将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。 + 3. 将 ar_list 作为结果返回。 +''' + + +# Function to find the author data which does not have "email" information +def Read(author_path, article_path): + # Read data list + def au_read(path, file_names, list): + for file_name in file_names: + file_path = os.path.join(path, file_name) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + for Dict in range(len(data)-1, -1, -1): + if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None: + list.append(data[Dict]) + # del data[Dict] + return list + + def ar_read(path, file_name, list, ar_id, au_id): + file_path = os.path.join(path, file_name) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + for Dict in data: + if Dict.get('article_id') == ar_id: + # A new dictionary to stored key information + temp_data = { + 'title': Dict.get('title'), + 'author_id': au_id + } + + list.append(temp_data) # Add into list + + return list + + # ========== Main code ========== + au_list = [] # List for author data + ar_list = [] # List for article data + ar_temp = [] # List for temp stored + num = 0 # Data number counter + + # Read the file + au_names = os.listdir(author_path) + ar_names = os.listdir(article_path) + + # Stored the author data which has no "email" information + au_list = au_read(author_path, au_names, au_list) + + # Search the articles where the authors from + for au_data in au_list: + if len(ar_temp) == 100: + num += 100 + ar_list.append(ar_temp) + ar_temp.clear() + + print(str(num) + " copies of data has been stored.") + + elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009: + ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id')) + + elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010: + ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id')) + + elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020: + ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id')) + + else: + ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id')) + + if len(ar_temp) > 0: # Stored remaining data + ar_list.append(ar_temp) + + print(len(ar_list)) + return ar_list + + +# ========== Test code ========== +# Read('./test_buffer/Author_output', './test_buffer/Article_output') + +