Fix bugs and add new code to search author data without email information

This commit is contained in:
Chenxiao Xia 2023-09-20 23:29:42 +08:00
parent 2a1fcfc4cd
commit 2f6f86a48e
2 changed files with 101 additions and 9 deletions

View File

@ -2,13 +2,12 @@ import json
import os
import re
import time
import unicodedata
import torch
from pprint import pprint
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
'''
@ -43,8 +42,11 @@ from sklearn.metrics.pairwise import cosine_similarity
建议
1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比比较
复杂暂时没有好思路
2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比已解
通过删除多余空格全部标点符号再进行相似度比较
3. 相似度比较需要一个更加快捷的方式通过预置的模型对比耗时过长cpu
占用率也较高
'''
@ -61,7 +63,7 @@ def SameWeb_merge(folder_path):
if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
"middle_name") and a.get("last_name") == aa.get("last_name"):
if a['from_article'] == aa['from_article']:
if a['from_article'] == aa['from_article']: # Remove same data
Data.remove(Data[j])
count2[0] += 1
return SameName_merge(i, count1, count2, Data, ml)
@ -165,7 +167,7 @@ def SameWeb_merge(folder_path):
text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()
# Delete space
# Delete extra spaces
text1 = re.sub(r'\s+', ' ', text1).strip()
text2 = re.sub(r'\s+', ' ', text2).strip()
@ -211,6 +213,7 @@ def SameWeb_merge(folder_path):
num1 = 0 # Unique data counter
num2 = 0 # Complete merged data counter
num3 = 0 # Incomplete merged data counter
num4 = 0 # Similarity algorithm merged data counter
# Add data into list
for filename in os.listdir(folder_path):
@ -262,9 +265,10 @@ def SameWeb_merge(folder_path):
# Combine Data with temp_list
for i in temp_list:
if len(i) == 1:
Data.append(i[0])
num4 += 1
Data.insert(0, i[0])
else:
Data.append(i)
Data.insert(-1, i)
print('\n----- Similarity algorithm merge complete -----\n')
@ -287,6 +291,7 @@ def SameWeb_merge(folder_path):
print(str(num1) + ' copies of data are unique.')
print(str(num2) + ' copies of data are complete merged')
print(str(num4) + ' copies of data are complete merged by similarity algorithm')
print(str(num3) + ' copies of data are incomplete merged')
# Save into file

View File

@ -0,0 +1,87 @@
import os
import json
'''
========== fileReader =========
1. 本程序用于读取作者信息 json 文件中不包含 email 的数据
2. 通过检索作者信息获取 author_id from_article并且返回作者信息中检索来源文章的 title
author_id title 作为一个字典存储到待处理的列表 ar_list
3. ar_list 作为结果返回
'''
# Function to find the author data which does not have "email" information
def Read(author_path, article_path):
# Read data list
def au_read(path, file_names, list):
for file_name in file_names:
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
for Dict in range(len(data)-1, -1, -1):
if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None:
list.append(data[Dict])
# del data[Dict]
return list
def ar_read(path, file_name, list, ar_id, au_id):
file_path = os.path.join(path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
for Dict in data:
if Dict.get('article_id') == ar_id:
# A new dictionary to stored key information
temp_data = {
'title': Dict.get('title'),
'author_id': au_id
}
list.append(temp_data) # Add into list
return list
# ========== Main code ==========
au_list = [] # List for author data
ar_list = [] # List for article data
ar_temp = [] # List for temp stored
num = 0 # Data number counter
# Read the file
au_names = os.listdir(author_path)
ar_names = os.listdir(article_path)
# Stored the author data which has no "email" information
au_list = au_read(author_path, au_names, au_list)
# Search the articles where the authors from
for au_data in au_list:
if len(ar_temp) == 100:
num += 100
ar_list.append(ar_temp)
ar_temp.clear()
print(str(num) + " copies of data has been stored.")
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
else:
ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
if len(ar_temp) > 0: # Stored remaining data
ar_list.append(ar_temp)
print(len(ar_list))
return ar_list
# ========== Test code ==========
# Read('./test_buffer/Author_output', './test_buffer/Article_output')