Fix bugs and add new code to search author data without email information
This commit is contained in:
parent
2a1fcfc4cd
commit
2f6f86a48e
@ -2,13 +2,12 @@ import json
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import torch
|
||||||
|
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
|
||||||
|
|
||||||
from transformers import BertTokenizer, BertModel
|
from transformers import BertTokenizer, BertModel
|
||||||
import torch
|
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ -43,8 +42,11 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|||||||
|
|
||||||
建议:
|
建议:
|
||||||
1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
|
1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
|
||||||
2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(比较
|
2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比(已解
|
||||||
复杂,暂时没有好思路)
|
决,通过删除多余空格,全部标点符号,再进行相似度比较)
|
||||||
|
3. 相似度比较需要一个更加快捷的方式,通过预置的模型对比耗时过长,cpu
|
||||||
|
占用率也较高
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
@ -61,7 +63,7 @@ def SameWeb_merge(folder_path):
|
|||||||
if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
|
if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
|
||||||
if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
|
if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
|
||||||
"middle_name") and a.get("last_name") == aa.get("last_name"):
|
"middle_name") and a.get("last_name") == aa.get("last_name"):
|
||||||
if a['from_article'] == aa['from_article']:
|
if a['from_article'] == aa['from_article']: # Remove same data
|
||||||
Data.remove(Data[j])
|
Data.remove(Data[j])
|
||||||
count2[0] += 1
|
count2[0] += 1
|
||||||
return SameName_merge(i, count1, count2, Data, ml)
|
return SameName_merge(i, count1, count2, Data, ml)
|
||||||
@ -165,7 +167,7 @@ def SameWeb_merge(folder_path):
|
|||||||
text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
|
text1 = re.sub(r'[^\w\s]', '', str(text1)).lower()
|
||||||
text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()
|
text2 = re.sub(r'[^\w\s]', '', str(text2)).lower()
|
||||||
|
|
||||||
# Delete space
|
# Delete extra spaces
|
||||||
text1 = re.sub(r'\s+', ' ', text1).strip()
|
text1 = re.sub(r'\s+', ' ', text1).strip()
|
||||||
text2 = re.sub(r'\s+', ' ', text2).strip()
|
text2 = re.sub(r'\s+', ' ', text2).strip()
|
||||||
|
|
||||||
@ -211,6 +213,7 @@ def SameWeb_merge(folder_path):
|
|||||||
num1 = 0 # Unique data counter
|
num1 = 0 # Unique data counter
|
||||||
num2 = 0 # Complete merged data counter
|
num2 = 0 # Complete merged data counter
|
||||||
num3 = 0 # Incomplete merged data counter
|
num3 = 0 # Incomplete merged data counter
|
||||||
|
num4 = 0 # Similarity algorithm merged data counter
|
||||||
|
|
||||||
# Add data into list
|
# Add data into list
|
||||||
for filename in os.listdir(folder_path):
|
for filename in os.listdir(folder_path):
|
||||||
@ -262,9 +265,10 @@ def SameWeb_merge(folder_path):
|
|||||||
# Combine Data with temp_list
|
# Combine Data with temp_list
|
||||||
for i in temp_list:
|
for i in temp_list:
|
||||||
if len(i) == 1:
|
if len(i) == 1:
|
||||||
Data.append(i[0])
|
num4 += 1
|
||||||
|
Data.insert(0, i[0])
|
||||||
else:
|
else:
|
||||||
Data.append(i)
|
Data.insert(-1, i)
|
||||||
|
|
||||||
print('\n----- Similarity algorithm merge complete -----\n')
|
print('\n----- Similarity algorithm merge complete -----\n')
|
||||||
|
|
||||||
@ -287,6 +291,7 @@ def SameWeb_merge(folder_path):
|
|||||||
|
|
||||||
print(str(num1) + ' copies of data are unique.')
|
print(str(num1) + ' copies of data are unique.')
|
||||||
print(str(num2) + ' copies of data are complete merged')
|
print(str(num2) + ' copies of data are complete merged')
|
||||||
|
print(str(num4) + ' copies of data are complete merged by similarity algorithm')
|
||||||
print(str(num3) + ' copies of data are incomplete merged')
|
print(str(num3) + ' copies of data are incomplete merged')
|
||||||
|
|
||||||
# Save into file
|
# Save into file
|
||||||
|
|||||||
87
00_File_merge/fileReader.py
Normal file
87
00_File_merge/fileReader.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
'''
|
||||||
|
========== fileReader =========
|
||||||
|
1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。
|
||||||
|
2. 通过检索作者信息获取 author_id 和 from_article,并且返回作者信息中检索来源文章的 title,
|
||||||
|
将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。
|
||||||
|
3. 将 ar_list 作为结果返回。
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# Function to find the author data which does not have "email" information
|
||||||
|
def Read(author_path, article_path):
|
||||||
|
# Read data list
|
||||||
|
def au_read(path, file_names, list):
|
||||||
|
for file_name in file_names:
|
||||||
|
file_path = os.path.join(path, file_name)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
for Dict in range(len(data)-1, -1, -1):
|
||||||
|
if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None:
|
||||||
|
list.append(data[Dict])
|
||||||
|
# del data[Dict]
|
||||||
|
return list
|
||||||
|
|
||||||
|
def ar_read(path, file_name, list, ar_id, au_id):
|
||||||
|
file_path = os.path.join(path, file_name)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
for Dict in data:
|
||||||
|
if Dict.get('article_id') == ar_id:
|
||||||
|
# A new dictionary to stored key information
|
||||||
|
temp_data = {
|
||||||
|
'title': Dict.get('title'),
|
||||||
|
'author_id': au_id
|
||||||
|
}
|
||||||
|
|
||||||
|
list.append(temp_data) # Add into list
|
||||||
|
|
||||||
|
return list
|
||||||
|
|
||||||
|
# ========== Main code ==========
|
||||||
|
au_list = [] # List for author data
|
||||||
|
ar_list = [] # List for article data
|
||||||
|
ar_temp = [] # List for temp stored
|
||||||
|
num = 0 # Data number counter
|
||||||
|
|
||||||
|
# Read the file
|
||||||
|
au_names = os.listdir(author_path)
|
||||||
|
ar_names = os.listdir(article_path)
|
||||||
|
|
||||||
|
# Stored the author data which has no "email" information
|
||||||
|
au_list = au_read(author_path, au_names, au_list)
|
||||||
|
|
||||||
|
# Search the articles where the authors from
|
||||||
|
for au_data in au_list:
|
||||||
|
if len(ar_temp) == 100:
|
||||||
|
num += 100
|
||||||
|
ar_list.append(ar_temp)
|
||||||
|
ar_temp.clear()
|
||||||
|
|
||||||
|
print(str(num) + " copies of data has been stored.")
|
||||||
|
|
||||||
|
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
||||||
|
ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
||||||
|
|
||||||
|
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
|
||||||
|
ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
||||||
|
|
||||||
|
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
||||||
|
ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
||||||
|
|
||||||
|
else:
|
||||||
|
ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
||||||
|
|
||||||
|
if len(ar_temp) > 0: # Stored remaining data
|
||||||
|
ar_list.append(ar_temp)
|
||||||
|
|
||||||
|
print(len(ar_list))
|
||||||
|
return ar_list
|
||||||
|
|
||||||
|
|
||||||
|
# ========== Test code ==========
|
||||||
|
# Read('./test_buffer/Author_output', './test_buffer/Article_output')
|
||||||
|
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user