Update code for merging author data by API
This commit is contained in:
parent
9b33cbabe7
commit
193581cd6a
@ -27,12 +27,14 @@ def Division(folder_path):
|
|||||||
# Save into different files
|
# Save into different files
|
||||||
def Transf(data):
|
def Transf(data):
|
||||||
os.makedirs("./nameDivision/", exist_ok=True) # Create a new folder
|
os.makedirs("./nameDivision/", exist_ok=True) # Create a new folder
|
||||||
list = [[] for _ in range(27)] # list of lists to stored data
|
list = [[] for _ in range(27)] # list of lists to stored data
|
||||||
|
|
||||||
# Division into 27 files according to the first alpha,
|
# Division into 27 files according to the first alpha,
|
||||||
for Dict in data:
|
for Dict in data:
|
||||||
if Dict.get('last_name') is not None and len(Dict.get('last_name')[0].lower()) < 2 and \
|
# print(Dict.get('last_name'))
|
||||||
97 <= ord(Dict.get('last_name')[0].lower()) <= 122:
|
# print(len(Dict.get('last_name')[0].lower()))
|
||||||
|
if Dict.get('last_name') is not None and Dict.get('last_name') != "" and \
|
||||||
|
len(Dict.get('last_name')[0].lower()) < 2 and 97 <= ord(Dict.get('last_name')[0].lower()) <= 122:
|
||||||
|
|
||||||
num = ord(Dict.get('last_name')[0].lower()) - 96
|
num = ord(Dict.get('last_name')[0].lower()) - 96
|
||||||
list[num].append(Dict)
|
list[num].append(Dict)
|
||||||
@ -55,7 +57,7 @@ def Division(folder_path):
|
|||||||
Transf(Read(folder_path))
|
Transf(Read(folder_path))
|
||||||
|
|
||||||
# ========== Test code ==========
|
# ========== Test code ==========
|
||||||
# Division('./test_buffer/Author_output')
|
Division('./test_buffer/Author_output')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -250,17 +250,18 @@ def SameWeb_merge(folder_path):
|
|||||||
|
|
||||||
print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
|
print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
|
||||||
|
|
||||||
st = time.time() # Start time
|
# Bert model merge
|
||||||
|
# st = time.time() # Start time
|
||||||
if len(temp_list) > 1:
|
#
|
||||||
executor = ThreadPoolExecutor(max_workers=10) # Thread pool
|
# if len(temp_list) > 1:
|
||||||
futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
|
# executor = ThreadPoolExecutor(max_workers=10) # Thread pool
|
||||||
for future in as_completed(futures):
|
# futures = [executor.submit(Similarity_merge, temp_list[i]) for i in range(len(temp_list))]
|
||||||
pass
|
# for future in as_completed(futures):
|
||||||
wait(futures)
|
# pass
|
||||||
|
# wait(futures)
|
||||||
et = time.time() # End time
|
#
|
||||||
print('\nThread pool has been run for ' + str(et-st) + 's')
|
# et = time.time() # End time
|
||||||
|
# print('\nThread pool has been run for ' + str(et-st) + 's')
|
||||||
|
|
||||||
# Combine Data with temp_list
|
# Combine Data with temp_list
|
||||||
for i in temp_list:
|
for i in temp_list:
|
||||||
@ -308,7 +309,7 @@ def SameWeb_merge(folder_path):
|
|||||||
# =========== input the file path here ==========
|
# =========== input the file path here ==========
|
||||||
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
|
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
|
||||||
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
|
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
|
||||||
SameWeb_merge('.\ejde\ejde_buffer\Author_output')
|
# SameWeb_merge('.\ejde\ejde_buffer\Author_output')
|
||||||
|
SameWeb_merge('.\\nameDivision\\1')
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
132
FileMerger/Merge_byNameAndEmail.py
Normal file
132
FileMerger/Merge_byNameAndEmail.py
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def SameWeb_merge(folder_path):
|
||||||
|
# Function
|
||||||
|
def SameName_merge(i, count1, count2, Data, ml):
|
||||||
|
# Same name merge
|
||||||
|
for j in range(i + 1, len(Data)):
|
||||||
|
if j < len(Data):
|
||||||
|
a = Data[i]
|
||||||
|
aa = Data[j]
|
||||||
|
|
||||||
|
if isinstance(a, dict) and isinstance(aa, dict):
|
||||||
|
if len(a['from_article']) == 1 and len(aa['from_article']) == 1:
|
||||||
|
if a.get("first_name") == aa.get("first_name") and a.get("middle_name") == aa.get(
|
||||||
|
"middle_name") and a.get("last_name") == aa.get("last_name"):
|
||||||
|
|
||||||
|
if a['from_article'] == aa['from_article']: # Remove same data
|
||||||
|
Data.remove(Data[j])
|
||||||
|
count2[0] += 1
|
||||||
|
return SameName_merge(i, count1, count2, Data, ml)
|
||||||
|
|
||||||
|
else:
|
||||||
|
ml.append(aa)
|
||||||
|
|
||||||
|
# Update the counter
|
||||||
|
count1[0] += 1
|
||||||
|
if count1[0] % 100 == 0 and count1[0] != 0:
|
||||||
|
print(str(count1[0]) + ' copies of data have been merged by same name.')
|
||||||
|
|
||||||
|
# Delete extra elements
|
||||||
|
Data.remove(Data[j])
|
||||||
|
|
||||||
|
return SameName_merge(i, count1, count2, Data, ml)
|
||||||
|
|
||||||
|
if len(ml) > 0:
|
||||||
|
ml.append(Data[i]) # Add first element
|
||||||
|
Data.remove(Data[i])
|
||||||
|
ml = sorted(ml, key=lambda x: x.get('affiliation')[0].get('year')) # Sorted by year
|
||||||
|
|
||||||
|
# Add into Data list
|
||||||
|
if len(ml) == 1:
|
||||||
|
Data.insert(-1, ml[0])
|
||||||
|
else:
|
||||||
|
Data.insert(-1, ml)
|
||||||
|
|
||||||
|
# ========== Main code ==========
|
||||||
|
Data = [] # List of all data
|
||||||
|
|
||||||
|
count1 = [0] # Same name merged data counter
|
||||||
|
count2 = [0] # Duplicate data counter
|
||||||
|
|
||||||
|
num1 = 0 # Unique data counter
|
||||||
|
num2 = 0 # Complete merged data counter
|
||||||
|
num3 = 0 # Incomplete merged data counter
|
||||||
|
|
||||||
|
# Add data into list
|
||||||
|
for num_folder in os.listdir(folder_path):
|
||||||
|
num_folder_path = os.path.join(folder_path, num_folder)
|
||||||
|
for filename in os.listdir(num_folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(num_folder_path, filename)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
if len(data) > 0:
|
||||||
|
Data.extend(data)
|
||||||
|
|
||||||
|
Database = len(Data) # The length of the original data
|
||||||
|
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
|
||||||
|
|
||||||
|
# ========== Merge ==========
|
||||||
|
# ----- Same name data merge -----
|
||||||
|
ml = []
|
||||||
|
if len(Data) > 1:
|
||||||
|
for i in range(len(Data)):
|
||||||
|
ml.clear()
|
||||||
|
SameName_merge(i, count1, count2, Data, ml)
|
||||||
|
|
||||||
|
print('\n----- Same name data merge complete -----\n')
|
||||||
|
|
||||||
|
# ----- Similarity algorithm merge -----
|
||||||
|
# Change the index of incomplete data before other data
|
||||||
|
temp_list = [] # Temp list for incomplete merged data
|
||||||
|
|
||||||
|
if len(Data) > 1:
|
||||||
|
for i in range(len(Data) - 1, -1, -1):
|
||||||
|
if isinstance(Data[i], list):
|
||||||
|
temp = Data[i]
|
||||||
|
Data.remove(Data[i])
|
||||||
|
temp_list.append(temp)
|
||||||
|
|
||||||
|
print(str(len(temp_list)) + ' copies of incomplete merged data have been added to temporary list\n')
|
||||||
|
|
||||||
|
print('\n----- Similarity algorithm merge complete -----\n')
|
||||||
|
|
||||||
|
# ========== Statistic data ==========
|
||||||
|
# Data counter update
|
||||||
|
for a in Data:
|
||||||
|
if isinstance(a, dict) and len(a['from_article']) == 1:
|
||||||
|
num1 += 1
|
||||||
|
elif isinstance(a, dict) and len(a['from_article']) > 1:
|
||||||
|
num2 += 1
|
||||||
|
else:
|
||||||
|
num3 += 1
|
||||||
|
|
||||||
|
# Information
|
||||||
|
print('\n========== Complete ==========\n')
|
||||||
|
print(str(Database) + ' copies of data in total, before')
|
||||||
|
print(str(count1[0]) + ' copies of data have been merged by same name.')
|
||||||
|
print(str(count2[0]) + ' copies of duplicate data have been deleted')
|
||||||
|
print(str(len(Data)) + ' copies of data in total, now.\n')
|
||||||
|
|
||||||
|
print(str(num1) + ' copies of data are unique.')
|
||||||
|
print(str(num3) + ' copies of data are incomplete merged')
|
||||||
|
|
||||||
|
# Save into file
|
||||||
|
path = os.path.dirname(folder_path) # parent path
|
||||||
|
path = os.path.join(path, "Author_data(merged)")
|
||||||
|
os.makedirs(path, exist_ok=True)
|
||||||
|
path = os.path.join(path, "Author_data(merged).json")
|
||||||
|
|
||||||
|
with open(path, 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(temp_list, file, indent=4)
|
||||||
|
|
||||||
|
print('\nData has been added to ' + path)
|
||||||
|
|
||||||
|
|
||||||
|
# =========== input the file path here ==========
|
||||||
|
SameWeb_merge('.\\nameDivision')
|
||||||
|
|
||||||
|
|
||||||
@ -1,87 +1,71 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
'''
|
'''
|
||||||
========== fileReader =========
|
========== fileReader =========
|
||||||
1. 本程序用于读取作者信息 json 文件中不包含 email 的数据。
|
1. 本程序用于读取作者信息后获取来源文章的标题,并且将其存储到新的字典当中
|
||||||
2. 通过检索作者信息获取 author_id 和 from_article,并且返回作者信息中检索来源文章的 title,
|
2. 通过检索作者信息获取 author_id 和 from_article,并且返回作者信息中检索来源文章的 title,
|
||||||
将 author_id 和 title 作为一个字典存储到待处理的列表 ar_list 中。
|
将作者信息和标题一并存储到新字典
|
||||||
3. 将 ar_list 作为结果返回。
|
3. 将新字典存储到 json 文件中
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
# Function to find the author data which does not have "email" information
|
# Function to find the author data which does not have "email" information
|
||||||
def Read(author_path, article_path):
|
def Read(author_path, article_path):
|
||||||
# Read data list
|
# Read data list
|
||||||
def au_read(path, file_names, list):
|
def au_read(path, list):
|
||||||
for file_name in file_names:
|
with open(path, 'r', encoding='utf-8') as file:
|
||||||
file_path = os.path.join(path, file_name)
|
data = json.load(file)
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
for Dict in range(len(data)-1, -1, -1):
|
||||||
data = json.load(file)
|
list.append(data[Dict])
|
||||||
for Dict in range(len(data)-1, -1, -1):
|
|
||||||
if data[Dict].get('affiliation', [{}])[0].get('email', 0) is None:
|
|
||||||
list.append(data[Dict])
|
|
||||||
# del data[Dict]
|
|
||||||
return list
|
return list
|
||||||
|
|
||||||
def ar_read(path, file_name, list, ar_id, au_id):
|
def ar_read(path, file_name, ar_id, au_data, num):
|
||||||
file_path = os.path.join(path, file_name)
|
file_path = os.path.join(path, file_name)
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
for Dict in data:
|
for Dict in data:
|
||||||
if Dict.get('article_id') == ar_id:
|
if Dict.get('article_id') == ar_id[0]:
|
||||||
# A new dictionary to stored key information
|
# A new dictionary to stored information
|
||||||
temp_data = {
|
au_data['from_article_title'] = Dict.get('title')
|
||||||
'title': Dict.get('title'),
|
num[0] += 1
|
||||||
'author_id': au_id
|
|
||||||
}
|
|
||||||
|
|
||||||
list.append(temp_data) # Add into list
|
|
||||||
|
|
||||||
return list
|
|
||||||
|
|
||||||
# ========== Main code ==========
|
# ========== Main code ==========
|
||||||
au_list = [] # List for author data
|
au_list = [] # List for author data
|
||||||
ar_list = [] # List for article data
|
num = [0] # Data number counter
|
||||||
ar_temp = [] # List for temp stored
|
|
||||||
num = 0 # Data number counter
|
|
||||||
|
|
||||||
# Read the file
|
# Read the file
|
||||||
au_names = os.listdir(author_path)
|
|
||||||
ar_names = os.listdir(article_path)
|
ar_names = os.listdir(article_path)
|
||||||
|
|
||||||
# Stored the author data which has no "email" information
|
# Stored the author data which has no "email" information
|
||||||
au_list = au_read(author_path, au_names, au_list)
|
au_list = au_read(author_path, au_list)
|
||||||
|
|
||||||
# Search the articles where the authors from
|
# Search the articles where the authors from
|
||||||
for au_data in au_list:
|
for au_MergeList in au_list:
|
||||||
if len(ar_temp) == 100:
|
for au_data in au_MergeList:
|
||||||
num += 100
|
if num[0] % 100 == 0 and num[0] != 0:
|
||||||
ar_list.append(ar_temp)
|
print(str(num[0]) + " copies of data have been done.")
|
||||||
ar_temp.clear()
|
|
||||||
|
|
||||||
print(str(num) + " copies of data has been stored.")
|
if int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
||||||
|
ar_read(article_path, ar_names[3], au_data.get('from_article'), au_data, num)
|
||||||
|
|
||||||
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2009:
|
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
|
||||||
ar_temp = ar_read(article_path, ar_names[3], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
ar_read(article_path, ar_names[0], au_data.get('from_article'), au_data, num)
|
||||||
|
|
||||||
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2010:
|
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
||||||
ar_temp = ar_read(article_path, ar_names[0], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
ar_read(article_path, ar_names[1], au_data.get('from_article'), au_data, num)
|
||||||
|
|
||||||
elif int(au_data.get('affiliation', [{}])[0].get('year', 0)) <= 2020:
|
else:
|
||||||
ar_temp = ar_read(article_path, ar_names[1], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
ar_read(article_path, ar_names[2], au_data.get('from_article'), au_data, num)
|
||||||
|
|
||||||
else:
|
with open('./Author_data(merged)/Author_data(info_supplementary).json', 'w', encoding='utf-8') as file:
|
||||||
ar_temp = ar_read(article_path, ar_names[2], ar_temp, au_data.get('from_article'), au_data.get('author_id'))
|
json.dump(au_list, file, indent=4)
|
||||||
|
|
||||||
if len(ar_temp) > 0: # Stored remaining data
|
|
||||||
ar_list.append(ar_temp)
|
|
||||||
|
|
||||||
print(len(ar_list))
|
|
||||||
return ar_list
|
|
||||||
|
|
||||||
|
print('All data have been stored into ./Author_data(merged)/Author_data(info_supplementary).json')
|
||||||
|
|
||||||
# ========== Test code ==========
|
# ========== Test code ==========
|
||||||
# Read('./test_buffer/Author_output', './test_buffer/Article_output')
|
Read('./Author_data(merged)/Author_data(merged).json', './test_buffer/Article_output')
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -130,7 +130,7 @@ Retry_author_aminerID = []
|
|||||||
|
|
||||||
|
|
||||||
# ---------- Call the API ----------
|
# ---------- Call the API ----------
|
||||||
for title in article_title: # Get the article web-ID
|
for title in article_title: # Get the article web-ID
|
||||||
aminer_article_webID(title)
|
aminer_article_webID(title)
|
||||||
|
|
||||||
if len(article_aminerID) > 0:
|
if len(article_aminerID) > 0:
|
||||||
Loading…
x
Reference in New Issue
Block a user