Merge remote-tracking branch 'origin/main'

This commit is contained in:
ldy 2023-08-11 12:44:53 +08:00
commit 69b10a9f72
5 changed files with 44 additions and 136 deletions

View File

@ -1,8 +1,7 @@
import json
import os
from pprint import pprint
import unicodedata
from pprint import pprint
'''
========== SameWeb_merge(folder_path) 运行顺序 ==========
@ -53,13 +52,16 @@ def SameWeb_merge(folder_path):
# Uniform characters in English
fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
faa = unicodedata.normalize('NFKD', aa['affiliation'][0]["affiliation"]).encode('ascii', 'ignore')
if fa != faa:
a['affiliation'].extend(aa['affiliation'])
elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
elif fa == faa and a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
a['affiliation'].extend(aa['affiliation'])
if len(a['affiliation']) > 1 and a['affiliation'][0] == a['affiliation'][1]:
a['affiliation'].remove(a['affiliation'][0])
# Delete extra elements
Data.remove(aa)
@ -82,7 +84,7 @@ def SameWeb_merge(folder_path):
if len(data) > 0:
Data.extend(data)
Database = len(Data)
Database = len(Data) # The length of the original data
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
# Same website data merge
@ -96,7 +98,7 @@ def SameWeb_merge(folder_path):
print(str(len(Data)) + ' copies of data in total, now.')
# Save into file
path = os.path.dirname(folder_path)
path = os.path.dirname(folder_path) # parent path
path = os.path.join(path, "Author_data(merged)")
os.makedirs(path, exist_ok=True)
path = os.path.join(path, "Author_data(merged).json")
@ -106,10 +108,10 @@ def SameWeb_merge(folder_path):
print('\nData has been added to ' + path + '\Author_data(merged).json')
# =========== input the file path here ==========
# SameWeb_merge('.\EJQTDE\EJQTDE_buffer\Author_output')
# SameWeb_merge('.\SpringerOpen\SpringerOpen_buffer\Author_output')
# SameWeb_merge('.\ejde\ejde_buffer\Author_output')

View File

@ -1,97 +0,0 @@
import json
import os
import unicodedata
'''
========== SameWeb_merge(folder_path) 运行顺序 ==========
1. 创建新列表依次读取文件夹内的json文件并录入对列表按年份进行排序
2. 使用
for a in Data:
Merge(a, count, Data):
for aa in Data:
结构进行遍历每次扫
描到符合条件的aa元素后添加至a元素中然后删除aa元素再次调用Merge
函数重新遍历直至data中不再包含符合条件的aa元素为止
3. 最后Data列表中元素全部完成相同作者合并
*Tips: 对于相同网站的作者信息合并采用的是对比键值是否相同的方法对于数
据格式较为规范的网站效果良好对于数据格式不做规范的网站效果较差
具体表现为某些拉丁文与英文混杂的意义相同的词语没有区分性或者同
义词随意增添标点符号和额外信息导致的程序严格对比无法准确判断键值
是否相同
建议
1. 在合并部分首先将拉丁文字等非英文字符统一为英文再进行对比(已解决)
2. 对于随意增添标点符号和任意额外信息的数据进行局部处理再对比比较
复杂暂时没有好思路
'''
def SameWeb_merge(folder_path):
# Function
def Merge(a, count, Data):
for aa in Data:
if a.get("firstname") == aa.get("firstname") and a.get("middlename") == aa.get("middlename") and a.get(
"lastname") == aa.get("lastname") and a != aa:
# Add different key-elements of "from_article" into the first element
if isinstance(a["from_article"], list):
if isinstance(aa["from_article"], list):
a["from_article"] += aa["from_article"]
else:
a["from_article"].append(aa["from_article"])
elif isinstance(a["from_article"], str):
if isinstance(aa["from_article"], list):
a["from_article"] = [a["from_article"]] + aa["from_article"]
else:
a["from_article"] = [a["from_article"], aa["from_article"]]
# Add different key-elements of "affiliation" into the first element
a['affiliation'] = sorted(a['affiliation'], key=lambda x: x['year']) # Sort by year
# Uniform characters in English
fa = unicodedata.normalize('NFKD', a['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
faa = unicodedata.normalize('NFKD', aa['affiliation'][-1]["affiliation"]).encode('ascii', 'ignore')
if fa != faa:
a['affiliation'].extend(aa['affiliation'])
elif a['affiliation'][-1]['year'] != aa['affiliation'][0]['year']:
a['affiliation'].extend(aa['affiliation'])
# Delete extra elements
Data.remove(aa)
# Update the counter
count[0] += 1
return Merge(a, count, Data)
# ========== Main code ==========
Data = [] # Empty list
count = [0] # counter
# Add data into list
for filename in os.listdir(folder_path):
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
if len(data) > 0:
Data.extend(data)
Database = len(Data)
Data = sorted(Data, key=lambda x: x['affiliation'][0]['year'])
# Same website data merge
for a in Data:
Merge(a, count, Data)
# Information
print(str(count[0]) + ' copies of data have been merged.')
print(str(Database) + ' copies of data in total, before')
print(str(len(Data)) + ' copies of data in total, now.')
# =========== input the file path here ==========
SameWeb_merge('./EJDE_buffer/Author_output')

View File

@ -63,17 +63,17 @@ def Transf():
# The path of storage
author_output_file = [
'./ejde_buffer/Author_output/Author_output_file(oldest).json',
'./ejde_buffer/Author_output/Author_output_file(2010-2014).json',
'./ejde_buffer/Author_output/Author_output_file(2015-2020).json',
'./ejde_buffer/Author_output/Author_output_file(newest).json'
'./ejde_buffer/Author_output/EJDE_Author_output_file(oldest).json',
'./ejde_buffer/Author_output/EJDE_Author_output_file(2010-2014).json',
'./ejde_buffer/Author_output/EJDE_Author_output_file(2015-2020).json',
'./ejde_buffer/Author_output/EJDE_Author_output_file(newest).json'
]
article_output_file = [
'./ejde_buffer/Article_output/Article_output_file(oldest).json',
'./ejde_buffer/Article_output/Article_output_file(2010-2014).json',
'./ejde_buffer/Article_output/Article_output_file(2015-2020).json',
'./ejde_buffer/Article_output/Article_output_file(newest).json'
'./ejde_buffer/Article_output/EJDE_Article_output_file(oldest).json',
'./ejde_buffer/Article_output/EJDE_Article_output_file(2010-2014).json',
'./ejde_buffer/Article_output/EJDE_Article_output_file(2015-2020).json',
'./ejde_buffer/Article_output/EJDE_Article_output_file(newest).json'
]
# Read and write into files

View File

@ -35,7 +35,7 @@ def Transf():
for Dict in data:
if Dict.get('volume') is not None or Dict.get('affiliation', [{}])[0].get('year', 0) is not None:
# 筛选文章
# Select data
if (isinstance(Dict, dict) and int(
Dict.get('volume') or Dict.get('affiliation', [{}])[0].get('year', 0)) <= 2009):
data_oldest.append(Dict)
@ -65,17 +65,17 @@ def Transf():
# The path of storage
author_output_file = [
'./EJQTDE_buffer/Author_output/Author_output_file(oldest).json',
'./EJQTDE_buffer/Author_output/Author_output_file(2010-2014).json',
'./EJQTDE_buffer/Author_output/Author_output_file(2015-2020).json',
'./EJQTDE_buffer/Author_output/Author_output_file(newest).json'
'./EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(oldest).json',
'./EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(2010-2014).json',
'./EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(2015-2020).json',
'./EJQTDE_buffer/Author_output/EJQTDE_Author_output_file(newest).json'
]
article_output_file = [
'./EJQTDE_buffer/Article_output/Article_output_file(oldest).json',
'./EJQTDE_buffer/Article_output/Article_output_file(2010-2014).json',
'./EJQTDE_buffer/Article_output/Article_output_file(2015-2020).json',
'./EJQTDE_buffer/Article_output/Article_output_file(newest).json'
'./EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(oldest).json',
'./EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(2010-2014).json',
'./EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(2015-2020).json',
'./EJQTDE_buffer/Article_output/EJQTDE_Article_output_file(newest).json'
]
# Read and write into files
@ -95,6 +95,7 @@ def delete():
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
os.remove(file_path)
os.rmdir(folder_path)
print('\nAttention: The temporary storage files have been deleted!')

View File

@ -63,17 +63,17 @@ def Transf():
# 存储路径
author_output_file = [
'./SpringerOpen_buffer/Author_output/Author_output_file(oldest).json',
'./SpringerOpen_buffer/Author_output/Author_output_file(2010-2014).json',
'./SpringerOpen_buffer/Author_output/Author_output_file(2015-2020).json',
'./SpringerOpen_buffer/Author_output/Author_output_file(newest).json'
'./SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(oldest).json',
'./SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(2010-2014).json',
'./SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(2015-2020).json',
'./SpringerOpen_buffer/Author_output/SpringerOpen_Author_output_file(newest).json'
]
article_output_file = [
'./SpringerOpen_buffer/Article_output/Article_output_file(oldest).json',
'./SpringerOpen_buffer/Article_output/Article_output_file(2010-2014).json',
'./SpringerOpen_buffer/Article_output/Article_output_file(2015-2020).json',
'./SpringerOpen_buffer/Article_output/Article_output_file(newest).json'
'./SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(oldest).json',
'./SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(2010-2014).json',
'./SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(2015-2020).json',
'./SpringerOpen_buffer/Article_output/SpringerOpen_Article_output_file(newest).json'
]
# 读取并写入文件
@ -85,12 +85,14 @@ def Transf():
# 删除暂存区文件
def delete(folder_path):
file_names = os.listdir(folder_path)
for file_name in file_names:
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
os.remove(file_path)
def delete():
folder_paths = ['./SpringerOpen_buffer/Author_TS', './SpringerOpen_buffer/Article_TS']
for folder_path in folder_paths:
file_names = os.listdir(folder_path)
for file_name in file_names:
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
os.remove(file_path)
os.rmdir(folder_path)
print('\nAttention: The temporary storage files have been deleted!')