ScholarDataMining/FileMerger/Division_byName.py
2023-10-09 23:24:58 +08:00

64 lines
2.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
import re
'''
========== Division_byName ==========
1. 本程序用于将作者信息再次按照 last_name 的首字母进行二次划分,便于后期对于无邮箱的作者信息经行合并。
2. 在最后输出的 nameDivision 文件夹中包含 27 个小文件夹,其中 0号 文件夹用来存储 last_name 首字母非英文字母的作者信息,
其余字母则按照排列顺序存储到 1 - 26号 文件夹之中每个文件夹中的json文件名称会根据网站的不同有所区别。
'''
def Division(folder_path):
# Read the folders
def Read(folder_path):
list = [] # A list to stored json data
for filename in os.listdir(folder_path):
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
list.extend(data)
print('\nComplete: Files reader.\n')
return list
# Save into different files
def Transf(data):
os.makedirs("./nameDivision/", exist_ok=True) # Create a new folder
list = [[] for _ in range(27)] # list of lists to stored data
# Division into 27 files according to the first alpha,
for Dict in data:
# print(Dict.get('last_name'))
# print(len(Dict.get('last_name')[0].lower()))
if Dict.get('last_name') is not None and Dict.get('last_name') != "" and \
len(Dict.get('last_name')[0].lower()) < 2 and 97 <= ord(Dict.get('last_name')[0].lower()) <= 122:
num = ord(Dict.get('last_name')[0].lower()) - 96
list[num].append(Dict)
else:
list[0].append(Dict) # one for last name without alpha
# Create new folders to stored author data
for num in range(0, 27):
directory = "./nameDivision/" + str(num)
os.makedirs(directory, exist_ok=True)
web_name = re.findall(r"\./(.*?)/", folder_path)[0]
file_path = os.path.join(directory, web_name + ".json")
with open(file_path, 'w', encoding='utf-8') as file:
json.dump(list[num], file, indent=4)
print('Complete: ' + str(len(list[num])) + ' copies of data have been saved into ' + directory)
# ========== Main code for function ==========
Transf(Read(folder_path))
# ========== Test code ==========
Division('./test_buffer/Author_output')