New function to divide author data according to their last name's first letter
This commit is contained in:
parent
34fb579f7c
commit
2a1fcfc4cd
61
00_File_merge/Division_byName.py
Normal file
61
00_File_merge/Division_byName.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
'''
|
||||||
|
========== Division_byName ==========
|
||||||
|
1. 本程序用于将作者信息再次按照 last_name 的首字母进行二次划分,便于后期对于无邮箱的作者信息经行合并。
|
||||||
|
2. 在最后输出的 nameDivision 文件夹中包含 27 个小文件夹,其中 0号 文件夹用来存储 last_name 首字母非英文字母的作者信息,
|
||||||
|
其余字母则按照排列顺序存储到 1 - 26号 文件夹之中,每个文件夹中的json文件名称会根据网站的不同有所区别。
|
||||||
|
'''
|
||||||
|
|
||||||
|
def Division(folder_path):
|
||||||
|
# Read the folders
|
||||||
|
def Read(folder_path):
|
||||||
|
list = [] # A list to stored json data
|
||||||
|
|
||||||
|
for filename in os.listdir(folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
list.extend(data)
|
||||||
|
|
||||||
|
print('\nComplete: Files reader.\n')
|
||||||
|
|
||||||
|
return list
|
||||||
|
|
||||||
|
# Save into different files
|
||||||
|
def Transf(data):
|
||||||
|
os.makedirs("./nameDivision/", exist_ok=True) # Create a new folder
|
||||||
|
list = [[] for _ in range(27)] # list of lists to stored data
|
||||||
|
|
||||||
|
# Division into 27 files according to the first alpha,
|
||||||
|
for Dict in data:
|
||||||
|
if Dict.get('last_name') is not None and len(Dict.get('last_name')[0].lower()) < 2 and \
|
||||||
|
97 <= ord(Dict.get('last_name')[0].lower()) <= 122:
|
||||||
|
|
||||||
|
num = ord(Dict.get('last_name')[0].lower()) - 96
|
||||||
|
list[num].append(Dict)
|
||||||
|
|
||||||
|
else:
|
||||||
|
list[0].append(Dict) # one for last name without alpha
|
||||||
|
|
||||||
|
# Create new folders to stored author data
|
||||||
|
for num in range(0, 27):
|
||||||
|
directory = "./nameDivision/" + str(num)
|
||||||
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
web_name = re.findall(r"\./(.*?)/", folder_path)[0]
|
||||||
|
file_path = os.path.join(directory, web_name + ".json")
|
||||||
|
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(list[num], file, indent=4)
|
||||||
|
print('Complete: ' + str(len(list[num])) + ' copies of data have been saved into ' + directory)
|
||||||
|
|
||||||
|
# ========== Main code for function ==========
|
||||||
|
Transf(Read(folder_path))
|
||||||
|
|
||||||
|
# ========== Test code ==========
|
||||||
|
# Division('./test_buffer/Author_output')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user