From 2a1fcfc4cdccc59329be1e0ebe659973fce97a31 Mon Sep 17 00:00:00 2001 From: Chenxiao Xia Date: Sat, 16 Sep 2023 18:48:06 +0800 Subject: [PATCH] New function to divide author data according to their last name's first letter --- 00_File_merge/Division_byName.py | 61 ++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 00_File_merge/Division_byName.py diff --git a/00_File_merge/Division_byName.py b/00_File_merge/Division_byName.py new file mode 100644 index 0000000..9d868a4 --- /dev/null +++ b/00_File_merge/Division_byName.py @@ -0,0 +1,61 @@ +import os +import json +import re +''' + ========== Division_byName ========== + 1. 本程序用于将作者信息再次按照 last_name 的首字母进行二次划分,便于后期对于无邮箱的作者信息经行合并。 + 2. 在最后输出的 nameDivision 文件夹中包含 27 个小文件夹,其中 0号 文件夹用来存储 last_name 首字母非英文字母的作者信息, + 其余字母则按照排列顺序存储到 1 - 26号 文件夹之中,每个文件夹中的json文件名称会根据网站的不同有所区别。 +''' + +def Division(folder_path): + # Read the folders + def Read(folder_path): + list = [] # A list to stored json data + + for filename in os.listdir(folder_path): + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + list.extend(data) + + print('\nComplete: Files reader.\n') + + return list + + # Save into different files + def Transf(data): + os.makedirs("./nameDivision/", exist_ok=True) # Create a new folder + list = [[] for _ in range(27)] # list of lists to stored data + + # Division into 27 files according to the first alpha, + for Dict in data: + if Dict.get('last_name') is not None and len(Dict.get('last_name')[0].lower()) < 2 and \ + 97 <= ord(Dict.get('last_name')[0].lower()) <= 122: + + num = ord(Dict.get('last_name')[0].lower()) - 96 + list[num].append(Dict) + + else: + list[0].append(Dict) # one for last name without alpha + + # Create new folders to stored author data + for num in range(0, 27): + directory = "./nameDivision/" + str(num) + os.makedirs(directory, exist_ok=True) + web_name = re.findall(r"\./(.*?)/", folder_path)[0] + file_path = os.path.join(directory, web_name + ".json") + + with open(file_path, 'w', encoding='utf-8') as file: + json.dump(list[num], file, indent=4) + print('Complete: ' + str(len(list[num])) + ' copies of data have been saved into ' + directory) + + # ========== Main code for function ========== + Transf(Read(folder_path)) + +# ========== Test code ========== +# Division('./test_buffer/Author_output') + + +