Revised for training
This commit is contained in:
40
scripts/concat_jsonl.py
Normal file
40
scripts/concat_jsonl.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""
|
||||
Concatenate all JSONL files in a directory into a single JSONL file.
|
||||
"""
|
||||
import argparse
|
||||
import pathlib
|
||||
|
||||
|
||||
def concat_jsonl_files(input_dir: pathlib.Path, output_file: pathlib.Path):
|
||||
"""
|
||||
Read all .jsonl files in the input directory, concatenate their lines,
|
||||
and write them to the output file.
|
||||
|
||||
Args:
|
||||
input_dir: Path to directory containing .jsonl files.
|
||||
output_file: Path to the resulting concatenated .jsonl file.
|
||||
"""
|
||||
jsonl_files = sorted(input_dir.glob('*.jsonl'))
|
||||
if not jsonl_files:
|
||||
print(f"No JSONL files found in {input_dir}")
|
||||
return
|
||||
|
||||
with output_file.open('w', encoding='utf-8') as fout:
|
||||
for jsonl in jsonl_files:
|
||||
with jsonl.open('r', encoding='utf-8') as fin:
|
||||
for line in fin:
|
||||
if line.strip(): # skip empty lines
|
||||
fout.write(line)
|
||||
print(f"Concatenated {len(jsonl_files)} files into {output_file}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Concatenate JSONL files.")
|
||||
parser.add_argument('input_dir', type=pathlib.Path, help='Directory with JSONL files')
|
||||
parser.add_argument('output_file', type=pathlib.Path, help='Output JSONL file')
|
||||
args = parser.parse_args()
|
||||
concat_jsonl_files(args.input_dir, args.output_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user