41 lines
1.3 KiB
Python
41 lines
1.3 KiB
Python
"""
|
|
Concatenate all JSONL files in a directory into a single JSONL file.
|
|
"""
|
|
import argparse
|
|
import pathlib
|
|
|
|
|
|
def concat_jsonl_files(input_dir: pathlib.Path, output_file: pathlib.Path):
|
|
"""
|
|
Read all .jsonl files in the input directory, concatenate their lines,
|
|
and write them to the output file.
|
|
|
|
Args:
|
|
input_dir: Path to directory containing .jsonl files.
|
|
output_file: Path to the resulting concatenated .jsonl file.
|
|
"""
|
|
jsonl_files = sorted(input_dir.glob('*.jsonl'))
|
|
if not jsonl_files:
|
|
print(f"No JSONL files found in {input_dir}")
|
|
return
|
|
|
|
with output_file.open('w', encoding='utf-8') as fout:
|
|
for jsonl in jsonl_files:
|
|
with jsonl.open('r', encoding='utf-8') as fin:
|
|
for line in fin:
|
|
if line.strip(): # skip empty lines
|
|
fout.write(line)
|
|
print(f"Concatenated {len(jsonl_files)} files into {output_file}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Concatenate JSONL files.")
|
|
parser.add_argument('input_dir', type=pathlib.Path, help='Directory with JSONL files')
|
|
parser.add_argument('output_file', type=pathlib.Path, help='Output JSONL file')
|
|
args = parser.parse_args()
|
|
concat_jsonl_files(args.input_dir, args.output_file)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|