""" Concatenate all JSONL files in a directory into a single JSONL file. """ import argparse import pathlib def concat_jsonl_files(input_dir: pathlib.Path, output_file: pathlib.Path): """ Read all .jsonl files in the input directory, concatenate their lines, and write them to the output file. Args: input_dir: Path to directory containing .jsonl files. output_file: Path to the resulting concatenated .jsonl file. """ jsonl_files = sorted(input_dir.glob('*.jsonl')) if not jsonl_files: print(f"No JSONL files found in {input_dir}") return with output_file.open('w', encoding='utf-8') as fout: for jsonl in jsonl_files: with jsonl.open('r', encoding='utf-8') as fin: for line in fin: if line.strip(): # skip empty lines fout.write(line) print(f"Concatenated {len(jsonl_files)} files into {output_file}") def main(): parser = argparse.ArgumentParser(description="Concatenate JSONL files.") parser.add_argument('input_dir', type=pathlib.Path, help='Directory with JSONL files') parser.add_argument('output_file', type=pathlib.Path, help='Output JSONL file') args = parser.parse_args() concat_jsonl_files(args.input_dir, args.output_file) if __name__ == "__main__": main()