bge_finetune/scripts/concat_jsonl.py
2025-07-23 14:54:46 +08:00

41 lines
1.3 KiB
Python

"""
Concatenate all JSONL files in a directory into a single JSONL file.
"""
import argparse
import pathlib
def concat_jsonl_files(input_dir: pathlib.Path, output_file: pathlib.Path):
"""
Read all .jsonl files in the input directory, concatenate their lines,
and write them to the output file.
Args:
input_dir: Path to directory containing .jsonl files.
output_file: Path to the resulting concatenated .jsonl file.
"""
jsonl_files = sorted(input_dir.glob('*.jsonl'))
if not jsonl_files:
print(f"No JSONL files found in {input_dir}")
return
with output_file.open('w', encoding='utf-8') as fout:
for jsonl in jsonl_files:
with jsonl.open('r', encoding='utf-8') as fin:
for line in fin:
if line.strip(): # skip empty lines
fout.write(line)
print(f"Concatenated {len(jsonl_files)} files into {output_file}")
def main():
parser = argparse.ArgumentParser(description="Concatenate JSONL files.")
parser.add_argument('input_dir', type=pathlib.Path, help='Directory with JSONL files')
parser.add_argument('output_file', type=pathlib.Path, help='Output JSONL file')
args = parser.parse_args()
concat_jsonl_files(args.input_dir, args.output_file)
if __name__ == "__main__":
main()