#!/usr/bin/env python3 """ convert_jsonl_with_fallback.py Like your original converter, but when inner JSON parsing fails (it often does when there are unescaped quotes inside the passage), we do a simple regex/string‐search to pull out passage, label, score. """ import argparse import json import os import re import sys from pathlib import Path def parse_args(): p = argparse.ArgumentParser( description="Convert JSONL with embedded JSON in 'data' to flat JSONL, with fallback." ) p.add_argument( "--input", "-i", required=True, help="Path to input .jsonl file or directory containing .jsonl files" ) p.add_argument( "--output-dir", "-o", required=True, help="Directory where converted files will be written" ) return p.parse_args() def strip_code_fence(s: str) -> str: """Remove ```json ... ``` or ``` ... ``` fences, return inner text.""" fenced = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.DOTALL) m = fenced.search(s) return m.group(1) if m else s def fallback_extract(inner: str): """ Fallback extractor that: - finds the passage between "passage": " … ", - then pulls label and score via regex. Returns dict or None. """ # 1) passage: find between `"passage": "` and the next `",` p_start = inner.find('"passage": "') if p_start < 0: return None p_start += len('"passage": "') # we assume the passage ends at the first occurrence of `",` after p_start p_end = inner.find('",', p_start) if p_end < 0: return None passage = inner[p_start:p_end] # 2) label m_lbl = re.search(r'"label"\s*:\s*(\d+)', inner[p_end:]) if not m_lbl: return None label = int(m_lbl.group(1)) # 3) score m_sc = re.search(r'"score"\s*:\s*([0-9]*\.?[0-9]+)', inner[p_end:]) if not m_sc: return None score = float(m_sc.group(1)) return {"passage": passage, "label": label, "score": score} def process_file(in_path: Path, out_path: Path): with in_path.open("r", encoding="utf-8") as fin, \ out_path.open("w", encoding="utf-8") as fout: for lineno, line in enumerate(fin, 1): line = line.strip() if not line: continue # parse the outer JSON try: outer = json.loads(line) except json.JSONDecodeError: print(f"[WARN] line {lineno} in {in_path.name}: outer JSON invalid, skipping", file=sys.stderr) continue query = outer.get("query") data_field = outer.get("data") if not query or not data_field: # missing query or data → skip continue inner_text = strip_code_fence(data_field) # attempt normal JSON parse record = None try: inner = json.loads(inner_text) # must have all three keys if all(k in inner for k in ("passage", "label", "score")): record = { "query": query, "passage": inner["passage"], "label": inner["label"], "score": inner["score"] } # else record stays None → fallback below except json.JSONDecodeError: pass # if JSON parse gave nothing, try fallback if record is None: fb = fallback_extract(inner_text) if fb: record = { "query": query, **fb } print(f"[INFO] line {lineno} in {in_path.name}: used fallback extraction", file=sys.stderr) else: print(f"[WARN] line {lineno} in {in_path.name}: unable to extract fields, skipping", file=sys.stderr) continue fout.write(json.dumps(record, ensure_ascii=False) + "\n") def main(): args = parse_args() inp = Path(args.input) outdir = Path(args.output_dir) outdir.mkdir(parents=True, exist_ok=True) if inp.is_dir(): files = list(inp.glob("*.jsonl")) else: files = [inp] if not files: print(f"No .jsonl files found in {inp}", file=sys.stderr) sys.exit(1) for f in files: dest = outdir / f.name print(f"Converting {f} → {dest}", file=sys.stderr) process_file(f, dest) if __name__ == "__main__": main()