bge_finetune/data/datasets/examples/format_usage_examples.py

#!/usr/bin/env python3
"""
Comprehensive usage examples for the refactored BGE data pipeline
Demonstrates all four supported formats: triplets, pairs, candidates, and legacy nested

Data Formats Supported:
1. Triplets: Query-triplets format (query, pos, neg) for BGE-M3 embedding training
2. Pairs: Flat pairs format (query, passage, label) for BGE-reranker training
3. Candidates: Unified format (query, candidates) with threshold-based label assignment
4. Legacy Nested: Legacy nested format with automatic conversion to flat pairs
"""

import os
import sys
import json
import tempfile
from transformers import AutoTokenizer

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from data.dataset import BGEDataset, BGEM3Dataset, BGERerankerDataset
from data.optimization import optimize_data


def create_sample_data():
    """Create sample data files for all four formats"""

    # Create temporary directory
    temp_dir = tempfile.mkdtemp(prefix="bge_examples_")
    print(f"📁 Creating sample data in: {temp_dir}")

    # 1. Triplets format (for BGE-M3 embedding training)
    triplets_file = os.path.join(temp_dir, "triplets_data.jsonl")
    triplets_data = [
        {
            "query": "What is machine learning?",
            "pos": [
                "Machine learning is a subset of artificial intelligence that enables computers to learn without explicit programming.",
                "ML algorithms use statistical techniques to learn patterns from data and make predictions."
            ],
            "neg": [
                "Weather forecasting uses meteorological data to predict atmospheric conditions.",
                "Cooking recipes require specific ingredients and step-by-step instructions."
            ],
            "pos_scores": [0.95, 0.88],
            "neg_scores": [0.15, 0.08],
            "prompt": "为此查询生成表示：",
            "type": "definition"
        },
        {
            "query": "How does deep learning work?",
            "pos": [
                "Deep learning uses neural networks with multiple layers to learn complex patterns.",
                "Deep neural networks automatically extract features from raw data through backpropagation."
            ],
            "neg": [
                "Baking bread requires yeast, flour, and proper kneading techniques.",
                "Solar panels convert sunlight into electrical energy through photovoltaic cells."
            ],
            "pos_scores": [0.92, 0.89],
            "neg_scores": [0.12, 0.18]
        }
    ]

    with open(triplets_file, 'w', encoding='utf-8') as f:
        for item in triplets_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

    # 2. Pairs format (for BGE-reranker training)
    pairs_file = os.path.join(temp_dir, "pairs_data.jsonl")
    pairs_data = [
        {
            "query": "What is machine learning?",
            "passage": "Machine learning is a subset of artificial intelligence that enables computers to learn without explicit programming.",
            "label": 1,
            "score": 0.95,
            "qid": "q1",
            "pid": "p1"
        },
        {
            "query": "What is machine learning?",
            "passage": "Weather forecasting uses meteorological data to predict atmospheric conditions.",
            "label": 0,
            "score": 0.15,
            "qid": "q1",
            "pid": "p2"
        },
        {
            "query": "How does deep learning work?",
            "passage": "Deep learning uses neural networks with multiple layers to learn complex patterns.",
            "label": 1,
            "score": 0.92,
            "qid": "q2",
            "pid": "p3"
        },
        {
            "query": "How does deep learning work?",
            "passage": "Baking bread requires yeast, flour, and proper kneading techniques.",
            "label": 0,
            "score": 0.12,
            "qid": "q2",
            "pid": "p4"
        }
    ]

    with open(pairs_file, 'w', encoding='utf-8') as f:
        for item in pairs_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

    # 3. Candidates format (unified format with threshold-based labels)
    candidates_file = os.path.join(temp_dir, "candidates_data.jsonl")
    candidates_data = [
        {
            "query": "What is artificial intelligence?",
            "qid": "q3",
            "candidates": [
                {
                    "text": "Artificial intelligence is the simulation of human intelligence in machines.",
                    "score": 0.94,
                    "pid": "c1"
                },
                {
                    "text": "AI systems can perform tasks that typically require human intelligence.",
                    "score": 0.87,
                    "pid": "c2"
                },
                {
                    "text": "Mountain climbing requires proper equipment and training.",
                    "score": 0.23,
                    "pid": "c3"
                },
                {
                    "text": "Chess is a strategic board game played between two players.",
                    "score": 0.31,
                    "pid": "c4"
                }
            ]
        },
        {
            "query": "Explain neural networks",
            "qid": "q4",
            "candidates": [
                {
                    "text": "Neural networks are computing systems inspired by biological neural networks.",
                    "score": 0.91,
                    "pid": "c5"
                },
                {
                    "text": "They consist of interconnected nodes that process information in layers.",
                    "score": 0.85,
                    "pid": "c6"
                },
                {
                    "text": "Gardening involves planting seeds and watering plants regularly.",
                    "score": 0.19,
                    "pid": "c7"
                }
            ]
        }
    ]

    with open(candidates_file, 'w', encoding='utf-8') as f:
        for item in candidates_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

    # 4. Legacy nested format (for backward compatibility)
    legacy_file = os.path.join(temp_dir, "legacy_nested_data.jsonl")
    legacy_data = [
        {
            "query": "What is natural language processing?",
            "pos": [
                "Natural language processing is a field of AI that helps computers understand human language.",
                "NLP combines computational linguistics with machine learning to process text and speech."
            ],
            "neg": [
                "Automobile manufacturing involves assembly lines and quality control processes.",
                "Photography captures light through camera lenses to create images."
            ],
            "pos_scores": [0.93, 0.86],
            "neg_scores": [0.14, 0.22]
        },
        {
            "query": "How do transformers work in NLP?",
            "pos": [
                "Transformers use self-attention mechanisms to process sequential data in parallel.",
                "The transformer architecture revolutionized NLP with attention-based models like BERT and GPT."
            ],
            "neg": [
                "Electric vehicles use batteries to power electric motors for transportation.",
                "Coffee brewing requires proper water temperature and grinding consistency."
            ],
            "pos_scores": [0.89, 0.91],
            "neg_scores": [0.16, 0.13]
        }
    ]

    with open(legacy_file, 'w', encoding='utf-8') as f:
        for item in legacy_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

    return {
        'triplets': triplets_file,
        'pairs': pairs_file,
        'candidates': candidates_file,
        'legacy_nested': legacy_file,
        'temp_dir': temp_dir
    }


def demonstrate_fine_tuning_loader():
    """Demonstrate fine-tuning data loader with all formats"""

    print("\n" + "="*80)
    print("🎯 FINE-TUNING DATA LOADER EXAMPLES")
    print("="*80)

    # Create sample data
    data_files = create_sample_data()

    # Initialize tokenizer (use a simple one for demo)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    # 1. Triplets format with BGE-M3 Dataset
    print("\n📊 1. TRIPLETS FORMAT (BGE-M3 Embedding Training)")
    print("-" * 60)

    try:
        triplets_dataset = BGEM3Dataset(
            data_path=data_files['triplets'],
            tokenizer=tokenizer,
            query_max_length=64,
            passage_max_length=256,
            is_train=True
        )

        print(f"✅ Loaded {len(triplets_dataset)} triplet examples")
        print(f"   Detected format: {triplets_dataset.detected_format}")

        # Get a sample
        sample = triplets_dataset[0]
        print(f"   Sample keys: {list(sample.keys())}")
        print(f"   Query shape: {sample['query_input_ids'].shape}")
        print(f"   Passages shape: {sample['passage_input_ids'].shape}")
        print(f"   Labels: {sample['labels']}")

    except Exception as e:
        print(f"❌ Error: {e}")

    # 2. Pairs format with BGE-Reranker Dataset
    print("\n📊 2. PAIRS FORMAT (BGE-Reranker Training)")
    print("-" * 60)

    try:
        pairs_dataset = BGERerankerDataset(
            data_path=data_files['pairs'],
            tokenizer=tokenizer,
            query_max_length=64,
            passage_max_length=256,
            is_train=True,
            legacy_support=False
        )

        print(f"✅ Loaded {len(pairs_dataset)} pair examples")
        print(f"   Detected format: {pairs_dataset.detected_format}")

        # Get a sample
        sample = pairs_dataset[0]
        print(f"   Sample keys: {list(sample.keys())}")
        print(f"   Input shape: {sample['input_ids'].shape}")
        print(f"   Label: {sample['labels']}")
        print(f"   Query: {sample['query_text'][:50]}...")

    except Exception as e:
        print(f"❌ Error: {e}")

    # 3. Candidates format with automatic conversion
    print("\n📊 3. CANDIDATES FORMAT (Threshold-based Conversion)")
    print("-" * 60)

    try:
        candidates_dataset = BGEDataset(
            data_path=data_files['candidates'],
            tokenizer=tokenizer,
            query_max_length=64,
            passage_max_length=256,
            is_train=True,
            format_type="candidates",
            candidates_threshold=0.5  # Scores >= 0.5 become label=1
        )

        print(f"✅ Loaded {len(candidates_dataset)} exploded examples")
        print(f"   Detected format: {candidates_dataset.detected_format}")
        print(f"   Note: Each query with multiple candidates exploded into separate pairs")

        # Get a sample
        sample = candidates_dataset[0]
        print(f"   Sample keys: {list(sample.keys())}")
        print(f"   Format after processing: {sample['format']}")
        print(f"   Source: {sample.get('source', 'N/A')}")

    except Exception as e:
        print(f"❌ Error: {e}")

    # 4. Legacy nested format with automatic conversion
    print("\n📊 4. LEGACY NESTED FORMAT (Automatic Conversion)")
    print("-" * 60)

    try:
        legacy_dataset = BGERerankerDataset(
            data_path=data_files['legacy_nested'],
            tokenizer=tokenizer,
            query_max_length=64,
            passage_max_length=256,
            is_train=True,
            legacy_support=True
        )

        print(f"✅ Loaded {len(legacy_dataset)} converted examples")
        print(f"   Detected format: {legacy_dataset.detected_format}")
        print(f"   Note: Legacy nested format automatically converted to flat pairs")

        # Get a sample
        sample = legacy_dataset[0]
        print(f"   Sample keys: {list(sample.keys())}")
        print(f"   Format after processing: {sample['format']}")
        print(f"   Source: {sample.get('source', 'N/A')}")

    except Exception as e:
        print(f"❌ Error: {e}")

    return data_files


def demonstrate_optimization_pipeline(data_files):
    """Demonstrate optimization pipeline with all formats"""

    print("\n" + "="*80)
    print("⚡ OPTIMIZATION PIPELINE EXAMPLES")
    print("="*80)

    cache_dir = os.path.join(data_files['temp_dir'], 'cache')
    os.makedirs(cache_dir, exist_ok=True)

    # 1. BGE-M3 optimization with triplets format
    print("\n🚀 1. BGE-M3 OPTIMIZATION (Triplets Format)")
    print("-" * 60)

    try:
        cached_files = optimize_data(
            model_type='bge-m3',
            input_files=[data_files['triplets']],
            output_dir=cache_dir,
            tokenizer_path='bert-base-uncased',
            force=True,
            hard_negative_ratio=0.7,
            max_triplets_per_query=8,
            difficulty_threshold=0.1
        )

        print(f"✅ M3 optimization complete: {len(cached_files)} files")
        for file in cached_files:
            print(f"   📦 {file}")

    except Exception as e:
        print(f"❌ M3 optimization error: {e}")

    # 2. BGE-Reranker optimization with pairs format
    print("\n🚀 2. BGE-RERANKER OPTIMIZATION (Pairs Format)")
    print("-" * 60)

    try:
        cached_files = optimize_data(
            model_type='bge-reranker',
            input_files=[data_files['pairs']],
            output_dir=cache_dir,
            tokenizer_path='bert-base-uncased',
            force=True,
            output_format='flat'
        )

        print(f"✅ Reranker optimization complete: {len(cached_files)} files")
        for file in cached_files:
            print(f"   📦 {file}")

    except Exception as e:
        print(f"❌ Reranker optimization error: {e}")

    # 3. Candidates format optimization (converts to pairs for reranker)
    print("\n🚀 3. CANDIDATES FORMAT OPTIMIZATION")
    print("-" * 60)

    try:
        cached_files = optimize_data(
            model_type='bge-reranker',
            input_files=[data_files['candidates']],
            output_dir=cache_dir,
            tokenizer_path='bert-base-uncased',
            force=True,
            output_format='flat'
        )

        print(f"✅ Candidates optimization complete: {len(cached_files)} files")
        print("   Note: Candidates exploded into pairs with threshold-based labels")
        for file in cached_files:
            print(f"   📦 {file}")

    except Exception as e:
        print(f"❌ Candidates optimization error: {e}")

    # 4. Candidates to M3 triplets conversion
    print("\n🚀 4. CANDIDATES TO M3 TRIPLETS CONVERSION")
    print("-" * 60)

    try:
        cached_files = optimize_data(
            model_type='bge-m3',
            input_files=[data_files['candidates']],
            output_dir=cache_dir,
            tokenizer_path='bert-base-uncased',
            force=True,
            hard_negative_ratio=0.8
        )

        print(f"✅ Candidates-to-M3 optimization complete: {len(cached_files)} files")
        print("   Note: Candidates split by threshold into pos/neg for triplets")
        for file in cached_files:
            print(f"   📦 {file}")

    except Exception as e:
        print(f"❌ Candidates-to-M3 optimization error: {e}")

    # 5. Legacy nested format optimization
    print("\n🚀 5. LEGACY NESTED FORMAT OPTIMIZATION")
    print("-" * 60)

    try:
        cached_files = optimize_data(
            model_type='bge-reranker',
            input_files=[data_files['legacy_nested']],
            output_dir=cache_dir,
            tokenizer_path='bert-base-uncased',
            force=True,
            output_format='nested'  # Keep nested format
        )

        print(f"✅ Legacy optimization complete: {len(cached_files)} files")
        print("   Note: Legacy nested format preserved for compatibility")
        for file in cached_files:
            print(f"   📦 {file}")

    except Exception as e:
        print(f"❌ Legacy optimization error: {e}")


def show_data_format_schemas():
    """Show clean data format schemas without inline comments"""

    print("\n" + "="*80)
    print("📋 DATA FORMAT SCHEMAS")
    print("="*80)

    print("\n1. 🎯 TRIPLETS FORMAT (BGE-M3 Embedding Training)")
    print("-" * 60)
    triplets_schema = {
        "query": "What is machine learning?",
        "pos": [
            "Machine learning is a subset of artificial intelligence...",
            "ML algorithms learn patterns from data..."
        ],
        "neg": [
            "Weather forecasting uses meteorological data...",
            "Cooking recipes require specific ingredients..."
        ],
        "pos_scores": [0.95, 0.88],
        "neg_scores": [0.15, 0.08],
        "prompt": "为此查询生成表示：",
        "type": "definition"
    }
    print(json.dumps(triplets_schema, indent=2, ensure_ascii=False))

    print("\n2. 🔍 PAIRS FORMAT (BGE-Reranker Training)")
    print("-" * 60)
    pairs_schema = {
        "query": "What is machine learning?",
        "passage": "Machine learning is a subset of artificial intelligence...",
        "label": 1,
        "score": 0.95,
        "qid": "q1",
        "pid": "p1"
    }
    print(json.dumps(pairs_schema, indent=2, ensure_ascii=False))

    print("\n3. 🎲 CANDIDATES FORMAT (Threshold-based Conversion)")
    print("-" * 60)
    candidates_schema = {
        "query": "What is artificial intelligence?",
        "qid": "q1",
        "candidates": [
            {
                "text": "AI is the simulation of human intelligence in machines.",
                "score": 0.94,
                "pid": "c1"
            },
            {
                "text": "Mountain climbing requires proper equipment.",
                "score": 0.23,
                "pid": "c2"
            }
        ]
    }
    print(json.dumps(candidates_schema, indent=2, ensure_ascii=False))
    print("📝 Note: Scores >= threshold become label=1, others become label=0")

    print("\n4. 🔄 LEGACY NESTED FORMAT (Backward Compatibility)")
    print("-" * 60)
    legacy_schema = {
        "query": "What is natural language processing?",
        "pos": [
            "NLP is a field of AI that helps computers understand human language.",
            "NLP combines computational linguistics with machine learning..."
        ],
        "neg": [
            "Automobile manufacturing involves assembly lines...",
            "Photography captures light through camera lenses..."
        ],
        "pos_scores": [0.93, 0.86],
        "neg_scores": [0.14, 0.22]
    }
    print(json.dumps(legacy_schema, indent=2, ensure_ascii=False))
    print("📝 Note: Automatically converted to flat pairs for reranker training")


def main():
    """Main demonstration function"""

    print("🚀 BGE Data Pipeline Format Examples")
    print("="*80)
    print("Demonstrating all four supported formats:")
    print("1. Triplets (BGE-M3 embedding training)")
    print("2. Pairs (BGE-reranker training)")
    print("3. Candidates (unified format with threshold conversion)")
    print("4. Legacy nested (backward compatibility)")
    print("="*80)

    try:
        # Show data schemas
        show_data_format_schemas()

        # Demonstrate fine-tuning loader
        data_files = demonstrate_fine_tuning_loader()

        # Demonstrate optimization pipeline
        demonstrate_optimization_pipeline(data_files)

        print("\n" + "="*80)
        print("🎉 ALL EXAMPLES COMPLETED SUCCESSFULLY!")
        print("="*80)

        print("\n📋 SUMMARY:")
        print("✅ Format Detection: Automatic detection of all four formats")
        print("✅ Conversion Logic: Seamless conversion between formats")
        print("✅ Threshold Assignment: Candidates → pairs with configurable threshold")
        print("✅ Legacy Support: Automatic migration of nested → flat pairs")
        print("✅ Optimization: Pre-tokenization and caching for 10-15x speedup")
        print("✅ Backward Compatibility: Full support for existing datasets")

        print(f"\n📁 Sample data and cache files created in: {data_files['temp_dir']}")
        print("   You can examine the generated files to see the format conversions")

    except Exception as e:
        print(f"\n❌ Error during demonstration: {e}")
        import traceback
        traceback.print_exc()

    print("\n" + "="*80)


if __name__ == "__main__":
    main()