bge_finetune/config.toml

[api]
# API settings for data augmentation (used in data/augmentation.py)
# NOTE: You can override api_key by setting the environment variable BGE_API_KEY.
base_url = "https://api.deepseek.com/v1"
api_key = "your_api_here"  # Replace with your actual API key or set BGE_API_KEY env var
model = "deepseek-chat"
max_retries = 3  # Number of times to retry API calls on failure
timeout = 30  # Timeout for API requests (seconds)
temperature = 0.7  # Sampling temperature for augmentation

[training]
# Default training parameters (overridden by [hardware] or model-specific sections if present)
# If a parameter is present in both [training] and [hardware], [hardware] takes precedence.
default_batch_size = 4  # Default batch size per device for training
default_learning_rate = 1e-5  # Default learning rate
default_num_epochs = 3  # Default number of epochs
default_warmup_ratio = 0.1  # Warmup steps as a ratio of total steps
default_weight_decay = 0.01  # Weight decay for optimizer
gradient_accumulation_steps = 2  # Number of steps to accumulate gradients - reduced for small datasets

[model_paths]
# Model source: 'huggingface' or 'modelscope'. Determines which hub to use for downloading/loading models.
source = "huggingface"  # Options: 'huggingface', 'modelscope'
bge_m3 = "./models/bge-m3"
bge_reranker = "./models/bge-reranker-base"
# Cache directory for downloaded models (tokenizers, config files, etc.)
cache_dir = "./cache/models"
# To use ModelScope, set source = "modelscope" and provide the correct model id/path.

[data]
# Enhanced data processing settings with unified schema support
max_query_length = 64  # Max length for query tokens
max_passage_length = 512  # Max length for passage tokens
max_seq_length = 512  # Max total input sequence length
# For testing only, in production, set to 0.3
validation_split = 0.0  # Fraction of data for validation
random_seed = 42  # Random seed for reproducibility

# Cache directory for processed/optimized datasets (.pkl files)
cache_dir = "./cache/data"
# Default output directory for optimization script
optimization_output_dir = "./cache/data"

# New: Enhanced dataset format settings
auto_detect_format = true  # Enable automatic format detection
legacy_support = true  # Support legacy nested reranker format
migrate_legacy_on_load = false  # Automatically migrate legacy data during loading
embedding_schema_version = "v3"  # Use enhanced embedding schema
reranker_schema_version = "v3"  # Use enhanced reranker schema

# New: Enhanced sampling settings
use_score_weighted_sampling = true  # Enable score-weighted positive sampling
multiple_positives_per_batch = true  # Use multiple positives for richer contrastive signals
hard_negative_ratio = 0.7  # Ratio of hard to random negatives
difficulty_threshold = 0.1  # Minimum difficulty threshold for hard negatives

[augmentation]
# Data augmentation settings (used in data/augmentation.py)
enable_query_augmentation = false  # Enable query augmentation
enable_passage_augmentation = false  # Enable passage augmentation
augmentation_methods = ["paraphrase", "back_translation"]  # List of augmentation methods
api_first = true  # Prefer LLM API for all augmentation if enabled; fallback to rule-based if API unavailable
# Augmentation is now language-aware: prompts and templates are selected based on query language (Chinese/English)
num_augmentations_per_sample = 2  # Number of augmentations per sample
augmentation_temperature = 0.8  # Sampling temperature for augmentation

[hard_negative_mining]
# ANCE hard negative mining settings (used in data/hard_negative_mining.py)
enable_mining = true  # Enable hard negative mining
num_hard_negatives = 15  # Number of hard negatives to mine
negative_range_min = 10  # Min index for negative mining
negative_range_max = 100  # Max index for negative mining
margin = 0.1  # Margin for ANCE mining
index_refresh_interval = 1000  # Steps between index refreshes
index_type = "IVF"  # FAISS index type
nlist = 100  # Number of clusters for IVF

[evaluation]
# Evaluation settings (used in evaluation/evaluator.py)
eval_batch_size = 32  # Batch size for evaluation
k_values = [1, 3, 5, 10, 20, 50, 100]  # k values for metrics
metrics = ["recall", "precision", "map", "mrr", "ndcg"]  # Metrics to compute
save_predictions = true  # Save predictions to file
predictions_dir = "./output/predictions"  # Directory for predictions

[logging]
# Logging configuration (used in utils/logging.py)
log_level = "INFO"  # Logging level (DEBUG, INFO, WARNING, ERROR)
log_to_file = true  # Log to file in addition to stdout
log_dir = "./logs"  # Directory for log files
tensorboard_dir = "./logs/tensorboard"  # Directory for TensorBoard logs
wandb_project = "bge-finetuning"  # Weights & Biases project name
wandb_entity = ""  # Your wandb entity
use_wandb = false  # Enable Weights & Biases logging

[distributed]
# Distributed training settings (used in utils/distributed.py)
# backend: 'nccl' for CUDA GPUs, 'hccl' for Huawei Ascend NPUs, 'gloo' for CPU
backend = "nccl"  # Backend for distributed training (nccl for CUDA, hccl for Ascend, gloo for CPU)
init_method = "env://"  # Initialization method for distributed training
find_unused_parameters = true  # Find unused parameters in DDP

[ascend]
# Ascend NPU-specific settings (used for Huawei hardware)
# Set device_type to 'npu' to enable Ascend support
# backend: 'hccl' is required for distributed training on Ascend
# mixed_precision: true enables float16/bfloat16 training if supported
# Example usage:
#   device_type = "npu"
#   backend = "hccl"
#   mixed_precision = true
#   visible_devices = "0,1,2,3"  # Comma-separated list of NPU device IDs
#   dataloader_num_workers = 4
#   per_device_train_batch_size = 4
#   per_device_eval_batch_size = 8
# Uncomment and set as needed:
# device_type = "npu"
# backend = "hccl"
# mixed_precision = true
# visible_devices = "0,1,2,3"
# dataloader_num_workers = 4
# per_device_train_batch_size = 4
# per_device_eval_batch_size = 8

[optimization]
# Advanced optimization settings (used in models/losses.py, training/*_trainer.py)
gradient_checkpointing = true  # Enable gradient checkpointing for memory savings
fp16 = true  # Use mixed precision (float16) training
bf16 = false  # Set to true for A100/H100 GPUs (bfloat16)
fp16_opt_level = "O1"  # Apex AMP optimization level
max_grad_norm = 1.0  # Max gradient norm for clipping
adam_epsilon = 1e-8  # Epsilon for Adam optimizer
adam_beta1 = 0.9  # Beta1 for Adam optimizer
adam_beta2 = 0.999  # Beta2 for Adam optimizer

[hardware]
# Hardware-specific settings (overrides [training] for batch sizes, etc.)
cuda_visible_devices = "0,1,2,3"  # Comma-separated list of visible CUDA devices
dataloader_num_workers = 4  # Number of workers for DataLoader
dataloader_pin_memory = true  # Use pinned memory in DataLoader
per_device_train_batch_size = 4  # Batch size per device for training (overrides [training])
per_device_eval_batch_size = 8  # Batch size per device for evaluation

[checkpoint]
# Checkpointing settings (used in training/*_trainer.py)
save_strategy = "steps"  # Save checkpoint by steps or epoch
save_steps = 1000  # Steps between checkpoints
save_total_limit = 3  # Max number of checkpoints to keep
load_best_model_at_end = true  # Load best model at end of training
metric_for_best_model = "eval_recall@10"  # Metric to select best model
greater_is_better = true  # Whether higher metric is better
resume_from_checkpoint = ""  # Path to checkpoint to resume from

[export]
# Model export settings (used in scripts/export.py)
export_format = ["pytorch", "onnx"]  # Export formats
optimize_for_inference = true  # Optimize model for inference
quantize = false  # Enable quantization
quantization_bits = 8  # Number of bits for quantization

[m3]
# Enhanced BGE-M3 model-specific parameters with new features
# If a parameter is present in both [m3] and [training]/[hardware], [m3] takes precedence for BGE-M3.
use_dense = true  # Enable dense embedding head
use_sparse = true  # Enable sparse (SPLADE-style) embedding head
use_colbert = true  # Enable ColBERT multi-vector head
unified_finetuning = true  # Unified fine-tuning for all heads
sentence_pooling_method = "cls"  # Pooling method for sentence embeddings
normalize_embeddings = true  # Normalize output embeddings
colbert_dim = -1  # Output dimension for ColBERT head (-1 uses model hidden size)
sparse_top_k = 100  # Top-k for sparse representations

# Training
train_group_size = 8  # Number of passages per query in training
temperature = 0.1  # InfoNCE loss temperature - increased from 0.02 for better stability
negatives_cross_device = true  # Share negatives across devices/GPUs

# Enhanced hard negative mining
use_hard_negatives = true  # Enable hard negative mining
num_hard_negatives = 15  # Number of hard negatives to mine
hard_negative_mining_steps = 1000  # Steps between hard negative mining refreshes
ance_negative_range = [10, 100]  # Range for ANCE negative mining
ance_margin = 0.1  # Margin for ANCE mining
hard_negative_ratio = 0.7  # Ratio of hard to random negatives
difficulty_threshold = 0.1  # Minimum difficulty for hard negatives
max_triplets_per_query = 8  # Maximum triplets per query for training

# Enhanced self-distillation
use_self_distill = false  # Enable self-knowledge distillation - changed default to false
self_distill_temperature = 4.0  # Temperature for self-distillation
self_distill_alpha = 0.5  # Alpha (weight) for self-distillation loss

# Enhanced query instruction
use_query_instruction = false  # Add instruction to queries
query_instruction_for_retrieval = "Represent this sentence for searching relevant passages:"
query_instruction_for_reranking = ""

# Enhanced data augmentation
augment_queries = false  # Enable query augmentation
augmentation_methods = ["paraphrase", "back_translation"]  # List of augmentation methods
create_hard_negatives = false  # Create hard negatives via augmentation

# Enhanced sampling features
use_score_weighted_sampling = true  # Enable score-weighted positive sampling
multiple_positives = true  # Use multiple positives per batch for richer contrastive signals
enhanced_contrastive_learning = true  # Enable enhanced contrastive learning features

# Loss weights
# For multi-task learning, these weights control the contribution of each head/loss
dense_weight = 1.0
sparse_weight = 0.3
colbert_weight = 0.5
distillation_weight = 1.0

# Evaluation
metric_for_best_model = "eval_recall@10"
greater_is_better = true

# Memory optimization
use_gradient_checkpointing = true  # Enable gradient checkpointing
max_passages_per_query = 32  # Max passages per query in memory

# Multi-granularity
max_query_length = 64  # Max query length
max_passage_length = 512  # Max passage length
max_length = 8192  # Max total input length (multi-granularity)

# New: Enhanced dataset integration
use_new_dataset_api = true  # Use integrated dataset API with format detection
auto_migrate_legacy = false  # Automatically migrate legacy formats during training

[reranker]
# Enhanced BGE-Reranker model-specific parameters with flat pairs support
# If a parameter is present in both [reranker] and [training]/[hardware], [reranker] takes precedence for BGE-Reranker.
add_pooling_layer = false  # Not used for cross-encoder
use_fp16 = true  # Use mixed precision (float16) training
train_group_size = 16  # Number of passages per query for listwise training
loss_type = "listwise_ce"  # Loss function type ('listwise_ce', 'pairwise_margin', 'combined', 'cross_entropy', 'binary_cross_entropy')
margin = 1.0  # Margin for pairwise loss
temperature = 1.0  # Temperature for listwise CE loss

# Enhanced knowledge distillation
use_knowledge_distillation = false  # Enable knowledge distillation from teacher model
teacher_model_name_or_path = ""  # Path to teacher model for distillation
distillation_temperature = 3.0  # Temperature for distillation
distillation_alpha = 0.7  # Alpha (weight) for distillation loss

# Enhanced hard negatives
use_hard_negatives = true  # Enable hard negative mining
hard_negative_ratio = 0.8  # Ratio of hard to random negatives

# Enhanced denoising
use_denoising = false  # Enable denoising strategy
noise_probability = 0.1  # Probability of noise for denoising

# Performance optimizations
max_pairs_per_device = 128  # Max query-passage pairs per device
accumulate_gradients_from_all_devices = true  # Accumulate gradients from all devices
max_seq_length = 512  # Max combined query+passage length
query_max_length = 64  # Max query length (if separate encoding)
passage_max_length = 512  # Max passage length (if separate encoding)

# Enhanced training features
shuffle_passages = true  # Shuffle passage order during training
position_bias_cutoff = 10  # Top-k passages to consider for position bias
use_gradient_caching = true  # Enable gradient caching for efficiency
gradient_cache_batch_size = 32  # Batch size for gradient caching
eval_group_size = 100  # Number of candidates during evaluation
metric_for_best_model = "eval_mrr@10"  # Metric to select best model
eval_normalize_scores = true  # Normalize scores during evaluation

# Loss weights for combined training
listwise_weight = 0.7  # Loss weight for listwise loss
pairwise_weight = 0.3  # Loss weight for pairwise loss

# Data quality controls
filter_empty_passages = true  # Filter out empty passages
min_passage_length = 10  # Minimum passage length to keep
max_passage_length_ratio = 50.0  # Max ratio between query and passage length

# Advanced features
gradient_checkpointing = true  # Enable gradient checkpointing
recompute_scores = false  # Recompute scores instead of storing all
use_dynamic_teacher = false  # Enable dynamic teacher for RocketQAv2-style training
teacher_update_steps = 1000  # Steps between teacher updates
teacher_momentum = 0.999  # Momentum for teacher updates

# New: Enhanced format support
support_flat_pairs = true  # Support flat pairs format (query, passage, label)
support_nested_format = true  # Support legacy nested format for backward compatibility
auto_convert_nested_to_flat = false  # Automatically convert nested to flat during training
use_new_dataset_api = true  # Use integrated dataset API with format detection