[api] # API settings for data augmentation (used in data/augmentation.py) # NOTE: You can override api_key by setting the environment variable BGE_API_KEY. base_url = "https://api.deepseek.com/v1" api_key = "sk-1bf2a963751_deletethis_b4e5e96d319810f8926e5" # Replace with your actual API key or set BGE_API_KEY env var model = "deepseek-chat" max_retries = 3 # Number of times to retry API calls on failure timeout = 30 # Timeout for API requests (seconds) temperature = 0.7 # Sampling temperature for augmentation [training] # Default training parameters (overridden by [hardware] or model-specific sections if present) # If a parameter is present in both [training] and [hardware], [hardware] takes precedence. default_batch_size = 4 # Default batch size per device for training default_learning_rate = 1e-5 # Default learning rate default_num_epochs = 3 # Default number of epochs default_warmup_ratio = 0.1 # Warmup steps as a ratio of total steps default_weight_decay = 0.01 # Weight decay for optimizer gradient_accumulation_steps = 2 # Number of steps to accumulate gradients - reduced for small datasets [model_paths] # Model source: 'huggingface' or 'modelscope'. Determines which hub to use for downloading/loading models. source = "huggingface" # Options: 'huggingface', 'modelscope' bge_m3 = "./models/bge-m3" bge_reranker = "./models/bge-reranker-base" # Cache directory for downloaded models (tokenizers, config files, etc.) cache_dir = "./cache/models" # To use ModelScope, set source = "modelscope" and provide the correct model id/path. [data] # Enhanced data processing settings with unified schema support max_query_length = 64 # Max length for query tokens max_passage_length = 512 # Max length for passage tokens max_seq_length = 512 # Max total input sequence length # For testing only, in production, set to 0.3 validation_split = 0.0 # Fraction of data for validation random_seed = 42 # Random seed for reproducibility # Cache directory for processed/optimized datasets (.pkl files) cache_dir = "./cache/data" # Default output directory for optimization script optimization_output_dir = "./cache/data" # New: Enhanced dataset format settings auto_detect_format = true # Enable automatic format detection legacy_support = true # Support legacy nested reranker format migrate_legacy_on_load = false # Automatically migrate legacy data during loading embedding_schema_version = "v3" # Use enhanced embedding schema reranker_schema_version = "v3" # Use enhanced reranker schema # New: Enhanced sampling settings use_score_weighted_sampling = true # Enable score-weighted positive sampling multiple_positives_per_batch = true # Use multiple positives for richer contrastive signals hard_negative_ratio = 0.7 # Ratio of hard to random negatives difficulty_threshold = 0.1 # Minimum difficulty threshold for hard negatives [augmentation] # Data augmentation settings (used in data/augmentation.py) enable_query_augmentation = false # Enable query augmentation enable_passage_augmentation = false # Enable passage augmentation augmentation_methods = ["paraphrase", "back_translation"] # List of augmentation methods api_first = true # Prefer LLM API for all augmentation if enabled; fallback to rule-based if API unavailable # Augmentation is now language-aware: prompts and templates are selected based on query language (Chinese/English) num_augmentations_per_sample = 2 # Number of augmentations per sample augmentation_temperature = 0.8 # Sampling temperature for augmentation [hard_negative_mining] # ANCE hard negative mining settings (used in data/hard_negative_mining.py) enable_mining = true # Enable hard negative mining num_hard_negatives = 15 # Number of hard negatives to mine negative_range_min = 10 # Min index for negative mining negative_range_max = 100 # Max index for negative mining margin = 0.1 # Margin for ANCE mining index_refresh_interval = 1000 # Steps between index refreshes index_type = "IVF" # FAISS index type nlist = 100 # Number of clusters for IVF [evaluation] # Evaluation settings (used in evaluation/evaluator.py) eval_batch_size = 32 # Batch size for evaluation k_values = [1, 3, 5, 10, 20, 50, 100] # k values for metrics metrics = ["recall", "precision", "map", "mrr", "ndcg"] # Metrics to compute save_predictions = true # Save predictions to file predictions_dir = "./output/predictions" # Directory for predictions [logging] # Logging configuration (used in utils/logging.py) log_level = "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR) log_to_file = true # Log to file in addition to stdout log_dir = "./logs" # Directory for log files tensorboard_dir = "./logs/tensorboard" # Directory for TensorBoard logs wandb_project = "bge-finetuning" # Weights & Biases project name wandb_entity = "" # Your wandb entity use_wandb = false # Enable Weights & Biases logging [distributed] # Distributed training settings (used in utils/distributed.py) # backend: 'nccl' for CUDA GPUs, 'hccl' for Huawei Ascend NPUs, 'gloo' for CPU backend = "nccl" # Backend for distributed training (nccl for CUDA, hccl for Ascend, gloo for CPU) init_method = "env://" # Initialization method for distributed training find_unused_parameters = true # Find unused parameters in DDP [ascend] # Ascend NPU-specific settings (used for Huawei hardware) # Set device_type to 'npu' to enable Ascend support # backend: 'hccl' is required for distributed training on Ascend # mixed_precision: true enables float16/bfloat16 training if supported # Example usage: # device_type = "npu" # backend = "hccl" # mixed_precision = true # visible_devices = "0,1,2,3" # Comma-separated list of NPU device IDs # dataloader_num_workers = 4 # per_device_train_batch_size = 4 # per_device_eval_batch_size = 8 # Uncomment and set as needed: # device_type = "npu" # backend = "hccl" # mixed_precision = true # visible_devices = "0,1,2,3" # dataloader_num_workers = 4 # per_device_train_batch_size = 4 # per_device_eval_batch_size = 8 [optimization] # Advanced optimization settings (used in models/losses.py, training/*_trainer.py) gradient_checkpointing = true # Enable gradient checkpointing for memory savings fp16 = true # Use mixed precision (float16) training bf16 = false # Set to true for A100/H100 GPUs (bfloat16) fp16_opt_level = "O1" # Apex AMP optimization level max_grad_norm = 1.0 # Max gradient norm for clipping adam_epsilon = 1e-8 # Epsilon for Adam optimizer adam_beta1 = 0.9 # Beta1 for Adam optimizer adam_beta2 = 0.999 # Beta2 for Adam optimizer [hardware] # Hardware-specific settings (overrides [training] for batch sizes, etc.) cuda_visible_devices = "0,1,2,3" # Comma-separated list of visible CUDA devices dataloader_num_workers = 4 # Number of workers for DataLoader dataloader_pin_memory = true # Use pinned memory in DataLoader per_device_train_batch_size = 4 # Batch size per device for training (overrides [training]) per_device_eval_batch_size = 8 # Batch size per device for evaluation [checkpoint] # Checkpointing settings (used in training/*_trainer.py) save_strategy = "steps" # Save checkpoint by steps or epoch save_steps = 1000 # Steps between checkpoints save_total_limit = 3 # Max number of checkpoints to keep load_best_model_at_end = true # Load best model at end of training metric_for_best_model = "eval_recall@10" # Metric to select best model greater_is_better = true # Whether higher metric is better resume_from_checkpoint = "" # Path to checkpoint to resume from [export] # Model export settings (used in scripts/export.py) export_format = ["pytorch", "onnx"] # Export formats optimize_for_inference = true # Optimize model for inference quantize = false # Enable quantization quantization_bits = 8 # Number of bits for quantization [m3] # Enhanced BGE-M3 model-specific parameters with new features # If a parameter is present in both [m3] and [training]/[hardware], [m3] takes precedence for BGE-M3. use_dense = true # Enable dense embedding head use_sparse = true # Enable sparse (SPLADE-style) embedding head use_colbert = true # Enable ColBERT multi-vector head unified_finetuning = true # Unified fine-tuning for all heads sentence_pooling_method = "cls" # Pooling method for sentence embeddings normalize_embeddings = true # Normalize output embeddings colbert_dim = -1 # Output dimension for ColBERT head (-1 uses model hidden size) sparse_top_k = 100 # Top-k for sparse representations # Training train_group_size = 8 # Number of passages per query in training temperature = 0.1 # InfoNCE loss temperature - increased from 0.02 for better stability negatives_cross_device = true # Share negatives across devices/GPUs # Enhanced hard negative mining use_hard_negatives = true # Enable hard negative mining num_hard_negatives = 15 # Number of hard negatives to mine hard_negative_mining_steps = 1000 # Steps between hard negative mining refreshes ance_negative_range = [10, 100] # Range for ANCE negative mining ance_margin = 0.1 # Margin for ANCE mining hard_negative_ratio = 0.7 # Ratio of hard to random negatives difficulty_threshold = 0.1 # Minimum difficulty for hard negatives max_triplets_per_query = 8 # Maximum triplets per query for training # Enhanced self-distillation use_self_distill = false # Enable self-knowledge distillation - changed default to false self_distill_temperature = 4.0 # Temperature for self-distillation self_distill_alpha = 0.5 # Alpha (weight) for self-distillation loss # Enhanced query instruction use_query_instruction = false # Add instruction to queries query_instruction_for_retrieval = "Represent this sentence for searching relevant passages:" query_instruction_for_reranking = "" # Enhanced data augmentation augment_queries = false # Enable query augmentation augmentation_methods = ["paraphrase", "back_translation"] # List of augmentation methods create_hard_negatives = false # Create hard negatives via augmentation # Enhanced sampling features use_score_weighted_sampling = true # Enable score-weighted positive sampling multiple_positives = true # Use multiple positives per batch for richer contrastive signals enhanced_contrastive_learning = true # Enable enhanced contrastive learning features # Loss weights # For multi-task learning, these weights control the contribution of each head/loss dense_weight = 1.0 sparse_weight = 0.3 colbert_weight = 0.5 distillation_weight = 1.0 # Evaluation metric_for_best_model = "eval_recall@10" greater_is_better = true # Memory optimization use_gradient_checkpointing = true # Enable gradient checkpointing max_passages_per_query = 32 # Max passages per query in memory # Multi-granularity max_query_length = 64 # Max query length max_passage_length = 512 # Max passage length max_length = 8192 # Max total input length (multi-granularity) # New: Enhanced dataset integration use_new_dataset_api = true # Use integrated dataset API with format detection auto_migrate_legacy = false # Automatically migrate legacy formats during training [reranker] # Enhanced BGE-Reranker model-specific parameters with flat pairs support # If a parameter is present in both [reranker] and [training]/[hardware], [reranker] takes precedence for BGE-Reranker. add_pooling_layer = false # Not used for cross-encoder use_fp16 = true # Use mixed precision (float16) training train_group_size = 16 # Number of passages per query for listwise training loss_type = "listwise_ce" # Loss function type ('listwise_ce', 'pairwise_margin', 'combined', 'cross_entropy', 'binary_cross_entropy') margin = 1.0 # Margin for pairwise loss temperature = 1.0 # Temperature for listwise CE loss # Enhanced knowledge distillation use_knowledge_distillation = false # Enable knowledge distillation from teacher model teacher_model_name_or_path = "" # Path to teacher model for distillation distillation_temperature = 3.0 # Temperature for distillation distillation_alpha = 0.7 # Alpha (weight) for distillation loss # Enhanced hard negatives use_hard_negatives = true # Enable hard negative mining hard_negative_ratio = 0.8 # Ratio of hard to random negatives # Enhanced denoising use_denoising = false # Enable denoising strategy noise_probability = 0.1 # Probability of noise for denoising # Performance optimizations max_pairs_per_device = 128 # Max query-passage pairs per device accumulate_gradients_from_all_devices = true # Accumulate gradients from all devices max_seq_length = 512 # Max combined query+passage length query_max_length = 64 # Max query length (if separate encoding) passage_max_length = 512 # Max passage length (if separate encoding) # Enhanced training features shuffle_passages = true # Shuffle passage order during training position_bias_cutoff = 10 # Top-k passages to consider for position bias use_gradient_caching = true # Enable gradient caching for efficiency gradient_cache_batch_size = 32 # Batch size for gradient caching eval_group_size = 100 # Number of candidates during evaluation metric_for_best_model = "eval_mrr@10" # Metric to select best model eval_normalize_scores = true # Normalize scores during evaluation # Loss weights for combined training listwise_weight = 0.7 # Loss weight for listwise loss pairwise_weight = 0.3 # Loss weight for pairwise loss # Data quality controls filter_empty_passages = true # Filter out empty passages min_passage_length = 10 # Minimum passage length to keep max_passage_length_ratio = 50.0 # Max ratio between query and passage length # Advanced features gradient_checkpointing = true # Enable gradient checkpointing recompute_scores = false # Recompute scores instead of storing all use_dynamic_teacher = false # Enable dynamic teacher for RocketQAv2-style training teacher_update_steps = 1000 # Steps between teacher updates teacher_momentum = 0.999 # Momentum for teacher updates # New: Enhanced format support support_flat_pairs = true # Support flat pairs format (query, passage, label) support_nested_format = true # Support legacy nested format for backward compatibility auto_convert_nested_to_flat = false # Automatically convert nested to flat during training use_new_dataset_api = true # Use integrated dataset API with format detection