bge_finetune/config.toml
2025-07-23 14:54:46 +08:00

293 lines
14 KiB
TOML

[api]
# API settings for data augmentation (used in data/augmentation.py)
# NOTE: You can override api_key by setting the environment variable BGE_API_KEY.
base_url = "https://api.deepseek.com/v1"
api_key = "your_api_here" # Replace with your actual API key or set BGE_API_KEY env var
model = "deepseek-chat"
max_retries = 3 # Number of times to retry API calls on failure
timeout = 30 # Timeout for API requests (seconds)
temperature = 0.7 # Sampling temperature for augmentation
[training]
# Default training parameters (overridden by [hardware] or model-specific sections if present)
# If a parameter is present in both [training] and [hardware], [hardware] takes precedence.
default_batch_size = 4 # Default batch size per device for training
default_learning_rate = 1e-5 # Default learning rate
default_num_epochs = 3 # Default number of epochs
default_warmup_ratio = 0.1 # Warmup steps as a ratio of total steps
default_weight_decay = 0.01 # Weight decay for optimizer
gradient_accumulation_steps = 2 # Number of steps to accumulate gradients - reduced for small datasets
[model_paths]
# Model source: 'huggingface' or 'modelscope'. Determines which hub to use for downloading/loading models.
source = "huggingface" # Options: 'huggingface', 'modelscope'
bge_m3 = "./models/bge-m3"
bge_reranker = "./models/bge-reranker-base"
# Cache directory for downloaded models (tokenizers, config files, etc.)
cache_dir = "./cache/models"
# To use ModelScope, set source = "modelscope" and provide the correct model id/path.
[data]
# Enhanced data processing settings with unified schema support
max_query_length = 64 # Max length for query tokens
max_passage_length = 512 # Max length for passage tokens
max_seq_length = 512 # Max total input sequence length
# For testing only, in production, set to 0.3
validation_split = 0.0 # Fraction of data for validation
random_seed = 42 # Random seed for reproducibility
# Cache directory for processed/optimized datasets (.pkl files)
cache_dir = "./cache/data"
# Default output directory for optimization script
optimization_output_dir = "./cache/data"
# New: Enhanced dataset format settings
auto_detect_format = true # Enable automatic format detection
legacy_support = true # Support legacy nested reranker format
migrate_legacy_on_load = false # Automatically migrate legacy data during loading
embedding_schema_version = "v3" # Use enhanced embedding schema
reranker_schema_version = "v3" # Use enhanced reranker schema
# New: Enhanced sampling settings
use_score_weighted_sampling = true # Enable score-weighted positive sampling
multiple_positives_per_batch = true # Use multiple positives for richer contrastive signals
hard_negative_ratio = 0.7 # Ratio of hard to random negatives
difficulty_threshold = 0.1 # Minimum difficulty threshold for hard negatives
[augmentation]
# Data augmentation settings (used in data/augmentation.py)
enable_query_augmentation = false # Enable query augmentation
enable_passage_augmentation = false # Enable passage augmentation
augmentation_methods = ["paraphrase", "back_translation"] # List of augmentation methods
api_first = true # Prefer LLM API for all augmentation if enabled; fallback to rule-based if API unavailable
# Augmentation is now language-aware: prompts and templates are selected based on query language (Chinese/English)
num_augmentations_per_sample = 2 # Number of augmentations per sample
augmentation_temperature = 0.8 # Sampling temperature for augmentation
[hard_negative_mining]
# ANCE hard negative mining settings (used in data/hard_negative_mining.py)
enable_mining = true # Enable hard negative mining
num_hard_negatives = 15 # Number of hard negatives to mine
negative_range_min = 10 # Min index for negative mining
negative_range_max = 100 # Max index for negative mining
margin = 0.1 # Margin for ANCE mining
index_refresh_interval = 1000 # Steps between index refreshes
index_type = "IVF" # FAISS index type
nlist = 100 # Number of clusters for IVF
[evaluation]
# Evaluation settings (used in evaluation/evaluator.py)
eval_batch_size = 32 # Batch size for evaluation
k_values = [1, 3, 5, 10, 20, 50, 100] # k values for metrics
metrics = ["recall", "precision", "map", "mrr", "ndcg"] # Metrics to compute
save_predictions = true # Save predictions to file
predictions_dir = "./output/predictions" # Directory for predictions
[logging]
# Logging configuration (used in utils/logging.py)
log_level = "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR)
log_to_file = true # Log to file in addition to stdout
log_dir = "./logs" # Directory for log files
tensorboard_dir = "./logs/tensorboard" # Directory for TensorBoard logs
wandb_project = "bge-finetuning" # Weights & Biases project name
wandb_entity = "" # Your wandb entity
use_wandb = false # Enable Weights & Biases logging
[distributed]
# Distributed training settings (used in utils/distributed.py)
# backend: 'nccl' for CUDA GPUs, 'hccl' for Huawei Ascend NPUs, 'gloo' for CPU
backend = "nccl" # Backend for distributed training (nccl for CUDA, hccl for Ascend, gloo for CPU)
init_method = "env://" # Initialization method for distributed training
find_unused_parameters = true # Find unused parameters in DDP
[ascend]
# Ascend NPU-specific settings (used for Huawei hardware)
# Set device_type to 'npu' to enable Ascend support
# backend: 'hccl' is required for distributed training on Ascend
# mixed_precision: true enables float16/bfloat16 training if supported
# Example usage:
# device_type = "npu"
# backend = "hccl"
# mixed_precision = true
# visible_devices = "0,1,2,3" # Comma-separated list of NPU device IDs
# dataloader_num_workers = 4
# per_device_train_batch_size = 4
# per_device_eval_batch_size = 8
# Uncomment and set as needed:
# device_type = "npu"
# backend = "hccl"
# mixed_precision = true
# visible_devices = "0,1,2,3"
# dataloader_num_workers = 4
# per_device_train_batch_size = 4
# per_device_eval_batch_size = 8
[optimization]
# Advanced optimization settings (used in models/losses.py, training/*_trainer.py)
gradient_checkpointing = true # Enable gradient checkpointing for memory savings
fp16 = true # Use mixed precision (float16) training
bf16 = false # Set to true for A100/H100 GPUs (bfloat16)
fp16_opt_level = "O1" # Apex AMP optimization level
max_grad_norm = 1.0 # Max gradient norm for clipping
adam_epsilon = 1e-8 # Epsilon for Adam optimizer
adam_beta1 = 0.9 # Beta1 for Adam optimizer
adam_beta2 = 0.999 # Beta2 for Adam optimizer
[hardware]
# Hardware-specific settings (overrides [training] for batch sizes, etc.)
cuda_visible_devices = "0,1,2,3" # Comma-separated list of visible CUDA devices
dataloader_num_workers = 4 # Number of workers for DataLoader
dataloader_pin_memory = true # Use pinned memory in DataLoader
per_device_train_batch_size = 4 # Batch size per device for training (overrides [training])
per_device_eval_batch_size = 8 # Batch size per device for evaluation
[checkpoint]
# Checkpointing settings (used in training/*_trainer.py)
save_strategy = "steps" # Save checkpoint by steps or epoch
save_steps = 1000 # Steps between checkpoints
save_total_limit = 3 # Max number of checkpoints to keep
load_best_model_at_end = true # Load best model at end of training
metric_for_best_model = "eval_recall@10" # Metric to select best model
greater_is_better = true # Whether higher metric is better
resume_from_checkpoint = "" # Path to checkpoint to resume from
[export]
# Model export settings (used in scripts/export.py)
export_format = ["pytorch", "onnx"] # Export formats
optimize_for_inference = true # Optimize model for inference
quantize = false # Enable quantization
quantization_bits = 8 # Number of bits for quantization
[m3]
# Enhanced BGE-M3 model-specific parameters with new features
# If a parameter is present in both [m3] and [training]/[hardware], [m3] takes precedence for BGE-M3.
use_dense = true # Enable dense embedding head
use_sparse = true # Enable sparse (SPLADE-style) embedding head
use_colbert = true # Enable ColBERT multi-vector head
unified_finetuning = true # Unified fine-tuning for all heads
sentence_pooling_method = "cls" # Pooling method for sentence embeddings
normalize_embeddings = true # Normalize output embeddings
colbert_dim = -1 # Output dimension for ColBERT head (-1 uses model hidden size)
sparse_top_k = 100 # Top-k for sparse representations
# Training
train_group_size = 8 # Number of passages per query in training
temperature = 0.1 # InfoNCE loss temperature - increased from 0.02 for better stability
negatives_cross_device = true # Share negatives across devices/GPUs
# Enhanced hard negative mining
use_hard_negatives = true # Enable hard negative mining
num_hard_negatives = 15 # Number of hard negatives to mine
hard_negative_mining_steps = 1000 # Steps between hard negative mining refreshes
ance_negative_range = [10, 100] # Range for ANCE negative mining
ance_margin = 0.1 # Margin for ANCE mining
hard_negative_ratio = 0.7 # Ratio of hard to random negatives
difficulty_threshold = 0.1 # Minimum difficulty for hard negatives
max_triplets_per_query = 8 # Maximum triplets per query for training
# Enhanced self-distillation
use_self_distill = false # Enable self-knowledge distillation - changed default to false
self_distill_temperature = 4.0 # Temperature for self-distillation
self_distill_alpha = 0.5 # Alpha (weight) for self-distillation loss
# Enhanced query instruction
use_query_instruction = false # Add instruction to queries
query_instruction_for_retrieval = "Represent this sentence for searching relevant passages:"
query_instruction_for_reranking = ""
# Enhanced data augmentation
augment_queries = false # Enable query augmentation
augmentation_methods = ["paraphrase", "back_translation"] # List of augmentation methods
create_hard_negatives = false # Create hard negatives via augmentation
# Enhanced sampling features
use_score_weighted_sampling = true # Enable score-weighted positive sampling
multiple_positives = true # Use multiple positives per batch for richer contrastive signals
enhanced_contrastive_learning = true # Enable enhanced contrastive learning features
# Loss weights
# For multi-task learning, these weights control the contribution of each head/loss
dense_weight = 1.0
sparse_weight = 0.3
colbert_weight = 0.5
distillation_weight = 1.0
# Evaluation
metric_for_best_model = "eval_recall@10"
greater_is_better = true
# Memory optimization
use_gradient_checkpointing = true # Enable gradient checkpointing
max_passages_per_query = 32 # Max passages per query in memory
# Multi-granularity
max_query_length = 64 # Max query length
max_passage_length = 512 # Max passage length
max_length = 8192 # Max total input length (multi-granularity)
# New: Enhanced dataset integration
use_new_dataset_api = true # Use integrated dataset API with format detection
auto_migrate_legacy = false # Automatically migrate legacy formats during training
[reranker]
# Enhanced BGE-Reranker model-specific parameters with flat pairs support
# If a parameter is present in both [reranker] and [training]/[hardware], [reranker] takes precedence for BGE-Reranker.
add_pooling_layer = false # Not used for cross-encoder
use_fp16 = true # Use mixed precision (float16) training
train_group_size = 16 # Number of passages per query for listwise training
loss_type = "listwise_ce" # Loss function type ('listwise_ce', 'pairwise_margin', 'combined', 'cross_entropy', 'binary_cross_entropy')
margin = 1.0 # Margin for pairwise loss
temperature = 1.0 # Temperature for listwise CE loss
# Enhanced knowledge distillation
use_knowledge_distillation = false # Enable knowledge distillation from teacher model
teacher_model_name_or_path = "" # Path to teacher model for distillation
distillation_temperature = 3.0 # Temperature for distillation
distillation_alpha = 0.7 # Alpha (weight) for distillation loss
# Enhanced hard negatives
use_hard_negatives = true # Enable hard negative mining
hard_negative_ratio = 0.8 # Ratio of hard to random negatives
# Enhanced denoising
use_denoising = false # Enable denoising strategy
noise_probability = 0.1 # Probability of noise for denoising
# Performance optimizations
max_pairs_per_device = 128 # Max query-passage pairs per device
accumulate_gradients_from_all_devices = true # Accumulate gradients from all devices
max_seq_length = 512 # Max combined query+passage length
query_max_length = 64 # Max query length (if separate encoding)
passage_max_length = 512 # Max passage length (if separate encoding)
# Enhanced training features
shuffle_passages = true # Shuffle passage order during training
position_bias_cutoff = 10 # Top-k passages to consider for position bias
use_gradient_caching = true # Enable gradient caching for efficiency
gradient_cache_batch_size = 32 # Batch size for gradient caching
eval_group_size = 100 # Number of candidates during evaluation
metric_for_best_model = "eval_mrr@10" # Metric to select best model
eval_normalize_scores = true # Normalize scores during evaluation
# Loss weights for combined training
listwise_weight = 0.7 # Loss weight for listwise loss
pairwise_weight = 0.3 # Loss weight for pairwise loss
# Data quality controls
filter_empty_passages = true # Filter out empty passages
min_passage_length = 10 # Minimum passage length to keep
max_passage_length_ratio = 50.0 # Max ratio between query and passage length
# Advanced features
gradient_checkpointing = true # Enable gradient checkpointing
recompute_scores = false # Recompute scores instead of storing all
use_dynamic_teacher = false # Enable dynamic teacher for RocketQAv2-style training
teacher_update_steps = 1000 # Steps between teacher updates
teacher_momentum = 0.999 # Momentum for teacher updates
# New: Enhanced format support
support_flat_pairs = true # Support flat pairs format (query, passage, label)
support_nested_format = true # Support legacy nested format for backward compatibility
auto_convert_nested_to_flat = false # Automatically convert nested to flat during training
use_new_dataset_api = true # Use integrated dataset API with format detection