293 lines
14 KiB
TOML
293 lines
14 KiB
TOML
[api]
|
|
# API settings for data augmentation (used in data/augmentation.py)
|
|
# NOTE: You can override api_key by setting the environment variable BGE_API_KEY.
|
|
base_url = "https://api.deepseek.com/v1"
|
|
api_key = "your_api_here" # Replace with your actual API key or set BGE_API_KEY env var
|
|
model = "deepseek-chat"
|
|
max_retries = 3 # Number of times to retry API calls on failure
|
|
timeout = 30 # Timeout for API requests (seconds)
|
|
temperature = 0.7 # Sampling temperature for augmentation
|
|
|
|
[training]
|
|
# Default training parameters (overridden by [hardware] or model-specific sections if present)
|
|
# If a parameter is present in both [training] and [hardware], [hardware] takes precedence.
|
|
default_batch_size = 4 # Default batch size per device for training
|
|
default_learning_rate = 1e-5 # Default learning rate
|
|
default_num_epochs = 3 # Default number of epochs
|
|
default_warmup_ratio = 0.1 # Warmup steps as a ratio of total steps
|
|
default_weight_decay = 0.01 # Weight decay for optimizer
|
|
gradient_accumulation_steps = 2 # Number of steps to accumulate gradients - reduced for small datasets
|
|
|
|
[model_paths]
|
|
# Model source: 'huggingface' or 'modelscope'. Determines which hub to use for downloading/loading models.
|
|
source = "huggingface" # Options: 'huggingface', 'modelscope'
|
|
bge_m3 = "./models/bge-m3"
|
|
bge_reranker = "./models/bge-reranker-base"
|
|
# Cache directory for downloaded models (tokenizers, config files, etc.)
|
|
cache_dir = "./cache/models"
|
|
# To use ModelScope, set source = "modelscope" and provide the correct model id/path.
|
|
|
|
[data]
|
|
# Enhanced data processing settings with unified schema support
|
|
max_query_length = 64 # Max length for query tokens
|
|
max_passage_length = 512 # Max length for passage tokens
|
|
max_seq_length = 512 # Max total input sequence length
|
|
# For testing only, in production, set to 0.3
|
|
validation_split = 0.0 # Fraction of data for validation
|
|
random_seed = 42 # Random seed for reproducibility
|
|
|
|
# Cache directory for processed/optimized datasets (.pkl files)
|
|
cache_dir = "./cache/data"
|
|
# Default output directory for optimization script
|
|
optimization_output_dir = "./cache/data"
|
|
|
|
# New: Enhanced dataset format settings
|
|
auto_detect_format = true # Enable automatic format detection
|
|
legacy_support = true # Support legacy nested reranker format
|
|
migrate_legacy_on_load = false # Automatically migrate legacy data during loading
|
|
embedding_schema_version = "v3" # Use enhanced embedding schema
|
|
reranker_schema_version = "v3" # Use enhanced reranker schema
|
|
|
|
# New: Enhanced sampling settings
|
|
use_score_weighted_sampling = true # Enable score-weighted positive sampling
|
|
multiple_positives_per_batch = true # Use multiple positives for richer contrastive signals
|
|
hard_negative_ratio = 0.7 # Ratio of hard to random negatives
|
|
difficulty_threshold = 0.1 # Minimum difficulty threshold for hard negatives
|
|
|
|
[augmentation]
|
|
# Data augmentation settings (used in data/augmentation.py)
|
|
enable_query_augmentation = false # Enable query augmentation
|
|
enable_passage_augmentation = false # Enable passage augmentation
|
|
augmentation_methods = ["paraphrase", "back_translation"] # List of augmentation methods
|
|
api_first = true # Prefer LLM API for all augmentation if enabled; fallback to rule-based if API unavailable
|
|
# Augmentation is now language-aware: prompts and templates are selected based on query language (Chinese/English)
|
|
num_augmentations_per_sample = 2 # Number of augmentations per sample
|
|
augmentation_temperature = 0.8 # Sampling temperature for augmentation
|
|
|
|
[hard_negative_mining]
|
|
# ANCE hard negative mining settings (used in data/hard_negative_mining.py)
|
|
enable_mining = true # Enable hard negative mining
|
|
num_hard_negatives = 15 # Number of hard negatives to mine
|
|
negative_range_min = 10 # Min index for negative mining
|
|
negative_range_max = 100 # Max index for negative mining
|
|
margin = 0.1 # Margin for ANCE mining
|
|
index_refresh_interval = 1000 # Steps between index refreshes
|
|
index_type = "IVF" # FAISS index type
|
|
nlist = 100 # Number of clusters for IVF
|
|
|
|
[evaluation]
|
|
# Evaluation settings (used in evaluation/evaluator.py)
|
|
eval_batch_size = 32 # Batch size for evaluation
|
|
k_values = [1, 3, 5, 10, 20, 50, 100] # k values for metrics
|
|
metrics = ["recall", "precision", "map", "mrr", "ndcg"] # Metrics to compute
|
|
save_predictions = true # Save predictions to file
|
|
predictions_dir = "./output/predictions" # Directory for predictions
|
|
|
|
[logging]
|
|
# Logging configuration (used in utils/logging.py)
|
|
log_level = "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR)
|
|
log_to_file = true # Log to file in addition to stdout
|
|
log_dir = "./logs" # Directory for log files
|
|
tensorboard_dir = "./logs/tensorboard" # Directory for TensorBoard logs
|
|
wandb_project = "bge-finetuning" # Weights & Biases project name
|
|
wandb_entity = "" # Your wandb entity
|
|
use_wandb = false # Enable Weights & Biases logging
|
|
|
|
[distributed]
|
|
# Distributed training settings (used in utils/distributed.py)
|
|
# backend: 'nccl' for CUDA GPUs, 'hccl' for Huawei Ascend NPUs, 'gloo' for CPU
|
|
backend = "nccl" # Backend for distributed training (nccl for CUDA, hccl for Ascend, gloo for CPU)
|
|
init_method = "env://" # Initialization method for distributed training
|
|
find_unused_parameters = true # Find unused parameters in DDP
|
|
|
|
[ascend]
|
|
# Ascend NPU-specific settings (used for Huawei hardware)
|
|
# Set device_type to 'npu' to enable Ascend support
|
|
# backend: 'hccl' is required for distributed training on Ascend
|
|
# mixed_precision: true enables float16/bfloat16 training if supported
|
|
# Example usage:
|
|
# device_type = "npu"
|
|
# backend = "hccl"
|
|
# mixed_precision = true
|
|
# visible_devices = "0,1,2,3" # Comma-separated list of NPU device IDs
|
|
# dataloader_num_workers = 4
|
|
# per_device_train_batch_size = 4
|
|
# per_device_eval_batch_size = 8
|
|
# Uncomment and set as needed:
|
|
# device_type = "npu"
|
|
# backend = "hccl"
|
|
# mixed_precision = true
|
|
# visible_devices = "0,1,2,3"
|
|
# dataloader_num_workers = 4
|
|
# per_device_train_batch_size = 4
|
|
# per_device_eval_batch_size = 8
|
|
|
|
[optimization]
|
|
# Advanced optimization settings (used in models/losses.py, training/*_trainer.py)
|
|
gradient_checkpointing = true # Enable gradient checkpointing for memory savings
|
|
fp16 = true # Use mixed precision (float16) training
|
|
bf16 = false # Set to true for A100/H100 GPUs (bfloat16)
|
|
fp16_opt_level = "O1" # Apex AMP optimization level
|
|
max_grad_norm = 1.0 # Max gradient norm for clipping
|
|
adam_epsilon = 1e-8 # Epsilon for Adam optimizer
|
|
adam_beta1 = 0.9 # Beta1 for Adam optimizer
|
|
adam_beta2 = 0.999 # Beta2 for Adam optimizer
|
|
|
|
[hardware]
|
|
# Hardware-specific settings (overrides [training] for batch sizes, etc.)
|
|
cuda_visible_devices = "0,1,2,3" # Comma-separated list of visible CUDA devices
|
|
dataloader_num_workers = 4 # Number of workers for DataLoader
|
|
dataloader_pin_memory = true # Use pinned memory in DataLoader
|
|
per_device_train_batch_size = 4 # Batch size per device for training (overrides [training])
|
|
per_device_eval_batch_size = 8 # Batch size per device for evaluation
|
|
|
|
[checkpoint]
|
|
# Checkpointing settings (used in training/*_trainer.py)
|
|
save_strategy = "steps" # Save checkpoint by steps or epoch
|
|
save_steps = 1000 # Steps between checkpoints
|
|
save_total_limit = 3 # Max number of checkpoints to keep
|
|
load_best_model_at_end = true # Load best model at end of training
|
|
metric_for_best_model = "eval_recall@10" # Metric to select best model
|
|
greater_is_better = true # Whether higher metric is better
|
|
resume_from_checkpoint = "" # Path to checkpoint to resume from
|
|
|
|
[export]
|
|
# Model export settings (used in scripts/export.py)
|
|
export_format = ["pytorch", "onnx"] # Export formats
|
|
optimize_for_inference = true # Optimize model for inference
|
|
quantize = false # Enable quantization
|
|
quantization_bits = 8 # Number of bits for quantization
|
|
|
|
[m3]
|
|
# Enhanced BGE-M3 model-specific parameters with new features
|
|
# If a parameter is present in both [m3] and [training]/[hardware], [m3] takes precedence for BGE-M3.
|
|
use_dense = true # Enable dense embedding head
|
|
use_sparse = true # Enable sparse (SPLADE-style) embedding head
|
|
use_colbert = true # Enable ColBERT multi-vector head
|
|
unified_finetuning = true # Unified fine-tuning for all heads
|
|
sentence_pooling_method = "cls" # Pooling method for sentence embeddings
|
|
normalize_embeddings = true # Normalize output embeddings
|
|
colbert_dim = -1 # Output dimension for ColBERT head (-1 uses model hidden size)
|
|
sparse_top_k = 100 # Top-k for sparse representations
|
|
|
|
# Training
|
|
train_group_size = 8 # Number of passages per query in training
|
|
temperature = 0.1 # InfoNCE loss temperature - increased from 0.02 for better stability
|
|
negatives_cross_device = true # Share negatives across devices/GPUs
|
|
|
|
# Enhanced hard negative mining
|
|
use_hard_negatives = true # Enable hard negative mining
|
|
num_hard_negatives = 15 # Number of hard negatives to mine
|
|
hard_negative_mining_steps = 1000 # Steps between hard negative mining refreshes
|
|
ance_negative_range = [10, 100] # Range for ANCE negative mining
|
|
ance_margin = 0.1 # Margin for ANCE mining
|
|
hard_negative_ratio = 0.7 # Ratio of hard to random negatives
|
|
difficulty_threshold = 0.1 # Minimum difficulty for hard negatives
|
|
max_triplets_per_query = 8 # Maximum triplets per query for training
|
|
|
|
# Enhanced self-distillation
|
|
use_self_distill = false # Enable self-knowledge distillation - changed default to false
|
|
self_distill_temperature = 4.0 # Temperature for self-distillation
|
|
self_distill_alpha = 0.5 # Alpha (weight) for self-distillation loss
|
|
|
|
# Enhanced query instruction
|
|
use_query_instruction = false # Add instruction to queries
|
|
query_instruction_for_retrieval = "Represent this sentence for searching relevant passages:"
|
|
query_instruction_for_reranking = ""
|
|
|
|
# Enhanced data augmentation
|
|
augment_queries = false # Enable query augmentation
|
|
augmentation_methods = ["paraphrase", "back_translation"] # List of augmentation methods
|
|
create_hard_negatives = false # Create hard negatives via augmentation
|
|
|
|
# Enhanced sampling features
|
|
use_score_weighted_sampling = true # Enable score-weighted positive sampling
|
|
multiple_positives = true # Use multiple positives per batch for richer contrastive signals
|
|
enhanced_contrastive_learning = true # Enable enhanced contrastive learning features
|
|
|
|
# Loss weights
|
|
# For multi-task learning, these weights control the contribution of each head/loss
|
|
dense_weight = 1.0
|
|
sparse_weight = 0.3
|
|
colbert_weight = 0.5
|
|
distillation_weight = 1.0
|
|
|
|
# Evaluation
|
|
metric_for_best_model = "eval_recall@10"
|
|
greater_is_better = true
|
|
|
|
# Memory optimization
|
|
use_gradient_checkpointing = true # Enable gradient checkpointing
|
|
max_passages_per_query = 32 # Max passages per query in memory
|
|
|
|
# Multi-granularity
|
|
max_query_length = 64 # Max query length
|
|
max_passage_length = 512 # Max passage length
|
|
max_length = 8192 # Max total input length (multi-granularity)
|
|
|
|
# New: Enhanced dataset integration
|
|
use_new_dataset_api = true # Use integrated dataset API with format detection
|
|
auto_migrate_legacy = false # Automatically migrate legacy formats during training
|
|
|
|
[reranker]
|
|
# Enhanced BGE-Reranker model-specific parameters with flat pairs support
|
|
# If a parameter is present in both [reranker] and [training]/[hardware], [reranker] takes precedence for BGE-Reranker.
|
|
add_pooling_layer = false # Not used for cross-encoder
|
|
use_fp16 = true # Use mixed precision (float16) training
|
|
train_group_size = 16 # Number of passages per query for listwise training
|
|
loss_type = "listwise_ce" # Loss function type ('listwise_ce', 'pairwise_margin', 'combined', 'cross_entropy', 'binary_cross_entropy')
|
|
margin = 1.0 # Margin for pairwise loss
|
|
temperature = 1.0 # Temperature for listwise CE loss
|
|
|
|
# Enhanced knowledge distillation
|
|
use_knowledge_distillation = false # Enable knowledge distillation from teacher model
|
|
teacher_model_name_or_path = "" # Path to teacher model for distillation
|
|
distillation_temperature = 3.0 # Temperature for distillation
|
|
distillation_alpha = 0.7 # Alpha (weight) for distillation loss
|
|
|
|
# Enhanced hard negatives
|
|
use_hard_negatives = true # Enable hard negative mining
|
|
hard_negative_ratio = 0.8 # Ratio of hard to random negatives
|
|
|
|
# Enhanced denoising
|
|
use_denoising = false # Enable denoising strategy
|
|
noise_probability = 0.1 # Probability of noise for denoising
|
|
|
|
# Performance optimizations
|
|
max_pairs_per_device = 128 # Max query-passage pairs per device
|
|
accumulate_gradients_from_all_devices = true # Accumulate gradients from all devices
|
|
max_seq_length = 512 # Max combined query+passage length
|
|
query_max_length = 64 # Max query length (if separate encoding)
|
|
passage_max_length = 512 # Max passage length (if separate encoding)
|
|
|
|
# Enhanced training features
|
|
shuffle_passages = true # Shuffle passage order during training
|
|
position_bias_cutoff = 10 # Top-k passages to consider for position bias
|
|
use_gradient_caching = true # Enable gradient caching for efficiency
|
|
gradient_cache_batch_size = 32 # Batch size for gradient caching
|
|
eval_group_size = 100 # Number of candidates during evaluation
|
|
metric_for_best_model = "eval_mrr@10" # Metric to select best model
|
|
eval_normalize_scores = true # Normalize scores during evaluation
|
|
|
|
# Loss weights for combined training
|
|
listwise_weight = 0.7 # Loss weight for listwise loss
|
|
pairwise_weight = 0.3 # Loss weight for pairwise loss
|
|
|
|
# Data quality controls
|
|
filter_empty_passages = true # Filter out empty passages
|
|
min_passage_length = 10 # Minimum passage length to keep
|
|
max_passage_length_ratio = 50.0 # Max ratio between query and passage length
|
|
|
|
# Advanced features
|
|
gradient_checkpointing = true # Enable gradient checkpointing
|
|
recompute_scores = false # Recompute scores instead of storing all
|
|
use_dynamic_teacher = false # Enable dynamic teacher for RocketQAv2-style training
|
|
teacher_update_steps = 1000 # Steps between teacher updates
|
|
teacher_momentum = 0.999 # Momentum for teacher updates
|
|
|
|
# New: Enhanced format support
|
|
support_flat_pairs = true # Support flat pairs format (query, passage, label)
|
|
support_nested_format = true # Support legacy nested format for backward compatibility
|
|
auto_convert_nested_to_flat = false # Automatically convert nested to flat during training
|
|
use_new_dataset_api = true # Use integrated dataset API with format detection
|