init commit

2025-07-22 16:55:25 +08:00
commit 36003b83e2
67 changed files with 76613 additions and 0 deletions
--- a/utils/config_loader.py
+++ b/utils/config_loader.py
@@ -0,0 +1,518 @@
+"""
+Centralized configuration loader for BGE fine-tuning project
+"""
+import os
+import toml
+import logging
+from typing import Dict, Any, Optional, Tuple
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def resolve_model_path(
+    model_key: str,
+    config_instance: 'ConfigLoader',
+    model_name_or_path: Optional[str] = None,
+    cache_dir: Optional[str] = None
+) -> Tuple[str, str]:
+    """
+    Resolve model path with automatic local/remote fallback
+    
+    Args:
+        model_key: Key in model_paths config (e.g., 'bge_m3', 'bge_reranker')
+        config_instance: ConfigLoader instance
+        model_name_or_path: Optional override from CLI
+        cache_dir: Optional cache directory
+        
+    Returns:
+        Tuple of (resolved_model_path, resolved_cache_dir)
+    """
+    # Default remote model names for each model type
+    remote_models = {
+        'bge_m3': 'BAAI/bge-m3',
+        'bge_reranker': 'BAAI/bge-reranker-base',
+    }
+    
+    # Get model source (huggingface or modelscope)
+    model_source = config_instance.get('model_paths', 'source', 'huggingface')
+    
+    # If ModelScope is used, adjust model names
+    if model_source == 'modelscope':
+        remote_models = {
+            'bge_m3': 'damo/nlp_gte_sentence-embedding_chinese-base',
+            'bge_reranker': 'damo/nlp_corom_passage-ranking_english-base',
+        }
+    
+    # 1. Check CLI override first (only if explicitly provided)
+    if model_name_or_path is not None:
+        resolved_model_path = model_name_or_path
+        logger.info(f"[CLI OVERRIDE] Using model path: {resolved_model_path}")
+    else:
+        # 2. Check local path from config
+        local_path_raw = config_instance.get('model_paths', model_key, None)
+        
+        # Handle list/string conversion
+        if isinstance(local_path_raw, list):
+            local_path = local_path_raw[0] if local_path_raw else None
+        else:
+            local_path = local_path_raw
+        
+        if local_path and isinstance(local_path, str) and os.path.exists(local_path):
+            # Check if it's a valid model directory
+            if _is_valid_model_directory(local_path):
+                resolved_model_path = local_path
+                logger.info(f"[LOCAL] Using local model: {resolved_model_path}")
+            else:
+                logger.warning(f"[LOCAL] Invalid model directory: {local_path}, falling back to remote")
+                resolved_model_path = remote_models.get(model_key, f'BAAI/{model_key}')
+        else:
+            # 3. Fall back to remote model
+            resolved_model_path = remote_models.get(model_key, f'BAAI/{model_key}')
+            logger.info(f"[REMOTE] Using remote model: {resolved_model_path}")
+            
+            # If local path was configured but doesn't exist, we'll download to that location
+            if local_path and isinstance(local_path, str):
+                logger.info(f"[DOWNLOAD] Will download to configured path: {local_path}")
+                # Create directory if it doesn't exist
+                os.makedirs(local_path, exist_ok=True)
+    
+    # Resolve cache directory
+    if cache_dir is not None:
+        resolved_cache_dir = cache_dir
+    else:
+        cache_dir_raw = config_instance.get('model_paths', 'cache_dir', './cache/data')
+        if isinstance(cache_dir_raw, list):
+            resolved_cache_dir = cache_dir_raw[0] if cache_dir_raw else './cache/data'
+        else:
+            resolved_cache_dir = cache_dir_raw if cache_dir_raw else './cache/data'
+    
+    # Ensure cache directory exists
+    if isinstance(resolved_cache_dir, str):
+        os.makedirs(resolved_cache_dir, exist_ok=True)
+    
+    return resolved_model_path, resolved_cache_dir
+
+
+def _is_valid_model_directory(path: str) -> bool:
+    """Check if a directory contains a valid model"""
+    required_files = ['config.json']
+    optional_files = ['tokenizer_config.json', 'vocab.txt', 'tokenizer.json']
+    model_files = ['pytorch_model.bin', 'model.safetensors', 'pytorch_model.bin.index.json']
+    
+    path_obj = Path(path)
+    if not path_obj.exists() or not path_obj.is_dir():
+        return False
+    
+    # Check for required files
+    for file in required_files:
+        if not (path_obj / file).exists():
+            return False
+    
+    # Check for at least one model file
+    has_model_file = any((path_obj / file).exists() for file in model_files)
+    if not has_model_file:
+        return False
+    
+    # Check for at least one tokenizer file (for embedding models)
+    has_tokenizer_file = any((path_obj / file).exists() for file in optional_files)
+    if not has_tokenizer_file:
+        logger.warning(f"Model directory {path} missing tokenizer files, but proceeding anyway")
+    
+    return True
+
+
+def download_model_if_needed(
+    model_name_or_path: str,
+    local_path: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    model_source: str = 'huggingface'
+) -> str:
+    """
+    Download model if it's a remote identifier and local path is specified
+    
+    Args:
+        model_name_or_path: Model identifier or path
+        local_path: Target local path for download
+        cache_dir: Cache directory
+        model_source: 'huggingface' or 'modelscope'
+        
+    Returns:
+        Final model path (local if downloaded, original if already local)
+    """
+    # If it's already a local path, return as is
+    if os.path.exists(model_name_or_path):
+        return model_name_or_path
+    
+    # If no local path specified, return original (will be handled by transformers)
+    if not local_path:
+        return model_name_or_path
+    
+    # Check if local path already exists and is valid
+    if os.path.exists(local_path) and _is_valid_model_directory(local_path):
+        logger.info(f"Model already exists locally: {local_path}")
+        return local_path
+    
+    # Download the model
+    try:
+        logger.info(f"Downloading model {model_name_or_path} to {local_path}")
+        
+        if model_source == 'huggingface':
+            from transformers import AutoModel, AutoTokenizer
+            
+            # Download model and tokenizer
+            model = AutoModel.from_pretrained(
+                model_name_or_path,
+                cache_dir=cache_dir,
+                trust_remote_code=True
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_name_or_path,
+                cache_dir=cache_dir,
+                trust_remote_code=True
+            )
+            
+            # Save to local path
+            os.makedirs(local_path, exist_ok=True)
+            model.save_pretrained(local_path)
+            tokenizer.save_pretrained(local_path)
+            
+            logger.info(f"Model downloaded successfully to {local_path}")
+            return local_path
+            
+        elif model_source == 'modelscope':
+            try:
+                from modelscope import snapshot_download  # type: ignore
+                
+                # Download using ModelScope
+                downloaded_path = snapshot_download(
+                    model_name_or_path,
+                    cache_dir=cache_dir,
+                    local_dir=local_path
+                )
+                
+                logger.info(f"Model downloaded successfully from ModelScope to {downloaded_path}")
+                return downloaded_path
+                
+            except ImportError:
+                logger.warning("ModelScope not available, falling back to HuggingFace")
+                return download_model_if_needed(
+                    model_name_or_path, local_path, cache_dir, 'huggingface'
+                )
+        
+    except Exception as e:
+        logger.error(f"Failed to download model {model_name_or_path}: {e}")
+        logger.info("Returning original model path - transformers will handle caching")
+    
+    # Fallback return
+    return model_name_or_path
+
+
+class ConfigLoader:
+    """Centralized configuration management for BGE fine-tuning
+
+    Parameter Precedence (highest to lowest):
+        1. Command-line arguments (CLI)
+        2. Model-specific config ([m3], [reranker])
+        3. [hardware] section in config.toml
+        4. [training] section in config.toml
+        5. Hardcoded defaults in code
+    """
+
+    _instance = None
+    _config = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(ConfigLoader, cls).__new__(cls)
+        return cls._instance
+
+    def __init__(self):
+        if self._config is None:
+            self._load_config()
+
+    def _find_config_file(self) -> str:
+        """Find config.toml file in project hierarchy"""
+        # Check environment variable FIRST (highest priority)
+        if 'BGE_CONFIG_PATH' in os.environ:
+            config_path = Path(os.environ['BGE_CONFIG_PATH'])
+            if config_path.exists():
+                return str(config_path)
+            else:
+                logger.warning(f"BGE_CONFIG_PATH set to {config_path} but file doesn't exist, falling back to default")
+
+        current_dir = Path(__file__).parent
+
+        # Search up the directory tree for config.toml
+        for _ in range(5):  # Limit search depth
+            config_path = current_dir / "config.toml"
+            if config_path.exists():
+                return str(config_path)
+            current_dir = current_dir.parent
+
+        # Default location
+        return os.path.join(os.getcwd(), "config.toml")
+
+    def _load_config(self):
+        """Load configuration from TOML file"""
+        config_path = self._find_config_file()
+
+        try:
+            if os.path.exists(config_path):
+                with open(config_path, 'r', encoding='utf-8') as f:
+                    loaded = toml.load(f)
+                    if loaded is None:
+                        loaded = {}
+                    self._config = dict(loaded)
+                logger.info(f"Loaded configuration from {config_path}")
+            else:
+                logger.warning(f"Config file not found at {config_path}, using defaults")
+                self._config = self._get_default_config()
+                self._save_default_config(config_path)
+                logger.warning("Default config created. Please check your configuration file.")
+        except Exception as e:
+            logger.error(f"Error loading config: {e}")
+            self._config = self._get_default_config()
+
+        # Validation for unset/placeholder values
+        self._validate_config()
+
+    def _validate_config(self):
+        """Validate config for unset/placeholder values"""
+        api_key = self.get('api', 'api_key', None)
+        if not api_key or api_key == 'your-api-key-here':
+            logger.warning("API key is unset or placeholder. Set it in config.toml or via the BGE_API_KEY environment variable.")
+
+    def _get_default_config(self) -> Dict[str, Any]:
+        """Get default configuration"""
+        return {
+            'api': {
+                'base_url': 'https://api.deepseek.com/v1',
+                'api_key': 'your-api-key-here',
+                'model': 'deepseek-chat',
+                'max_retries': 3,
+                'timeout': 30,
+                'temperature': 0.7
+            },
+            'training': {
+                'default_batch_size': 4,
+                'default_learning_rate': 1e-5,
+                'default_num_epochs': 3,
+                'default_warmup_ratio': 0.1,
+                'default_weight_decay': 0.01,
+                'gradient_accumulation_steps': 4
+            },
+            'model_paths': {
+                'bge_m3': './models/bge-m3',
+                'bge_reranker': './models/bge-reranker-base',
+                'cache_dir': './cache/data'
+            },
+            'data': {
+                'max_query_length': 64,
+                'max_passage_length': 512,
+                'max_seq_length': 512,
+                'validation_split': 0.1,
+                'random_seed': 42
+            },
+            'augmentation': {
+                'enable_query_augmentation': False,
+                'enable_passage_augmentation': False,
+                'augmentation_methods': ['paraphrase', 'back_translation'],
+                'num_augmentations_per_sample': 2,
+                'augmentation_temperature': 0.8
+            },
+            'hard_negative_mining': {
+                'enable_mining': True,
+                'num_hard_negatives': 15,
+                'negative_range_min': 10,
+                'negative_range_max': 100,
+                'margin': 0.1,
+                'index_refresh_interval': 1000,
+                'index_type': 'IVF',
+                'nlist': 100
+            },
+            'evaluation': {
+                'eval_batch_size': 32,
+                'k_values': [1, 3, 5, 10, 20, 50, 100],
+                'metrics': ['recall', 'precision', 'map', 'mrr', 'ndcg'],
+                'save_predictions': True,
+                'predictions_dir': './output/predictions'
+            },
+            'logging': {
+                'log_level': 'INFO',
+                'log_to_file': True,
+                'log_dir': './logs',
+                'tensorboard_dir': './logs/tensorboard',
+                'wandb_project': 'bge-finetuning',
+                'wandb_entity': '',
+                'use_wandb': False
+            },
+            'distributed': {
+                'backend': 'nccl',
+                'init_method': 'env://',
+                'find_unused_parameters': True
+            },
+            'optimization': {
+                'gradient_checkpointing': True,
+                'fp16': True,
+                'bf16': False,
+                'fp16_opt_level': 'O1',
+                'max_grad_norm': 1.0,
+                'adam_epsilon': 1e-8,
+                'adam_beta1': 0.9,
+                'adam_beta2': 0.999
+            },
+            'hardware': {
+                'cuda_visible_devices': '0,1,2,3',
+                'dataloader_num_workers': 4,
+                'dataloader_pin_memory': True,
+                'per_device_train_batch_size': 4,
+                'per_device_eval_batch_size': 8
+            },
+            'checkpoint': {
+                'save_strategy': 'steps',
+                'save_steps': 1000,
+                'save_total_limit': 3,
+                'load_best_model_at_end': True,
+                'metric_for_best_model': 'eval_recall@10',
+                'greater_is_better': True,
+                'resume_from_checkpoint': ''
+            },
+            'export': {
+                'export_format': ['pytorch', 'onnx'],
+                'optimize_for_inference': True,
+                'quantize': False,
+                'quantization_bits': 8
+            }
+        }
+
+    def _save_default_config(self, config_path: str):
+        """Save default configuration to file"""
+        try:
+            os.makedirs(os.path.dirname(config_path), exist_ok=True)
+            with open(config_path, 'w', encoding='utf-8') as f:
+                config_to_save = self._config if isinstance(self._config, dict) else {}
+                toml.dump(config_to_save, f)
+            logger.info(f"Created default config at {config_path}")
+        except Exception as e:
+            logger.error(f"Error saving default config: {e}")
+
+    def get(self, section: str, key: str, default=None):
+        """Get a config value, logging its source (CLI, config, default)."""
+        # CLI overrides are handled outside this loader; here we check config and default
+        value = default
+        source = 'default'
+        if isinstance(self._config, dict) and section in self._config and key in self._config[section]:
+            value = self._config[section][key]
+            source = 'config.toml'
+        # Robust list/tuple conversion
+        if isinstance(value, str) and (',' in value or ';' in value):
+            try:
+                value = [v.strip() for v in value.replace(';', ',').split(',') if v.strip()]
+                source += ' (parsed as list)'
+            except Exception as e:
+                logging.warning(f"Failed to parse list for {section}.{key}: {e}")
+        log_func = logging.info if source != 'default' else logging.warning
+        log_func(f"Config param {section}.{key} = {value} (source: {source})")
+        return value
+
+    def get_section(self, section: str) -> Dict[str, Any]:
+        """Get entire configuration section"""
+        if isinstance(self._config, dict):
+            return self._config.get(section, {})
+        return {}
+
+    def update(self, key: str, value: Any):
+        """Update configuration value. Logs a warning if overriding an existing value."""
+        if self._config is None:
+            self._config = {}
+        
+        keys = key.split('.')
+        config = self._config
+
+        # Navigate to the parent of the final key
+        for k in keys[:-1]:
+            if k not in config:
+                config[k] = {}
+            config = config[k]
+
+        # Log if overriding
+        old_value = config.get(keys[-1], None)
+        if old_value is not None and old_value != value:
+            logger.warning(f"Parameter '{key}' overridden: {old_value} -> {value} (higher-precedence source)")
+        config[keys[-1]] = value
+
+    def reload(self):
+        """Reload configuration from file"""
+        self._config = None
+        self._load_config()
+
+
+# Global instance
+config = ConfigLoader()
+
+
+def get_config() -> ConfigLoader:
+    """Get global configuration instance"""
+    return config
+
+
+def load_config_for_training(
+        config_path: Optional[str] = None,
+        override_params: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any]:
+    """
+    Load configuration specifically for training scripts
+
+    Args:
+        config_path: Optional path to config file
+        override_params: Parameters to override from command line
+
+    Returns:
+        Dictionary with all configuration parameters
+    """
+    global config
+
+    # Reload with specific path if provided
+    if config_path:
+        old_path = os.environ.get('BGE_CONFIG_PATH')
+        os.environ['BGE_CONFIG_PATH'] = config_path
+        config.reload()
+        if old_path:
+            os.environ['BGE_CONFIG_PATH'] = old_path
+        else:
+            del os.environ['BGE_CONFIG_PATH']
+
+    # Get all configuration
+    training_config = {
+        'api': config.get_section('api'),
+        'training': config.get_section('training'),
+        'model_paths': config.get_section('model_paths'),
+        'data': config.get_section('data'),
+        'augmentation': config.get_section('augmentation'),
+        'hard_negative_mining': config.get_section('hard_negative_mining'),
+        'evaluation': config.get_section('evaluation'),
+        'logging': config.get_section('logging'),
+        'distributed': config.get_section('distributed'),
+        'optimization': config.get_section('optimization'),
+        'hardware': config.get_section('hardware'),
+        'checkpoint': config.get_section('checkpoint'),
+        'export': config.get_section('export'),
+        'm3': config.get_section('m3'),
+        'reranker': config.get_section('reranker'),
+    }
+
+    # Apply overrides if provided
+    if override_params:
+        for key, value in override_params.items():
+            config.update(key, value)
+            # Also update the returned dict
+            keys = key.split('.')
+            current = training_config
+            for k in keys[:-1]:
+                if k not in current:
+                    current[k] = {}
+                current = current[k]
+            current[keys[-1]] = value
+
+    return training_config