#!/usr/bin/env python3 """ BGE Validation Suite Runner This script runs a complete validation suite for BGE fine-tuned models, including multiple validation approaches and generating comprehensive reports. Usage: # Auto-discover models and run full suite python scripts/run_validation_suite.py --auto-discover # Run with specific models python scripts/run_validation_suite.py \\ --retriever_model ./output/bge-m3-enhanced/final_model \\ --reranker_model ./output/bge-reranker/final_model # Run only specific validation types python scripts/run_validation_suite.py \\ --retriever_model ./output/bge-m3-enhanced/final_model \\ --validation_types quick comprehensive comparison """ import os import sys import json import argparse import subprocess import logging from pathlib import Path from datetime import datetime from typing import List, Dict, Optional, Any # Add project root to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from scripts.validation_utils import ModelDiscovery, TestDataManager, ValidationReportAnalyzer logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class ValidationSuiteRunner: """Orchestrates comprehensive validation of BGE models""" def __init__(self, config: Dict[str, Any]): self.config = config self.results_dir = Path(config.get('output_dir', './validation_suite_results')) self.results_dir.mkdir(parents=True, exist_ok=True) # Initialize components self.model_discovery = ModelDiscovery(config.get('workspace_root', '.')) self.data_manager = TestDataManager(config.get('data_dir', 'data/datasets')) # Suite results self.suite_results = { 'start_time': datetime.now().isoformat(), 'config': config, 'validation_results': {}, 'summary': {}, 'issues': [], 'recommendations': [] } def run_suite(self) -> Dict[str, Any]: """Run the complete validation suite""" logger.info("šŸš€ Starting BGE Validation Suite") logger.info("=" * 60) try: # Phase 1: Discovery and Setup self._phase_discovery() # Phase 2: Quick Validation if 'quick' in self.config.get('validation_types', ['quick']): self._phase_quick_validation() # Phase 3: Comprehensive Validation if 'comprehensive' in self.config.get('validation_types', ['comprehensive']): self._phase_comprehensive_validation() # Phase 4: Comparison Benchmarks if 'comparison' in self.config.get('validation_types', ['comparison']): self._phase_comparison_benchmarks() # Phase 5: Analysis and Reporting self._phase_analysis() # Phase 6: Generate Final Report self._generate_final_report() self.suite_results['end_time'] = datetime.now().isoformat() self.suite_results['status'] = 'completed' logger.info("āœ… Validation Suite Completed Successfully!") except Exception as e: logger.error(f"āŒ Validation Suite Failed: {e}") self.suite_results['status'] = 'failed' self.suite_results['error'] = str(e) raise return self.suite_results def _phase_discovery(self): """Phase 1: Discover models and datasets""" logger.info("šŸ“” Phase 1: Discovery and Setup") # Auto-discover models if requested if self.config.get('auto_discover', False): discovered_models = self.model_discovery.find_trained_models() # Set models from discovery if not provided if not self.config.get('retriever_model'): if discovered_models['retrievers']: model_name = list(discovered_models['retrievers'].keys())[0] self.config['retriever_model'] = discovered_models['retrievers'][model_name]['path'] logger.info(f" šŸ“Š Auto-discovered retriever: {model_name}") if not self.config.get('reranker_model'): if discovered_models['rerankers']: model_name = list(discovered_models['rerankers'].keys())[0] self.config['reranker_model'] = discovered_models['rerankers'][model_name]['path'] logger.info(f" šŸ† Auto-discovered reranker: {model_name}") self.suite_results['discovered_models'] = discovered_models # Discover test datasets discovered_datasets = self.data_manager.find_test_datasets() self.suite_results['available_datasets'] = discovered_datasets # Validate models exist issues = [] if self.config.get('retriever_model'): if not os.path.exists(self.config['retriever_model']): issues.append(f"Retriever model not found: {self.config['retriever_model']}") if self.config.get('reranker_model'): if not os.path.exists(self.config['reranker_model']): issues.append(f"Reranker model not found: {self.config['reranker_model']}") if issues: for issue in issues: logger.error(f" āŒ {issue}") self.suite_results['issues'].extend(issues) return logger.info(" āœ… Discovery phase completed") def _phase_quick_validation(self): """Phase 2: Quick validation tests""" logger.info("⚔ Phase 2: Quick Validation") try: # Run quick validation script cmd = ['python', 'scripts/quick_validation.py'] if self.config.get('retriever_model'): cmd.extend(['--retriever_model', self.config['retriever_model']]) if self.config.get('reranker_model'): cmd.extend(['--reranker_model', self.config['reranker_model']]) if self.config.get('retriever_model') and self.config.get('reranker_model'): cmd.append('--test_pipeline') # Save results to file quick_results_file = self.results_dir / 'quick_validation_results.json' cmd.extend(['--output_file', str(quick_results_file)]) logger.info(f" Running: {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(" āœ… Quick validation completed") # Load results if quick_results_file.exists(): with open(quick_results_file, 'r') as f: quick_results = json.load(f) self.suite_results['validation_results']['quick'] = quick_results else: logger.error(f" āŒ Quick validation failed: {result.stderr}") self.suite_results['issues'].append(f"Quick validation failed: {result.stderr}") except Exception as e: logger.error(f" āŒ Error in quick validation: {e}") self.suite_results['issues'].append(f"Quick validation error: {e}") def _phase_comprehensive_validation(self): """Phase 3: Comprehensive validation""" logger.info("šŸ”¬ Phase 3: Comprehensive Validation") try: # Run comprehensive validation script cmd = ['python', 'scripts/comprehensive_validation.py'] if self.config.get('retriever_model'): cmd.extend(['--retriever_finetuned', self.config['retriever_model']]) if self.config.get('reranker_model'): cmd.extend(['--reranker_finetuned', self.config['reranker_model']]) # Use specific datasets if provided if self.config.get('test_datasets'): cmd.extend(['--test_datasets'] + self.config['test_datasets']) # Set output directory comprehensive_dir = self.results_dir / 'comprehensive' cmd.extend(['--output_dir', str(comprehensive_dir)]) logger.info(f" Running: {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(" āœ… Comprehensive validation completed") # Load results results_file = comprehensive_dir / 'validation_results.json' if results_file.exists(): with open(results_file, 'r') as f: comp_results = json.load(f) self.suite_results['validation_results']['comprehensive'] = comp_results else: logger.error(f" āŒ Comprehensive validation failed: {result.stderr}") self.suite_results['issues'].append(f"Comprehensive validation failed: {result.stderr}") except Exception as e: logger.error(f" āŒ Error in comprehensive validation: {e}") self.suite_results['issues'].append(f"Comprehensive validation error: {e}") def _phase_comparison_benchmarks(self): """Phase 4: Run comparison benchmarks""" logger.info("šŸ“Š Phase 4: Comparison Benchmarks") # Find suitable datasets for comparison datasets = self.data_manager.find_test_datasets() try: # Run retriever comparison if model exists if self.config.get('retriever_model'): self._run_retriever_comparison(datasets) # Run reranker comparison if model exists if self.config.get('reranker_model'): self._run_reranker_comparison(datasets) except Exception as e: logger.error(f" āŒ Error in comparison benchmarks: {e}") self.suite_results['issues'].append(f"Comparison benchmarks error: {e}") def _run_retriever_comparison(self, datasets: Dict[str, List[str]]): """Run retriever comparison benchmarks""" logger.info(" šŸ” Running retriever comparison...") # Find suitable datasets for retriever testing test_datasets = datasets.get('retriever_datasets', []) + datasets.get('general_datasets', []) if not test_datasets: logger.warning(" āš ļø No suitable datasets found for retriever comparison") return for dataset_path in test_datasets[:2]: # Limit to 2 datasets for speed if not os.path.exists(dataset_path): continue try: cmd = [ 'python', 'scripts/compare_retriever.py', '--finetuned_model_path', self.config['retriever_model'], '--baseline_model_path', self.config.get('retriever_baseline', 'BAAI/bge-m3'), '--data_path', dataset_path, '--batch_size', str(self.config.get('batch_size', 16)), '--max_samples', str(self.config.get('max_samples', 500)), '--output', str(self.results_dir / f'retriever_comparison_{Path(dataset_path).stem}.txt') ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(f" āœ… Retriever comparison on {Path(dataset_path).name}") else: logger.warning(f" āš ļø Retriever comparison failed on {Path(dataset_path).name}") except Exception as e: logger.warning(f" āš ļø Error comparing retriever on {dataset_path}: {e}") def _run_reranker_comparison(self, datasets: Dict[str, List[str]]): """Run reranker comparison benchmarks""" logger.info(" šŸ† Running reranker comparison...") # Find suitable datasets for reranker testing test_datasets = datasets.get('reranker_datasets', []) + datasets.get('general_datasets', []) if not test_datasets: logger.warning(" āš ļø No suitable datasets found for reranker comparison") return for dataset_path in test_datasets[:2]: # Limit to 2 datasets for speed if not os.path.exists(dataset_path): continue try: cmd = [ 'python', 'scripts/compare_reranker.py', '--finetuned_model_path', self.config['reranker_model'], '--baseline_model_path', self.config.get('reranker_baseline', 'BAAI/bge-reranker-base'), '--data_path', dataset_path, '--batch_size', str(self.config.get('batch_size', 16)), '--max_samples', str(self.config.get('max_samples', 500)), '--output', str(self.results_dir / f'reranker_comparison_{Path(dataset_path).stem}.txt') ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(f" āœ… Reranker comparison on {Path(dataset_path).name}") else: logger.warning(f" āš ļø Reranker comparison failed on {Path(dataset_path).name}") except Exception as e: logger.warning(f" āš ļø Error comparing reranker on {dataset_path}: {e}") def _phase_analysis(self): """Phase 5: Analyze all results""" logger.info("šŸ“ˆ Phase 5: Results Analysis") try: # Analyze comprehensive results if available comprehensive_dir = self.results_dir / 'comprehensive' if comprehensive_dir.exists(): analyzer = ValidationReportAnalyzer(str(comprehensive_dir)) analysis = analyzer.analyze_results() self.suite_results['analysis'] = analysis logger.info(" āœ… Results analysis completed") else: logger.warning(" āš ļø No comprehensive results found for analysis") except Exception as e: logger.error(f" āŒ Error in results analysis: {e}") self.suite_results['issues'].append(f"Analysis error: {e}") def _generate_final_report(self): """Generate final comprehensive report""" logger.info("šŸ“ Phase 6: Generating Final Report") try: # Create summary self._create_suite_summary() # Save suite results suite_results_file = self.results_dir / 'validation_suite_results.json' with open(suite_results_file, 'w', encoding='utf-8') as f: json.dump(self.suite_results, f, indent=2, ensure_ascii=False) # Generate HTML report self._generate_html_report() logger.info(f" šŸ“Š Reports saved to: {self.results_dir}") except Exception as e: logger.error(f" āŒ Error generating final report: {e}") self.suite_results['issues'].append(f"Report generation error: {e}") def _create_suite_summary(self): """Create suite summary""" summary = { 'total_validations': 0, 'successful_validations': 0, 'failed_validations': 0, 'overall_verdict': 'unknown', 'key_findings': [], 'critical_issues': [] } # Count validations for validation_type, results in self.suite_results.get('validation_results', {}).items(): summary['total_validations'] += 1 if results: summary['successful_validations'] += 1 else: summary['failed_validations'] += 1 # Determine overall verdict if summary['successful_validations'] > 0 and summary['failed_validations'] == 0: if self.suite_results.get('analysis', {}).get('overall_status') == 'excellent': summary['overall_verdict'] = 'excellent' elif self.suite_results.get('analysis', {}).get('overall_status') in ['good', 'mixed']: summary['overall_verdict'] = 'good' else: summary['overall_verdict'] = 'fair' elif summary['successful_validations'] > summary['failed_validations']: summary['overall_verdict'] = 'partial' else: summary['overall_verdict'] = 'poor' # Extract key findings from analysis if 'analysis' in self.suite_results: analysis = self.suite_results['analysis'] if 'recommendations' in analysis: summary['key_findings'] = analysis['recommendations'][:5] # Top 5 recommendations # Extract critical issues summary['critical_issues'] = self.suite_results.get('issues', []) self.suite_results['summary'] = summary def _generate_html_report(self): """Generate HTML summary report""" html_file = self.results_dir / 'validation_suite_report.html' summary = self.suite_results.get('summary', {}) verdict_colors = { 'excellent': '#28a745', 'good': '#17a2b8', 'fair': '#ffc107', 'partial': '#fd7e14', 'poor': '#dc3545', 'unknown': '#6c757d' } verdict_color = verdict_colors.get(summary.get('overall_verdict', 'unknown'), '#6c757d') html_content = f""" BGE Validation Suite Report

šŸš€ BGE Validation Suite Report

Comprehensive Model Performance Analysis

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Overall Verdict: {summary.get('overall_verdict', 'UNKNOWN').upper()}
{summary.get('total_validations', 0)}
Total Validations
{summary.get('successful_validations', 0)}
Successful
{summary.get('failed_validations', 0)}
Failed
""" # Add key findings if summary.get('key_findings'): html_content += """

šŸ” Key Findings

""" # Add critical issues if any if summary.get('critical_issues'): html_content += """

āš ļø Critical Issues

""" # Add validation timeline html_content += f"""

ā±ļø Validation Timeline

Suite Started: {self.suite_results.get('start_time', 'Unknown')}
""" for validation_type in self.suite_results.get('validation_results', {}).keys(): html_content += f"""
{validation_type.title()} Validation: Completed
""" html_content += f"""
Suite Completed: {self.suite_results.get('end_time', 'In Progress')}

šŸ“ Detailed Results

For detailed validation results, check the following files in {self.results_dir}:

""" with open(html_file, 'w', encoding='utf-8') as f: f.write(html_content) logger.info(f" šŸ“Š HTML report: {html_file}") def parse_args(): """Parse command line arguments""" parser = argparse.ArgumentParser( description="Run comprehensive BGE validation suite", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Auto-discover models and run full suite python scripts/run_validation_suite.py --auto-discover # Run with specific models python scripts/run_validation_suite.py \\ --retriever_model ./output/bge-m3-enhanced/final_model \\ --reranker_model ./output/bge-reranker/final_model # Run only specific validation types python scripts/run_validation_suite.py \\ --retriever_model ./output/bge-m3-enhanced/final_model \\ --validation_types quick comprehensive """ ) # Model paths parser.add_argument("--retriever_model", type=str, default=None, help="Path to fine-tuned retriever model") parser.add_argument("--reranker_model", type=str, default=None, help="Path to fine-tuned reranker model") parser.add_argument("--auto_discover", action="store_true", help="Auto-discover trained models in workspace") # Baseline models parser.add_argument("--retriever_baseline", type=str, default="BAAI/bge-m3", help="Baseline retriever model") parser.add_argument("--reranker_baseline", type=str, default="BAAI/bge-reranker-base", help="Baseline reranker model") # Validation configuration parser.add_argument("--validation_types", type=str, nargs="+", default=["quick", "comprehensive", "comparison"], choices=["quick", "comprehensive", "comparison"], help="Types of validation to run") parser.add_argument("--test_datasets", type=str, nargs="+", default=None, help="Specific test datasets to use") # Performance settings parser.add_argument("--batch_size", type=int, default=16, help="Batch size for evaluation") parser.add_argument("--max_samples", type=int, default=1000, help="Maximum samples per dataset for speed") # Directories parser.add_argument("--workspace_root", type=str, default=".", help="Workspace root directory") parser.add_argument("--data_dir", type=str, default="data/datasets", help="Test datasets directory") parser.add_argument("--output_dir", type=str, default="./validation_suite_results", help="Output directory for all results") return parser.parse_args() def main(): """Main function""" args = parse_args() # Validate input if not args.auto_discover and not args.retriever_model and not args.reranker_model: print("āŒ Error: Must specify models or use --auto_discover") print(" Use --retriever_model, --reranker_model, or --auto_discover") return 1 # Create configuration config = { 'retriever_model': args.retriever_model, 'reranker_model': args.reranker_model, 'auto_discover': args.auto_discover, 'retriever_baseline': args.retriever_baseline, 'reranker_baseline': args.reranker_baseline, 'validation_types': args.validation_types, 'test_datasets': args.test_datasets, 'batch_size': args.batch_size, 'max_samples': args.max_samples, 'workspace_root': args.workspace_root, 'data_dir': args.data_dir, 'output_dir': args.output_dir } # Run validation suite try: runner = ValidationSuiteRunner(config) results = runner.run_suite() # Print final summary summary = results.get('summary', {}) verdict = summary.get('overall_verdict', 'unknown') print("\n" + "=" * 80) print("šŸŽÆ VALIDATION SUITE SUMMARY") print("=" * 80) verdict_emojis = { 'excellent': '🌟', 'good': 'āœ…', 'fair': 'šŸ‘Œ', 'partial': 'āš ļø', 'poor': 'āŒ', 'unknown': 'ā“' } print(f"{verdict_emojis.get(verdict, 'ā“')} Overall Verdict: {verdict.upper()}") print(f"šŸ“Š Validations: {summary.get('successful_validations', 0)}/{summary.get('total_validations', 0)} successful") if summary.get('key_findings'): print(f"\nšŸ” Key Findings:") for i, finding in enumerate(summary['key_findings'][:3], 1): print(f" {i}. {finding}") if summary.get('critical_issues'): print(f"\nāš ļø Critical Issues:") for issue in summary['critical_issues'][:3]: print(f" • {issue}") print(f"\nšŸ“ Detailed results: {config['output_dir']}") print("🌐 Open validation_suite_report.html in your browser for full report") return 0 if verdict in ['excellent', 'good'] else 1 except KeyboardInterrupt: print("\nāŒ Validation suite interrupted by user") return 1 except Exception as e: print(f"\nāŒ Validation suite failed: {e}") return 1 if __name__ == "__main__": exit(main())