#!/usr/bin/env python3 """ Test script to verify the evaluation pipeline works with mock mode. """ import os import sys # Add src to path for imports sys.path.append('src') from evaluator import evaluator from models_registry import models_registry def test_evaluation_pipeline(): """Test the complete evaluation pipeline with mock mode.""" print("๐Ÿงช Testing Evaluation Pipeline with Mock Mode") print("=" * 50) # Enable mock mode os.environ["MOCK_MODE"] = "true" # Test parameters dataset_name = "nyc_taxi_small" dialect = "presto" case_id = "avg_fare_amount" model_name = "CodeLlama-7B-Instruct" # Load prompt template template_path = f"prompts/template_{dialect}.txt" with open(template_path, 'r') as f: prompt_template = f.read() print(f"Testing evaluation:") print(f" Dataset: {dataset_name}") print(f" Dialect: {dialect}") print(f" Case: {case_id}") print(f" Model: {model_name}") print() try: # Run evaluation result = evaluator.evaluate_model_on_case( model_name, dataset_name, case_id, dialect, prompt_template ) print("โœ… Evaluation completed successfully!") print() print("Results:") print(f" Model: {result['model_name']}") print(f" Question: {result['question']}") print(f" Reference SQL: {result['reference_sql']}") print(f" Generated SQL: {result['candidate_sql']}") print(f" Composite Score: {result['composite_score']:.4f}") print(f" Correctness: {result['correctness_exact']:.2f}") print(f" Execution Success: {result['exec_success']:.2f}") print(f" Result Match F1: {result['result_match_f1']:.4f}") print(f" Latency: {result['latency_ms']:.1f}ms") print(f" Dialect OK: {result['dialect_ok']:.2f}") # Check if we got reasonable results if result['composite_score'] > 0: print("\n๐ŸŽ‰ SUCCESS: Evaluation pipeline is working!") return True else: print("\nโŒ ISSUE: All scores are zero") return False except Exception as e: print(f"โŒ ERROR: {e}") import traceback traceback.print_exc() return False if __name__ == "__main__": success = test_evaluation_pipeline() sys.exit(0 if success else 1)