Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Test script to verify the evaluation pipeline works with mock mode. | |
| """ | |
| import os | |
| import sys | |
| # Add src to path for imports | |
| sys.path.append('src') | |
| from evaluator import evaluator | |
| from models_registry import models_registry | |
| def test_evaluation_pipeline(): | |
| """Test the complete evaluation pipeline with mock mode.""" | |
| print("🧪 Testing Evaluation Pipeline with Mock Mode") | |
| print("=" * 50) | |
| # Enable mock mode | |
| os.environ["MOCK_MODE"] = "true" | |
| # Test parameters | |
| dataset_name = "nyc_taxi_small" | |
| dialect = "presto" | |
| case_id = "avg_fare_amount" | |
| model_name = "CodeLlama-7B-Instruct" | |
| # Load prompt template | |
| template_path = f"prompts/template_{dialect}.txt" | |
| with open(template_path, 'r') as f: | |
| prompt_template = f.read() | |
| print(f"Testing evaluation:") | |
| print(f" Dataset: {dataset_name}") | |
| print(f" Dialect: {dialect}") | |
| print(f" Case: {case_id}") | |
| print(f" Model: {model_name}") | |
| print() | |
| try: | |
| # Run evaluation | |
| result = evaluator.evaluate_model_on_case( | |
| model_name, dataset_name, case_id, dialect, prompt_template | |
| ) | |
| print("✅ Evaluation completed successfully!") | |
| print() | |
| print("Results:") | |
| print(f" Model: {result['model_name']}") | |
| print(f" Question: {result['question']}") | |
| print(f" Reference SQL: {result['reference_sql']}") | |
| print(f" Generated SQL: {result['candidate_sql']}") | |
| print(f" Composite Score: {result['composite_score']:.4f}") | |
| print(f" Correctness: {result['correctness_exact']:.2f}") | |
| print(f" Execution Success: {result['exec_success']:.2f}") | |
| print(f" Result Match F1: {result['result_match_f1']:.4f}") | |
| print(f" Latency: {result['latency_ms']:.1f}ms") | |
| print(f" Dialect OK: {result['dialect_ok']:.2f}") | |
| # Check if we got reasonable results | |
| if result['composite_score'] > 0: | |
| print("\n🎉 SUCCESS: Evaluation pipeline is working!") | |
| return True | |
| else: | |
| print("\n❌ ISSUE: All scores are zero") | |
| return False | |
| except Exception as e: | |
| print(f"❌ ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| if __name__ == "__main__": | |
| success = test_evaluation_pipeline() | |
| sys.exit(0 if success else 1) | |