DataEngEval / test /test_evaluation.py
uparekh01151's picture
Initial commit for DataEngEval
acd8e16
raw
history blame
2.44 kB
#!/usr/bin/env python3
"""
Test script to verify the evaluation pipeline works with mock mode.
"""
import os
import sys
# Add src to path for imports
sys.path.append('src')
from evaluator import evaluator
from models_registry import models_registry
def test_evaluation_pipeline():
"""Test the complete evaluation pipeline with mock mode."""
print("🧪 Testing Evaluation Pipeline with Mock Mode")
print("=" * 50)
# Enable mock mode
os.environ["MOCK_MODE"] = "true"
# Test parameters
dataset_name = "nyc_taxi_small"
dialect = "presto"
case_id = "avg_fare_amount"
model_name = "CodeLlama-7B-Instruct"
# Load prompt template
template_path = f"prompts/template_{dialect}.txt"
with open(template_path, 'r') as f:
prompt_template = f.read()
print(f"Testing evaluation:")
print(f" Dataset: {dataset_name}")
print(f" Dialect: {dialect}")
print(f" Case: {case_id}")
print(f" Model: {model_name}")
print()
try:
# Run evaluation
result = evaluator.evaluate_model_on_case(
model_name, dataset_name, case_id, dialect, prompt_template
)
print("✅ Evaluation completed successfully!")
print()
print("Results:")
print(f" Model: {result['model_name']}")
print(f" Question: {result['question']}")
print(f" Reference SQL: {result['reference_sql']}")
print(f" Generated SQL: {result['candidate_sql']}")
print(f" Composite Score: {result['composite_score']:.4f}")
print(f" Correctness: {result['correctness_exact']:.2f}")
print(f" Execution Success: {result['exec_success']:.2f}")
print(f" Result Match F1: {result['result_match_f1']:.4f}")
print(f" Latency: {result['latency_ms']:.1f}ms")
print(f" Dialect OK: {result['dialect_ok']:.2f}")
# Check if we got reasonable results
if result['composite_score'] > 0:
print("\n🎉 SUCCESS: Evaluation pipeline is working!")
return True
else:
print("\n❌ ISSUE: All scores are zero")
return False
except Exception as e:
print(f"❌ ERROR: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
success = test_evaluation_pipeline()
sys.exit(0 if success else 1)