DataEngEval

Sleeping

App Files Files Community

DataEngEval / test /test_evaluation.py

uparekh01151

Initial commit for DataEngEval

acd8e16 about 2 months ago

raw

history blame

2.44 kB

	#!/usr/bin/env python3
	"""
	Test script to verify the evaluation pipeline works with mock mode.
	"""

	import os
	import sys

	# Add src to path for imports
	sys.path.append('src')

	from evaluator import evaluator
	from models_registry import models_registry


	def test_evaluation_pipeline():
	"""Test the complete evaluation pipeline with mock mode."""
	print("🧪 Testing Evaluation Pipeline with Mock Mode")
	print("=" * 50)

	# Enable mock mode
	os.environ["MOCK_MODE"] = "true"

	# Test parameters
	dataset_name = "nyc_taxi_small"
	dialect = "presto"
	case_id = "avg_fare_amount"
	model_name = "CodeLlama-7B-Instruct"

	# Load prompt template
	template_path = f"prompts/template_{dialect}.txt"
	with open(template_path, 'r') as f:
	prompt_template = f.read()

	print(f"Testing evaluation:")
	print(f" Dataset: {dataset_name}")
	print(f" Dialect: {dialect}")
	print(f" Case: {case_id}")
	print(f" Model: {model_name}")
	print()

	try:
	# Run evaluation
	result = evaluator.evaluate_model_on_case(
	model_name, dataset_name, case_id, dialect, prompt_template
	)

	print("✅ Evaluation completed successfully!")
	print()
	print("Results:")
	print(f" Model: {result['model_name']}")
	print(f" Question: {result['question']}")
	print(f" Reference SQL: {result['reference_sql']}")
	print(f" Generated SQL: {result['candidate_sql']}")
	print(f" Composite Score: {result['composite_score']:.4f}")
	print(f" Correctness: {result['correctness_exact']:.2f}")
	print(f" Execution Success: {result['exec_success']:.2f}")
	print(f" Result Match F1: {result['result_match_f1']:.4f}")
	print(f" Latency: {result['latency_ms']:.1f}ms")
	print(f" Dialect OK: {result['dialect_ok']:.2f}")

	# Check if we got reasonable results
	if result['composite_score'] > 0:
	print("\n🎉 SUCCESS: Evaluation pipeline is working!")
	return True
	else:
	print("\n❌ ISSUE: All scores are zero")
	return False

	except Exception as e:
	print(f"❌ ERROR: {e}")
	import traceback
	traceback.print_exc()
	return False


	if __name__ == "__main__":
	success = test_evaluation_pipeline()
	sys.exit(0 if success else 1)