Spaces:
Sleeping
Sleeping
| # Metrics Configuration | |
| metrics: | |
| # Scoring weights for composite score calculation | |
| weights: | |
| correctness_exact: 0.40 | |
| exec_success: 0.25 | |
| result_match_f1: 0.15 | |
| dialect_ok: 0.10 | |
| readability: 0.05 | |
| latency: 0.05 | |
| # Metric descriptions | |
| descriptions: | |
| correctness_exact: "Binary score (0/1) for exact result match" | |
| exec_success: "Binary score (0/1) for successful SQL execution" | |
| result_match_f1: "F1 score for partial result matching" | |
| latency: "Response time in milliseconds" | |
| readability: "Score based on SQL structure and formatting" | |
| dialect_ok: "Binary score (0/1) for successful SQL transpilation" | |
| # Thresholds and limits | |
| thresholds: | |
| max_latency_ms: 30000 # 30 seconds timeout | |
| min_score: 0.0 | |
| max_score: 1.0 | |
| # Display formatting | |
| formatting: | |
| composite_score: "{:.4f}" | |
| correctness_exact: "{:.2f}" | |
| exec_success: "{:.2f}" | |
| result_match_f1: "{:.4f}" | |
| latency_ms: "{:.1f}ms" | |
| dialect_ok: "{:.2f}" | |
| readability: "{:.2f}" | |
| # Mock SQL Generation Patterns | |
| mock_sql: | |
| patterns: | |
| count_queries: | |
| - "how many" | |
| - "count" | |
| average_queries: | |
| - "average" | |
| - "avg" | |
| total_queries: | |
| - "total" | |
| - "amount" | |
| passenger_queries: | |
| - "passenger" | |
| templates: | |
| count_trips: "SELECT COUNT(*) as total_trips FROM trips" | |
| count_generic: "SELECT COUNT(*) FROM trips" | |
| avg_fare: "SELECT AVG(fare_amount) as avg_fare FROM trips" | |
| avg_generic: "SELECT AVG(total_amount) FROM trips" | |
| total_amount: "SELECT SUM(total_amount) as total_collected FROM trips" | |
| passenger_count: "SELECT passenger_count, COUNT(*) as trip_count FROM trips GROUP BY passenger_count" | |
| default: "SELECT * FROM trips LIMIT 10" | |