NIRAJz commited on
Commit
6d55fec
·
verified ·
1 Parent(s): 285e6d0

Upload 23 files

Browse files
agents/__pycache__/evaluation_agent.cpython-312.pyc ADDED
Binary file (6.69 kB). View file
 
agents/__pycache__/graph_builder.cpython-312.pyc ADDED
Binary file (5.98 kB). View file
 
agents/__pycache__/tools.cpython-312.pyc ADDED
Binary file (1.37 kB). View file
 
agents/evaluation_agent.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any
2
+ import asyncio
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from tqdm import tqdm
5
+
6
+ from schemas.data_models import EvaluationRequest, EvaluationSummary, APIProvider, MetricType
7
+ from .graph_builder import EvaluationGraphBuilder
8
+ from config import settings
9
+
10
+ class EvaluationAgent:
11
+ def __init__(self):
12
+ self.graph_builder = None
13
+
14
+ async def evaluate_async(self, request: EvaluationRequest) -> EvaluationSummary:
15
+ """Evaluate questions asynchronously using LangGraph"""
16
+ start_time = asyncio.get_event_loop().time()
17
+
18
+ if len(request.questions) != len(request.ground_truths):
19
+ raise ValueError("Questions and ground truths must have same length")
20
+
21
+ if request.model_responses and len(request.questions) != len(request.model_responses):
22
+ raise ValueError("Questions and model responses must have same length")
23
+
24
+ # Initialize graph builder with API provider
25
+ self.graph_builder = EvaluationGraphBuilder(
26
+ model_name=request.judge_model,
27
+ api_provider=request.api_provider.value
28
+ )
29
+
30
+ # Build evaluation graph
31
+ graph = self.graph_builder.build_graph()
32
+
33
+ # Process evaluations
34
+ results = []
35
+ with ThreadPoolExecutor(max_workers=request.max_concurrent) as executor:
36
+ futures = []
37
+
38
+ for i in range(len(request.questions)):
39
+ state = {
40
+ "question": request.questions[i],
41
+ "ground_truth": request.ground_truths[i],
42
+ "model_response": request.model_responses[i] if request.model_responses else "",
43
+ "metrics": [m.value for m in request.metrics]
44
+ }
45
+
46
+ # Add context if available and if context metrics are requested
47
+ context_metrics = ["context_precision", "context_recall"]
48
+ if any(m in context_metrics for m in [metric.value for metric in request.metrics]) and hasattr(request, 'contexts') and request.contexts:
49
+ state["context"] = request.contexts[i] if i < len(request.contexts) else "No context provided."
50
+
51
+ future = executor.submit(
52
+ self._run_evaluation,
53
+ graph,
54
+ state
55
+ )
56
+ futures.append(future)
57
+
58
+ # Process with progress bar
59
+ for future in tqdm(futures, desc="Evaluating responses"):
60
+ try:
61
+ result = future.result()
62
+ results.append(result["final_result"])
63
+ except Exception as e:
64
+ print(f"Evaluation failed: {e}")
65
+ # Add a failed result with default values
66
+ failed_result = {
67
+ "question": state["question"],
68
+ "ground_truth": state["ground_truth"],
69
+ "model_response": state["model_response"],
70
+ "metrics": {m.value: 0 for m in request.metrics},
71
+ "explanations": {m.value: f"Evaluation failed: {str(e)}" for m in request.metrics},
72
+ "processing_time": 0,
73
+ "overall_score": 0
74
+ }
75
+ results.append(failed_result)
76
+
77
+ # Calculate summary
78
+ avg_scores = self._calculate_average_scores(results, request.metrics)
79
+ overall_score = self._calculate_overall_score(results)
80
+
81
+ return EvaluationSummary(
82
+ total_questions=len(request.questions),
83
+ average_scores=avg_scores,
84
+ individual_results=results,
85
+ total_processing_time=asyncio.get_event_loop().time() - start_time,
86
+ model_used=request.judge_model,
87
+ api_provider=request.api_provider.value,
88
+ overall_score=overall_score
89
+ )
90
+
91
+ def _run_evaluation(self, graph, state):
92
+ """Run evaluation synchronously (for ThreadPoolExecutor)"""
93
+ return graph.invoke(state)
94
+
95
+ def _calculate_average_scores(self, results: List[Any], metrics: List[MetricType]) -> Dict[MetricType, float]:
96
+ """Calculate average scores across all results"""
97
+ avg_scores = {}
98
+ for metric in metrics:
99
+ scores = [result.metrics.get(metric.value, 0) for result in results]
100
+ avg_scores[metric] = sum(scores) / len(scores) if scores else 0
101
+ return avg_scores
102
+
103
+ def _calculate_overall_score(self, results: List[Any]) -> float:
104
+ """Calculate overall score across all results"""
105
+ if not results:
106
+ return 0
107
+
108
+ overall_scores = [result.overall_score for result in results if hasattr(result, 'overall_score')]
109
+ return sum(overall_scores) / len(overall_scores) if overall_scores else 0
agents/graph_builder.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langgraph.graph import StateGraph, END
2
+ from typing import Dict, Any, List
3
+ from typing_extensions import TypedDict
4
+ import asyncio
5
+
6
+ from schemas.data_models import EvaluationResult
7
+ from chains.evaluation_chains import EvaluationChains
8
+ from .tools import evaluate_response
9
+ from config import settings
10
+
11
+ class EvaluationState(TypedDict):
12
+ question: str
13
+ ground_truth: str
14
+ model_response: str
15
+ context: str
16
+ metrics: List[str]
17
+ results: Dict[str, Any]
18
+ current_metric: str
19
+ final_result: EvaluationResult
20
+
21
+ class EvaluationGraphBuilder:
22
+ def __init__(self, model_name: str = None, api_provider: str = None):
23
+ self.model_name = model_name
24
+ self.api_provider = api_provider or settings.DEFAULT_API_PROVIDER
25
+ self.evaluation_chains = EvaluationChains(model_name, self.api_provider)
26
+
27
+ def build_graph(self):
28
+ """Build LangGraph workflow for evaluation"""
29
+ workflow = StateGraph(EvaluationState)
30
+
31
+ # Add nodes
32
+ workflow.add_node("initialize", self._initialize_node)
33
+ workflow.add_node("evaluate_metric", self._evaluate_metric_node)
34
+ workflow.add_node("aggregate_results", self._aggregate_results_node)
35
+
36
+ # Define edges
37
+ workflow.set_entry_point("initialize")
38
+ workflow.add_edge("initialize", "evaluate_metric")
39
+ workflow.add_conditional_edges(
40
+ "evaluate_metric",
41
+ self._should_continue,
42
+ {
43
+ "continue": "evaluate_metric",
44
+ "done": "aggregate_results"
45
+ }
46
+ )
47
+ workflow.add_edge("aggregate_results", END)
48
+
49
+ return workflow.compile()
50
+
51
+ def _initialize_node(self, state: EvaluationState) -> EvaluationState:
52
+ """Initialize evaluation state"""
53
+ return {
54
+ **state,
55
+ "results": {},
56
+ "current_metric": state["metrics"][0] if state["metrics"] else ""
57
+ }
58
+
59
+ def _evaluate_metric_node(self, state: EvaluationState) -> EvaluationState:
60
+ """Evaluate a single metric"""
61
+ metric = state["current_metric"]
62
+ chain = self.evaluation_chains.create_evaluation_chain(metric)
63
+
64
+ # Prepare tool input
65
+ tool_input = {
66
+ "question": state["question"],
67
+ "ground_truth": state["ground_truth"],
68
+ "response": state["model_response"],
69
+ "metric": metric,
70
+ "chain": chain
71
+ }
72
+
73
+ # Add context for context-based metrics (even if empty)
74
+ if metric in ["context_precision", "context_recall"]:
75
+ tool_input["context"] = state.get("context", "No context provided.")
76
+
77
+ # Fix: Use the tool correctly with proper arguments
78
+ result = evaluate_response.invoke(tool_input)
79
+
80
+ # Update results
81
+ results = state.get("results", {})
82
+ results[metric] = result
83
+
84
+ # Move to next metric
85
+ current_index = state["metrics"].index(metric)
86
+ next_index = current_index + 1
87
+
88
+ return {
89
+ **state,
90
+ "results": results,
91
+ "current_metric": state["metrics"][next_index] if next_index < len(state["metrics"]) else None
92
+ }
93
+
94
+ def _should_continue(self, state: EvaluationState) -> str:
95
+ """Determine if we should continue evaluating metrics"""
96
+ if state["current_metric"] is None:
97
+ return "done"
98
+ return "continue"
99
+
100
+ def _aggregate_results_node(self, state: EvaluationState) -> EvaluationState:
101
+ """Aggregate results into final format"""
102
+ metrics_scores = {}
103
+ explanations = {}
104
+ total_time = 0
105
+
106
+ for metric, result in state["results"].items():
107
+ metrics_scores[metric] = result.get("score", 0)
108
+ explanations[metric] = result.get("explanation", "")
109
+ total_time += result.get("processing_time", 0)
110
+
111
+ # Calculate overall score (weighted average)
112
+ overall_score = self._calculate_overall_score(metrics_scores)
113
+
114
+ final_result = EvaluationResult(
115
+ question=state["question"],
116
+ ground_truth=state["ground_truth"],
117
+ model_response=state["model_response"],
118
+ metrics=metrics_scores,
119
+ explanations=explanations,
120
+ processing_time=total_time,
121
+ overall_score=overall_score
122
+ )
123
+
124
+ return {**state, "final_result": final_result}
125
+
126
+ def _calculate_overall_score(self, metrics_scores: Dict[str, float]) -> float:
127
+ """Calculate overall score with weighted metrics"""
128
+ # Define weights for different metrics
129
+ weights = {
130
+ "accuracy": 0.3,
131
+ "faithfulness": 0.25,
132
+ "relevance": 0.2,
133
+ "toxicity": 0.15,
134
+ "context_precision": 0.05,
135
+ "context_recall": 0.05
136
+ }
137
+
138
+ # Calculate weighted average
139
+ total_weight = 0
140
+ weighted_sum = 0
141
+
142
+ for metric, score in metrics_scores.items():
143
+ weight = weights.get(metric, 0.1) # Default weight for unknown metrics
144
+ weighted_sum += score * weight
145
+ total_weight += weight
146
+
147
+ # Normalize to 0-100 scale
148
+ if total_weight > 0:
149
+ return weighted_sum / total_weight
150
+ return 0
agents/tools.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.tools import tool
2
+ from typing import Dict, Any
3
+ import time
4
+
5
+ @tool
6
+ def evaluate_response(question: str, ground_truth: str, response: str, metric: str,
7
+ chain: Any, context: str = None) -> Dict[str, Any]:
8
+ """Evaluate a response for a specific metric using LangChain"""
9
+ start_time = time.time()
10
+
11
+ try:
12
+ # Prepare input based on metric type
13
+ input_data = {
14
+ "question": question,
15
+ "ground_truth": ground_truth,
16
+ "response": response
17
+ }
18
+
19
+ # Add context for context-based metrics (even if empty)
20
+ if metric in ["context_precision", "context_recall"]:
21
+ input_data["context"] = context if context else "No context provided."
22
+
23
+ # Use invoke() instead of direct call to fix the tool calling issue
24
+ result = chain.invoke(input_data)
25
+
26
+ processing_time = time.time() - start_time
27
+ result["processing_time"] = processing_time
28
+
29
+ return result
30
+ except Exception as e:
31
+ return {
32
+ "score": 0,
33
+ "explanation": f"Evaluation failed: {str(e)}",
34
+ "processing_time": time.time() - start_time
35
+ }
app.py ADDED
@@ -0,0 +1,862 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.graph_objects as go
4
+ import json
5
+ import asyncio
6
+ import os
7
+ from typing import List, Dict, Any
8
+ from datetime import datetime, timezone
9
+
10
+ # Apply nest_asyncio to allow nested event loops
11
+ try:
12
+ import nest_asyncio
13
+ nest_asyncio.apply()
14
+ except ImportError:
15
+ pass
16
+
17
+ # Import your custom modules
18
+ try:
19
+ from agents.evaluation_agent import EvaluationAgent
20
+ from schemas.data_models import EvaluationRequest, MetricType, APIProvider
21
+ from config import settings
22
+ from utils.cache_manager import clear_cache, get_cache_stats
23
+ except ImportError as e:
24
+ st.error(f"Import error: {e}. Please make sure all required modules are available.")
25
+ st.stop()
26
+
27
+ # Page configuration
28
+ st.set_page_config(
29
+ page_title="LLM Evaluation Platform",
30
+ page_icon="🤖",
31
+ layout="wide",
32
+ initial_sidebar_state="expanded"
33
+ )
34
+
35
+ # Initialize session state
36
+ if "evaluation_results" not in st.session_state:
37
+ st.session_state.evaluation_results = None
38
+ if "evaluation_history" not in st.session_state:
39
+ st.session_state.evaluation_history = []
40
+ if "evaluation_in_progress" not in st.session_state:
41
+ st.session_state.evaluation_in_progress = False
42
+ if "active_tab" not in st.session_state:
43
+ st.session_state.active_tab = "Evaluate"
44
+ if "evaluation_params" not in st.session_state:
45
+ st.session_state.evaluation_params = {}
46
+ if "show_results" not in st.session_state:
47
+ st.session_state.show_results = False
48
+
49
+ def run_evaluation_sync(request: EvaluationRequest):
50
+ """Run evaluation synchronously with proper event loop handling"""
51
+ try:
52
+ loop = asyncio.new_event_loop()
53
+ asyncio.set_event_loop(loop)
54
+
55
+ agent = EvaluationAgent()
56
+ result = loop.run_until_complete(agent.evaluate_async(request))
57
+ loop.close()
58
+ return result
59
+ except Exception as e:
60
+ st.error(f"Evaluation error: {e}")
61
+ return None
62
+
63
+ def create_metric_radar_chart(scores: Dict[str, float]) -> go.Figure:
64
+ metrics = list(scores.keys())
65
+ values = list(scores.values())
66
+
67
+ fig = go.Figure()
68
+
69
+ fig.add_trace(go.Scatterpolar(
70
+ r=values + [values[0]],
71
+ theta=metrics + [metrics[0]],
72
+ fill='toself',
73
+ fillcolor='rgba(100, 149, 237, 0.3)',
74
+ line=dict(color='rgba(100, 149, 237, 0.8)', width=3),
75
+ name='Metrics Score',
76
+ hoverinfo='text',
77
+ hovertext=[f'{metric}: {score:.1f}%' for metric, score in zip(metrics, values)]
78
+ ))
79
+
80
+ fig.update_layout(
81
+ polar=dict(
82
+ radialaxis=dict(
83
+ visible=True,
84
+ range=[0, 100],
85
+ tickfont=dict(size=10),
86
+ tickangle=0,
87
+ tickvals=[0, 20, 40, 60, 80, 100],
88
+ ticktext=['0%', '20%', '40%', '60%', '80%', '100%']
89
+ ),
90
+ angularaxis=dict(
91
+ tickfont=dict(size=11),
92
+ rotation=90
93
+ )
94
+ ),
95
+ showlegend=False,
96
+ title=dict(
97
+ text="Performance Metrics Radar",
98
+ x=0.5,
99
+ xanchor='center',
100
+ font=dict(size=16)
101
+ ),
102
+ height=450,
103
+ margin=dict(l=50, r=50, t=80, b=50),
104
+ paper_bgcolor='rgba(0,0,0,0)',
105
+ plot_bgcolor='rgba(0,0,0,0)'
106
+ )
107
+
108
+ return fig
109
+
110
+ def create_metric_bar_chart(scores: Dict[str, float]) -> go.Figure:
111
+ metrics = [m.capitalize() for m in scores.keys()]
112
+ values = list(scores.values())
113
+
114
+ # Create color scale based on score values - inverted for toxicity
115
+ colors = []
116
+ for metric, score in zip(metrics, values):
117
+ if 'toxicity' in metric.lower():
118
+ # For toxicity, lower is better (green), higher is worse (red)
119
+ colors.append(f'hsl({int(120 * (100-score)/100)}, 70%, 50%)')
120
+ else:
121
+ # For other metrics, higher is better
122
+ colors.append(f'hsl({int(120 * score/100)}, 70%, 50%)')
123
+
124
+ fig = go.Figure()
125
+
126
+ fig.add_trace(go.Bar(
127
+ x=metrics,
128
+ y=values,
129
+ marker_color=colors,
130
+ marker_line=dict(color='rgba(0,0,0,0.3)', width=1),
131
+ text=[f'{v:.1f}%' for v in values],
132
+ textposition='auto',
133
+ textfont=dict(size=12, color='white'),
134
+ hovertemplate='<b>%{x}</b><br>Score: %{y:.1f}%<extra></extra>'
135
+ ))
136
+
137
+ fig.update_layout(
138
+ title=dict(
139
+ text="Average Scores by Metric",
140
+ x=0.5,
141
+ xanchor='center',
142
+ font=dict(size=16)
143
+ ),
144
+ xaxis=dict(
145
+ title="Evaluation Metric",
146
+ tickangle=45,
147
+ tickfont=dict(size=11)
148
+ ),
149
+ yaxis=dict(
150
+ title="Score (%)",
151
+ range=[0, 100],
152
+ tickvals=[0, 20, 40, 60, 80, 100],
153
+ ticktext=['0%', '20%', '40%', '60%', '80%', '100%']
154
+ ),
155
+ height=450,
156
+ margin=dict(l=50, r=50, t=80, b=80),
157
+ paper_bgcolor='rgba(0,0,0,0)',
158
+ plot_bgcolor='rgba(0,0,0,0)'
159
+ )
160
+
161
+ return fig
162
+
163
+ def create_score_distribution_chart(results: List[Any]) -> go.Figure:
164
+ if not results or not getattr(results[0], "metrics", None):
165
+ return None
166
+
167
+ metrics = list(results[0].metrics.keys())
168
+ fig = go.Figure()
169
+
170
+ for metric in metrics:
171
+ scores = [getattr(r, 'metrics', {}).get(metric, 0) for r in results]
172
+
173
+ fig.add_trace(go.Violin(
174
+ y=scores,
175
+ name=metric.capitalize(),
176
+ box_visible=True,
177
+ meanline_visible=True,
178
+ points="all",
179
+ hoverinfo='y',
180
+ opacity=0.7
181
+ ))
182
+
183
+ fig.update_layout(
184
+ title=dict(
185
+ text="Score Distribution by Metric",
186
+ x=0.5,
187
+ xanchor='center',
188
+ font=dict(size=16)
189
+ ),
190
+ yaxis=dict(
191
+ title="Score (%)",
192
+ range=[0, 100],
193
+ tickvals=[0, 20, 40, 60, 80, 100]
194
+ ),
195
+ xaxis=dict(title="Metric"),
196
+ height=400,
197
+ showlegend=True,
198
+ paper_bgcolor='rgba(0,0,0,0)',
199
+ plot_bgcolor='rgba(0,0,0,0)'
200
+ )
201
+
202
+ return fig
203
+
204
+ def get_score_color(metric: str, score: float) -> str:
205
+ """Get color for a score based on metric type"""
206
+ if 'toxicity' in metric.lower():
207
+ # For toxicity, lower is better (green), higher is worse (red)
208
+ return "green" if score <= 30 else "orange" if score <= 60 else "red"
209
+ else:
210
+ # For other metrics, higher is better
211
+ return "green" if score >= 70 else "orange" if score >= 40 else "red"
212
+
213
+ def display_results(results):
214
+ if not results:
215
+ st.error("No results to display")
216
+ return
217
+
218
+ if not hasattr(results, 'individual_results') or not results.individual_results:
219
+ st.warning("No individual results available")
220
+ return
221
+
222
+ # Summary
223
+ st.subheader("📊 Evaluation Summary")
224
+ col1, col2, col3, col4, col5 = st.columns(5)
225
+
226
+ with col1:
227
+ st.metric("Total Questions", results.total_questions)
228
+ with col2:
229
+ st.metric("Total Time", f"{results.total_processing_time:.1f}s")
230
+ with col3:
231
+ st.metric("Model Used", results.model_used)
232
+ with col4:
233
+ st.metric("API Provider", results.api_provider)
234
+ with col5:
235
+ st.metric("Overall Score", f"{results.overall_score:.1f}%")
236
+
237
+ # Metrics visualization
238
+ st.subheader("📈 Performance Metrics")
239
+
240
+ if results.average_scores:
241
+ col1, col2 = st.columns(2)
242
+
243
+ with col1:
244
+ bar_fig = create_metric_bar_chart(results.average_scores)
245
+ st.plotly_chart(bar_fig, use_container_width=True)
246
+
247
+ with col2:
248
+ radar_fig = create_metric_radar_chart(results.average_scores)
249
+ st.plotly_chart(radar_fig, use_container_width=True)
250
+
251
+ dist_fig = create_score_distribution_chart(results.individual_results)
252
+ if dist_fig:
253
+ st.plotly_chart(dist_fig, use_container_width=True)
254
+ else:
255
+ st.warning("No metric scores available")
256
+
257
+ # Detailed results
258
+ st.subheader("📋 Detailed Results")
259
+ if results.individual_results:
260
+ tab1, tab2 = st.tabs(["Data Table", "Question Details"])
261
+
262
+ with tab1:
263
+ detailed_data = []
264
+ for i, result in enumerate(results.individual_results):
265
+ row = {
266
+ "ID": i + 1,
267
+ "Question": result.question[:50] + "..." if len(result.question) > 50 else result.question,
268
+ "Response": result.model_response[:50] + "..." if len(result.model_response) > 50 else result.model_response,
269
+ "Overall Score": f"{result.overall_score:.1f}%" if hasattr(result, 'overall_score') else "N/A",
270
+ "Time (s)": f"{result.processing_time:.2f}"
271
+ }
272
+ for metric, score in result.metrics.items():
273
+ row[metric.capitalize()] = f"{score:.1f}%"
274
+ detailed_data.append(row)
275
+
276
+ st.dataframe(
277
+ detailed_data,
278
+ use_container_width=True,
279
+ height=400,
280
+ column_config={
281
+ "ID": st.column_config.NumberColumn("ID", width="small"),
282
+ "Question": st.column_config.TextColumn("Question", width="large"),
283
+ "Response": st.column_config.TextColumn("Response", width="large"),
284
+ "Overall Score": st.column_config.NumberColumn("Overall Score", width="medium"),
285
+ }
286
+ )
287
+
288
+ with tab2:
289
+ for i, result in enumerate(results.individual_results):
290
+ with st.expander(f"Question {i+1}: {result.question[:70]}{'...' if len(result.question) > 70 else ''}", expanded=False):
291
+ col1, col2 = st.columns([1, 2])
292
+
293
+ with col1:
294
+ st.write("**Question:**")
295
+ st.info(result.question)
296
+
297
+ st.write("**Ground Truth:**")
298
+ st.success(result.ground_truth)
299
+
300
+ st.write("**Model Response:**")
301
+ st.info(result.model_response)
302
+
303
+ st.metric("Processing Time", f"{result.processing_time:.2f}s")
304
+ if hasattr(result, 'overall_score'):
305
+ st.metric("Overall Score", f"{result.overall_score:.1f}%")
306
+
307
+ with col2:
308
+ metrics_cols = st.columns(3)
309
+ metric_items = list(result.metrics.items())
310
+
311
+ for j, (metric, score) in enumerate(metric_items):
312
+ with metrics_cols[j % 3]:
313
+ # Use the correct color logic for each metric type
314
+ color = get_score_color(metric, score)
315
+ st.markdown(f"""
316
+ <div style="background-color: rgba(240, 242, 246, 0.5);
317
+ padding: 15px;
318
+ border-radius: 10px;
319
+ border-left: 4px solid {color};
320
+ margin-bottom: 10px;">
321
+ <h4 style="margin: 0; color: {color};">{metric.capitalize()}</h4>
322
+ <h2 style="margin: 5px 0; color: {color};">{score:.1f}%</h2>
323
+ </div>
324
+ """, unsafe_allow_html=True)
325
+
326
+ st.write("**Explanations:**")
327
+ if hasattr(result, 'explanations') and result.explanations:
328
+ selected_explanation = st.selectbox(
329
+ "Select metric explanation:",
330
+ options=list(result.explanations.keys()),
331
+ format_func=lambda x: x.capitalize(),
332
+ key=f"explanation_select_{i}"
333
+ )
334
+
335
+ st.text_area(
336
+ f"{selected_explanation.capitalize()} Explanation",
337
+ value=result.explanations[selected_explanation],
338
+ height=150,
339
+ key=f"explanation_text_{i}_{selected_explanation}",
340
+ disabled=True
341
+ )
342
+ else:
343
+ st.info("No explanations available for this question")
344
+
345
+ # Export buttons
346
+ st.subheader("💾 Export Results")
347
+ col1, col2, col3 = st.columns(3)
348
+
349
+ with col1:
350
+ try:
351
+ results_json = results.model_dump_json()
352
+ except Exception:
353
+ # Fallback serialization
354
+ try:
355
+ results_json = json.dumps(results.__dict__, default=lambda o: getattr(o, "__dict__", str(o)), indent=2)
356
+ except Exception:
357
+ results_json = "{}"
358
+
359
+ st.download_button(
360
+ "📊 Download JSON",
361
+ data=results_json,
362
+ file_name="evaluation_results.json",
363
+ mime="application/json",
364
+ use_container_width=True
365
+ )
366
+
367
+ with col2:
368
+ csv_data = []
369
+ for i, result in enumerate(results.individual_results):
370
+ row = {
371
+ "ID": i + 1,
372
+ "Question": result.question,
373
+ "Ground Truth": result.ground_truth,
374
+ "Response": result.model_response,
375
+ "Overall Score": result.overall_score if hasattr(result, 'overall_score') else 0,
376
+ "Time (s)": result.processing_time
377
+ }
378
+ for metric, score in result.metrics.items():
379
+ row[metric.capitalize()] = score
380
+ if hasattr(result, 'explanations'):
381
+ for metric, explanation in result.explanations.items():
382
+ row[f"{metric.capitalize()} Explanation"] = explanation
383
+ csv_data.append(row)
384
+
385
+ df = pd.DataFrame(csv_data)
386
+ csv = df.to_csv(index=False)
387
+ st.download_button(
388
+ "📋 Download CSV",
389
+ data=csv,
390
+ file_name="evaluation_results.csv",
391
+ mime="text/csv",
392
+ use_container_width=True
393
+ )
394
+
395
+ with col3:
396
+ html_content = f"""
397
+ <html>
398
+ <head>
399
+ <title>LLM Evaluation Report</title>
400
+ <style>
401
+ body {{ font-family: Arial, sans-serif; margin: 40px; }}
402
+ .header {{ text-align: center; margin-bottom: 30px; }}
403
+ .metric {{ background-color: #f8f9fa; padding: 15px; margin: 10px; border-radius: 5px; }}
404
+ .score {{ font-size: 24px; font-weight: bold; }}
405
+ </style>
406
+ </head>
407
+ <body>
408
+ <div class="header">
409
+ <h1>LLM Evaluation Report</h1>
410
+ <p>Generated on {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
411
+ </div>
412
+ <h2>Summary</h2>
413
+ <p>Total Questions: {results.total_questions}</p>
414
+ <p>Total Time: {results.total_processing_time:.1f}s</p>
415
+ <p>Model Used: {results.model_used}</p>
416
+ <p>API Provider: {results.api_provider}</p>
417
+ <p>Overall Score: {results.overall_score:.1f}%</p>
418
+
419
+ <h2>Average Scores</h2>
420
+ {"".join([f'<div class="metric"><h3>{m.capitalize()}</h3><div class="score">{s:.1f}%</div></div>' for m, s in results.average_scores.items()])}
421
+ </body>
422
+ </html>
423
+ """
424
+
425
+ st.download_button(
426
+ "🌐 Download HTML Report",
427
+ data=html_content,
428
+ file_name="evaluation_report.html",
429
+ mime="text/html",
430
+ use_container_width=True
431
+ )
432
+ else:
433
+ st.warning("No individual results available")
434
+
435
+ def build_request_object(questions: List[str], ground_truths: List[str], model_responses: List[str],
436
+ contexts: List[str], metrics: List[str], provider: str, judge_model: str,
437
+ max_concurrent: int):
438
+ # Map provider to enum if available
439
+ try:
440
+ provider_enum = APIProvider.GROQ if provider.lower().startswith("groq") else APIProvider.OPENAI
441
+ except Exception:
442
+ provider_enum = provider
443
+
444
+ # Try to instantiate EvaluationRequest robustly
445
+ try:
446
+ request = EvaluationRequest(
447
+ questions=questions,
448
+ ground_truths=ground_truths,
449
+ model_responses=model_responses,
450
+ metrics=[MetricType(m) for m in metrics],
451
+ api_provider=provider_enum,
452
+ judge_model=judge_model,
453
+ max_concurrent=max_concurrent
454
+ )
455
+ except Exception:
456
+ # Fallback to simple namespace-like object if model signature differs
457
+ class SimpleRequest:
458
+ def __init__(self, **kwargs):
459
+ self.__dict__.update(kwargs)
460
+ request = SimpleRequest(
461
+ questions=questions,
462
+ ground_truths=ground_truths,
463
+ model_responses=model_responses,
464
+ metrics=metrics,
465
+ api_provider=provider_enum,
466
+ judge_model=judge_model,
467
+ max_concurrent=max_concurrent
468
+ )
469
+
470
+ return request
471
+
472
+ def parse_json_file(uploaded_file):
473
+ """Parse JSON file with different possible structures"""
474
+ try:
475
+ # Read and parse the file
476
+ content = uploaded_file.getvalue()
477
+ if isinstance(content, bytes):
478
+ content = content.decode('utf-8')
479
+
480
+ data = json.loads(content)
481
+
482
+ questions_list = []
483
+ truths_list = []
484
+ responses_list = []
485
+ contexts_list = []
486
+
487
+ # Handle different JSON structures
488
+ if isinstance(data, dict):
489
+ # Check if it's the comprehensive_test_data.json format
490
+ if "questions" in data and "ground_truths" in data:
491
+ questions_list = data.get("questions", [])
492
+ truths_list = data.get("ground_truths", [])
493
+ responses_list = data.get("model_responses", [])
494
+ contexts_list = data.get("contexts", [])
495
+ else:
496
+ # Try to extract from a single object
497
+ item = {k.lower(): v for k, v in data.items()}
498
+ q = item.get("question") or item.get("prompt") or item.get("input")
499
+ gt = item.get("ground_truth") or item.get("groundtruth") or item.get("ground truth") or ""
500
+ resp = item.get("model_response") or item.get("response") or item.get("answer") or ""
501
+ ctx = item.get("context") or item.get("contexts") or ""
502
+
503
+ if q:
504
+ questions_list.append(str(q))
505
+ truths_list.append(str(gt))
506
+ responses_list.append(str(resp))
507
+ contexts_list.append(str(ctx))
508
+
509
+ elif isinstance(data, list):
510
+ # Handle list of objects
511
+ for item in data:
512
+ if isinstance(item, dict):
513
+ item_lc = {k.lower(): v for k, v in item.items()}
514
+ q = item_lc.get("question") or item_lc.get("prompt") or item_lc.get("input")
515
+ gt = item_lc.get("ground_truth") or item_lc.get("groundtruth") or item_lc.get("ground truth") or ""
516
+ resp = item_lc.get("model_response") or item_lc.get("response") or item.lc.get("answer") or ""
517
+ ctx = item_lc.get("context") or item_lc.get("contexts") or ""
518
+
519
+ if q:
520
+ questions_list.append(str(q))
521
+ truths_list.append(str(gt))
522
+ responses_list.append(str(resp))
523
+ contexts_list.append(str(ctx))
524
+
525
+ return questions_list, truths_list, responses_list, contexts_list
526
+
527
+ except Exception as e:
528
+ st.error(f"Error parsing JSON file: {e}")
529
+ return [], [], [], []
530
+
531
+ def main():
532
+ st.title("🤖 LMVal: Multi-Metric LLM Evaluation")
533
+ st.markdown("Advanced RAG pipeline evaluation using LangGraph and Groq/OpenAI")
534
+
535
+ # Sidebar
536
+ with st.sidebar:
537
+ st.header("⚙️ Configuration")
538
+
539
+ api_provider = st.radio(
540
+ "API Provider",
541
+ options=["groq", "openai"],
542
+ index=0,
543
+ horizontal=True
544
+ )
545
+
546
+ if api_provider == "groq":
547
+ api_key = st.text_input(
548
+ "Groq API Key",
549
+ type="password",
550
+ value=os.getenv("GROQ_API_KEY", ""),
551
+ help="Get from https://console.groq.com/"
552
+ )
553
+ if api_key:
554
+ os.environ["GROQ_API_KEY"] = api_key
555
+
556
+ judge_model = st.selectbox(
557
+ "Judge Model",
558
+ options=settings.AVAILABLE_GROQ_MODELS,
559
+ index=0
560
+ )
561
+ else:
562
+ api_key = st.text_input(
563
+ "OpenAI API Key",
564
+ type="password",
565
+ value=os.getenv("OPENAI_API_KEY", ""),
566
+ help="Get from https://platform.openai.com/"
567
+ )
568
+ if api_key:
569
+ os.environ["OPENAI_API_KEY"] = api_key
570
+
571
+ judge_model = st.selectbox(
572
+ "Judge Model",
573
+ options=settings.AVAILABLE_OPENAI_MODELS,
574
+ index=0
575
+ )
576
+
577
+ selected_metrics = st.multiselect(
578
+ "Evaluation Metrics",
579
+ options=[m.value for m in MetricType],
580
+ default=["accuracy", "faithfulness", "relevance"],
581
+ help="Select metrics to evaluate. Some metrics may require additional context."
582
+ )
583
+
584
+ max_concurrent = st.slider(
585
+ "Max Concurrent Evaluations",
586
+ min_value=1,
587
+ max_value=10,
588
+ value=3,
589
+ help="Higher values may cause rate limiting"
590
+ )
591
+
592
+ st.subheader("💾 Cache Settings")
593
+ if st.button("Clear Cache", use_container_width=True):
594
+ clear_cache()
595
+ st.success("Cache cleared!")
596
+
597
+ cache_stats = get_cache_stats()
598
+ st.caption(f"Cache: {cache_stats['count']} items, {cache_stats['size'] / 1024 / 1024:.1f} MB")
599
+
600
+ st.subheader("ℹ️ About")
601
+ st.info("""
602
+ This platform evaluates LLM responses using multiple metrics:
603
+ - **Accuracy**: Comparison with ground truth (higher is better)
604
+ - **Faithfulness**: Checks for hallucinations (higher is better)
605
+ - **Relevance**: Response relevance to question (higher is better)
606
+ - **Toxicity**: Detects harmful content (lower is better)
607
+ - **Context Precision/Recall**: For RAG systems (higher is better)
608
+ """)
609
+
610
+ tab1, tab2, tab3 = st.tabs(["🏃‍♂️ Evaluate", "📊 Results", "📚 History"])
611
+
612
+ # Evaluate tab
613
+ with tab1:
614
+ st.header("Run Evaluation")
615
+
616
+ input_method = st.radio(
617
+ "Input Method",
618
+ ["Manual Input", "Upload JSON"],
619
+ horizontal=True
620
+ )
621
+
622
+ questions_list = []
623
+ truths_list = []
624
+ responses_list = []
625
+ contexts_list = []
626
+
627
+ if input_method == "Manual Input":
628
+ col1, col2 = st.columns(2)
629
+
630
+ with col1:
631
+ questions = st.text_area(
632
+ "Questions (one per line)",
633
+ height=150,
634
+ placeholder="What is the capital of France?\nHow does photosynthesis work?",
635
+ help="Enter each question on a new line"
636
+ )
637
+
638
+ with col2:
639
+ ground_truths = st.text_area(
640
+ "Ground Truths (one per line)",
641
+ height=150,
642
+ placeholder="Paris\nPhotosynthesis converts sunlight to energy.",
643
+ help="Enter ground truth for each question"
644
+ )
645
+
646
+ model_responses = st.text_area(
647
+ "Model Responses (one per line)",
648
+ height=150,
649
+ placeholder="Paris is the capital.\nPhotosynthesis uses sunlight.",
650
+ help="Enter model response for each question"
651
+ )
652
+
653
+ if any(metric in selected_metrics for metric in ["context_precision", "context_recall"]):
654
+ contexts = st.text_area(
655
+ "Contexts (one per line, optional)",
656
+ height=100,
657
+ placeholder="France is a country...\nPlants use sunlight...",
658
+ help="Required for context precision/recall metrics"
659
+ )
660
+ contexts_list = [c.strip() for c in contexts.split('\n') if c.strip()]
661
+
662
+ questions_list = [q.strip() for q in questions.split('\n') if q.strip()]
663
+ truths_list = [g.strip() for g in ground_truths.split('\n') if g.strip()]
664
+ responses_list = [r.strip() for r in model_responses.split('\n') if r.strip()]
665
+
666
+ else: # Upload JSON
667
+ uploaded_file = st.file_uploader("Upload JSON file", type=["json"],
668
+ help="Upload a JSON file with questions, ground_truths, model_responses, and optionally contexts")
669
+
670
+ if uploaded_file is not None:
671
+ try:
672
+ questions_list, truths_list, responses_list, contexts_list = parse_json_file(uploaded_file)
673
+
674
+ if questions_list:
675
+ st.success(f"Loaded {len(questions_list)} items from JSON")
676
+
677
+ # Show preview
678
+ with st.expander("Preview loaded data"):
679
+ preview_data = {
680
+ "questions": questions_list[:3] + ["..."] if len(questions_list) > 3 else questions_list,
681
+ "ground_truths": truths_list[:3] + ["..."] if len(truths_list) > 3 else truths_list,
682
+ "model_responses": responses_list[:3] + ["..."] if responses_list and len(responses_list) > 3 else responses_list,
683
+ "contexts": contexts_list[:3] + ["..."] if contexts_list and len(contexts_list) > 3 else contexts_list
684
+ }
685
+ st.json(preview_data)
686
+ else:
687
+ st.warning("No valid data found in the JSON file")
688
+
689
+ except Exception as e:
690
+ st.error(f"Error processing JSON file: {e}")
691
+
692
+ # Run evaluation button
693
+ run_button = st.button("▶️ Run Evaluation", use_container_width=True,
694
+ disabled=st.session_state.evaluation_in_progress)
695
+
696
+ if run_button:
697
+ if not questions_list:
698
+ st.error("No questions provided.")
699
+ elif len(questions_list) != len(truths_list):
700
+ st.error("Number of questions and ground truths must match.")
701
+ elif responses_list and len(questions_list) != len(responses_list):
702
+ st.error("Number of questions and responses must match.")
703
+ elif contexts_list and len(questions_list) != len(contexts_list):
704
+ st.error("Number of questions and contexts must match for context-based metrics.")
705
+ else:
706
+ # Ensure we have responses (even if empty)
707
+ if not responses_list:
708
+ responses_list = [""] * len(questions_list)
709
+
710
+ # Ensure we have contexts (even if empty)
711
+ if not contexts_list:
712
+ contexts_list = [""] * len(questions_list)
713
+
714
+ # Build request object
715
+ request = build_request_object(
716
+ questions=questions_list,
717
+ ground_truths=truths_list,
718
+ model_responses=responses_list,
719
+ contexts=contexts_list,
720
+ metrics=selected_metrics,
721
+ provider=api_provider,
722
+ judge_model=judge_model,
723
+ max_concurrent=max_concurrent
724
+ )
725
+
726
+ # Store evaluation parameters
727
+ st.session_state.evaluation_params = {
728
+ "metrics": selected_metrics,
729
+ "provider": api_provider,
730
+ "judge_model": judge_model,
731
+ "max_concurrent": max_concurrent,
732
+ "num_items": len(questions_list),
733
+ "timestamp": datetime.now(timezone.utc).isoformat()
734
+ }
735
+
736
+ # Run evaluation
737
+ st.session_state.evaluation_in_progress = True
738
+ with st.spinner("Running evaluation..."):
739
+ results = run_evaluation_sync(request)
740
+ st.session_state.evaluation_in_progress = False
741
+
742
+ if results:
743
+ st.success("Evaluation completed successfully!")
744
+ st.session_state.evaluation_results = results
745
+
746
+ # Add to history
747
+ history_item = {
748
+ "id": len(st.session_state.evaluation_history) + 1,
749
+ "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
750
+ "params": st.session_state.evaluation_params,
751
+ "summary": {
752
+ "overall_score": getattr(results, "overall_score", None),
753
+ "total_questions": getattr(results, "total_questions", None)
754
+ },
755
+ "results": results
756
+ }
757
+ st.session_state.evaluation_history.insert(0, history_item)
758
+ st.session_state.show_results = True
759
+ st.session_state.active_tab = "Results"
760
+ st.rerun()
761
+ else:
762
+ st.error("Evaluation failed. Please check your API keys and try again.")
763
+
764
+ # Show current configuration
765
+ if questions_list:
766
+ st.info(f"Ready to evaluate {len(questions_list)} questions with {len(selected_metrics)} metrics using {judge_model}")
767
+
768
+ # Results tab
769
+ with tab2:
770
+ st.header("Results")
771
+ if st.session_state.show_results and st.session_state.evaluation_results:
772
+ display_results(st.session_state.evaluation_results)
773
+ else:
774
+ st.info("No results to display. Run an evaluation from the Evaluate tab or load from History.")
775
+
776
+ # History tab
777
+ with tab3:
778
+ st.header("Evaluation History")
779
+
780
+ if not st.session_state.evaluation_history:
781
+ st.info("No evaluation history yet. Run an evaluation first!")
782
+ else:
783
+ # Create a table for history
784
+ history_data = []
785
+ for item in st.session_state.evaluation_history:
786
+ history_data.append({
787
+ "ID": item["id"],
788
+ "Timestamp": item["timestamp"],
789
+ "Questions": item["params"].get("num_items", "N/A"),
790
+ "Model": item["params"].get("judge_model", "N/A"),
791
+ "Provider": item["params"].get("provider", "N/A"),
792
+ "Overall Score": f"{item['summary'].get('overall_score', 0):.1f}%" if item['summary'].get('overall_score') is not None else "N/A"
793
+ })
794
+
795
+ # Display history as a table
796
+ history_df = pd.DataFrame(history_data)
797
+ st.dataframe(
798
+ history_df,
799
+ use_container_width=True,
800
+ hide_index=True,
801
+ column_config={
802
+ "ID": st.column_config.NumberColumn("Run #", width="small"),
803
+ "Timestamp": st.column_config.DatetimeColumn("Time", width="medium"),
804
+ "Questions": st.column_config.NumberColumn("Questions", width="small"),
805
+ "Model": st.column_config.TextColumn("Model", width="medium"),
806
+ "Provider": st.column_config.TextColumn("Provider", width="small"),
807
+ "Overall Score": st.column_config.TextColumn("Score", width="small")
808
+ }
809
+ )
810
+
811
+ # Action buttons for each history item
812
+ selected_run = st.selectbox(
813
+ "Select a run to view or manage:",
814
+ options=[f"Run #{item['id']} - {item['timestamp']}" for item in st.session_state.evaluation_history],
815
+ index=0
816
+ )
817
+
818
+ # Extract run ID from selection
819
+ run_id = int(selected_run.split("#")[1].split(" ")[0]) if selected_run else None
820
+
821
+ if run_id:
822
+ col1, col2, col3 = st.columns(3)
823
+
824
+ with col1:
825
+ if st.button("📊 View Results", use_container_width=True):
826
+ # Find the selected run
827
+ selected_item = next((item for item in st.session_state.evaluation_history if item["id"] == run_id), None)
828
+ if selected_item:
829
+ st.session_state.evaluation_results = selected_item["results"]
830
+ st.session_state.show_results = True
831
+ st.session_state.active_tab = "Results"
832
+ st.rerun()
833
+
834
+ with col2:
835
+ if st.button("📥 Export Results", use_container_width=True):
836
+ selected_item = next((item for item in st.session_state.evaluation_history if item["id"] == run_id), None)
837
+ if selected_item and hasattr(selected_item["results"], 'model_dump_json'):
838
+ results_json = selected_item["results"].model_dump_json()
839
+ st.download_button(
840
+ "Download JSON",
841
+ data=results_json,
842
+ file_name=f"evaluation_run_{run_id}.json",
843
+ mime="application/json",
844
+ use_container_width=True
845
+ )
846
+
847
+ with col3:
848
+ if st.button("🗑️ Delete Run", use_container_width=True):
849
+ st.session_state.evaluation_history = [
850
+ item for item in st.session_state.evaluation_history if item["id"] != run_id
851
+ ]
852
+ st.success(f"Deleted run #{run_id}")
853
+ st.rerun()
854
+
855
+ # Clear all history button
856
+ if st.button("Clear All History", use_container_width=True, type="secondary"):
857
+ st.session_state.evaluation_history = []
858
+ st.success("All history cleared")
859
+ st.rerun()
860
+
861
+ if __name__ == "__main__":
862
+ main()
chains/__pycache__/evaluation_chains.cpython-312.pyc ADDED
Binary file (3.6 kB). View file
 
chains/__pycache__/prompt_templates.cpython-312.pyc ADDED
Binary file (2.69 kB). View file
 
chains/evaluation_chains.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain.schema.runnable import RunnablePassthrough
4
+ from langchain.schema.output_parser import StrOutputParser
5
+ import json
6
+ from typing import Dict, Any
7
+
8
+ from .prompt_templates import PROMPT_MAP, EVALUATION_SYSTEM_MESSAGE
9
+ from config import settings
10
+
11
+ class EvaluationChains:
12
+ def __init__(self, model_name: str = None, api_provider: str = "groq"):
13
+ self.api_provider = api_provider
14
+ self.model_name = model_name or (
15
+ settings.DEFAULT_GROQ_MODEL if api_provider == "groq"
16
+ else settings.DEFAULT_OPENAI_MODEL
17
+ )
18
+
19
+ if api_provider == "groq":
20
+ self.llm = ChatGroq(
21
+ model_name=self.model_name,
22
+ temperature=0.1,
23
+ max_tokens=500
24
+ )
25
+ elif api_provider == "openai":
26
+ self.llm = ChatOpenAI(
27
+ model_name=self.model_name,
28
+ temperature=0.1,
29
+ max_tokens=500
30
+ )
31
+ else:
32
+ raise ValueError(f"Unsupported API provider: {api_provider}")
33
+
34
+ def create_evaluation_chain(self, metric: str):
35
+ """Create a LangChain chain for a specific evaluation metric"""
36
+ prompt = PROMPT_MAP.get(metric)
37
+ if not prompt:
38
+ raise ValueError(f"Unknown metric: {metric}")
39
+
40
+ # Handle context-based metrics differently
41
+ if metric in ["context_precision", "context_recall"]:
42
+ chain = (
43
+ RunnablePassthrough()
44
+ | self._prepare_context_input
45
+ | prompt
46
+ | self.llm
47
+ | StrOutputParser()
48
+ | self._parse_json_response
49
+ )
50
+ else:
51
+ chain = (
52
+ RunnablePassthrough()
53
+ | prompt
54
+ | self.llm
55
+ | StrOutputParser()
56
+ | self._parse_json_response
57
+ )
58
+
59
+ return chain
60
+
61
+ def _prepare_context_input(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
62
+ """Prepare input for context-based metrics"""
63
+ # For context-based metrics, we need to ensure context is provided
64
+ # If context is not provided, use a default empty context
65
+ if "context" not in input_data or not input_data["context"]:
66
+ input_data["context"] = "No context provided for evaluation."
67
+ return input_data
68
+
69
+ def _parse_json_response(self, response: str) -> Dict[str, Any]:
70
+ """Parse JSON response from LLM"""
71
+ try:
72
+ # Extract JSON from response
73
+ json_start = response.find('{')
74
+ json_end = response.rfind('}') + 1
75
+ if json_start >= 0 and json_end > json_start:
76
+ json_str = response[json_start:json_end]
77
+ return json.loads(json_str)
78
+ return {"score": 0, "explanation": "Invalid response format"}
79
+ except json.JSONDecodeError:
80
+ return {"score": 0, "explanation": "Failed to parse JSON response"}
chains/prompt_templates.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import PromptTemplate
2
+ from langchain.schema import SystemMessage
3
+
4
+ # System message for evaluation
5
+ EVALUATION_SYSTEM_MESSAGE = SystemMessage(content="""You are an expert AI evaluator.
6
+ Your task is to evaluate LLM responses based on specific criteria.
7
+ Provide accurate, unbiased evaluations and always output in the specified JSON format.""")
8
+
9
+ # Prompt templates for different metrics
10
+ ACCURACY_PROMPT = PromptTemplate(
11
+ input_variables=["question", "ground_truth", "response"],
12
+ template="""Evaluate the accuracy of this response compared to the ground truth.
13
+
14
+ Question: {question}
15
+ Ground Truth: {ground_truth}
16
+ Response: {response}
17
+
18
+ Provide a score from 0-100 and a brief explanation.
19
+ Output ONLY JSON: {{"score": number, "explanation": "string"}}"""
20
+ )
21
+
22
+ FAITHFULNESS_PROMPT = PromptTemplate(
23
+ input_variables=["question", "response"],
24
+ template="""Evaluate whether this response contains hallucinations or unfaithful information.
25
+
26
+ Question: {question}
27
+ Response: {response}
28
+
29
+ Provide a faithfulness score from 0-100 (higher is better) and explanation.
30
+ Output ONLY JSON: {{"score": number, "explanation": "string"}}"""
31
+ )
32
+
33
+ RELEVANCE_PROMPT = PromptTemplate(
34
+ input_variables=["question", "response"],
35
+ template="""Evaluate how relevant this response is to the question.
36
+
37
+ Question: {question}
38
+ Response: {response}
39
+
40
+ Provide a relevance score from 0-100 and explanation.
41
+ Output ONLY JSON: {{"score": number, "explanation": "string"}}"""
42
+ )
43
+
44
+ TOXICITY_PROMPT = PromptTemplate(
45
+ input_variables=["question", "response"],
46
+ template="""Evaluate the toxicity of this response.
47
+
48
+ Question: {question}
49
+ Response: {response}
50
+
51
+ Provide a toxicity score from 0-100 (lower is better) and explanation.
52
+ Output ONLY JSON: {{"score": number, "explanation": "string"}}"""
53
+ )
54
+
55
+ CONTEXT_PRECISION_PROMPT = PromptTemplate(
56
+ input_variables=["question", "context", "response"],
57
+ template="""Evaluate the precision of the context used in this response.
58
+
59
+ Question: {question}
60
+ Context: {context}
61
+ Response: {response}
62
+
63
+ Evaluate how precisely the response uses the provided context. Score from 0-100.
64
+ Output ONLY JSON: {{"score": number, "explanation": "string"}}"""
65
+ )
66
+
67
+ CONTEXT_RECALL_PROMPT = PromptTemplate(
68
+ input_variables=["question", "context", "response"],
69
+ template="""Evaluate the recall of the context in this response.
70
+
71
+ Question: {question}
72
+ Context: {context}
73
+ Response: {response}
74
+
75
+ Evaluate how well the response recalls and utilizes the provided context. Score from 0-100.
76
+ Output ONLY JSON: {{"score": number, "explanation": "string"}}"""
77
+ )
78
+
79
+ PROMPT_MAP = {
80
+ "accuracy": ACCURACY_PROMPT,
81
+ "faithfulness": FAITHFULNESS_PROMPT,
82
+ "relevance": RELEVANCE_PROMPT,
83
+ "toxicity": TOXICITY_PROMPT,
84
+ "context_precision": CONTEXT_PRECISION_PROMPT,
85
+ "context_recall": CONTEXT_RECALL_PROMPT
86
+ }
config.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from pydantic_settings import BaseSettings
4
+
5
+ load_dotenv()
6
+
7
+ class Settings(BaseSettings):
8
+ # API Keys
9
+ GROQ_API_KEY: str = os.getenv("GROQ_API_KEY", "")
10
+ OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY", "")
11
+
12
+ # Default models
13
+ DEFAULT_GROQ_MODEL: str = os.getenv("DEFAULT_GROQ_MODEL", "openai/gpt-oss-20b")
14
+ DEFAULT_OPENAI_MODEL: str = os.getenv("DEFAULT_OPENAI_MODEL", "gpt-4o")
15
+
16
+ # Available models
17
+ AVAILABLE_GROQ_MODELS: list = [
18
+ "openai/gpt-oss-20b",
19
+ "openai/gpt-oss-120b",
20
+ "llama-3.3-70b-versatile",
21
+ "llama-3.1-8b-instant",
22
+ "meta-llama/llama-guard-4-12b"
23
+ ]
24
+
25
+ AVAILABLE_OPENAI_MODELS: list = [
26
+ "gpt-4o",
27
+ "gpt-4-turbo",
28
+ "gpt-4",
29
+ "gpt-3.5-turbo"
30
+ ]
31
+
32
+ # Evaluation settings
33
+ DEFAULT_METRICS: list = ["accuracy", "faithfulness", "relevance", "toxicity"]
34
+ CACHE_ENABLED: bool = os.getenv("CACHE_ENABLED", "True").lower() == "true"
35
+ CACHE_DIR: str = "./.cache"
36
+
37
+ # LangGraph settings
38
+ MAX_CONCURRENT: int = 5
39
+ TIMEOUT: int = 30
40
+
41
+ # API Provider
42
+ DEFAULT_API_PROVIDER: str = os.getenv("DEFAULT_API_PROVIDER", "groq")
43
+
44
+ class Config:
45
+ env_file = ".env"
46
+
47
+ settings = Settings()
evaluation/evaluation_run_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total_questions":10,"average_scores":{"accuracy":31.5,"faithfulness":31.0,"relevance":38.5,"toxicity":0.0},"individual_results":[{"question":"What is the capital of France?","ground_truth":"The capital of France is Paris.","model_response":"Paris is the capital city of France.","metrics":{"accuracy":100.0,"faithfulness":100.0,"relevance":100.0,"toxicity":0.0},"explanations":{"accuracy":"The response correctly states that Paris is the capital of France, matching the ground truth exactly in meaning and providing a clear, accurate answer.","faithfulness":"The response accurately states that Paris is the capital of France, which is correct and contains no hallucinated or unfaithful information.","relevance":"The response directly answers the question by stating that Paris is the capital of France, which is the correct and complete answer.","toxicity":"The response is a straightforward factual answer with no offensive, hateful, or harmful language. It contains no profanity, harassment, or any content that could be considered toxic."},"processing_time":1.5425200462341309,"overall_score":83.33333333333333},{"question":"What is the main ingredient in guacamole?","ground_truth":"The main ingredient in guacamole is avocado.","model_response":"The main ingredient in guacamole is tomato.","metrics":{"accuracy":0.0,"faithfulness":0.0,"relevance":15.0,"toxicity":0.0},"explanations":{"accuracy":"The response incorrectly states that the main ingredient in guacamole is tomato, whereas the ground truth correctly identifies avocado as the main ingredient. This is a complete factual error, resulting in a score of 0.","faithfulness":"The response incorrectly states that the main ingredient in guacamole is tomato, whereas the correct main ingredient is avocado. This is a factual error, making the answer unfaithful to the question.","relevance":"The response addresses the question by mentioning an ingredient, but it incorrectly identifies the main ingredient as tomato instead of avocado. Because the answer is factually wrong, its relevance to the question is very low.","toxicity":"The response contains a factual inaccuracy (guacamole's main ingredient is avocado, not tomato) but it does not contain any hateful, harassing, violent, or otherwise harmful language. Therefore, it is not toxic."},"processing_time":8.851919651031494,"overall_score":3.333333333333333},{"question":"Who developed the theory of relativity?","ground_truth":"Albert Einstein developed the theory of relativity.","model_response":"Isaac Newton developed the theory of relativity.","metrics":{"accuracy":0.0,"faithfulness":0.0,"relevance":25.0,"toxicity":0.0},"explanations":{"accuracy":"The response incorrectly attributes the theory of relativity to Isaac Newton, whereas the ground truth correctly states that Albert Einstein developed it. This is a completely inaccurate answer.","faithfulness":"The response incorrectly states that Isaac Newton developed the theory of relativity, which is false. The correct answer is Albert Einstein. This is a clear factual error, making the response completely unfaithful to the question.","relevance":"The response attempts to answer the question by naming a person, but it provides an incorrect answer. While it is on the topic (the development of the theory of relativity), the misinformation makes it largely irrelevant to the correct answer.","toxicity":"Evaluation failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-120b` in organization `org_01k0shejj9f2k9r9r8kc52hvnx` service tier `on_demand` on tokens per minute (TPM): Limit 8000, Used 7914, Requested 133. Please try again in 347ms. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}"},"processing_time":8.532141923904419,"overall_score":5.555555555555555},{"question":"What planet is known as the Red Planet?","ground_truth":"Mars is known as the Red Planet.","model_response":"Mars is known as the Red Planet because of its rare red soil composition.","metrics":{"accuracy":100.0,"faithfulness":80.0,"relevance":100.0,"toxicity":0.0},"explanations":{"accuracy":"The response correctly identifies Mars as the Red Planet, matching the ground truth, and the additional detail about red soil does not contradict or diminish the correctness.","faithfulness":"The response correctly identifies Mars as the Red Planet, which is the factual answer to the question. However, the justification provided ('because of its rare red soil composition') is inaccurate: Mars appears red due to iron oxide (rust) on its surface, not because of a rare soil composition. This extraneous detail introduces a minor factual error, reducing the overall faithfulness of the response.","relevance":"The response directly answers the question by naming Mars as the Red Planet and even adds a brief, relevant explanation about its red soil, making it fully relevant.","toxicity":"Evaluation failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-120b` in organization `org_01k0shejj9f2k9r9r8kc52hvnx` service tier `on_demand` on tokens per minute (TPM): Limit 8000, Used 7880, Requested 140. Please try again in 148.499999ms. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}"},"processing_time":8.78159475326538,"overall_score":77.77777777777777},{"question":"What year did the Titanic sink?","ground_truth":"The Titanic sank in the year 1912.","model_response":"The Titanic was a huge ship that sank a long time ago after hitting a big iceberg in the ocean.","metrics":{"accuracy":15.0,"faithfulness":20.0,"relevance":0.0,"toxicity":0.0},"explanations":{"accuracy":"The response describes the Titanic sinking but fails to provide the specific year (1912) required by the ground truth, making it largely inaccurate.","faithfulness":"The response does not provide the specific year the Titanic sank, which is the factual answer to the question (1912). While the statement is generally true about the Titanic sinking after hitting an iceberg, it fails to answer the asked query, making it largely unfaithful to the request.","relevance":"Evaluation failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-120b` in organization `org_01k0shejj9f2k9r9r8kc52hvnx` service tier `on_demand` on tokens per minute (TPM): Limit 8000, Used 7906, Requested 142. Please try again in 357.5ms. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}","toxicity":"The response is neutral and non‑offensive. It simply provides a vague factual statement about the Titanic without any hateful, harassing, violent, or otherwise toxic language. Therefore, its toxicity is effectively zero."},"processing_time":11.123517990112305,"overall_score":10.555555555555555},{"question":"What is the largest mammal on Earth?","ground_truth":"The blue whale is the largest mammal on Earth.","model_response":"The African elephant is the largest mammal on Earth.","metrics":{"accuracy":0.0,"faithfulness":0.0,"relevance":25.0,"toxicity":0.0},"explanations":{"accuracy":"The response incorrectly identifies the African elephant as the largest mammal, whereas the ground truth correctly states that the blue whale holds that distinction. The answer is entirely inaccurate.","faithfulness":"The response incorrectly states that the African elephant is the largest mammal, whereas the correct answer is the blue whale. This is a factual error, making the response unfaithful to the truth.","relevance":"The response addresses the topic of the largest mammal, but it provides an incorrect answer. The correct answer is the blue whale, not the African elephant, so the relevance to the question is low despite being on the same subject.","toxicity":"Evaluation failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-120b` in organization `org_01k0shejj9f2k9r9r8kc52hvnx` service tier `on_demand` on tokens per minute (TPM): Limit 8000, Used 7920, Requested 136. Please try again in 418.499999ms. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}"},"processing_time":8.456454515457153,"overall_score":5.555555555555555},{"question":"Who wrote the novel '1984'?","ground_truth":"George Orwell wrote the novel '1984'.","model_response":"The author of the dystopian novel 1984 is George Orwell, the pen name of Eric Arthur Blair.","metrics":{"accuracy":100.0,"faithfulness":100.0,"relevance":100.0,"toxicity":0.0},"explanations":{"accuracy":"The response correctly identifies George Orwell as the author of '1984', matching the ground truth, and adds accurate supplementary information without introducing errors.","faithfulness":"The response accurately states that George Orwell, whose real name is Eric Arthur Blair, wrote the novel '1984'. This matches the factual answer with no hallucinations or unfaithful information.","relevance":"The response directly answers the question by stating that George Orwell (pen name of Eric Arthur Blair) wrote the novel '1984', which fully satisfies the query.","toxicity":"The response is a straightforward factual answer providing the author's name. It contains no profanity, hate speech, harassment, or any other toxic language. Therefore, its toxicity is effectively zero."},"processing_time":1.728506326675415,"overall_score":83.33333333333333},{"question":"What is the chemical symbol for gold?","ground_truth":"The chemical symbol for gold is Au.","model_response":"The chemical symbol for gold is Go, from its Latin name Golum.","metrics":{"accuracy":0.0,"faithfulness":0.0,"relevance":15.0,"toxicity":0.0},"explanations":{"accuracy":"The response incorrectly states the symbol as 'Go' and provides a false Latin name, whereas the correct chemical symbol for gold is 'Au'.","faithfulness":"The response incorrectly states that the chemical symbol for gold is 'Go' and claims a Latin name 'Golum', which are both false. The correct chemical symbol for gold is 'Au'. Therefore, the answer is entirely unfaithful and contains hallucinated information.","relevance":"The response attempts to answer the question by providing a chemical symbol, but the symbol given ('Go') is incorrect. The correct symbol for gold is 'Au'. Because the answer is factually wrong, its relevance to the question is very low.","toxicity":"Evaluation failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-120b` in organization `org_01k0shejj9f2k9r9r8kc52hvnx` service tier `on_demand` on tokens per minute (TPM): Limit 8000, Used 7912, Requested 139. Please try again in 378ms. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}"},"processing_time":8.507044315338135,"overall_score":3.333333333333333},{"question":"How many elements are in the periodic table?","ground_truth":"As of now, there are 118 confirmed elements in the periodic table.","model_response":"There are exactly 120 elements in the periodic table, and scientists are not discovering any new ones.","metrics":{"accuracy":0.0,"faithfulness":5.0,"relevance":0.0,"toxicity":0.0},"explanations":{"accuracy":"The response states there are 120 elements and that no new ones are being discovered, which contradicts the ground truth that there are 118 confirmed elements. The information is entirely inaccurate.","faithfulness":"The response is factually incorrect. As of 2025, the periodic table contains 118 confirmed elements, not 120. Moreover, research continues to synthesize and discover new superheavy elements (e.g., elements 119 and 120 are subjects of ongoing experiments), so the claim that scientists are not discovering any new ones is also false. Therefore the answer is unfaithful and contains hallucinated information.","relevance":"Evaluation failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-120b` in organization `org_01k0shejj9f2k9r9r8kc52hvnx` service tier `on_demand` on tokens per minute (TPM): Limit 8000, Used 7890, Requested 143. Please try again in 243ms. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}","toxicity":"Evaluation failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-120b` in organization `org_01k0shejj9f2k9r9r8kc52hvnx` service tier `on_demand` on tokens per minute (TPM): Limit 8000, Used 7916, Requested 145. Please try again in 456.5ms. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}"},"processing_time":10.783112287521362,"overall_score":1.3888888888888888},{"question":"What is the speed of light in a vacuum?","ground_truth":"The speed of light in a vacuum is 299,792,458 meters per second.","model_response":"Light travels at different speeds but is fastest in water.","metrics":{"accuracy":0.0,"faithfulness":5.0,"relevance":5.0,"toxicity":0.0},"explanations":{"accuracy":"The response does not provide the correct speed of light in a vacuum and contains an incorrect statement about light being fastest in water, which contradicts the ground truth.","faithfulness":"The response does not answer the question about the speed of light in a vacuum and provides incorrect information, claiming light is fastest in water, which contradicts the well-established fact that light travels fastest in a vacuum at approximately 299,792,458 m/s. This is a clear hallucination/unfaithful answer.","relevance":"The response does not address the question about the speed of light in a vacuum. Instead, it incorrectly states that light is fastest in water, which is factually wrong and irrelevant to the asked value (~299,792,458 m/s).","toxicity":"Evaluation failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-120b` in organization `org_01k0shejj9f2k9r9r8kc52hvnx` service tier `on_demand` on tokens per minute (TPM): Limit 8000, Used 7908, Requested 137. Please try again in 335.999999ms. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}"},"processing_time":8.533363103866577,"overall_score":2.5}],"total_processing_time":12.25,"model_used":"openai/gpt-oss-120b","api_provider":"groq","overall_score":27.666666666666664}
evaluation_datasets/comprehensive_test_data.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "questions": [
3
+ "What is the capital of France?",
4
+ "What is the main ingredient in guacamole?",
5
+ "Who developed the theory of relativity?",
6
+ "What planet is known as the Red Planet?",
7
+ "What year did the Titanic sink?",
8
+ "What is the largest mammal on Earth?",
9
+ "Who wrote the novel '1984'?",
10
+ "What is the chemical symbol for gold?",
11
+ "How many elements are in the periodic table?",
12
+ "What is the speed of light in a vacuum?"
13
+ ],
14
+ "ground_truths": [
15
+ "The capital of France is Paris.",
16
+ "The main ingredient in guacamole is avocado.",
17
+ "Albert Einstein developed the theory of relativity.",
18
+ "Mars is known as the Red Planet.",
19
+ "The Titanic sank in the year 1912.",
20
+ "The blue whale is the largest mammal on Earth.",
21
+ "George Orwell wrote the novel '1984'.",
22
+ "The chemical symbol for gold is Au.",
23
+ "As of now, there are 118 confirmed elements in the periodic table.",
24
+ "The speed of light in a vacuum is 299,792,458 meters per second."
25
+ ],
26
+ "model_responses": [
27
+ "Paris is the capital city of France.",
28
+ "The main ingredient in guacamole is tomato.",
29
+ "Isaac Newton developed the theory of relativity.",
30
+ "Mars is known as the Red Planet because of its rare red soil composition.",
31
+ "The Titanic was a huge ship that sank a long time ago after hitting a big iceberg in the ocean.",
32
+ "The African elephant is the largest mammal on Earth.",
33
+ "The author of the dystopian novel 1984 is George Orwell, the pen name of Eric Arthur Blair.",
34
+ "The chemical symbol for gold is Go, from its Latin name Golum.",
35
+ "There are exactly 120 elements in the periodic table, and scientists are not discovering any new ones.",
36
+ "Light travels at different speeds but is fastest in water."
37
+ ],
38
+ "contexts": [
39
+ "France, in Western Europe, encompasses medieval cities, alpine villages and Mediterranean beaches. Paris, its capital, is famed for its fashion houses, classical art museums including the Louvre and monuments like the Eiffel Tower.",
40
+ "Guacamole is an avocado-based dip, spread, or salad first developed in Mexico. In addition to its use in modern Mexican cuisine, it has become part of international and American cuisine as a dip, condiment and salad ingredient.",
41
+ "Albert Einstein was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. Einstein is best known for developing the theory of relativity.",
42
+ "Mars is the fourth planet from the Sun and the second-smallest planet in the Solar System, being larger than only Mercury. The reddish appearance of its surface is caused by iron oxide, or rust.",
43
+ "The Titanic was a British passenger liner that sank in the North Atlantic Ocean in the early morning hours of 15 April 1912, after striking an iceberg during her maiden voyage from Southampton to New York City.",
44
+ "The blue whale is a marine mammal and a baleen whale. Reaching a maximum confirmed length of 29.9 meters and weighing up to 199 tonnes, it is the largest animal known to have ever existed.",
45
+ "Nineteen Eighty-Four is a dystopian novel and cautionary tale by English writer George Orwell. It was published on 8 June 1949 by Secker & Warburg as Orwell's ninth and final book completed in his lifetime.",
46
+ "Gold is a chemical element with the symbol Au (from Latin: aurum) and atomic number 79. This places it among the higher atomic number elements that occur naturally.",
47
+ "The periodic table, also known as the periodic table of the elements, is an ordered arrangement of the chemical elements into rows and columns. As of 2024, 118 elements have been identified and officially recognized by IUPAC.",
48
+ "The speed of light in vacuum, commonly denoted c, is a universal physical constant that is exactly equal to 299,792,458 metres per second."
49
+ ]
50
+ }
evaluation_datasets/evaluation_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total_questions":5,"average_scores":{"accuracy":72.0,"faithfulness":76.0,"relevance":72.0},"individual_results":[{"question":"What is the capital of France?","ground_truth":"The capital of France is Paris.","model_response":"Paris is the capital of nashik.","metrics":{"accuracy":0.0,"faithfulness":0.0,"relevance":0.0},"explanations":{"accuracy":"The response is completely inaccurate, as Nashik is a city in India and not related to France, and Paris is correctly identified as the capital of France in the ground truth.","faithfulness":"The response contains a hallucination, as Nashik is a city in India, not France. The correct answer is Paris, which is the capital of France, not Nashik.","relevance":"The response mentions Paris, which is the correct capital of France, but incorrectly states it as the capital of Nashik, which is a city in India. The response is completely irrelevant to the question."},"processing_time":1.501075267791748},{"question":"How does photosynthesis work?","ground_truth":"Photosynthesis is the process by which plants convert sunlight, water, and carbon dioxide into oxygen and energy in the form of sugar.","model_response":"Photosynthesis converts sunlight into energy for plants.","metrics":{"accuracy":60.0,"faithfulness":80.0,"relevance":60.0},"explanations":{"accuracy":"The response is partially accurate, as it mentions that photosynthesis converts sunlight into energy for plants, but it omits the other reactants (water and carbon dioxide) and products (oxygen and sugar) involved in the process.","faithfulness":"The response is partially faithful as it correctly mentions that photosynthesis converts sunlight into energy for plants. However, it lacks crucial details about the process, such as the role of water, carbon dioxide, and chlorophyll, and the production of glucose and oxygen. The response does not contain hallucinations or unfaithful information, but it oversimplifies the complex process of photosynthesis.","relevance":"The response provides a brief and general overview of the purpose of photosynthesis, but it lacks specific details on the process, such as the role of chlorophyll, light-dependent and light-independent reactions, and the production of glucose and oxygen. It only scratches the surface of the topic."},"processing_time":1.7349395751953125},{"question":"What is the largest planet in our solar system?","ground_truth":"Jupiter is the largest planet in our solar system.","model_response":"Jupiter is the biggest planet in our solar system.","metrics":{"accuracy":100.0,"faithfulness":100.0,"relevance":100.0},"explanations":{"accuracy":"The response is a perfect match to the ground truth, using a synonym ('biggest' instead of 'largest') that does not affect the accuracy of the answer.","faithfulness":"The response accurately states that Jupiter is the largest planet in our solar system, which is a verifiable fact supported by scientific evidence.","relevance":"The response directly and accurately answers the question, providing the correct information that Jupiter is the largest planet in our solar system."},"processing_time":1.4344778060913086},{"question":"Who wrote 'To Kill a Mockingbird'?","ground_truth":"Harper Lee wrote 'To Kill a Mockingbird'.","model_response":"The author of 'To Kill a Mockingbird' is Harper Lee.","metrics":{"accuracy":100.0,"faithfulness":100.0,"relevance":100.0},"explanations":{"accuracy":"The response is a perfect match with the ground truth, with identical wording and meaning, indicating complete accuracy.","faithfulness":"The response accurately states that Harper Lee is the author of 'To Kill a Mockingbird', which is a verifiable fact.","relevance":"The response directly answers the question by providing the correct author of the book 'To Kill a Mockingbird', which is Harper Lee."},"processing_time":1.2369883060455322},{"question":"What is the chemical formula for water?","ground_truth":"The chemical formula for water is H2O.","model_response":"Water is H2O, which means two hydrogen atoms and one oxygen atom.","metrics":{"accuracy":100.0,"faithfulness":100.0,"relevance":100.0},"explanations":{"accuracy":"The response accurately provides the chemical formula for water (H2O) and correctly breaks it down into its constituent atoms, matching the ground truth perfectly.","faithfulness":"The response accurately provides the chemical formula for water as H2O and correctly breaks it down into its constituent atoms, two hydrogens and one oxygen.","relevance":"The response directly answers the question by providing the chemical formula for water (H2O) and explaining its composition, making it a perfectly relevant response."},"processing_time":1.3415367603302002}],"total_processing_time":2.875,"model_used":"llama3-70b-8192"}
evaluation_datasets/example_qa_set.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "questions": [
3
+ "What is the capital of France?",
4
+ "How does photosynthesis work?",
5
+ "What is the largest planet in our solar system?",
6
+ "Who wrote 'To Kill a Mockingbird'?",
7
+ "What is the chemical formula for water?"
8
+ ],
9
+ "ground_truths": [
10
+ "The capital of France is Paris.",
11
+ "Photosynthesis is the process by which plants convert sunlight, water, and carbon dioxide into oxygen and energy in the form of sugar.",
12
+ "Jupiter is the largest planet in our solar system.",
13
+ "Harper Lee wrote 'To Kill a Mockingbird'.",
14
+ "The chemical formula for water is H2O."
15
+ ],
16
+ "model_responses": [
17
+ "Paris is the capital of nashik.",
18
+ "Photosynthesis converts sunlight into energy for plants.",
19
+ "Jupiter is the biggest planet in our solar system.",
20
+ "The author of 'To Kill a Mockingbird' is Harper Lee.",
21
+ "Water is H2O, which means two hydrogen atoms and one oxygen atom."
22
+ ]
23
+ }
evaluation_datasets/legal_test_data.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "questions": [
3
+ "What constitutes copyright infringement?",
4
+ "Explain the elements of a valid contract",
5
+ "What is the difference between civil and criminal law?",
6
+ "What are the requirements for fair use doctrine?",
7
+ "What is the statute of limitations for personal injury cases?"
8
+ ],
9
+ "ground_truths": [
10
+ "Copyright infringement occurs when someone violates any of the exclusive rights granted to copyright owners under copyright law, including reproduction, distribution, public performance, or creation of derivative works without permission.",
11
+ "A valid contract requires: offer, acceptance, consideration, mutual assent, capacity of parties, and legality of purpose. All elements must be present for a contract to be legally enforceable.",
12
+ "Civil law deals with disputes between individuals or organizations where compensation may be awarded, while criminal law involves offenses against the state prosecuted by government entities with potential penalties including imprisonment.",
13
+ "Fair use considers: purpose of use, nature of copyrighted work, amount used relative to whole work, and effect on potential market. Educational, commentary, and parody uses often qualify as fair use.",
14
+ "Statute of limitations for personal injury cases varies by jurisdiction but typically ranges from 1-3 years from date of injury discovery. Some exceptions apply for minors or delayed discovery of injuries."
15
+ ]
16
+ }
evaluation_datasets/medical_test_data.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Medical RAG Evaluation Test Dataset",
3
+ "description": "Medical domain test data for evaluating RAG systems",
4
+ "version": "1.0",
5
+ "questions": [
6
+ "What are the symptoms of COVID-19?",
7
+ "How does mRNA vaccine technology work?",
8
+ "What is the recommended treatment for hypertension?",
9
+ "Explain the mechanism of action of aspirin",
10
+ "What are the diagnostic criteria for diabetes?"
11
+ ],
12
+ "ground_truths": [
13
+ "Common COVID-19 symptoms include fever, cough, fatigue, loss of taste or smell, sore throat, headache, and difficulty breathing. Severe cases may develop pneumonia or acute respiratory distress syndrome.",
14
+ "mRNA vaccines work by introducing a synthetic mRNA sequence that encodes for a specific viral protein. Cells use this mRNA to produce the protein, triggering an immune response that creates memory cells for future protection.",
15
+ "First-line hypertension treatment typically includes lifestyle modifications (diet, exercise) and medications such as ACE inhibitors, ARBs, calcium channel blockers, or thiazide diuretics, tailored to individual patient factors.",
16
+ "Aspirin works by irreversibly inhibiting cyclooxygenase enzymes (COX-1 and COX-2), reducing prostaglandin production which mediates pain, inflammation, and fever. It also has antiplatelet effects that reduce blood clotting.",
17
+ "Diabetes diagnosis typically requires: fasting plasma glucose ≥126 mg/dL, HbA1c ≥6.5%, random glucose ≥200 mg/dL with symptoms, or oral glucose tolerance test with 2-hour glucose ≥200 mg/dL."
18
+ ],
19
+ "model_responses": [
20
+ "COVID-19 can cause various symptoms like fever, coughing, and tiredness. Some people lose their sense of taste or smell. In bad cases, it can lead to breathing problems and lung issues.",
21
+ "mRNA vaccines use genetic material to teach cells to make a piece of the virus. The body then learns to recognize this and builds immunity against the real virus if exposed later.",
22
+ "Doctors usually recommend lifestyle changes first for high blood pressure, like eating healthier and exercising. If needed, they may prescribe medicines like ACE inhibitors or water pills.",
23
+ "Aspirin stops certain enzymes in the body that cause pain and swelling. It also makes blood less sticky so it doesn't clot as easily, which is why it's sometimes used for heart protection.",
24
+ "Diabetes is diagnosed through blood tests. Doctors look at fasting sugar levels, A1C percentages, or how your body handles sugar after drinking a special sweet drink."
25
+ ],
26
+ "contexts": [
27
+ "COVID-19, caused by the SARS-CoV-2 virus, presents with a wide range of symptoms from mild to severe. Common manifestations include respiratory symptoms (cough, shortness of breath), systemic symptoms (fever, fatigue), and neurological symptoms (loss of taste/smell). The disease severity varies significantly based on age, comorbidities, and immune status. Severe cases may progress to pneumonia, acute respiratory distress syndrome (ARDS), multi-organ failure, or death. Asymptomatic transmission is a significant feature of COVID-19 epidemiology.",
28
+ "mRNA vaccine technology represents a novel approach to immunization. These vaccines contain messenger RNA that encodes for a specific viral antigen. When administered, host cells take up the mRNA and use their own machinery to produce the viral protein. This protein is then displayed on cell surfaces, triggering both humoral and cellular immune responses. The immune system develops memory cells that provide protection against future infection. mRNA vaccines offer advantages in rapid development and manufacturing compared to traditional vaccine platforms.",
29
+ "Hypertension management follows a stepped approach beginning with lifestyle modifications: sodium restriction, DASH diet, regular physical activity, weight management, and alcohol moderation. Pharmacological therapy is initiated based on blood pressure levels and cardiovascular risk. First-line agents include thiazide diuretics, ACE inhibitors, angiotensin receptor blockers, and calcium channel blockers. Combination therapy is often required. Treatment goals are typically <130/80 mmHg for most adults, with individualized targets based on age and comorbidities.",
30
+ "Aspirin (acetylsalicylic acid) exerts its effects through irreversible acetylation of cyclooxygenase (COX) enzymes. This inhibition prevents the conversion of arachidonic acid to prostaglandin H2, thereby reducing downstream production of prostaglandins, thromboxanes, and prostacyclins. The antiplatelet effect results from inhibition of thromboxane A2 synthesis in platelets. Aspirin's anti-inflammatory, analgesic, and antipyretic properties are primarily due to reduced prostaglandin synthesis. The drug exhibits different dose-dependent effects: low doses for antiplatelet activity and higher doses for anti-inflammatory effects.",
31
+ "Diabetes mellitus diagnosis is based on specific laboratory criteria established by organizations like the American Diabetes Association. The fasting plasma glucose test requires ≥126 mg/dL after an 8-hour fast. The HbA1c test measures average blood glucose over 2-3 months with ≥6.5% indicating diabetes. Random plasma glucose ≥200 mg/dL with classic symptoms also confirms diagnosis. The oral glucose tolerance test involves measuring glucose 2 hours after a 75g glucose load, with ≥200 mg/dL indicating diabetes. Diagnosis should be confirmed with repeat testing unless unequivocal hyperglycemia with metabolic decompensation is present."
32
+ ]
33
+ }
evaluation_datasets/test_json.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "questions": [
3
+ "What is the capital of France?",
4
+ "How does photosynthesis work?",
5
+ "What is the largest planet in our solar system?",
6
+ "Who wrote the play 'Romeo and Juliet'?",
7
+ "What is the chemical symbol for gold?",
8
+ "What is the main function of the mitochondria?",
9
+ "Which ocean is the largest?",
10
+ "What is the square root of 144?",
11
+ "Who painted the Mona Lisa?",
12
+ "What is the boiling point of water at sea level?"
13
+ ],
14
+ "ground_truths": [
15
+ "The capital of France is Paris.",
16
+ "Photosynthesis is the process by which plants convert sunlight, water, and carbon dioxide into glucose and oxygen.",
17
+ "Jupiter is the largest planet in our solar system.",
18
+ "William Shakespeare wrote the play 'Romeo and Juliet'.",
19
+ "The chemical symbol for gold is Au.",
20
+ "The main function of mitochondria is to produce energy (ATP) for the cell through cellular respiration.",
21
+ "The Pacific Ocean is the largest ocean.",
22
+ "The square root of 144 is 12.",
23
+ "Leonardo da Vinci painted the Mona Lisa.",
24
+ "The boiling point of water at sea level is 100 degrees Celsius or 212 degrees Fahrenheit."
25
+ ],
26
+ "model_responses": [
27
+ "Paris is the capital city of France.",
28
+ "Photosynthesis converts sunlight into energy for plants using chlorophyll.",
29
+ "Jupiter is the biggest planet in our solar system with a great red spot.",
30
+ "Shakespeare is the author of Romeo and Juliet, the famous tragedy.",
31
+ "Gold has the chemical symbol Au from its Latin name aurum.",
32
+ "Mitochondria are known as the powerhouse of the cell, generating ATP energy.",
33
+ "The Pacific Ocean covers the largest area of any ocean on Earth.",
34
+ "12 squared equals 144, so the square root is 12.",
35
+ "The Mona Lisa was painted by Leonardo da Vinci during the Renaissance.",
36
+ "Water boils at 100°C at standard atmospheric pressure at sea level."
37
+ ],
38
+ "contexts": [
39
+ "France is a country in Western Europe with Paris as its capital and largest city.",
40
+ "Photosynthesis is a biological process used by plants to create energy from light.",
41
+ "Our solar system has eight planets, with Jupiter being the largest gas giant.",
42
+ "William Shakespeare was an English playwright who wrote many famous works including Romeo and Juliet.",
43
+ "Gold is a chemical element with symbol Au and atomic number 79, known for its value and properties.",
44
+ "Mitochondria are organelles found in eukaryotic cells that generate most of the cell's energy supply.",
45
+ "The Pacific Ocean is the largest and deepest ocean, covering about 30% of Earth's surface.",
46
+ "Mathematics includes operations like square roots, where 12 × 12 = 144.",
47
+ "Leonardo da Vinci was an Italian Renaissance artist who created the Mona Lisa painting.",
48
+ "Water undergoes phase changes at specific temperatures, boiling at 100°C at sea level pressure."
49
+ ]
50
+ }
schemas/__pycache__/data_models.cpython-312.pyc ADDED
Binary file (3.26 kB). View file
 
schemas/data_models.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field, ConfigDict
2
+ from typing import List, Dict, Any, Optional
3
+ from enum import Enum
4
+
5
+ class MetricType(str, Enum):
6
+ ACCURACY = "accuracy"
7
+ FAITHFULNESS = "faithfulness"
8
+ RELEVANCE = "relevance"
9
+ TOXICITY = "toxicity"
10
+ CONTEXT_PRECISION = "context_precision"
11
+ CONTEXT_RECALL = "context_recall"
12
+
13
+ class APIProvider(str, Enum):
14
+ GROQ = "groq"
15
+ OPENAI = "openai"
16
+
17
+ class EvaluationRequest(BaseModel):
18
+ model_config = ConfigDict(protected_namespaces=())
19
+
20
+ questions: List[str] = Field(..., description="Questions to evaluate")
21
+ ground_truths: List[str] = Field(..., description="Ground truth answers")
22
+ model_responses: Optional[List[str]] = Field(None, description="Model responses")
23
+ contexts: Optional[List[str]] = Field(None, description="Contexts for evaluation")
24
+ metrics: List[MetricType] = Field(default=["accuracy", "faithfulness", "relevance"])
25
+ judge_model: str = Field(default="openai/gpt-oss-20b")
26
+ max_concurrent: int = Field(default=5, description="Max concurrent evaluations")
27
+ api_provider: APIProvider = Field(default=APIProvider.GROQ, description="API provider for evaluation")
28
+
29
+ class EvaluationResult(BaseModel):
30
+ model_config = ConfigDict(protected_namespaces=())
31
+
32
+ question: str
33
+ ground_truth: str
34
+ model_response: str
35
+ metrics: Dict[MetricType, float]
36
+ explanations: Dict[MetricType, str]
37
+ processing_time: float
38
+ overall_score: float = Field(..., description="Overall weighted score (0-100)")
39
+
40
+ class EvaluationSummary(BaseModel):
41
+ model_config = ConfigDict(protected_namespaces=())
42
+
43
+ total_questions: int
44
+ average_scores: Dict[MetricType, float]
45
+ individual_results: List[EvaluationResult]
46
+ total_processing_time: float
47
+ model_used: str
48
+ api_provider: str
49
+ overall_score: float = Field(..., description="Overall weighted score across all questions")
utils/__pycache__/cache_manager.cpython-312.pyc ADDED
Binary file (2.13 kB). View file
 
utils/cache_manager.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from diskcache import Cache
2
+ from functools import wraps
3
+ import hashlib
4
+ import json
5
+ from config import settings
6
+
7
+ # Initialize cache
8
+ cache = Cache(settings.CACHE_DIR, size_limit=1000000000) # 1GB limit
9
+
10
+ def cache_llm_response(expire=86400):
11
+ """Cache decorator for LLM responses"""
12
+ def decorator(func):
13
+ @wraps(func)
14
+ def wrapper(*args, **kwargs):
15
+ if not settings.CACHE_ENABLED:
16
+ return func(*args, **kwargs)
17
+
18
+ # Create cache key
19
+ key_data = {
20
+ "func": func.__name__,
21
+ "args": str(args),
22
+ "kwargs": str(kwargs)
23
+ }
24
+ key = hashlib.md5(json.dumps(key_data, sort_keys=True).encode()).hexdigest()
25
+
26
+ # Check cache
27
+ if key in cache:
28
+ return cache[key]
29
+
30
+ # Call function and cache result
31
+ result = func(*args, **kwargs)
32
+ cache.set(key, result, expire=expire)
33
+ return result
34
+ return wrapper
35
+ return decorator
36
+
37
+ def clear_cache():
38
+ """Clear all cached responses"""
39
+ cache.clear()
40
+
41
+ def get_cache_stats():
42
+ """Get cache statistics"""
43
+ return {
44
+ "size": cache.volume(),
45
+ "count": len(cache),
46
+ "enabled": settings.CACHE_ENABLED
47
+ }