Spaces:

NIRAJz
/

LMVal-Multi-Metric-LLM-Evaluation

Sleeping

File size: 35,983 Bytes

6d55fec

import streamlit as st
import pandas as pd
import plotly.graph_objects as go
import json
import asyncio
import os
from typing import List, Dict, Any
from datetime import datetime, timezone

# Apply nest_asyncio to allow nested event loops
try:
    import nest_asyncio
    nest_asyncio.apply()
except ImportError:
    pass

# Import your custom modules
try:
    from agents.evaluation_agent import EvaluationAgent
    from schemas.data_models import EvaluationRequest, MetricType, APIProvider
    from config import settings
    from utils.cache_manager import clear_cache, get_cache_stats
except ImportError as e:
    st.error(f"Import error: {e}. Please make sure all required modules are available.")
    st.stop()

# Page configuration
st.set_page_config(
    page_title="LLM Evaluation Platform",
    page_icon="🤖",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Initialize session state
if "evaluation_results" not in st.session_state:
    st.session_state.evaluation_results = None
if "evaluation_history" not in st.session_state:
    st.session_state.evaluation_history = []
if "evaluation_in_progress" not in st.session_state:
    st.session_state.evaluation_in_progress = False
if "active_tab" not in st.session_state:
    st.session_state.active_tab = "Evaluate"
if "evaluation_params" not in st.session_state:
    st.session_state.evaluation_params = {}
if "show_results" not in st.session_state:
    st.session_state.show_results = False

def run_evaluation_sync(request: EvaluationRequest):
    """Run evaluation synchronously with proper event loop handling"""
    try:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        agent = EvaluationAgent()
        result = loop.run_until_complete(agent.evaluate_async(request))
        loop.close()
        return result
    except Exception as e:
        st.error(f"Evaluation error: {e}")
        return None

def create_metric_radar_chart(scores: Dict[str, float]) -> go.Figure:
    metrics = list(scores.keys())
    values = list(scores.values())

    fig = go.Figure()

    fig.add_trace(go.Scatterpolar(
        r=values + [values[0]],
        theta=metrics + [metrics[0]],
        fill='toself',
        fillcolor='rgba(100, 149, 237, 0.3)',
        line=dict(color='rgba(100, 149, 237, 0.8)', width=3),
        name='Metrics Score',
        hoverinfo='text',
        hovertext=[f'{metric}: {score:.1f}%' for metric, score in zip(metrics, values)]
    ))

    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 100],
                tickfont=dict(size=10),
                tickangle=0,
                tickvals=[0, 20, 40, 60, 80, 100],
                ticktext=['0%', '20%', '40%', '60%', '80%', '100%']
            ),
            angularaxis=dict(
                tickfont=dict(size=11),
                rotation=90
            )
        ),
        showlegend=False,
        title=dict(
            text="Performance Metrics Radar",
            x=0.5,
            xanchor='center',
            font=dict(size=16)
        ),
        height=450,
        margin=dict(l=50, r=50, t=80, b=50),
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'
    )

    return fig

def create_metric_bar_chart(scores: Dict[str, float]) -> go.Figure:
    metrics = [m.capitalize() for m in scores.keys()]
    values = list(scores.values())

    # Create color scale based on score values - inverted for toxicity
    colors = []
    for metric, score in zip(metrics, values):
        if 'toxicity' in metric.lower():
            # For toxicity, lower is better (green), higher is worse (red)
            colors.append(f'hsl({int(120 * (100-score)/100)}, 70%, 50%)')
        else:
            # For other metrics, higher is better
            colors.append(f'hsl({int(120 * score/100)}, 70%, 50%)')

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=metrics,
        y=values,
        marker_color=colors,
        marker_line=dict(color='rgba(0,0,0,0.3)', width=1),
        text=[f'{v:.1f}%' for v in values],
        textposition='auto',
        textfont=dict(size=12, color='white'),
        hovertemplate='<b>%{x}</b><br>Score: %{y:.1f}%<extra></extra>'
    ))

    fig.update_layout(
        title=dict(
            text="Average Scores by Metric",
            x=0.5,
            xanchor='center',
            font=dict(size=16)
        ),
        xaxis=dict(
            title="Evaluation Metric",
            tickangle=45,
            tickfont=dict(size=11)
        ),
        yaxis=dict(
            title="Score (%)",
            range=[0, 100],
            tickvals=[0, 20, 40, 60, 80, 100],
            ticktext=['0%', '20%', '40%', '60%', '80%', '100%']
        ),
        height=450,
        margin=dict(l=50, r=50, t=80, b=80),
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'
    )

    return fig

def create_score_distribution_chart(results: List[Any]) -> go.Figure:
    if not results or not getattr(results[0], "metrics", None):
        return None

    metrics = list(results[0].metrics.keys())
    fig = go.Figure()

    for metric in metrics:
        scores = [getattr(r, 'metrics', {}).get(metric, 0) for r in results]

        fig.add_trace(go.Violin(
            y=scores,
            name=metric.capitalize(),
            box_visible=True,
            meanline_visible=True,
            points="all",
            hoverinfo='y',
            opacity=0.7
        ))

    fig.update_layout(
        title=dict(
            text="Score Distribution by Metric",
            x=0.5,
            xanchor='center',
            font=dict(size=16)
        ),
        yaxis=dict(
            title="Score (%)",
            range=[0, 100],
            tickvals=[0, 20, 40, 60, 80, 100]
        ),
        xaxis=dict(title="Metric"),
        height=400,
        showlegend=True,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'
    )

    return fig

def get_score_color(metric: str, score: float) -> str:
    """Get color for a score based on metric type"""
    if 'toxicity' in metric.lower():
        # For toxicity, lower is better (green), higher is worse (red)
        return "green" if score <= 30 else "orange" if score <= 60 else "red"
    else:
        # For other metrics, higher is better
        return "green" if score >= 70 else "orange" if score >= 40 else "red"

def display_results(results):
    if not results:
        st.error("No results to display")
        return

    if not hasattr(results, 'individual_results') or not results.individual_results:
        st.warning("No individual results available")
        return

    # Summary
    st.subheader("📊 Evaluation Summary")
    col1, col2, col3, col4, col5 = st.columns(5)

    with col1:
        st.metric("Total Questions", results.total_questions)
    with col2:
        st.metric("Total Time", f"{results.total_processing_time:.1f}s")
    with col3:
        st.metric("Model Used", results.model_used)
    with col4:
        st.metric("API Provider", results.api_provider)
    with col5:
        st.metric("Overall Score", f"{results.overall_score:.1f}%")

    # Metrics visualization
    st.subheader("📈 Performance Metrics")

    if results.average_scores:
        col1, col2 = st.columns(2)

        with col1:
            bar_fig = create_metric_bar_chart(results.average_scores)
            st.plotly_chart(bar_fig, use_container_width=True)

        with col2:
            radar_fig = create_metric_radar_chart(results.average_scores)
            st.plotly_chart(radar_fig, use_container_width=True)

        dist_fig = create_score_distribution_chart(results.individual_results)
        if dist_fig:
            st.plotly_chart(dist_fig, use_container_width=True)
    else:
        st.warning("No metric scores available")

    # Detailed results
    st.subheader("📋 Detailed Results")
    if results.individual_results:
        tab1, tab2 = st.tabs(["Data Table", "Question Details"])

        with tab1:
            detailed_data = []
            for i, result in enumerate(results.individual_results):
                row = {
                    "ID": i + 1,
                    "Question": result.question[:50] + "..." if len(result.question) > 50 else result.question,
                    "Response": result.model_response[:50] + "..." if len(result.model_response) > 50 else result.model_response,
                    "Overall Score": f"{result.overall_score:.1f}%" if hasattr(result, 'overall_score') else "N/A",
                    "Time (s)": f"{result.processing_time:.2f}"
                }
                for metric, score in result.metrics.items():
                    row[metric.capitalize()] = f"{score:.1f}%"
                detailed_data.append(row)

            st.dataframe(
                detailed_data,
                use_container_width=True,
                height=400,
                column_config={
                    "ID": st.column_config.NumberColumn("ID", width="small"),
                    "Question": st.column_config.TextColumn("Question", width="large"),
                    "Response": st.column_config.TextColumn("Response", width="large"),
                    "Overall Score": st.column_config.NumberColumn("Overall Score", width="medium"),
                }
            )

        with tab2:
            for i, result in enumerate(results.individual_results):
                with st.expander(f"Question {i+1}: {result.question[:70]}{'...' if len(result.question) > 70 else ''}", expanded=False):
                    col1, col2 = st.columns([1, 2])

                    with col1:
                        st.write("**Question:**")
                        st.info(result.question)

                        st.write("**Ground Truth:**")
                        st.success(result.ground_truth)

                        st.write("**Model Response:**")
                        st.info(result.model_response)

                        st.metric("Processing Time", f"{result.processing_time:.2f}s")
                        if hasattr(result, 'overall_score'):
                            st.metric("Overall Score", f"{result.overall_score:.1f}%")

                    with col2:
                        metrics_cols = st.columns(3)
                        metric_items = list(result.metrics.items())

                        for j, (metric, score) in enumerate(metric_items):
                            with metrics_cols[j % 3]:
                                # Use the correct color logic for each metric type
                                color = get_score_color(metric, score)
                                st.markdown(f"""

                                <div style="background-color: rgba(240, 242, 246, 0.5); 

                                            padding: 15px; 

                                            border-radius: 10px; 

                                            border-left: 4px solid {color};

                                            margin-bottom: 10px;">

                                    <h4 style="margin: 0; color: {color};">{metric.capitalize()}</h4>

                                    <h2 style="margin: 5px 0; color: {color};">{score:.1f}%</h2>

                                </div>

                                """, unsafe_allow_html=True)

                        st.write("**Explanations:**")
                        if hasattr(result, 'explanations') and result.explanations:
                            selected_explanation = st.selectbox(
                                "Select metric explanation:",
                                options=list(result.explanations.keys()),
                                format_func=lambda x: x.capitalize(),
                                key=f"explanation_select_{i}"
                            )

                            st.text_area(
                                f"{selected_explanation.capitalize()} Explanation",
                                value=result.explanations[selected_explanation],
                                height=150,
                                key=f"explanation_text_{i}_{selected_explanation}",
                                disabled=True
                            )
                        else:
                            st.info("No explanations available for this question")

        # Export buttons
        st.subheader("💾 Export Results")
        col1, col2, col3 = st.columns(3)

        with col1:
            try:
                results_json = results.model_dump_json()
            except Exception:
                # Fallback serialization
                try:
                    results_json = json.dumps(results.__dict__, default=lambda o: getattr(o, "__dict__", str(o)), indent=2)
                except Exception:
                    results_json = "{}"

            st.download_button(
                "📊 Download JSON",
                data=results_json,
                file_name="evaluation_results.json",
                mime="application/json",
                use_container_width=True
            )

        with col2:
            csv_data = []
            for i, result in enumerate(results.individual_results):
                row = {
                    "ID": i + 1,
                    "Question": result.question,
                    "Ground Truth": result.ground_truth,
                    "Response": result.model_response,
                    "Overall Score": result.overall_score if hasattr(result, 'overall_score') else 0,
                    "Time (s)": result.processing_time
                }
                for metric, score in result.metrics.items():
                    row[metric.capitalize()] = score
                if hasattr(result, 'explanations'):
                    for metric, explanation in result.explanations.items():
                        row[f"{metric.capitalize()} Explanation"] = explanation
                csv_data.append(row)

            df = pd.DataFrame(csv_data)
            csv = df.to_csv(index=False)
            st.download_button(
                "📋 Download CSV",
                data=csv,
                file_name="evaluation_results.csv",
                mime="text/csv",
                use_container_width=True
            )

        with col3:
            html_content = f"""

            <html>

            <head>

                <title>LLM Evaluation Report</title>

                <style>

                    body {{ font-family: Arial, sans-serif; margin: 40px; }}

                    .header {{ text-align: center; margin-bottom: 30px; }}

                    .metric {{ background-color: #f8f9fa; padding: 15px; margin: 10px; border-radius: 5px; }}

                    .score {{ font-size: 24px; font-weight: bold; }}

                </style>

            </head>

            <body>

                <div class="header">

                    <h1>LLM Evaluation Report</h1>

                    <p>Generated on {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}</p>

                </div>

                <h2>Summary</h2>

                <p>Total Questions: {results.total_questions}</p>

                <p>Total Time: {results.total_processing_time:.1f}s</p>

                <p>Model Used: {results.model_used}</p>

                <p>API Provider: {results.api_provider}</p>

                <p>Overall Score: {results.overall_score:.1f}%</p>



                <h2>Average Scores</h2>

                {"".join([f'<div class="metric"><h3>{m.capitalize()}</h3><div class="score">{s:.1f}%</div></div>' for m, s in results.average_scores.items()])}

            </body>

            </html>

            """

            st.download_button(
                "🌐 Download HTML Report",
                data=html_content,
                file_name="evaluation_report.html",
                mime="text/html",
                use_container_width=True
            )
    else:
        st.warning("No individual results available")

def build_request_object(questions: List[str], ground_truths: List[str], model_responses: List[str],

                         contexts: List[str], metrics: List[str], provider: str, judge_model: str,

                         max_concurrent: int):
    # Map provider to enum if available
    try:
        provider_enum = APIProvider.GROQ if provider.lower().startswith("groq") else APIProvider.OPENAI
    except Exception:
        provider_enum = provider

    # Try to instantiate EvaluationRequest robustly
    try:
        request = EvaluationRequest(
            questions=questions,
            ground_truths=ground_truths,
            model_responses=model_responses,
            metrics=[MetricType(m) for m in metrics],
            api_provider=provider_enum,
            judge_model=judge_model,
            max_concurrent=max_concurrent
        )
    except Exception:
        # Fallback to simple namespace-like object if model signature differs
        class SimpleRequest:
            def __init__(self, **kwargs):
                self.__dict__.update(kwargs)
        request = SimpleRequest(
            questions=questions,
            ground_truths=ground_truths,
            model_responses=model_responses,
            metrics=metrics,
            api_provider=provider_enum,
            judge_model=judge_model,
            max_concurrent=max_concurrent
        )

    return request

def parse_json_file(uploaded_file):
    """Parse JSON file with different possible structures"""
    try:
        # Read and parse the file
        content = uploaded_file.getvalue()
        if isinstance(content, bytes):
            content = content.decode('utf-8')
        
        data = json.loads(content)
        
        questions_list = []
        truths_list = []
        responses_list = []
        contexts_list = []
        
        # Handle different JSON structures
        if isinstance(data, dict):
            # Check if it's the comprehensive_test_data.json format
            if "questions" in data and "ground_truths" in data:
                questions_list = data.get("questions", [])
                truths_list = data.get("ground_truths", [])
                responses_list = data.get("model_responses", [])
                contexts_list = data.get("contexts", [])
            else:
                # Try to extract from a single object
                item = {k.lower(): v for k, v in data.items()}
                q = item.get("question") or item.get("prompt") or item.get("input")
                gt = item.get("ground_truth") or item.get("groundtruth") or item.get("ground truth") or ""
                resp = item.get("model_response") or item.get("response") or item.get("answer") or ""
                ctx = item.get("context") or item.get("contexts") or ""
                
                if q:
                    questions_list.append(str(q))
                    truths_list.append(str(gt))
                    responses_list.append(str(resp))
                    contexts_list.append(str(ctx))
        
        elif isinstance(data, list):
            # Handle list of objects
            for item in data:
                if isinstance(item, dict):
                    item_lc = {k.lower(): v for k, v in item.items()}
                    q = item_lc.get("question") or item_lc.get("prompt") or item_lc.get("input")
                    gt = item_lc.get("ground_truth") or item_lc.get("groundtruth") or item_lc.get("ground truth") or ""
                    resp = item_lc.get("model_response") or item_lc.get("response") or item.lc.get("answer") or ""
                    ctx = item_lc.get("context") or item_lc.get("contexts") or ""
                    
                    if q:
                        questions_list.append(str(q))
                        truths_list.append(str(gt))
                        responses_list.append(str(resp))
                        contexts_list.append(str(ctx))
        
        return questions_list, truths_list, responses_list, contexts_list
        
    except Exception as e:
        st.error(f"Error parsing JSON file: {e}")
        return [], [], [], []

def main():
    st.title("🤖 LMVal: Multi-Metric LLM Evaluation")
    st.markdown("Advanced RAG pipeline evaluation using LangGraph and Groq/OpenAI")

    # Sidebar
    with st.sidebar:
        st.header("⚙️ Configuration")

        api_provider = st.radio(
            "API Provider",
            options=["groq", "openai"],
            index=0,
            horizontal=True
        )

        if api_provider == "groq":
            api_key = st.text_input(
                "Groq API Key",
                type="password",
                value=os.getenv("GROQ_API_KEY", ""),
                help="Get from https://console.groq.com/"
            )
            if api_key:
                os.environ["GROQ_API_KEY"] = api_key
            
            judge_model = st.selectbox(
                "Judge Model",
                options=settings.AVAILABLE_GROQ_MODELS,
                index=0
            )
        else:
            api_key = st.text_input(
                "OpenAI API Key",
                type="password",
                value=os.getenv("OPENAI_API_KEY", ""),
                help="Get from https://platform.openai.com/"
            )
            if api_key:
                os.environ["OPENAI_API_KEY"] = api_key
            
            judge_model = st.selectbox(
                "Judge Model",
                options=settings.AVAILABLE_OPENAI_MODELS,
                index=0
            )

        selected_metrics = st.multiselect(
            "Evaluation Metrics",
            options=[m.value for m in MetricType],
            default=["accuracy", "faithfulness", "relevance"],
            help="Select metrics to evaluate. Some metrics may require additional context."
        )

        max_concurrent = st.slider(
            "Max Concurrent Evaluations",
            min_value=1,
            max_value=10,
            value=3,
            help="Higher values may cause rate limiting"
        )

        st.subheader("💾 Cache Settings")
        if st.button("Clear Cache", use_container_width=True):
            clear_cache()
            st.success("Cache cleared!")

        cache_stats = get_cache_stats()
        st.caption(f"Cache: {cache_stats['count']} items, {cache_stats['size'] / 1024 / 1024:.1f} MB")

        st.subheader("ℹ️ About")
        st.info("""

        This platform evaluates LLM responses using multiple metrics:

        - **Accuracy**: Comparison with ground truth (higher is better)

        - **Faithfulness**: Checks for hallucinations (higher is better)

        - **Relevance**: Response relevance to question (higher is better)

        - **Toxicity**: Detects harmful content (lower is better)

        - **Context Precision/Recall**: For RAG systems (higher is better)

        """)

    tab1, tab2, tab3 = st.tabs(["🏃‍♂️ Evaluate", "📊 Results", "📚 History"])

    # Evaluate tab
    with tab1:
        st.header("Run Evaluation")

        input_method = st.radio(
            "Input Method",
            ["Manual Input", "Upload JSON"],
            horizontal=True
        )

        questions_list = []
        truths_list = []
        responses_list = []
        contexts_list = []

        if input_method == "Manual Input":
            col1, col2 = st.columns(2)

            with col1:
                questions = st.text_area(
                    "Questions (one per line)",
                    height=150,
                    placeholder="What is the capital of France?\nHow does photosynthesis work?",
                    help="Enter each question on a new line"
                )

            with col2:
                ground_truths = st.text_area(
                    "Ground Truths (one per line)",
                    height=150,
                    placeholder="Paris\nPhotosynthesis converts sunlight to energy.",
                    help="Enter ground truth for each question"
                )

            model_responses = st.text_area(
                "Model Responses (one per line)",
                height=150,
                placeholder="Paris is the capital.\nPhotosynthesis uses sunlight.",
                help="Enter model response for each question"
                )

            if any(metric in selected_metrics for metric in ["context_precision", "context_recall"]):
                contexts = st.text_area(
                    "Contexts (one per line, optional)",
                    height=100,
                    placeholder="France is a country...\nPlants use sunlight...",
                    help="Required for context precision/recall metrics"
                )
                contexts_list = [c.strip() for c in contexts.split('\n') if c.strip()]

            questions_list = [q.strip() for q in questions.split('\n') if q.strip()]
            truths_list = [g.strip() for g in ground_truths.split('\n') if g.strip()]
            responses_list = [r.strip() for r in model_responses.split('\n') if r.strip()]

        else:  # Upload JSON
            uploaded_file = st.file_uploader("Upload JSON file", type=["json"], 
                                           help="Upload a JSON file with questions, ground_truths, model_responses, and optionally contexts")
            
            if uploaded_file is not None:
                try:
                    questions_list, truths_list, responses_list, contexts_list = parse_json_file(uploaded_file)
                    
                    if questions_list:
                        st.success(f"Loaded {len(questions_list)} items from JSON")
                        
                        # Show preview
                        with st.expander("Preview loaded data"):
                            preview_data = {
                                "questions": questions_list[:3] + ["..."] if len(questions_list) > 3 else questions_list,
                                "ground_truths": truths_list[:3] + ["..."] if len(truths_list) > 3 else truths_list,
                                "model_responses": responses_list[:3] + ["..."] if responses_list and len(responses_list) > 3 else responses_list,
                                "contexts": contexts_list[:3] + ["..."] if contexts_list and len(contexts_list) > 3 else contexts_list
                            }
                            st.json(preview_data)
                    else:
                        st.warning("No valid data found in the JSON file")
                        
                except Exception as e:
                    st.error(f"Error processing JSON file: {e}")

        # Run evaluation button
        run_button = st.button("▶️ Run Evaluation", use_container_width=True, 
                              disabled=st.session_state.evaluation_in_progress)
        
        if run_button:
            if not questions_list:
                st.error("No questions provided.")
            elif len(questions_list) != len(truths_list):
                st.error("Number of questions and ground truths must match.")
            elif responses_list and len(questions_list) != len(responses_list):
                st.error("Number of questions and responses must match.")
            elif contexts_list and len(questions_list) != len(contexts_list):
                st.error("Number of questions and contexts must match for context-based metrics.")
            else:
                # Ensure we have responses (even if empty)
                if not responses_list:
                    responses_list = [""] * len(questions_list)
                
                # Ensure we have contexts (even if empty)
                if not contexts_list:
                    contexts_list = [""] * len(questions_list)
                
                # Build request object
                request = build_request_object(
                    questions=questions_list,
                    ground_truths=truths_list,
                    model_responses=responses_list,
                    contexts=contexts_list,
                    metrics=selected_metrics,
                    provider=api_provider,
                    judge_model=judge_model,
                    max_concurrent=max_concurrent
                )

                # Store evaluation parameters
                st.session_state.evaluation_params = {
                    "metrics": selected_metrics,
                    "provider": api_provider,
                    "judge_model": judge_model,
                    "max_concurrent": max_concurrent,
                    "num_items": len(questions_list),
                    "timestamp": datetime.now(timezone.utc).isoformat()
                }

                # Run evaluation
                st.session_state.evaluation_in_progress = True
                with st.spinner("Running evaluation..."):
                    results = run_evaluation_sync(request)
                    st.session_state.evaluation_in_progress = False

                if results:
                    st.success("Evaluation completed successfully!")
                    st.session_state.evaluation_results = results
                    
                    # Add to history
                    history_item = {
                        "id": len(st.session_state.evaluation_history) + 1,
                        "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
                        "params": st.session_state.evaluation_params,
                        "summary": {
                            "overall_score": getattr(results, "overall_score", None),
                            "total_questions": getattr(results, "total_questions", None)
                        },
                        "results": results
                    }
                    st.session_state.evaluation_history.insert(0, history_item)
                    st.session_state.show_results = True
                    st.session_state.active_tab = "Results"
                    st.rerun()
                else:
                    st.error("Evaluation failed. Please check your API keys and try again.")

        # Show current configuration
        if questions_list:
            st.info(f"Ready to evaluate {len(questions_list)} questions with {len(selected_metrics)} metrics using {judge_model}")

    # Results tab
    with tab2:
        st.header("Results")
        if st.session_state.show_results and st.session_state.evaluation_results:
            display_results(st.session_state.evaluation_results)
        else:
            st.info("No results to display. Run an evaluation from the Evaluate tab or load from History.")

    # History tab
    with tab3:
        st.header("Evaluation History")
        
        if not st.session_state.evaluation_history:
            st.info("No evaluation history yet. Run an evaluation first!")
        else:
            # Create a table for history
            history_data = []
            for item in st.session_state.evaluation_history:
                history_data.append({
                    "ID": item["id"],
                    "Timestamp": item["timestamp"],
                    "Questions": item["params"].get("num_items", "N/A"),
                    "Model": item["params"].get("judge_model", "N/A"),
                    "Provider": item["params"].get("provider", "N/A"),
                    "Overall Score": f"{item['summary'].get('overall_score', 0):.1f}%" if item['summary'].get('overall_score') is not None else "N/A"
                })
            
            # Display history as a table
            history_df = pd.DataFrame(history_data)
            st.dataframe(
                history_df,
                use_container_width=True,
                hide_index=True,
                column_config={
                    "ID": st.column_config.NumberColumn("Run #", width="small"),
                    "Timestamp": st.column_config.DatetimeColumn("Time", width="medium"),
                    "Questions": st.column_config.NumberColumn("Questions", width="small"),
                    "Model": st.column_config.TextColumn("Model", width="medium"),
                    "Provider": st.column_config.TextColumn("Provider", width="small"),
                    "Overall Score": st.column_config.TextColumn("Score", width="small")
                }
            )
            
            # Action buttons for each history item
            selected_run = st.selectbox(
                "Select a run to view or manage:",
                options=[f"Run #{item['id']} - {item['timestamp']}" for item in st.session_state.evaluation_history],
                index=0
            )
            
            # Extract run ID from selection
            run_id = int(selected_run.split("#")[1].split(" ")[0]) if selected_run else None
            
            if run_id:
                col1, col2, col3 = st.columns(3)
                
                with col1:
                    if st.button("📊 View Results", use_container_width=True):
                        # Find the selected run
                        selected_item = next((item for item in st.session_state.evaluation_history if item["id"] == run_id), None)
                        if selected_item:
                            st.session_state.evaluation_results = selected_item["results"]
                            st.session_state.show_results = True
                            st.session_state.active_tab = "Results"
                            st.rerun()
                
                with col2:
                    if st.button("📥 Export Results", use_container_width=True):
                        selected_item = next((item for item in st.session_state.evaluation_history if item["id"] == run_id), None)
                        if selected_item and hasattr(selected_item["results"], 'model_dump_json'):
                            results_json = selected_item["results"].model_dump_json()
                            st.download_button(
                                "Download JSON",
                                data=results_json,
                                file_name=f"evaluation_run_{run_id}.json",
                                mime="application/json",
                                use_container_width=True
                            )
                
                with col3:
                    if st.button("🗑️ Delete Run", use_container_width=True):
                        st.session_state.evaluation_history = [
                            item for item in st.session_state.evaluation_history if item["id"] != run_id
                        ]
                        st.success(f"Deleted run #{run_id}")
                        st.rerun()
            
            # Clear all history button
            if st.button("Clear All History", use_container_width=True, type="secondary"):
                st.session_state.evaluation_history = []
                st.success("All history cleared")
                st.rerun()

if __name__ == "__main__":
    main()