model_trace

Runtime error

App Files Files Community

Ahmed Ahmed commited on Jul 25

Commit

70ea05e

1 Parent(s): 46cc1f1

initial commit

Browse files

Files changed (6) hide show

app.py +37 -0
explain.md +292 -0
requirements.txt +4 -2
src/about.py +1 -0
src/evaluation/dynamic_eval.py +44 -0
src/evaluation/perplexity_eval.py +66 -0

app.py CHANGED Viewed

@@ -27,6 +27,7 @@ from src.display.utils import (
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
@@ -89,6 +90,19 @@ def init_leaderboard(dataframe):
     )
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
@@ -188,6 +202,29 @@ with demo:
                 submission_result,
             )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(

 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
+from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
 def restart_space():
     )
+def run_perplexity_test(model_name, revision, precision):
+    """Run perplexity evaluation on demand."""
+    if not model_name:
+        return "Please enter a model name."
+    success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
+    if success:
+        return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\nResults have been saved and will appear in the leaderboard shortly."
+    else:
+        return f"❌ Evaluation failed: {result}"
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
                 submission_result,
             )
+        with gr.TabItem("🧪 Dynamic Testing", elem_id="dynamic-testing-tab", id=4):
+            gr.Markdown("## Run Perplexity Evaluation")
+            with gr.Row():
+                with gr.Column():
+                    dynamic_model_name = gr.Textbox(label="Model name", placeholder="org/model-name")
+                    dynamic_revision = gr.Textbox(label="Revision", placeholder="main", value="main")
+                    dynamic_precision = gr.Dropdown(
+                        choices=["float16", "bfloat16"],
+                        label="Precision",
+                        value="float16"
+                    )
+                with gr.Column():
+                    dynamic_test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
+                    dynamic_result = gr.Markdown()
+            dynamic_test_button.click(
+                run_perplexity_test,
+                [dynamic_model_name, dynamic_revision, dynamic_precision],
+                dynamic_result
+            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(

explain.md ADDED Viewed

	@@ -0,0 +1,292 @@

+# Model Trace - Hugging Face Space Explanation
+## Overview
+This repository hosts a **Hugging Face Space** that creates a dynamic leaderboard for evaluating language models. The space provides a web interface where users can submit models for evaluation and view results in a ranked leaderboard format.
+## How It Works
+### Architecture
+The system consists of several key components:
+1. **Frontend Interface** (`app.py`): A Gradio web application with three main tabs:
+   - **🏅 LLM Benchmark**: Displays the main leaderboard
+   - **📝 About**: Shows information about the evaluation process
+   - **🚀 Submit here!**: Allows users to submit models for evaluation
+2. **Data Storage**: Uses Hugging Face datasets to store:
+   - **Evaluation Requests**: Models waiting to be evaluated
+   - **Evaluation Results**: Completed evaluation results
+3. **Evaluation Queue System**: Models go through different states:
+   - **PENDING**: Submitted but not yet evaluated
+   - **RUNNING**: Currently being evaluated
+   - **FINISHED**: Evaluation completed
+### Data Flow
+1. **Model Submission**: Users submit models through the web interface
+2. **Validation**: System checks if the model exists on Hugging Face Hub and has proper metadata
+3. **Queue Management**: Valid models are added to the evaluation queue
+4. **Evaluation**: External evaluation system processes the models (not included in this repo)
+5. **Results Display**: Completed evaluations appear in the leaderboard
+### Configuration
+The main configuration files are:
+- **`src/envs.py`**: Repository settings and API tokens
+- **`src/about.py`**: Task definitions and leaderboard metadata
+- **`src/display/utils.py`**: Column definitions and display settings
+## Current Evaluation Tasks
+The system is currently configured to evaluate models on:
+- **ANLI** (Adversarial NLI) - accuracy metric
+- **LogiQA** - normalized accuracy metric
+## Adding Dynamic Perplexity Testing
+To add perplexity evaluation as a dynamic test, you'll need to make several modifications:
+### 1. Update Task Configuration
+First, modify `src/about.py` to add perplexity as a new task:
+```python
+class Tasks(Enum):
+    # Existing tasks
+    task0 = Task("anli_r1", "acc", "ANLI")
+    task1 = Task("logiqa", "acc_norm", "LogiQA")
+    # Add perplexity task
+    task2 = Task("perplexity", "perplexity", "Perplexity")
+```
+### 2. Create Perplexity Evaluation Script
+Create a new file `src/evaluation/perplexity_eval.py`:
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import numpy as np
+def evaluate_perplexity(model_name, revision="main", test_text=None):
+    """
+    Evaluate perplexity on a fixed piece of text.
+    Args:
+        model_name: Hugging Face model identifier
+        revision: Model revision/commit hash
+        test_text: Text to evaluate perplexity on (default if None)
+    Returns:
+        float: Perplexity score (lower is better)
+    """
+    # Default test text if none provided
+    if test_text is None:
+        test_text = """The quick brown fox jumps over the lazy dog. This is a standard test sentence that contains all the letters of the English alphabet. It is commonly used for testing fonts and keyboards."""
+    # Load model and tokenizer
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        revision=revision,
+        torch_dtype=torch.float16,
+        device_map="auto"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
+    # Tokenize the text
+    inputs = tokenizer(test_text, return_tensors="pt")
+    # Move to same device as model
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Calculate loss
+    with torch.no_grad():
+        outputs = model(**inputs, labels=inputs["input_ids"])
+        loss = outputs.loss
+    # Calculate perplexity
+    perplexity = torch.exp(loss).item()
+    return perplexity
+def create_perplexity_result(model_name, revision, precision, perplexity_score):
+    """
+    Create a result file in the expected format.
+    """
+    return {
+        "config": {
+            "model_dtype": f"torch.{precision}",
+            "model_name": model_name,
+            "model_sha": revision,
+        },
+        "results": {
+            "perplexity": {
+                "perplexity": perplexity_score,
+            }
+        }
+    }
+```
+### 3. Add Dynamic Evaluation Endpoint
+Create a new file `src/evaluation/dynamic_eval.py`:
+```python
+import json
+import os
+from datetime import datetime
+from src.evaluation.perplexity_eval import evaluate_perplexity, create_perplexity_result
+from src.envs import EVAL_RESULTS_PATH, API, RESULTS_REPO
+def run_dynamic_perplexity_eval(model_name, revision="main", precision="float16"):
+    """
+    Run perplexity evaluation and save results.
+    """
+    try:
+        # Run evaluation
+        perplexity_score = evaluate_perplexity(model_name, revision)
+        # Create result structure
+        result = create_perplexity_result(model_name, revision, precision, perplexity_score)
+        # Save result file
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        result_filename = f"results_{model_name.replace('/', '_')}_{timestamp}.json"
+        # Create directory structure
+        org, model = model_name.split("/") if "/" in model_name else ("", model_name)
+        result_dir = os.path.join(EVAL_RESULTS_PATH, org) if org else EVAL_RESULTS_PATH
+        os.makedirs(result_dir, exist_ok=True)
+        result_path = os.path.join(result_dir, result_filename)
+        with open(result_path, "w") as f:
+            json.dump(result, f, indent=2)
+        # Upload to Hugging Face dataset
+        API.upload_file(
+            path_or_fileobj=result_path,
+            path_in_repo=result_path.split("eval-results/")[1],
+            repo_id=RESULTS_REPO,
+            repo_type="dataset",
+            commit_message=f"Add perplexity results for {model_name}",
+        )
+        return True, perplexity_score
+    except Exception as e:
+        return False, str(e)
+```
+### 4. Add Dynamic Testing Interface
+Modify `app.py` to add a new tab for dynamic testing:
+```python
+# Add this import
+from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
+# Add this function
+def run_perplexity_test(model_name, revision, precision):
+    """Run perplexity evaluation on demand."""
+    if not model_name:
+        return "Please enter a model name."
+    success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
+    if success:
+        return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\nResults have been saved and will appear in the leaderboard shortly."
+    else:
+        return f"❌ Evaluation failed: {result}"
+# Add this to the demo interface (inside the gr.Blocks)
+with gr.TabItem("🧪 Dynamic Testing", elem_id="dynamic-testing-tab", id=4):
+    gr.Markdown("## Run Perplexity Evaluation")
+    with gr.Row():
+        with gr.Column():
+            dynamic_model_name = gr.Textbox(label="Model name", placeholder="org/model-name")
+            dynamic_revision = gr.Textbox(label="Revision", placeholder="main", value="main")
+            dynamic_precision = gr.Dropdown(
+                choices=["float16", "bfloat16"],
+                label="Precision",
+                value="float16"
+            )
+        with gr.Column():
+            dynamic_test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
+            dynamic_result = gr.Markdown()
+    dynamic_test_button.click(
+        run_perplexity_test,
+        [dynamic_model_name, dynamic_revision, dynamic_precision],
+        dynamic_result
+    )
+```
+### 5. Update Requirements
+Add any additional dependencies to `requirements.txt`:
+```txt
+# Add if not already present
+torch
+transformers
+accelerate
+```
+### 6. Configure Environment
+Update `src/envs.py` to point to your repositories:
+```python
+OWNER = "your-org-name"  # Change this
+```
+You'll need to create two Hugging Face datasets:
+- `your-org-name/requests` - for evaluation requests
+- `your-org-name/results` - for evaluation results
+## How to Use the Dynamic Testing
+1. **Deploy the Space**: Push your changes to a Hugging Face Space
+2. **Set Environment Variables**: Add `HF_TOKEN` with write permissions
+3. **Test Models**: Use the "Dynamic Testing" tab to evaluate models on demand
+4. **View Results**: Results will appear in the main leaderboard
+## Key Features of Dynamic Testing
+- **On-Demand Evaluation**: Test models immediately without queue
+- **Fixed Text**: Uses consistent test text for fair comparison
+- **Automatic Ranking**: Lower perplexity scores rank higher
+- **Real-time Results**: See results immediately after evaluation
+- **Integration**: Results automatically appear in the main leaderboard
+## Customization Options
+You can customize the perplexity evaluation by:
+1. **Changing Test Text**: Modify the default text in `perplexity_eval.py`
+2. **Adding Multiple Texts**: Evaluate on multiple texts and average results
+3. **Different Metrics**: Add other metrics like BLEU, ROUGE, etc.
+4. **Model Loading Options**: Customize model loading parameters
+5. **Batch Processing**: Process multiple models in sequence
+## Security Considerations
+- Models must be public on Hugging Face Hub
+- Evaluation runs in the Space's environment
+- Results are publicly visible
+- Consider rate limiting for dynamic testing
+This setup provides a complete dynamic testing system that integrates seamlessly with the existing leaderboard infrastructure.
+# MODELS TO TEST:
+'openai-community/gpt2'
+'EleutherAI/gpt-neo-1.3B'
+'openai-community/gpt2-large'

requirements.txt CHANGED Viewed

@@ -11,6 +11,8 @@ numpy
 pandas
 python-dateutil
 tqdm
-transformers
 tokenizers>=0.15.0
-sentencepiece

 pandas
 python-dateutil
 tqdm
+transformers>=4.30.0
 tokenizers>=0.15.0
+sentencepiece
+torch>=2.0.0
+accelerate>=0.20.0

src/about.py CHANGED Viewed

@@ -14,6 +14,7 @@ class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("anli_r1", "acc", "ANLI")
     task1 = Task("logiqa", "acc_norm", "LogiQA")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("anli_r1", "acc", "ANLI")
     task1 = Task("logiqa", "acc_norm", "LogiQA")
+    task2 = Task("perplexity", "perplexity", "Perplexity")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

src/evaluation/dynamic_eval.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import json
+import os
+from datetime import datetime
+from src.evaluation.perplexity_eval import evaluate_perplexity, create_perplexity_result
+from src.envs import EVAL_RESULTS_PATH, API, RESULTS_REPO
+def run_dynamic_perplexity_eval(model_name, revision="main", precision="float16"):
+    """
+    Run perplexity evaluation and save results.
+    """
+    try:
+        # Run evaluation
+        perplexity_score = evaluate_perplexity(model_name, revision)
+        # Create result structure
+        result = create_perplexity_result(model_name, revision, precision, perplexity_score)
+        # Save result file
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        result_filename = f"results_{model_name.replace('/', '_')}_{timestamp}.json"
+        # Create directory structure
+        org, model = model_name.split("/") if "/" in model_name else ("", model_name)
+        result_dir = os.path.join(EVAL_RESULTS_PATH, org) if org else EVAL_RESULTS_PATH
+        os.makedirs(result_dir, exist_ok=True)
+        result_path = os.path.join(result_dir, result_filename)
+        with open(result_path, "w") as f:
+            json.dump(result, f, indent=2)
+        # Upload to Hugging Face dataset
+        API.upload_file(
+            path_or_fileobj=result_path,
+            path_in_repo=result_path.split("eval-results/")[1],
+            repo_id=RESULTS_REPO,
+            repo_type="dataset",
+            commit_message=f"Add perplexity results for {model_name}",
+        )
+        return True, perplexity_score
+    except Exception as e:
+        return False, str(e)

src/evaluation/perplexity_eval.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import numpy as np
+def evaluate_perplexity(model_name, revision="main", test_text=None):
+    """
+    Evaluate perplexity on a fixed piece of text.
+    Args:
+        model_name: Hugging Face model identifier
+        revision: Model revision/commit hash
+        test_text: Text to evaluate perplexity on (default if None)
+    Returns:
+        float: Perplexity score (lower is better)
+    """
+    # Default test text if none provided
+    if test_text is None:
+        test_text = """Artificial intelligence has transformed the way we live and work, bringing both opportunities and challenges.
+        From autonomous vehicles to language models that can engage in human-like conversation, AI technologies are becoming increasingly
+        sophisticated. However, with this advancement comes the responsibility to ensure these systems are developed and deployed ethically,
+        with careful consideration for privacy, fairness, and transparency. The future of AI will likely depend on how well we balance innovation
+        with these important social considerations."""
+    # Load model and tokenizer
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        revision=revision,
+        torch_dtype=torch.float16,
+        device_map="auto"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
+    # Tokenize the text
+    inputs = tokenizer(test_text, return_tensors="pt")
+    # Move to same device as model
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Calculate loss
+    with torch.no_grad():
+        outputs = model(**inputs, labels=inputs["input_ids"])
+        loss = outputs.loss
+    # Calculate perplexity
+    perplexity = torch.exp(loss).item()
+    return perplexity
+def create_perplexity_result(model_name, revision, precision, perplexity_score):
+    """
+    Create a result file in the expected format.
+    """
+    return {
+        "config": {
+            "model_dtype": f"torch.{precision}",
+            "model_name": model_name,
+            "model_sha": revision,
+        },
+        "results": {
+            "perplexity": {
+                "perplexity": perplexity_score,
+            }
+        }
+    }