Spaces:
Runtime error
Runtime error
Commit
·
9d7aae7
1
Parent(s):
7850eab
add evaluator integration
Browse files- app.py +50 -0
- pyproject.toml +1 -0
- requirements.txt +2 -0
- src/evaluator/evaluate.py +182 -0
- src/evaluator/run_evaluator.py +27 -0
app.py
CHANGED
|
@@ -26,6 +26,7 @@ from src.display.utils import (
|
|
| 26 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 27 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 28 |
from src.submission.submit import add_new_eval
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def restart_space():
|
|
@@ -87,6 +88,28 @@ def init_leaderboard(dataframe):
|
|
| 87 |
interactive=False,
|
| 88 |
)
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
demo = gr.Blocks(css=custom_css)
|
| 92 |
with demo:
|
|
@@ -98,6 +121,33 @@ with demo:
|
|
| 98 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 99 |
|
| 100 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 102 |
|
| 103 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
|
|
|
| 26 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 27 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 28 |
from src.submission.submit import add_new_eval
|
| 29 |
+
from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult, Tasks
|
| 30 |
|
| 31 |
|
| 32 |
def restart_space():
|
|
|
|
| 88 |
interactive=False,
|
| 89 |
)
|
| 90 |
|
| 91 |
+
# Add model evaluation functionality
|
| 92 |
+
def evaluate_and_update(model_name, revision, precision, weight_type):
|
| 93 |
+
"""Evaluate a model and update the leaderboard"""
|
| 94 |
+
try:
|
| 95 |
+
# Run evaluation
|
| 96 |
+
eval_result = evaluate_model(model_name, revision, precision, weight_type)
|
| 97 |
+
|
| 98 |
+
# Add evaluation to queue
|
| 99 |
+
add_new_eval(
|
| 100 |
+
model_name=model_name,
|
| 101 |
+
revision=revision,
|
| 102 |
+
precision=precision,
|
| 103 |
+
weight_type=weight_type,
|
| 104 |
+
results=eval_result.results
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Update leaderboard
|
| 108 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 109 |
+
return "Evaluation started successfully! Check the leaderboard for updates."
|
| 110 |
+
except Exception as e:
|
| 111 |
+
return f"Error during evaluation: {str(e)}"
|
| 112 |
+
|
| 113 |
|
| 114 |
demo = gr.Blocks(css=custom_css)
|
| 115 |
with demo:
|
|
|
|
| 121 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 122 |
|
| 123 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 124 |
+
gr.Markdown(INTRODUCTION_TEXT)
|
| 125 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT)
|
| 126 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT)
|
| 127 |
+
|
| 128 |
+
with gr.TabItem("🚀 Evaluate Model", elem_id="evaluate-tab", id=3):
|
| 129 |
+
with gr.Row():
|
| 130 |
+
model_name = gr.Textbox(label="Model Name")
|
| 131 |
+
revision = gr.Textbox(label="Revision", value="main")
|
| 132 |
+
with gr.Row():
|
| 133 |
+
precision = gr.Dropdown(
|
| 134 |
+
choices=[p.value for p in Precision],
|
| 135 |
+
label="Precision",
|
| 136 |
+
value="fp32"
|
| 137 |
+
)
|
| 138 |
+
weight_type = gr.Dropdown(
|
| 139 |
+
choices=[w.value for w in WeightType],
|
| 140 |
+
label="Weight Type",
|
| 141 |
+
value="pytorch"
|
| 142 |
+
)
|
| 143 |
+
evaluate_button = gr.Button("Evaluate Model")
|
| 144 |
+
status_output = gr.Textbox(label="Evaluation Status")
|
| 145 |
+
|
| 146 |
+
evaluate_button.click(
|
| 147 |
+
fn=evaluate_and_update,
|
| 148 |
+
inputs=[model_name, revision, precision, weight_type],
|
| 149 |
+
outputs=[status_output]
|
| 150 |
+
)
|
| 151 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 152 |
|
| 153 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
pyproject.toml
CHANGED
|
@@ -20,6 +20,7 @@ dependencies = [
|
|
| 20 |
"python-dateutil>=2.9.0.post0",
|
| 21 |
"sentencepiece>=0.2.0",
|
| 22 |
"tokenizers>=0.15.0",
|
|
|
|
| 23 |
"tqdm>=4.67.1",
|
| 24 |
"transformers>=4.53.1",
|
| 25 |
]
|
|
|
|
| 20 |
"python-dateutil>=2.9.0.post0",
|
| 21 |
"sentencepiece>=0.2.0",
|
| 22 |
"tokenizers>=0.15.0",
|
| 23 |
+
"torch>=2.7.1",
|
| 24 |
"tqdm>=4.67.1",
|
| 25 |
"transformers>=4.53.1",
|
| 26 |
]
|
requirements.txt
CHANGED
|
@@ -8,6 +8,8 @@ gradio_client
|
|
| 8 |
huggingface-hub>=0.18.0
|
| 9 |
matplotlib
|
| 10 |
numpy
|
|
|
|
|
|
|
| 11 |
pandas
|
| 12 |
python-dateutil
|
| 13 |
tqdm
|
|
|
|
| 8 |
huggingface-hub>=0.18.0
|
| 9 |
matplotlib
|
| 10 |
numpy
|
| 11 |
+
torch>=2.0.0
|
| 12 |
+
scikit-learn>=1.0.0
|
| 13 |
pandas
|
| 14 |
python-dateutil
|
| 15 |
tqdm
|
src/evaluator/evaluate.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from typing import Dict, Any
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from enum import Enum
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 9 |
+
from datasets import load_dataset
|
| 10 |
+
|
| 11 |
+
from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
|
| 12 |
+
from src.display.utils import Tasks
|
| 13 |
+
|
| 14 |
+
class EvaluationStatus(Enum):
|
| 15 |
+
PENDING = "PENDING"
|
| 16 |
+
RUNNING = "RUNNING"
|
| 17 |
+
FINISHED = "FINISHED"
|
| 18 |
+
FAILED = "FAILED"
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class EvaluationResult:
|
| 22 |
+
model: str
|
| 23 |
+
revision: str
|
| 24 |
+
precision: str
|
| 25 |
+
weight_type: str
|
| 26 |
+
results: Dict[str, float]
|
| 27 |
+
error: str = None
|
| 28 |
+
|
| 29 |
+
def evaluate_tsac_sentiment(model, tokenizer, device):
|
| 30 |
+
"""Evaluate model on TSAC sentiment analysis task"""
|
| 31 |
+
dataset = load_dataset("fbougares/tsac", split="test")
|
| 32 |
+
|
| 33 |
+
def preprocess(examples):
|
| 34 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
| 35 |
+
|
| 36 |
+
dataset = dataset.map(preprocess, batched=True)
|
| 37 |
+
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
| 38 |
+
|
| 39 |
+
model.eval()
|
| 40 |
+
with torch.no_grad():
|
| 41 |
+
predictions = []
|
| 42 |
+
labels = []
|
| 43 |
+
|
| 44 |
+
for batch in dataset:
|
| 45 |
+
inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
|
| 46 |
+
label = batch['label'].to(device)
|
| 47 |
+
|
| 48 |
+
outputs = model(**inputs)
|
| 49 |
+
predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
|
| 50 |
+
labels.extend(label.cpu().tolist())
|
| 51 |
+
|
| 52 |
+
accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
|
| 53 |
+
return accuracy
|
| 54 |
+
|
| 55 |
+
def evaluate_tunisian_corpus_coverage(model, tokenizer):
|
| 56 |
+
"""Evaluate model's coverage on Tunisian Dialect Corpus"""
|
| 57 |
+
dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
|
| 58 |
+
|
| 59 |
+
def preprocess(examples):
|
| 60 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
| 61 |
+
|
| 62 |
+
dataset = dataset.map(preprocess, batched=True)
|
| 63 |
+
|
| 64 |
+
# Calculate coverage based on tokenization
|
| 65 |
+
total_tokens = 0
|
| 66 |
+
covered_tokens = 0
|
| 67 |
+
|
| 68 |
+
for example in dataset:
|
| 69 |
+
tokens = tokenizer.tokenize(example['text'])
|
| 70 |
+
total_tokens += len(tokens)
|
| 71 |
+
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
|
| 72 |
+
|
| 73 |
+
coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
|
| 74 |
+
return coverage
|
| 75 |
+
|
| 76 |
+
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
| 77 |
+
"""Evaluate a single model on all tasks"""
|
| 78 |
+
try:
|
| 79 |
+
# Load model and tokenizer
|
| 80 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 81 |
+
|
| 82 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 83 |
+
model_name,
|
| 84 |
+
revision=revision,
|
| 85 |
+
torch_dtype=getattr(torch, precision),
|
| 86 |
+
trust_remote_code=True
|
| 87 |
+
).to(device)
|
| 88 |
+
|
| 89 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
|
| 90 |
+
|
| 91 |
+
# Run evaluations
|
| 92 |
+
results = {}
|
| 93 |
+
|
| 94 |
+
# TSAC Sentiment
|
| 95 |
+
tsac_result = evaluate_tsac_sentiment(model, tokenizer, device)
|
| 96 |
+
results[Tasks.tsac_sentiment.value.benchmark] = tsac_result
|
| 97 |
+
|
| 98 |
+
# Tunisian Corpus Coverage
|
| 99 |
+
corpus_result = evaluate_tunisian_corpus_coverage(model, tokenizer)
|
| 100 |
+
results[Tasks.tunisian_corpus.value.benchmark] = corpus_result
|
| 101 |
+
|
| 102 |
+
return EvaluationResult(
|
| 103 |
+
model=model_name,
|
| 104 |
+
revision=revision,
|
| 105 |
+
precision=precision,
|
| 106 |
+
weight_type=weight_type,
|
| 107 |
+
results=results
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
return EvaluationResult(
|
| 112 |
+
model=model_name,
|
| 113 |
+
revision=revision,
|
| 114 |
+
precision=precision,
|
| 115 |
+
weight_type=weight_type,
|
| 116 |
+
results={},
|
| 117 |
+
error=str(e)
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
def process_evaluation_queue():
|
| 121 |
+
"""Process all pending evaluations in the queue"""
|
| 122 |
+
# Get all pending evaluations
|
| 123 |
+
queue_dir = os.path.join(EVAL_REQUESTS_PATH)
|
| 124 |
+
pending_files = [f for f in os.listdir(queue_dir) if f.endswith('.json')]
|
| 125 |
+
|
| 126 |
+
for file in pending_files:
|
| 127 |
+
file_path = os.path.join(queue_dir, file)
|
| 128 |
+
with open(file_path, 'r') as f:
|
| 129 |
+
eval_request = json.load(f)
|
| 130 |
+
|
| 131 |
+
if eval_request.get('status') != EvaluationStatus.PENDING.value:
|
| 132 |
+
continue
|
| 133 |
+
|
| 134 |
+
# Mark as running
|
| 135 |
+
eval_request['status'] = EvaluationStatus.RUNNING.value
|
| 136 |
+
with open(file_path, 'w') as f:
|
| 137 |
+
json.dump(eval_request, f, indent=2)
|
| 138 |
+
|
| 139 |
+
# Perform evaluation
|
| 140 |
+
result = evaluate_model(
|
| 141 |
+
model_name=eval_request['model'],
|
| 142 |
+
revision=eval_request['revision'],
|
| 143 |
+
precision=eval_request['precision'],
|
| 144 |
+
weight_type=eval_request['weight_type']
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# Save results
|
| 148 |
+
if result.error:
|
| 149 |
+
eval_request['status'] = EvaluationStatus.FAILED.value
|
| 150 |
+
eval_request['error'] = result.error
|
| 151 |
+
else:
|
| 152 |
+
eval_request['status'] = EvaluationStatus.FINISHED.value
|
| 153 |
+
eval_request['results'] = result.results
|
| 154 |
+
|
| 155 |
+
with open(file_path, 'w') as f:
|
| 156 |
+
json.dump(eval_request, f, indent=2)
|
| 157 |
+
|
| 158 |
+
# Save to results dataset
|
| 159 |
+
result_file = os.path.join(EVAL_RESULTS_PATH, f"{result.model}_{result.precision}.json")
|
| 160 |
+
with open(result_file, 'w') as f:
|
| 161 |
+
json.dump({
|
| 162 |
+
'model': result.model,
|
| 163 |
+
'revision': result.revision,
|
| 164 |
+
'precision': result.precision,
|
| 165 |
+
'weight_type': result.weight_type,
|
| 166 |
+
'results': result.results
|
| 167 |
+
}, f, indent=2)
|
| 168 |
+
|
| 169 |
+
# Upload to Hugging Face
|
| 170 |
+
API.upload_file(
|
| 171 |
+
path_or_fileobj=result_file,
|
| 172 |
+
path_in_repo=os.path.basename(result_file),
|
| 173 |
+
repo_id=f"{OWNER}/results",
|
| 174 |
+
repo_type="dataset",
|
| 175 |
+
commit_message=f"Add evaluation results for {result.model}"
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
def main():
|
| 179 |
+
process_evaluation_queue()
|
| 180 |
+
|
| 181 |
+
if __name__ == "__main__":
|
| 182 |
+
main()
|
src/evaluator/run_evaluator.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import sys
|
| 3 |
+
import signal
|
| 4 |
+
from src.evaluator.evaluate import process_evaluation_queue
|
| 5 |
+
|
| 6 |
+
def signal_handler(sig, frame):
|
| 7 |
+
print("\nEvaluator shutting down...")
|
| 8 |
+
sys.exit(0)
|
| 9 |
+
|
| 10 |
+
def main():
|
| 11 |
+
# Register signal handler for graceful shutdown
|
| 12 |
+
signal.signal(signal.SIGINT, signal_handler)
|
| 13 |
+
|
| 14 |
+
print("Starting evaluator service...")
|
| 15 |
+
|
| 16 |
+
while True:
|
| 17 |
+
try:
|
| 18 |
+
process_evaluation_queue()
|
| 19 |
+
print("Evaluation queue processed. Sleeping for 5 minutes...")
|
| 20 |
+
time.sleep(300) # Sleep for 5 minutes
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"Error in evaluation process: {e}")
|
| 23 |
+
print("Retrying in 5 minutes...")
|
| 24 |
+
time.sleep(300)
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
main()
|