Spaces:
Runtime error
Runtime error
Commit
·
28e88f2
1
Parent(s):
9d7aae7
implement evaluation and fix bugs
Browse files- app.py +20 -26
- pyproject.toml +1 -0
- src/envs.py +8 -6
- src/evaluator/evaluate.py +97 -53
- src/leaderboard/read_evals.py +2 -1
- src/submission/submit.py +128 -3
app.py
CHANGED
|
@@ -26,7 +26,9 @@ from src.display.utils import (
|
|
| 26 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 27 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 28 |
from src.submission.submit import add_new_eval
|
| 29 |
-
from src.evaluator.evaluate import
|
|
|
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
def restart_space():
|
|
@@ -49,6 +51,23 @@ except Exception:
|
|
| 49 |
restart_space()
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 53 |
|
| 54 |
(
|
|
@@ -125,31 +144,6 @@ with demo:
|
|
| 125 |
gr.Markdown(LLM_BENCHMARKS_TEXT)
|
| 126 |
gr.Markdown(EVALUATION_QUEUE_TEXT)
|
| 127 |
|
| 128 |
-
with gr.TabItem("🚀 Evaluate Model", elem_id="evaluate-tab", id=3):
|
| 129 |
-
with gr.Row():
|
| 130 |
-
model_name = gr.Textbox(label="Model Name")
|
| 131 |
-
revision = gr.Textbox(label="Revision", value="main")
|
| 132 |
-
with gr.Row():
|
| 133 |
-
precision = gr.Dropdown(
|
| 134 |
-
choices=[p.value for p in Precision],
|
| 135 |
-
label="Precision",
|
| 136 |
-
value="fp32"
|
| 137 |
-
)
|
| 138 |
-
weight_type = gr.Dropdown(
|
| 139 |
-
choices=[w.value for w in WeightType],
|
| 140 |
-
label="Weight Type",
|
| 141 |
-
value="pytorch"
|
| 142 |
-
)
|
| 143 |
-
evaluate_button = gr.Button("Evaluate Model")
|
| 144 |
-
status_output = gr.Textbox(label="Evaluation Status")
|
| 145 |
-
|
| 146 |
-
evaluate_button.click(
|
| 147 |
-
fn=evaluate_and_update,
|
| 148 |
-
inputs=[model_name, revision, precision, weight_type],
|
| 149 |
-
outputs=[status_output]
|
| 150 |
-
)
|
| 151 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 152 |
-
|
| 153 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 154 |
with gr.Column():
|
| 155 |
with gr.Row():
|
|
|
|
| 26 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 27 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 28 |
from src.submission.submit import add_new_eval
|
| 29 |
+
from src.evaluator.evaluate import process_evaluation_queue
|
| 30 |
+
import threading
|
| 31 |
+
import time
|
| 32 |
|
| 33 |
|
| 34 |
def restart_space():
|
|
|
|
| 51 |
restart_space()
|
| 52 |
|
| 53 |
|
| 54 |
+
# Start evaluator service in a separate thread
|
| 55 |
+
def run_evaluator():
|
| 56 |
+
print("Starting evaluator service...")
|
| 57 |
+
while True:
|
| 58 |
+
try:
|
| 59 |
+
process_evaluation_queue()
|
| 60 |
+
print("Evaluation queue processed. Sleeping for 5 minutes...")
|
| 61 |
+
time.sleep(300) # Sleep for 5 minutes
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"Error in evaluation process: {e}")
|
| 64 |
+
print("Retrying in 5 minutes...")
|
| 65 |
+
time.sleep(300)
|
| 66 |
+
|
| 67 |
+
# Start evaluator in a separate thread
|
| 68 |
+
evaluator_thread = threading.Thread(target=run_evaluator, daemon=True)
|
| 69 |
+
evaluator_thread.start()
|
| 70 |
+
|
| 71 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 72 |
|
| 73 |
(
|
|
|
|
| 144 |
gr.Markdown(LLM_BENCHMARKS_TEXT)
|
| 145 |
gr.Markdown(EVALUATION_QUEUE_TEXT)
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 148 |
with gr.Column():
|
| 149 |
with gr.Row():
|
pyproject.toml
CHANGED
|
@@ -18,6 +18,7 @@ dependencies = [
|
|
| 18 |
"numpy>=2.3.1",
|
| 19 |
"pandas>=2.3.0",
|
| 20 |
"python-dateutil>=2.9.0.post0",
|
|
|
|
| 21 |
"sentencepiece>=0.2.0",
|
| 22 |
"tokenizers>=0.15.0",
|
| 23 |
"torch>=2.7.1",
|
|
|
|
| 18 |
"numpy>=2.3.1",
|
| 19 |
"pandas>=2.3.0",
|
| 20 |
"python-dateutil>=2.9.0.post0",
|
| 21 |
+
"scikit-learn>=1.7.0",
|
| 22 |
"sentencepiece>=0.2.0",
|
| 23 |
"tokenizers>=0.15.0",
|
| 24 |
"torch>=2.7.1",
|
src/envs.py
CHANGED
|
@@ -14,12 +14,14 @@ QUEUE_REPO = f"{OWNER}/requests"
|
|
| 14 |
RESULTS_REPO = f"{OWNER}/results"
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
| 17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
| 18 |
-
|
| 19 |
# Local caches
|
| 20 |
-
EVAL_REQUESTS_PATH =
|
| 21 |
-
EVAL_RESULTS_PATH =
|
| 22 |
-
EVAL_REQUESTS_PATH_BACKEND =
|
| 23 |
-
EVAL_RESULTS_PATH_BACKEND =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
API = HfApi(token=TOKEN)
|
|
|
|
| 14 |
RESULTS_REPO = f"{OWNER}/results"
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
|
|
|
|
|
|
| 17 |
# Local caches
|
| 18 |
+
EVAL_REQUESTS_PATH = "./eval-queue"
|
| 19 |
+
EVAL_RESULTS_PATH = "./eval-results"
|
| 20 |
+
EVAL_REQUESTS_PATH_BACKEND = "./eval-queue-bk"
|
| 21 |
+
EVAL_RESULTS_PATH_BACKEND = "./eval-results-bk"
|
| 22 |
+
|
| 23 |
+
# Create directories if they don't exist
|
| 24 |
+
for path in [EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND]:
|
| 25 |
+
os.makedirs(path, exist_ok=True)
|
| 26 |
|
| 27 |
API = HfApi(token=TOKEN)
|
src/evaluator/evaluate.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
| 3 |
from typing import Dict, Any
|
| 4 |
from dataclasses import dataclass
|
| 5 |
from enum import Enum
|
| 6 |
-
|
| 7 |
import torch
|
| 8 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 9 |
from datasets import load_dataset
|
|
@@ -28,54 +28,63 @@ class EvaluationResult:
|
|
| 28 |
|
| 29 |
def evaluate_tsac_sentiment(model, tokenizer, device):
|
| 30 |
"""Evaluate model on TSAC sentiment analysis task"""
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
for batch in dataset:
|
| 45 |
-
inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
|
| 46 |
-
label = batch['label'].to(device)
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
def evaluate_tunisian_corpus_coverage(model, tokenizer):
|
| 56 |
"""Evaluate model's coverage on Tunisian Dialect Corpus"""
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
| 77 |
"""Evaluate a single model on all tasks"""
|
| 78 |
try:
|
|
|
|
| 79 |
# Load model and tokenizer
|
| 80 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 81 |
|
|
@@ -119,18 +128,23 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
|
|
| 119 |
|
| 120 |
def process_evaluation_queue():
|
| 121 |
"""Process all pending evaluations in the queue"""
|
| 122 |
-
# Get all pending evaluations
|
| 123 |
queue_dir = os.path.join(EVAL_REQUESTS_PATH)
|
| 124 |
-
pending_files = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
-
for
|
| 127 |
-
file_path = os.path.join(queue_dir, file)
|
| 128 |
with open(file_path, 'r') as f:
|
| 129 |
eval_request = json.load(f)
|
| 130 |
|
| 131 |
if eval_request.get('status') != EvaluationStatus.PENDING.value:
|
| 132 |
continue
|
| 133 |
|
|
|
|
|
|
|
| 134 |
# Mark as running
|
| 135 |
eval_request['status'] = EvaluationStatus.RUNNING.value
|
| 136 |
with open(file_path, 'w') as f:
|
|
@@ -156,27 +170,57 @@ def process_evaluation_queue():
|
|
| 156 |
json.dump(eval_request, f, indent=2)
|
| 157 |
|
| 158 |
# Save to results dataset
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
with open(result_file, 'w') as f:
|
| 161 |
json.dump({
|
| 162 |
'model': result.model,
|
| 163 |
'revision': result.revision,
|
| 164 |
'precision': result.precision,
|
| 165 |
'weight_type': result.weight_type,
|
| 166 |
-
'results': result.results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
}, f, indent=2)
|
| 168 |
|
| 169 |
# Upload to Hugging Face
|
| 170 |
API.upload_file(
|
| 171 |
path_or_fileobj=result_file,
|
| 172 |
-
path_in_repo=os.path.
|
| 173 |
repo_id=f"{OWNER}/results",
|
| 174 |
repo_type="dataset",
|
| 175 |
commit_message=f"Add evaluation results for {result.model}"
|
| 176 |
)
|
| 177 |
|
| 178 |
-
def main():
|
| 179 |
-
process_evaluation_queue()
|
| 180 |
-
|
| 181 |
-
if __name__ == "__main__":
|
| 182 |
-
main()
|
|
|
|
| 3 |
from typing import Dict, Any
|
| 4 |
from dataclasses import dataclass
|
| 5 |
from enum import Enum
|
| 6 |
+
from datetime import datetime
|
| 7 |
import torch
|
| 8 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 9 |
from datasets import load_dataset
|
|
|
|
| 28 |
|
| 29 |
def evaluate_tsac_sentiment(model, tokenizer, device):
|
| 30 |
"""Evaluate model on TSAC sentiment analysis task"""
|
| 31 |
+
try:
|
| 32 |
+
dataset = load_dataset("fbougares/tsac", split="train")
|
| 33 |
+
|
| 34 |
+
def preprocess(examples):
|
| 35 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
| 36 |
+
|
| 37 |
+
dataset = dataset.map(preprocess, batched=True)
|
| 38 |
+
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
| 39 |
+
|
| 40 |
+
model.eval()
|
| 41 |
+
with torch.no_grad():
|
| 42 |
+
predictions = []
|
| 43 |
+
labels = []
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
for batch in dataset:
|
| 46 |
+
inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
|
| 47 |
+
label = batch['label'].to(device)
|
| 48 |
+
|
| 49 |
+
outputs = model(**inputs)
|
| 50 |
+
predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
|
| 51 |
+
labels.extend(label.cpu().tolist())
|
| 52 |
+
|
| 53 |
+
accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
|
| 54 |
+
return accuracy
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"Error in TSAC evaluation: {str(e)}")
|
| 57 |
+
return 0.0
|
| 58 |
|
| 59 |
def evaluate_tunisian_corpus_coverage(model, tokenizer):
|
| 60 |
"""Evaluate model's coverage on Tunisian Dialect Corpus"""
|
| 61 |
+
try:
|
| 62 |
+
dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
|
| 63 |
+
|
| 64 |
+
def preprocess(examples):
|
| 65 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
| 66 |
+
|
| 67 |
+
dataset = dataset.map(preprocess, batched=True)
|
| 68 |
+
|
| 69 |
+
# Calculate coverage based on tokenization
|
| 70 |
+
total_tokens = 0
|
| 71 |
+
covered_tokens = 0
|
| 72 |
+
|
| 73 |
+
for example in dataset:
|
| 74 |
+
tokens = tokenizer.tokenize(example['text'])
|
| 75 |
+
total_tokens += len(tokens)
|
| 76 |
+
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
|
| 77 |
+
|
| 78 |
+
coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
|
| 79 |
+
return coverage
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
| 82 |
+
return 0.0
|
| 83 |
|
| 84 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
| 85 |
"""Evaluate a single model on all tasks"""
|
| 86 |
try:
|
| 87 |
+
print(f"------------ evaluation model {model_name}")
|
| 88 |
# Load model and tokenizer
|
| 89 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 90 |
|
|
|
|
| 128 |
|
| 129 |
def process_evaluation_queue():
|
| 130 |
"""Process all pending evaluations in the queue"""
|
| 131 |
+
# Get all pending evaluations (including nested directories)
|
| 132 |
queue_dir = os.path.join(EVAL_REQUESTS_PATH)
|
| 133 |
+
pending_files = []
|
| 134 |
+
|
| 135 |
+
# Walk through the directory tree
|
| 136 |
+
for root, dirs, files in os.walk(queue_dir):
|
| 137 |
+
pending_files.extend([os.path.join(root, f) for f in files if f.endswith('.json')])
|
| 138 |
|
| 139 |
+
for file_path in pending_files:
|
|
|
|
| 140 |
with open(file_path, 'r') as f:
|
| 141 |
eval_request = json.load(f)
|
| 142 |
|
| 143 |
if eval_request.get('status') != EvaluationStatus.PENDING.value:
|
| 144 |
continue
|
| 145 |
|
| 146 |
+
print(f"Processing evaluation request: {file_path}")
|
| 147 |
+
|
| 148 |
# Mark as running
|
| 149 |
eval_request['status'] = EvaluationStatus.RUNNING.value
|
| 150 |
with open(file_path, 'w') as f:
|
|
|
|
| 170 |
json.dump(eval_request, f, indent=2)
|
| 171 |
|
| 172 |
# Save to results dataset
|
| 173 |
+
# Extract username from model path if it exists
|
| 174 |
+
username = result.model.split('/')[0] if '/' in result.model else ''
|
| 175 |
+
result_filename = f"{result.model.split('/')[-1]}_{result.precision}.json"
|
| 176 |
+
|
| 177 |
+
if username:
|
| 178 |
+
# Create user directory if it doesn't exist
|
| 179 |
+
user_dir = os.path.join(EVAL_RESULTS_PATH, username)
|
| 180 |
+
os.makedirs(user_dir, exist_ok=True)
|
| 181 |
+
result_file = os.path.join(user_dir, result_filename)
|
| 182 |
+
else:
|
| 183 |
+
result_file = os.path.join(EVAL_RESULTS_PATH, result_filename)
|
| 184 |
+
|
| 185 |
+
# First, update the request file with the results
|
| 186 |
+
request_file = os.path.join(os.path.dirname(file_path), os.path.basename(file_path))
|
| 187 |
+
with open(file_path, 'r') as f:
|
| 188 |
+
request_data = json.load(f)
|
| 189 |
+
|
| 190 |
+
# Update request file with results and status
|
| 191 |
+
request_data['results'] = result.results
|
| 192 |
+
request_data['status'] = EvaluationStatus.FINISHED.value
|
| 193 |
+
|
| 194 |
+
with open(file_path, 'w') as f:
|
| 195 |
+
json.dump(request_data, f, indent=2)
|
| 196 |
+
|
| 197 |
+
# Now create the results file
|
| 198 |
with open(result_file, 'w') as f:
|
| 199 |
json.dump({
|
| 200 |
'model': result.model,
|
| 201 |
'revision': result.revision,
|
| 202 |
'precision': result.precision,
|
| 203 |
'weight_type': result.weight_type,
|
| 204 |
+
'results': result.results,
|
| 205 |
+
'config': {
|
| 206 |
+
'model_name': result.model,
|
| 207 |
+
'model_dtype': result.precision,
|
| 208 |
+
'model_type': result.weight_type,
|
| 209 |
+
'architecture': 'Unknown',
|
| 210 |
+
'license': request_data.get('license', '?'),
|
| 211 |
+
'likes': request_data.get('likes', 0),
|
| 212 |
+
'num_params': request_data.get('params', 0),
|
| 213 |
+
'date': request_data.get('submitted_time', datetime.now().strftime('%Y-%m-%d')),
|
| 214 |
+
'still_on_hub': True
|
| 215 |
+
}
|
| 216 |
}, f, indent=2)
|
| 217 |
|
| 218 |
# Upload to Hugging Face
|
| 219 |
API.upload_file(
|
| 220 |
path_or_fileobj=result_file,
|
| 221 |
+
path_in_repo=result_filename if not username else os.path.join(username, result_filename),
|
| 222 |
repo_id=f"{OWNER}/results",
|
| 223 |
repo_type="dataset",
|
| 224 |
commit_message=f"Add evaluation results for {result.model}"
|
| 225 |
)
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -36,8 +36,9 @@ class EvalResult:
|
|
| 36 |
def init_from_json_file(self, json_filepath):
|
| 37 |
"""Inits the result from the specific model result file"""
|
| 38 |
with open(json_filepath) as fp:
|
|
|
|
| 39 |
data = json.load(fp)
|
| 40 |
-
|
| 41 |
config = data.get("config")
|
| 42 |
|
| 43 |
# Precision
|
|
|
|
| 36 |
def init_from_json_file(self, json_filepath):
|
| 37 |
"""Inits the result from the specific model result file"""
|
| 38 |
with open(json_filepath) as fp:
|
| 39 |
+
print(json_filepath)
|
| 40 |
data = json.load(fp)
|
| 41 |
+
print(data)
|
| 42 |
config = data.get("config")
|
| 43 |
|
| 44 |
# Precision
|
src/submission/submit.py
CHANGED
|
@@ -10,6 +10,12 @@ from src.submission.check_validity import (
|
|
| 10 |
get_model_size,
|
| 11 |
is_model_on_hub,
|
| 12 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
REQUESTED_MODELS = None
|
| 15 |
USERS_TO_SUBMISSION_DATES = None
|
|
@@ -114,6 +120,125 @@ def add_new_eval(
|
|
| 114 |
# Remove the local file
|
| 115 |
os.remove(out_path)
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
get_model_size,
|
| 11 |
is_model_on_hub,
|
| 12 |
)
|
| 13 |
+
from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult
|
| 14 |
+
from src.display.utils import Tasks
|
| 15 |
+
import torch
|
| 16 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 17 |
+
from datasets import load_dataset
|
| 18 |
+
import time
|
| 19 |
|
| 20 |
REQUESTED_MODELS = None
|
| 21 |
USERS_TO_SUBMISSION_DATES = None
|
|
|
|
| 120 |
# Remove the local file
|
| 121 |
os.remove(out_path)
|
| 122 |
|
| 123 |
+
# Run evaluation immediately
|
| 124 |
+
print(f"Evaluating model {model}...")
|
| 125 |
+
try:
|
| 126 |
+
# Load model and tokenizer
|
| 127 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 128 |
+
|
| 129 |
+
model_obj = AutoModelForSequenceClassification.from_pretrained(
|
| 130 |
+
model,
|
| 131 |
+
revision=revision,
|
| 132 |
+
torch_dtype=getattr(torch, precision),
|
| 133 |
+
trust_remote_code=True
|
| 134 |
+
).to(device)
|
| 135 |
+
|
| 136 |
+
tokenizer = AutoTokenizer.from_pretrained(model, revision=revision)
|
| 137 |
+
|
| 138 |
+
# Evaluate on TSAC
|
| 139 |
+
print("Evaluating on TSAC sentiment analysis...")
|
| 140 |
+
tsac_dataset = load_dataset("fbougares/tsac", split="test")
|
| 141 |
+
|
| 142 |
+
def preprocess_tsac(examples):
|
| 143 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
| 144 |
+
|
| 145 |
+
tsac_dataset = tsac_dataset.map(preprocess_tsac, batched=True)
|
| 146 |
+
tsac_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
| 147 |
+
|
| 148 |
+
model_obj.eval()
|
| 149 |
+
with torch.no_grad():
|
| 150 |
+
predictions = []
|
| 151 |
+
labels = []
|
| 152 |
+
|
| 153 |
+
for batch in tsac_dataset:
|
| 154 |
+
inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
|
| 155 |
+
label = batch['label'].to(device)
|
| 156 |
+
|
| 157 |
+
outputs = model_obj(**inputs)
|
| 158 |
+
predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
|
| 159 |
+
labels.extend(label.cpu().tolist())
|
| 160 |
+
|
| 161 |
+
tsac_accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
|
| 162 |
+
|
| 163 |
+
# Evaluate on ArabML
|
| 164 |
+
print("Evaluating on ArabML Tunisian Corpus...")
|
| 165 |
+
arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
|
| 166 |
+
|
| 167 |
+
def preprocess_arabml(examples):
|
| 168 |
+
return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
|
| 169 |
+
|
| 170 |
+
arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True)
|
| 171 |
+
|
| 172 |
+
total_tokens = 0
|
| 173 |
+
covered_tokens = 0
|
| 174 |
+
|
| 175 |
+
for example in arabml_dataset:
|
| 176 |
+
tokens = tokenizer.tokenize(example['text'])
|
| 177 |
+
total_tokens += len(tokens)
|
| 178 |
+
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
|
| 179 |
+
|
| 180 |
+
arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
|
| 181 |
+
|
| 182 |
+
# Store results
|
| 183 |
+
eval_results = {
|
| 184 |
+
Tasks.tsac_sentiment.value.benchmark: tsac_accuracy,
|
| 185 |
+
Tasks.tunisian_corpus.value.benchmark: arabml_coverage
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
print(f"Evaluation results: {eval_results}")
|
| 189 |
+
|
| 190 |
+
# Update eval_entry with results
|
| 191 |
+
eval_entry["status"] = EvaluationStatus.FINISHED.value
|
| 192 |
+
eval_entry["results"] = eval_results
|
| 193 |
+
|
| 194 |
+
# Save to results dataset
|
| 195 |
+
results_file = os.path.join(EVAL_RESULTS_PATH, f"{model}_{revision}_{precision}_{weight_type}.json")
|
| 196 |
+
with open(results_file, 'w') as f:
|
| 197 |
+
json.dump({
|
| 198 |
+
'model': model,
|
| 199 |
+
'revision': revision,
|
| 200 |
+
'precision': precision,
|
| 201 |
+
'weight_type': weight_type,
|
| 202 |
+
'results': eval_results
|
| 203 |
+
}, f, indent=2)
|
| 204 |
+
|
| 205 |
+
# Upload results to Hugging Face
|
| 206 |
+
API.upload_file(
|
| 207 |
+
path_or_fileobj=results_file,
|
| 208 |
+
path_in_repo=os.path.basename(results_file),
|
| 209 |
+
repo_id=RESULTS_REPO,
|
| 210 |
+
repo_type="dataset",
|
| 211 |
+
commit_message=f"Add evaluation results for {model}"
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# Remove the original eval request file
|
| 215 |
+
os.remove(out_path)
|
| 216 |
+
|
| 217 |
+
return styled_message(
|
| 218 |
+
f"Model evaluation completed!\n\n"
|
| 219 |
+
f"TSAC Sentiment Accuracy: {tsac_accuracy:.2%}\n"
|
| 220 |
+
f"ArabML Corpus Coverage: {arabml_coverage:.2%}"
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
except Exception as e:
|
| 224 |
+
print(f"Error during evaluation: {str(e)}")
|
| 225 |
+
eval_entry["status"] = EvaluationStatus.FAILED.value
|
| 226 |
+
eval_entry["error"] = str(e)
|
| 227 |
+
|
| 228 |
+
with open(out_path, "w") as f:
|
| 229 |
+
f.write(json.dumps(eval_entry))
|
| 230 |
+
|
| 231 |
+
API.upload_file(
|
| 232 |
+
path_or_fileobj=out_path,
|
| 233 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
| 234 |
+
repo_id=QUEUE_REPO,
|
| 235 |
+
repo_type="dataset",
|
| 236 |
+
commit_message=f"Add {model} evaluation error",
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
os.remove(out_path)
|
| 240 |
+
|
| 241 |
+
return styled_error(
|
| 242 |
+
f"Error during evaluation: {str(e)}\n\n"
|
| 243 |
+
"The evaluation will be retried automatically later."
|
| 244 |
+
)
|