hamzabouajila commited on
Commit
9d7aae7
·
1 Parent(s): 7850eab

add evaluator integration

Browse files
app.py CHANGED
@@ -26,6 +26,7 @@ from src.display.utils import (
26
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
27
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
28
  from src.submission.submit import add_new_eval
 
29
 
30
 
31
  def restart_space():
@@ -87,6 +88,28 @@ def init_leaderboard(dataframe):
87
  interactive=False,
88
  )
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  demo = gr.Blocks(css=custom_css)
92
  with demo:
@@ -98,6 +121,33 @@ with demo:
98
  leaderboard = init_leaderboard(LEADERBOARD_DF)
99
 
100
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
102
 
103
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
 
26
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
27
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
28
  from src.submission.submit import add_new_eval
29
+ from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult, Tasks
30
 
31
 
32
  def restart_space():
 
88
  interactive=False,
89
  )
90
 
91
+ # Add model evaluation functionality
92
+ def evaluate_and_update(model_name, revision, precision, weight_type):
93
+ """Evaluate a model and update the leaderboard"""
94
+ try:
95
+ # Run evaluation
96
+ eval_result = evaluate_model(model_name, revision, precision, weight_type)
97
+
98
+ # Add evaluation to queue
99
+ add_new_eval(
100
+ model_name=model_name,
101
+ revision=revision,
102
+ precision=precision,
103
+ weight_type=weight_type,
104
+ results=eval_result.results
105
+ )
106
+
107
+ # Update leaderboard
108
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
109
+ return "Evaluation started successfully! Check the leaderboard for updates."
110
+ except Exception as e:
111
+ return f"Error during evaluation: {str(e)}"
112
+
113
 
114
  demo = gr.Blocks(css=custom_css)
115
  with demo:
 
121
  leaderboard = init_leaderboard(LEADERBOARD_DF)
122
 
123
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
124
+ gr.Markdown(INTRODUCTION_TEXT)
125
+ gr.Markdown(LLM_BENCHMARKS_TEXT)
126
+ gr.Markdown(EVALUATION_QUEUE_TEXT)
127
+
128
+ with gr.TabItem("🚀 Evaluate Model", elem_id="evaluate-tab", id=3):
129
+ with gr.Row():
130
+ model_name = gr.Textbox(label="Model Name")
131
+ revision = gr.Textbox(label="Revision", value="main")
132
+ with gr.Row():
133
+ precision = gr.Dropdown(
134
+ choices=[p.value for p in Precision],
135
+ label="Precision",
136
+ value="fp32"
137
+ )
138
+ weight_type = gr.Dropdown(
139
+ choices=[w.value for w in WeightType],
140
+ label="Weight Type",
141
+ value="pytorch"
142
+ )
143
+ evaluate_button = gr.Button("Evaluate Model")
144
+ status_output = gr.Textbox(label="Evaluation Status")
145
+
146
+ evaluate_button.click(
147
+ fn=evaluate_and_update,
148
+ inputs=[model_name, revision, precision, weight_type],
149
+ outputs=[status_output]
150
+ )
151
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
152
 
153
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
pyproject.toml CHANGED
@@ -20,6 +20,7 @@ dependencies = [
20
  "python-dateutil>=2.9.0.post0",
21
  "sentencepiece>=0.2.0",
22
  "tokenizers>=0.15.0",
 
23
  "tqdm>=4.67.1",
24
  "transformers>=4.53.1",
25
  ]
 
20
  "python-dateutil>=2.9.0.post0",
21
  "sentencepiece>=0.2.0",
22
  "tokenizers>=0.15.0",
23
+ "torch>=2.7.1",
24
  "tqdm>=4.67.1",
25
  "transformers>=4.53.1",
26
  ]
requirements.txt CHANGED
@@ -8,6 +8,8 @@ gradio_client
8
  huggingface-hub>=0.18.0
9
  matplotlib
10
  numpy
 
 
11
  pandas
12
  python-dateutil
13
  tqdm
 
8
  huggingface-hub>=0.18.0
9
  matplotlib
10
  numpy
11
+ torch>=2.0.0
12
+ scikit-learn>=1.0.0
13
  pandas
14
  python-dateutil
15
  tqdm
src/evaluator/evaluate.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Dict, Any
4
+ from dataclasses import dataclass
5
+ from enum import Enum
6
+
7
+ import torch
8
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
+ from datasets import load_dataset
10
+
11
+ from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
12
+ from src.display.utils import Tasks
13
+
14
+ class EvaluationStatus(Enum):
15
+ PENDING = "PENDING"
16
+ RUNNING = "RUNNING"
17
+ FINISHED = "FINISHED"
18
+ FAILED = "FAILED"
19
+
20
+ @dataclass
21
+ class EvaluationResult:
22
+ model: str
23
+ revision: str
24
+ precision: str
25
+ weight_type: str
26
+ results: Dict[str, float]
27
+ error: str = None
28
+
29
+ def evaluate_tsac_sentiment(model, tokenizer, device):
30
+ """Evaluate model on TSAC sentiment analysis task"""
31
+ dataset = load_dataset("fbougares/tsac", split="test")
32
+
33
+ def preprocess(examples):
34
+ return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
35
+
36
+ dataset = dataset.map(preprocess, batched=True)
37
+ dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
38
+
39
+ model.eval()
40
+ with torch.no_grad():
41
+ predictions = []
42
+ labels = []
43
+
44
+ for batch in dataset:
45
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
46
+ label = batch['label'].to(device)
47
+
48
+ outputs = model(**inputs)
49
+ predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
50
+ labels.extend(label.cpu().tolist())
51
+
52
+ accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(predictions)
53
+ return accuracy
54
+
55
+ def evaluate_tunisian_corpus_coverage(model, tokenizer):
56
+ """Evaluate model's coverage on Tunisian Dialect Corpus"""
57
+ dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="test")
58
+
59
+ def preprocess(examples):
60
+ return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
61
+
62
+ dataset = dataset.map(preprocess, batched=True)
63
+
64
+ # Calculate coverage based on tokenization
65
+ total_tokens = 0
66
+ covered_tokens = 0
67
+
68
+ for example in dataset:
69
+ tokens = tokenizer.tokenize(example['text'])
70
+ total_tokens += len(tokens)
71
+ covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
72
+
73
+ coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
74
+ return coverage
75
+
76
+ def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
77
+ """Evaluate a single model on all tasks"""
78
+ try:
79
+ # Load model and tokenizer
80
+ device = "cuda" if torch.cuda.is_available() else "cpu"
81
+
82
+ model = AutoModelForSequenceClassification.from_pretrained(
83
+ model_name,
84
+ revision=revision,
85
+ torch_dtype=getattr(torch, precision),
86
+ trust_remote_code=True
87
+ ).to(device)
88
+
89
+ tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
90
+
91
+ # Run evaluations
92
+ results = {}
93
+
94
+ # TSAC Sentiment
95
+ tsac_result = evaluate_tsac_sentiment(model, tokenizer, device)
96
+ results[Tasks.tsac_sentiment.value.benchmark] = tsac_result
97
+
98
+ # Tunisian Corpus Coverage
99
+ corpus_result = evaluate_tunisian_corpus_coverage(model, tokenizer)
100
+ results[Tasks.tunisian_corpus.value.benchmark] = corpus_result
101
+
102
+ return EvaluationResult(
103
+ model=model_name,
104
+ revision=revision,
105
+ precision=precision,
106
+ weight_type=weight_type,
107
+ results=results
108
+ )
109
+
110
+ except Exception as e:
111
+ return EvaluationResult(
112
+ model=model_name,
113
+ revision=revision,
114
+ precision=precision,
115
+ weight_type=weight_type,
116
+ results={},
117
+ error=str(e)
118
+ )
119
+
120
+ def process_evaluation_queue():
121
+ """Process all pending evaluations in the queue"""
122
+ # Get all pending evaluations
123
+ queue_dir = os.path.join(EVAL_REQUESTS_PATH)
124
+ pending_files = [f for f in os.listdir(queue_dir) if f.endswith('.json')]
125
+
126
+ for file in pending_files:
127
+ file_path = os.path.join(queue_dir, file)
128
+ with open(file_path, 'r') as f:
129
+ eval_request = json.load(f)
130
+
131
+ if eval_request.get('status') != EvaluationStatus.PENDING.value:
132
+ continue
133
+
134
+ # Mark as running
135
+ eval_request['status'] = EvaluationStatus.RUNNING.value
136
+ with open(file_path, 'w') as f:
137
+ json.dump(eval_request, f, indent=2)
138
+
139
+ # Perform evaluation
140
+ result = evaluate_model(
141
+ model_name=eval_request['model'],
142
+ revision=eval_request['revision'],
143
+ precision=eval_request['precision'],
144
+ weight_type=eval_request['weight_type']
145
+ )
146
+
147
+ # Save results
148
+ if result.error:
149
+ eval_request['status'] = EvaluationStatus.FAILED.value
150
+ eval_request['error'] = result.error
151
+ else:
152
+ eval_request['status'] = EvaluationStatus.FINISHED.value
153
+ eval_request['results'] = result.results
154
+
155
+ with open(file_path, 'w') as f:
156
+ json.dump(eval_request, f, indent=2)
157
+
158
+ # Save to results dataset
159
+ result_file = os.path.join(EVAL_RESULTS_PATH, f"{result.model}_{result.precision}.json")
160
+ with open(result_file, 'w') as f:
161
+ json.dump({
162
+ 'model': result.model,
163
+ 'revision': result.revision,
164
+ 'precision': result.precision,
165
+ 'weight_type': result.weight_type,
166
+ 'results': result.results
167
+ }, f, indent=2)
168
+
169
+ # Upload to Hugging Face
170
+ API.upload_file(
171
+ path_or_fileobj=result_file,
172
+ path_in_repo=os.path.basename(result_file),
173
+ repo_id=f"{OWNER}/results",
174
+ repo_type="dataset",
175
+ commit_message=f"Add evaluation results for {result.model}"
176
+ )
177
+
178
+ def main():
179
+ process_evaluation_queue()
180
+
181
+ if __name__ == "__main__":
182
+ main()
src/evaluator/run_evaluator.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import sys
3
+ import signal
4
+ from src.evaluator.evaluate import process_evaluation_queue
5
+
6
+ def signal_handler(sig, frame):
7
+ print("\nEvaluator shutting down...")
8
+ sys.exit(0)
9
+
10
+ def main():
11
+ # Register signal handler for graceful shutdown
12
+ signal.signal(signal.SIGINT, signal_handler)
13
+
14
+ print("Starting evaluator service...")
15
+
16
+ while True:
17
+ try:
18
+ process_evaluation_queue()
19
+ print("Evaluation queue processed. Sleeping for 5 minutes...")
20
+ time.sleep(300) # Sleep for 5 minutes
21
+ except Exception as e:
22
+ print(f"Error in evaluation process: {e}")
23
+ print("Retrying in 5 minutes...")
24
+ time.sleep(300)
25
+
26
+ if __name__ == "__main__":
27
+ main()