Spaces:
Runtime error
Runtime error
Commit
·
f12b6ec
1
Parent(s):
f54d576
Added traceback import to handle error traces
Browse filesFixed TSAC evaluation:
Added proper DataLoader with batch processing
Improved error handling and logging
Better handling of model output formats
Fixed Tunisian Corpus evaluation:
Removed truncation to handle long sequences
Improved token counting using input IDs
Better error handling with full traceback
The main issues were:
Missing traceback import for error traces
TSAC evaluation wasn't using proper batch processing
Tunisian Corpus evaluation was truncating long sequences
Try running the evaluation again. The improvements should:
Handle long sequences in the Tunisian Corpus
Process TSAC evaluation in batches
Provide better error messages
- app.py +28 -6
- scripts/fix_results.py +69 -0
- scripts/setup_env.py +18 -0
- src/evaluator/evaluate.py +81 -114
- src/leaderboard/read_evals.py +44 -9
- src/populate.py +7 -1
app.py
CHANGED
|
@@ -67,18 +67,40 @@ def restart_space():
|
|
| 67 |
|
| 68 |
### Space initialisation
|
| 69 |
try:
|
| 70 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
snapshot_download(
|
| 72 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 73 |
)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
try:
|
| 77 |
-
print(EVAL_RESULTS_PATH)
|
| 78 |
snapshot_download(
|
| 79 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 80 |
)
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
restart_space()
|
| 83 |
|
| 84 |
|
|
|
|
| 67 |
|
| 68 |
### Space initialisation
|
| 69 |
try:
|
| 70 |
+
print(f"\n=== Starting space initialization ===")
|
| 71 |
+
print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
|
| 72 |
+
print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
|
| 73 |
+
print(f"QUEUE_REPO: {QUEUE_REPO}")
|
| 74 |
+
print(f"RESULTS_REPO: {RESULTS_REPO}")
|
| 75 |
+
print(f"TOKEN: {bool(TOKEN)}")
|
| 76 |
+
|
| 77 |
+
print("\n=== Downloading request files ===")
|
| 78 |
snapshot_download(
|
| 79 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 80 |
)
|
| 81 |
+
|
| 82 |
+
print("\n=== Downloading results files ===")
|
|
|
|
|
|
|
| 83 |
snapshot_download(
|
| 84 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 85 |
)
|
| 86 |
+
|
| 87 |
+
print("\n=== Loading leaderboard data ===")
|
| 88 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 89 |
+
print(f"Leaderboard DataFrame shape: {LEADERBOARD_DF.shape if LEADERBOARD_DF is not None else 'None'}")
|
| 90 |
+
|
| 91 |
+
print("\n=== Loading evaluation queue data ===")
|
| 92 |
+
(
|
| 93 |
+
finished_eval_queue_df,
|
| 94 |
+
running_eval_queue_df,
|
| 95 |
+
pending_eval_queue_df,
|
| 96 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 97 |
+
print(f"Finished eval queue shape: {finished_eval_queue_df.shape if finished_eval_queue_df is not None else 'None'}")
|
| 98 |
+
print(f"Running eval queue shape: {running_eval_queue_df.shape if running_eval_queue_df is not None else 'None'}")
|
| 99 |
+
print(f"Pending eval queue shape: {pending_eval_queue_df.shape if pending_eval_queue_df is not None else 'None'}")
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"\n=== Error during space initialization ===")
|
| 103 |
+
print(f"Error: {str(e)}")
|
| 104 |
restart_space()
|
| 105 |
|
| 106 |
|
scripts/fix_results.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from huggingface_hub import HfApi
|
| 5 |
+
|
| 6 |
+
# Load environment variables
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
# Configuration
|
| 10 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 11 |
+
RESULTS_REPO = "hamzabouajila/results"
|
| 12 |
+
|
| 13 |
+
# Read the original results file
|
| 14 |
+
def read_results_file(file_path):
|
| 15 |
+
with open(file_path, 'r') as f:
|
| 16 |
+
return json.load(f)
|
| 17 |
+
|
| 18 |
+
# Fix the results format
|
| 19 |
+
def fix_results_format(results):
|
| 20 |
+
# Fix null accuracy
|
| 21 |
+
if results['results'].get('accuracy') is None:
|
| 22 |
+
results['results']['accuracy'] = 0.0 # Replace with actual accuracy if known
|
| 23 |
+
|
| 24 |
+
# Fix model_type format
|
| 25 |
+
results['model_type'] = results['model_type'].replace('\ud83d\udfe2 : ', '').strip()
|
| 26 |
+
|
| 27 |
+
# Convert params to integer if needed
|
| 28 |
+
if isinstance(results.get('params'), float):
|
| 29 |
+
results['params'] = int(results['params'] * 1000000) # Convert to millions
|
| 30 |
+
|
| 31 |
+
return results
|
| 32 |
+
|
| 33 |
+
# Upload to Hugging Face
|
| 34 |
+
def upload_to_hf(results, file_name):
|
| 35 |
+
api = HfApi(token=HF_TOKEN)
|
| 36 |
+
try:
|
| 37 |
+
api.upload_file(
|
| 38 |
+
path_or_fileobj=file_name,
|
| 39 |
+
path_in_repo=os.path.basename(file_name),
|
| 40 |
+
repo_id=RESULTS_REPO,
|
| 41 |
+
repo_type="dataset",
|
| 42 |
+
commit_message=f"Add evaluation results for {results['model']}"
|
| 43 |
+
)
|
| 44 |
+
print(f"Successfully uploaded to Hugging Face")
|
| 45 |
+
return True
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Error uploading to Hugging Face: {str(e)}")
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
# Original file path
|
| 52 |
+
original_file = "/teamspace/studios/this_studio/TunisianLeaderBoard/eval-results/tunis-ai/TunBERT_eval_request_False_float16_Original.json"
|
| 53 |
+
|
| 54 |
+
# Read and fix the results
|
| 55 |
+
results = read_results_file(original_file)
|
| 56 |
+
fixed_results = fix_results_format(results)
|
| 57 |
+
|
| 58 |
+
# Save the fixed version
|
| 59 |
+
fixed_file = "/teamspace/studios/this_studio/TunisianLeaderBoard/eval-results/tunis-ai/TunBERT_eval_request_False_float16_Original_fixed.json"
|
| 60 |
+
with open(fixed_file, 'w') as f:
|
| 61 |
+
json.dump(fixed_results, f, indent=2)
|
| 62 |
+
|
| 63 |
+
print(f"Fixed results saved to: {fixed_file}")
|
| 64 |
+
|
| 65 |
+
# Try to upload to Hugging Face
|
| 66 |
+
if HF_TOKEN:
|
| 67 |
+
upload_to_hf(fixed_results, fixed_file)
|
| 68 |
+
else:
|
| 69 |
+
print("No HF_TOKEN found. Skipping Hugging Face upload.")
|
scripts/setup_env.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
# Load environment variables
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
# Set up paths if not already set
|
| 8 |
+
if not os.getenv("EVAL_REQUESTS_PATH"):
|
| 9 |
+
os.environ["EVAL_REQUESTS_PATH"] = "./eval-queue"
|
| 10 |
+
print("Set EVAL_REQUESTS_PATH to ./eval-queue")
|
| 11 |
+
|
| 12 |
+
if not os.getenv("EVAL_RESULTS_PATH"):
|
| 13 |
+
os.environ["EVAL_RESULTS_PATH"] = "./eval-results"
|
| 14 |
+
print("Set EVAL_RESULTS_PATH to ./eval-results")
|
| 15 |
+
|
| 16 |
+
# Verify paths
|
| 17 |
+
print(f"EVAL_REQUESTS_PATH: {os.getenv('EVAL_REQUESTS_PATH')}")
|
| 18 |
+
print(f"EVAL_RESULTS_PATH: {os.getenv('EVAL_RESULTS_PATH')}")
|
src/evaluator/evaluate.py
CHANGED
|
@@ -7,6 +7,7 @@ from datetime import datetime
|
|
| 7 |
import torch
|
| 8 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
|
| 9 |
from datasets import load_dataset
|
|
|
|
| 10 |
|
| 11 |
from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
|
| 12 |
from src.display.utils import Tasks
|
|
@@ -66,7 +67,30 @@ def evaluate_tsac_sentiment(model, tokenizer, device):
|
|
| 66 |
predictions = []
|
| 67 |
targets = []
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
if i == 0:
|
| 71 |
print("\nProcessing first batch...")
|
| 72 |
print(f"Batch keys: {list(batch.keys())}")
|
|
@@ -139,7 +163,12 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
|
|
| 139 |
def preprocess(examples):
|
| 140 |
print("Tunisian Corpus preprocess exemples -------------",examples)
|
| 141 |
# Use 'Tweet' field as per dataset structure
|
| 142 |
-
return tokenizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
dataset = dataset.map(preprocess, batched=True)
|
| 145 |
|
|
@@ -148,7 +177,11 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
|
|
| 148 |
covered_tokens = 0
|
| 149 |
|
| 150 |
for example in dataset:
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
total_tokens += len(tokens)
|
| 153 |
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
|
| 154 |
|
|
@@ -157,7 +190,8 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
|
|
| 157 |
return {"coverage": coverage}
|
| 158 |
except Exception as e:
|
| 159 |
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
| 160 |
-
|
|
|
|
| 161 |
|
| 162 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
| 163 |
"""Evaluate a single model on all tasks"""
|
|
@@ -305,16 +339,17 @@ def process_evaluation_queue():
|
|
| 305 |
|
| 306 |
# Find all JSON files in the model directory
|
| 307 |
json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
|
| 308 |
-
print(f"Found {len(json_files)}
|
| 309 |
-
|
| 310 |
for file in json_files:
|
| 311 |
file_path = os.path.join(model_dir_path, file)
|
|
|
|
| 312 |
try:
|
| 313 |
with open(file_path, 'r') as f:
|
| 314 |
eval_entry = json.load(f)
|
| 315 |
|
| 316 |
-
# Check if this is a pending evaluation
|
| 317 |
-
|
|
|
|
| 318 |
print(f"\n=== Found pending evaluation ===")
|
| 319 |
print(f"Model: {eval_entry['model']}")
|
| 320 |
print(f"Revision: {eval_entry['revision']}")
|
|
@@ -409,115 +444,47 @@ def process_evaluation_queue():
|
|
| 409 |
print("\nError file uploaded to Hugging Face")
|
| 410 |
except Exception as upload_error:
|
| 411 |
print(f"Error uploading error file: {str(upload_error)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
except Exception as e:
|
| 413 |
print(f"Error processing file {file}: {str(e)}")
|
| 414 |
print(f"Full traceback: {traceback.format_exc()}")
|
| 415 |
-
pending_files.append(os.path.join(EVAL_REQUESTS_PATH, file))
|
| 416 |
-
|
| 417 |
-
print(f"Found {len(pending_files)} pending evaluation requests")
|
| 418 |
-
for file_path in pending_files:
|
| 419 |
-
print(f" - {file_path}")
|
| 420 |
-
|
| 421 |
-
if not pending_files:
|
| 422 |
-
print("No pending evaluation requests found")
|
| 423 |
-
return
|
| 424 |
-
|
| 425 |
-
for file_path in pending_files:
|
| 426 |
-
try:
|
| 427 |
-
print(f"\n=== Processing evaluation request: {file_path} ===")
|
| 428 |
-
|
| 429 |
-
# Read the file atomically
|
| 430 |
-
try:
|
| 431 |
-
with open(file_path, 'r') as f:
|
| 432 |
-
eval_request = json.load(f)
|
| 433 |
-
print(f"Loaded evaluation request: {json.dumps(eval_request, indent=2)}")
|
| 434 |
-
except Exception as e:
|
| 435 |
-
print(f"Error reading evaluation request: {str(e)}")
|
| 436 |
-
continue
|
| 437 |
-
|
| 438 |
-
# Skip non-pending evaluations
|
| 439 |
-
status = eval_request.get('status', 'UNKNOWN')
|
| 440 |
-
if status != EvaluationStatus.PENDING.value:
|
| 441 |
-
print(f"Skipping non-pending evaluation (status: {status})")
|
| 442 |
-
continue
|
| 443 |
-
|
| 444 |
-
# Update status to RUNNING
|
| 445 |
-
eval_request['status'] = EvaluationStatus.RUNNING.value
|
| 446 |
-
print(f"Updating status to RUNNING for {eval_request['model']}")
|
| 447 |
-
|
| 448 |
-
# Write the update atomically
|
| 449 |
-
try:
|
| 450 |
-
with open(file_path, 'w') as f:
|
| 451 |
-
json.dump(eval_request, f, indent=2)
|
| 452 |
-
print("Successfully updated status to RUNNING")
|
| 453 |
-
except Exception as e:
|
| 454 |
-
print(f"Error updating status: {str(e)}")
|
| 455 |
-
continue
|
| 456 |
-
|
| 457 |
-
# Get model info from request
|
| 458 |
-
model_name = eval_request.get('model', '')
|
| 459 |
-
revision = eval_request.get('revision', '')
|
| 460 |
-
precision = eval_request.get('precision', '')
|
| 461 |
-
weight_type = eval_request.get('weight_type', '')
|
| 462 |
-
|
| 463 |
-
if not model_name:
|
| 464 |
-
print("Error: Missing model name in evaluation request")
|
| 465 |
continue
|
| 466 |
-
|
| 467 |
-
print(f"\n=== Evaluating model: {model_name} ===")
|
| 468 |
-
print(f"Revision: {revision}")
|
| 469 |
-
print(f"Precision: {precision}")
|
| 470 |
-
print(f"Weight type: {weight_type}")
|
| 471 |
-
|
| 472 |
-
result = evaluate_model(model_name, revision, precision, weight_type)
|
| 473 |
-
|
| 474 |
-
# Update status and save results
|
| 475 |
-
if result.error:
|
| 476 |
-
print(f"\n=== Evaluation failed ===")
|
| 477 |
-
print(f"Error: {result.error}")
|
| 478 |
-
eval_request['status'] = EvaluationStatus.FAILED.value
|
| 479 |
-
eval_request['error'] = result.error
|
| 480 |
-
else:
|
| 481 |
-
print(f"\n=== Evaluation completed successfully ===")
|
| 482 |
-
print(f"Results: {result.results}")
|
| 483 |
-
eval_request['status'] = EvaluationStatus.FINISHED.value
|
| 484 |
-
eval_request['results'] = result.results
|
| 485 |
-
|
| 486 |
-
# Write the final update atomically
|
| 487 |
-
try:
|
| 488 |
-
with open(file_path, 'w') as f:
|
| 489 |
-
json.dump(eval_request, f, indent=2)
|
| 490 |
-
print("Successfully saved evaluation results")
|
| 491 |
-
except Exception as e:
|
| 492 |
-
print(f"Error saving evaluation results: {str(e)}")
|
| 493 |
-
continue
|
| 494 |
-
|
| 495 |
-
# Move successful evaluations to results directory
|
| 496 |
-
if eval_request['status'] == EvaluationStatus.FINISHED.value:
|
| 497 |
-
try:
|
| 498 |
-
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
| 499 |
-
result_file = os.path.join(EVAL_RESULTS_PATH, os.path.basename(file_path))
|
| 500 |
-
os.rename(file_path, result_file)
|
| 501 |
-
print(f"Moved evaluation results to: {result_file}")
|
| 502 |
-
except Exception as e:
|
| 503 |
-
print(f"Error moving results file: {str(e)}")
|
| 504 |
-
|
| 505 |
-
except Exception as e:
|
| 506 |
-
print(f"\n=== Error processing evaluation: {str(e)} ===")
|
| 507 |
-
print(f"Full traceback: {traceback.format_exc()}")
|
| 508 |
-
continue
|
| 509 |
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
path_in_repo=result_filename if not username else os.path.join(username, result_filename),
|
| 516 |
-
repo_id=f"{OWNER}/results",
|
| 517 |
-
repo_type="dataset",
|
| 518 |
-
commit_message=f"Add evaluation results for {result.model}"
|
| 519 |
-
)
|
| 520 |
-
print("Successfully uploaded results to Hugging Face")
|
| 521 |
-
except Exception as e:
|
| 522 |
-
print(f"Error uploading results to Hugging Face: {str(e)}")
|
| 523 |
|
|
|
|
| 7 |
import torch
|
| 8 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
|
| 9 |
from datasets import load_dataset
|
| 10 |
+
import traceback
|
| 11 |
|
| 12 |
from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
|
| 13 |
from src.display.utils import Tasks
|
|
|
|
| 67 |
predictions = []
|
| 68 |
targets = []
|
| 69 |
|
| 70 |
+
# Create DataLoader with batch size 16
|
| 71 |
+
from torch.utils.data import DataLoader
|
| 72 |
+
|
| 73 |
+
# Define a custom collate function
|
| 74 |
+
def collate_fn(batch):
|
| 75 |
+
# Stack tensors for input_ids and attention_mask
|
| 76 |
+
input_ids = torch.stack([sample['input_ids'] for sample in batch])
|
| 77 |
+
attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
|
| 78 |
+
# Stack targets
|
| 79 |
+
targets = torch.stack([torch.tensor(sample['target']) for sample in batch])
|
| 80 |
+
return {
|
| 81 |
+
'input_ids': input_ids,
|
| 82 |
+
'attention_mask': attention_mask,
|
| 83 |
+
'target': targets
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
dataloader = DataLoader(
|
| 87 |
+
dataset,
|
| 88 |
+
batch_size=16,
|
| 89 |
+
shuffle=False,
|
| 90 |
+
collate_fn=collate_fn
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
for i, batch in enumerate(dataloader):
|
| 94 |
if i == 0:
|
| 95 |
print("\nProcessing first batch...")
|
| 96 |
print(f"Batch keys: {list(batch.keys())}")
|
|
|
|
| 163 |
def preprocess(examples):
|
| 164 |
print("Tunisian Corpus preprocess exemples -------------",examples)
|
| 165 |
# Use 'Tweet' field as per dataset structure
|
| 166 |
+
return tokenizer(
|
| 167 |
+
examples['Tweet'],
|
| 168 |
+
padding=False, # We don't need padding for token coverage
|
| 169 |
+
truncation=False, # Don't truncate long sequences
|
| 170 |
+
max_length=None # Let tokenizer handle the length
|
| 171 |
+
)
|
| 172 |
|
| 173 |
dataset = dataset.map(preprocess, batched=True)
|
| 174 |
|
|
|
|
| 177 |
covered_tokens = 0
|
| 178 |
|
| 179 |
for example in dataset:
|
| 180 |
+
# Get the tokenized input IDs
|
| 181 |
+
input_ids = example['input_ids']
|
| 182 |
+
|
| 183 |
+
# Convert to tokens and count
|
| 184 |
+
tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
| 185 |
total_tokens += len(tokens)
|
| 186 |
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
|
| 187 |
|
|
|
|
| 190 |
return {"coverage": coverage}
|
| 191 |
except Exception as e:
|
| 192 |
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
| 193 |
+
print(f"Full traceback: {traceback.format_exc()}")
|
| 194 |
+
raise e
|
| 195 |
|
| 196 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
| 197 |
"""Evaluate a single model on all tasks"""
|
|
|
|
| 339 |
|
| 340 |
# Find all JSON files in the model directory
|
| 341 |
json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
|
| 342 |
+
print(f"Found {len(json_files)} pending evaluation requests")
|
|
|
|
| 343 |
for file in json_files:
|
| 344 |
file_path = os.path.join(model_dir_path, file)
|
| 345 |
+
print(f" - {file_path}")
|
| 346 |
try:
|
| 347 |
with open(file_path, 'r') as f:
|
| 348 |
eval_entry = json.load(f)
|
| 349 |
|
| 350 |
+
# Check if this is a pending or running evaluation
|
| 351 |
+
status = eval_entry.get('status', '')
|
| 352 |
+
if status == EvaluationStatus.PENDING.value:
|
| 353 |
print(f"\n=== Found pending evaluation ===")
|
| 354 |
print(f"Model: {eval_entry['model']}")
|
| 355 |
print(f"Revision: {eval_entry['revision']}")
|
|
|
|
| 444 |
print("\nError file uploaded to Hugging Face")
|
| 445 |
except Exception as upload_error:
|
| 446 |
print(f"Error uploading error file: {str(upload_error)}")
|
| 447 |
+
elif status == EvaluationStatus.RUNNING.value:
|
| 448 |
+
print(f"\n=== Found running evaluation ===")
|
| 449 |
+
print(f"Model: {eval_entry['model']}")
|
| 450 |
+
print(f"Revision: {eval_entry['revision']}")
|
| 451 |
+
print(f"Precision: {eval_entry['precision']}")
|
| 452 |
+
print(f"Weight type: {eval_entry['weight_type']}")
|
| 453 |
+
|
| 454 |
+
try:
|
| 455 |
+
# Check if we have results for this evaluation
|
| 456 |
+
result_filename = os.path.basename(file_path)
|
| 457 |
+
result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
|
| 458 |
+
|
| 459 |
+
if os.path.exists(result_path):
|
| 460 |
+
print(f"\nFound existing results file: {result_path}")
|
| 461 |
+
# Update status to FINISHED
|
| 462 |
+
eval_entry['status'] = EvaluationStatus.FINISHED.value
|
| 463 |
+
with open(file_path, 'w') as f:
|
| 464 |
+
json.dump(eval_entry, f, indent=2)
|
| 465 |
+
else:
|
| 466 |
+
print("\nNo results found. Restarting evaluation...")
|
| 467 |
+
# Restart the evaluation
|
| 468 |
+
eval_entry['status'] = EvaluationStatus.PENDING.value
|
| 469 |
+
with open(file_path, 'w') as f:
|
| 470 |
+
json.dump(eval_entry, f, indent=2)
|
| 471 |
+
except Exception as check_error:
|
| 472 |
+
print(f"\n=== Error checking running evaluation ===")
|
| 473 |
+
print(f"Error: {str(check_error)}")
|
| 474 |
+
print(f"Full traceback: {traceback.format_exc()}")
|
| 475 |
+
|
| 476 |
+
# If we can't check the status, restart the evaluation
|
| 477 |
+
eval_entry['status'] = EvaluationStatus.PENDING.value
|
| 478 |
+
with open(file_path, 'w') as f:
|
| 479 |
+
json.dump(eval_entry, f, indent=2)
|
| 480 |
except Exception as e:
|
| 481 |
print(f"Error processing file {file}: {str(e)}")
|
| 482 |
print(f"Full traceback: {traceback.format_exc()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
+
print(f"\n=== Evaluation queue summary ===")
|
| 486 |
+
print(f"Total directories checked: {len(model_dirs)}")
|
| 487 |
+
print(f"Total files processed: {len(json_files)}")
|
| 488 |
+
print(f"\nEvaluation queue processed. Sleeping for 5 minutes...")
|
| 489 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -61,7 +61,7 @@ class EvalResult:
|
|
| 61 |
model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
|
| 62 |
weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
|
| 63 |
date=data.get('submitted_at', ''),
|
| 64 |
-
still_on_hub=is_model_on_hub(model_name)
|
| 65 |
)
|
| 66 |
except Exception as e:
|
| 67 |
print(f"Error reading evaluation file {json_filepath}: {str(e)}")
|
|
@@ -85,7 +85,7 @@ class EvalResult:
|
|
| 85 |
full_model = "/".join(org_and_model)
|
| 86 |
|
| 87 |
still_on_hub, _, model_config = is_model_on_hub(
|
| 88 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 89 |
)
|
| 90 |
architecture = "?"
|
| 91 |
if model_config is not None:
|
|
@@ -151,7 +151,7 @@ class EvalResult:
|
|
| 151 |
AutoEvalColumnInstance.license.name: self.license,
|
| 152 |
AutoEvalColumnInstance.likes.name: self.likes,
|
| 153 |
AutoEvalColumnInstance.params.name: self.num_params,
|
| 154 |
-
AutoEvalColumnInstance.still_on_hub.name: self.still_on_hub,
|
| 155 |
}
|
| 156 |
|
| 157 |
for task in Tasks:
|
|
@@ -188,24 +188,28 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 188 |
for root, _, files in os.walk(results_path):
|
| 189 |
# Only process .json files
|
| 190 |
json_files = [f for f in files if f.endswith(".json")]
|
|
|
|
| 191 |
for file in json_files:
|
| 192 |
model_result_filepaths.append(os.path.join(root, file))
|
|
|
|
| 193 |
|
| 194 |
eval_results = {}
|
| 195 |
for model_result_filepath in model_result_filepaths:
|
| 196 |
try:
|
| 197 |
# Creation of result
|
| 198 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
|
| 199 |
if eval_result is None:
|
| 200 |
print(f"Skipping invalid evaluation file: {model_result_filepath}")
|
| 201 |
continue
|
| 202 |
|
| 203 |
eval_result.update_with_request_file(requests_path)
|
| 204 |
-
|
| 205 |
# Store results of same eval together
|
| 206 |
if eval_result.eval_name not in eval_results:
|
| 207 |
eval_results[eval_result.eval_name] = []
|
| 208 |
eval_results[eval_result.eval_name].append(eval_result)
|
|
|
|
| 209 |
|
| 210 |
except Exception as e:
|
| 211 |
print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
|
|
@@ -214,16 +218,47 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 214 |
# Store results of same eval together
|
| 215 |
eval_name = eval_result.eval_name
|
| 216 |
if eval_name in eval_results.keys():
|
| 217 |
-
|
|
|
|
| 218 |
else:
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
results = []
|
| 222 |
-
for v in
|
|
|
|
|
|
|
| 223 |
try:
|
| 224 |
-
v.to_dict()
|
| 225 |
results.append(v)
|
| 226 |
except KeyError as e: # not all eval values present
|
| 227 |
-
print(e)
|
| 228 |
continue
|
| 229 |
return results
|
|
|
|
| 61 |
model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
|
| 62 |
weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
|
| 63 |
date=data.get('submitted_at', ''),
|
| 64 |
+
still_on_hub=is_model_on_hub(model_name, revision="main")
|
| 65 |
)
|
| 66 |
except Exception as e:
|
| 67 |
print(f"Error reading evaluation file {json_filepath}: {str(e)}")
|
|
|
|
| 85 |
full_model = "/".join(org_and_model)
|
| 86 |
|
| 87 |
still_on_hub, _, model_config = is_model_on_hub(
|
| 88 |
+
full_model, revision=config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 89 |
)
|
| 90 |
architecture = "?"
|
| 91 |
if model_config is not None:
|
|
|
|
| 151 |
AutoEvalColumnInstance.license.name: self.license,
|
| 152 |
AutoEvalColumnInstance.likes.name: self.likes,
|
| 153 |
AutoEvalColumnInstance.params.name: self.num_params,
|
| 154 |
+
AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
|
| 155 |
}
|
| 156 |
|
| 157 |
for task in Tasks:
|
|
|
|
| 188 |
for root, _, files in os.walk(results_path):
|
| 189 |
# Only process .json files
|
| 190 |
json_files = [f for f in files if f.endswith(".json")]
|
| 191 |
+
print(json_files)
|
| 192 |
for file in json_files:
|
| 193 |
model_result_filepaths.append(os.path.join(root, file))
|
| 194 |
+
print(model_result_filepaths)
|
| 195 |
|
| 196 |
eval_results = {}
|
| 197 |
for model_result_filepath in model_result_filepaths:
|
| 198 |
try:
|
| 199 |
# Creation of result
|
| 200 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 201 |
+
# print(eval_result)
|
| 202 |
if eval_result is None:
|
| 203 |
print(f"Skipping invalid evaluation file: {model_result_filepath}")
|
| 204 |
continue
|
| 205 |
|
| 206 |
eval_result.update_with_request_file(requests_path)
|
| 207 |
+
# print(eval_result)
|
| 208 |
# Store results of same eval together
|
| 209 |
if eval_result.eval_name not in eval_results:
|
| 210 |
eval_results[eval_result.eval_name] = []
|
| 211 |
eval_results[eval_result.eval_name].append(eval_result)
|
| 212 |
+
# print(eval_results)
|
| 213 |
|
| 214 |
except Exception as e:
|
| 215 |
print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
|
|
|
|
| 218 |
# Store results of same eval together
|
| 219 |
eval_name = eval_result.eval_name
|
| 220 |
if eval_name in eval_results.keys():
|
| 221 |
+
# If we already have results for this eval, append to list
|
| 222 |
+
eval_results[eval_name].append(eval_result)
|
| 223 |
else:
|
| 224 |
+
# Initialize list for this eval name
|
| 225 |
+
eval_results[eval_name] = [eval_result]
|
| 226 |
+
|
| 227 |
+
# Process final results
|
| 228 |
+
final_results = {}
|
| 229 |
+
for eval_name, eval_list in eval_results.items():
|
| 230 |
+
# Create merged results from all evaluations, ensuring all required task keys are present
|
| 231 |
+
merged_results = {task.value.benchmark: None for task in Tasks}
|
| 232 |
+
for eval_result in eval_list:
|
| 233 |
+
merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 234 |
+
|
| 235 |
+
# Take the first eval_result as base and update with merged results
|
| 236 |
+
print("evaluation list : ", eval_list)
|
| 237 |
+
base_result = eval_list[0]
|
| 238 |
+
# print(base_result)
|
| 239 |
+
final_results[eval_name] = EvalResult(
|
| 240 |
+
eval_name=eval_name,
|
| 241 |
+
full_model=base_result.full_model,
|
| 242 |
+
org=base_result.org,
|
| 243 |
+
model=base_result.model,
|
| 244 |
+
revision=base_result.revision,
|
| 245 |
+
results=merged_results,
|
| 246 |
+
precision=base_result.precision,
|
| 247 |
+
model_type=base_result.model_type,
|
| 248 |
+
weight_type=base_result.weight_type,
|
| 249 |
+
date=base_result.date,
|
| 250 |
+
still_on_hub=base_result.still_on_hub
|
| 251 |
+
)
|
| 252 |
+
print(final_results)
|
| 253 |
|
| 254 |
results = []
|
| 255 |
+
for v in final_results.values():
|
| 256 |
+
print("v : ",v)
|
| 257 |
+
print("Merged results: ", v.results)
|
| 258 |
try:
|
| 259 |
+
v.to_dict() # we test if the dict version is complete
|
| 260 |
results.append(v)
|
| 261 |
except KeyError as e: # not all eval values present
|
| 262 |
+
print("error in v",e)
|
| 263 |
continue
|
| 264 |
return results
|
src/populate.py
CHANGED
|
@@ -11,14 +11,20 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
| 11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
|
|
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
|
| 15 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
| 16 |
if df.empty:
|
| 17 |
print("No evaluation results found. Returning empty DataFrame with correct columns.")
|
| 18 |
return pd.DataFrame(columns=cols)
|
| 19 |
df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
|
|
|
|
| 20 |
df = df[cols].round(decimals=2)
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
return df
|
| 23 |
|
| 24 |
|
|
|
|
| 11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
+
print(raw_data)
|
| 15 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 16 |
+
print(all_data_json)
|
| 17 |
df = pd.DataFrame.from_records(all_data_json)
|
| 18 |
+
print(df)
|
| 19 |
if df.empty:
|
| 20 |
print("No evaluation results found. Returning empty DataFrame with correct columns.")
|
| 21 |
return pd.DataFrame(columns=cols)
|
| 22 |
df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
|
| 23 |
+
print(df)
|
| 24 |
df = df[cols].round(decimals=2)
|
| 25 |
+
print(df)
|
| 26 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
| 27 |
+
# print(df)
|
| 28 |
return df
|
| 29 |
|
| 30 |
|