hamzabouajila's picture
implement scripts for checking , add logging and update submission and integrate evaluation
742dfc3
raw
history blame
16.3 kB
import json
import os
from datetime import datetime, timezone
from src.display.formatting import styled_error, styled_message, styled_warning
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
from src.submission.check_validity import (
already_submitted_models,
check_model_card,
get_model_size,
is_model_on_hub,
)
from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult
from src.display.utils import Tasks
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import time
REQUESTED_MODELS = None
USERS_TO_SUBMISSION_DATES = None
def create_eval_request(
model: str,
base_model: str,
revision: str,
precision: str,
weight_type: str,
model_type: str,
):
"""Create and upload an evaluation request"""
try:
# Create evaluation request file
request_data = {
'model': model,
'base_model': base_model,
'revision': revision,
'precision': precision,
'weight_type': weight_type,
'model_type': model_type,
'status': EvaluationStatus.PENDING.value,
'submitted_time': datetime.now(timezone.utc).isoformat()
}
# Create filename
username = model.split('/')[0] if '/' in model else None
request_filename = f"{username or 'unknown'}_{model.replace('/', '_')}_eval_request_{revision}_{precision}_{weight_type}.json"
request_path = os.path.join(EVAL_REQUESTS_PATH, request_filename)
# Write request file
with open(request_path, 'w') as f:
json.dump(request_data, f, indent=2)
print(f"Created evaluation request: {request_filename}")
# Upload to Hugging Face
API.upload_file(
path_or_fileobj=request_path,
path_in_repo=request_filename if not username else os.path.join(username, request_filename),
repo_id=QUEUE_REPO,
repo_type="dataset",
commit_message=f"Add evaluation request for {model}",
token=TOKEN
)
print(f"Uploaded evaluation request to {QUEUE_REPO}")
return styled_message(
"Evaluation request created! Please wait for the evaluation to complete."
)
except Exception as e:
print(f"Error creating evaluation request: {str(e)}")
return styled_error(f"Failed to create evaluation request: {str(e)}")
def add_new_eval(
model: str,
base_model: str,
revision: str,
precision: str,
weight_type: str,
model_type: str,
):
"""Validate model and create evaluation request"""
try:
print("\n=== Starting evaluation submission ===")
print(f"Submission time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC")
print(f"Model: {model}")
print(f"Base model: {base_model}")
print(f"Revision: {revision}")
print(f"Precision: {precision}")
print(f"Weight type: {weight_type}")
print(f"Model type: {model_type}")
print(f"Evaluation requests path: {EVAL_REQUESTS_PATH}")
print(f"Queue repo: {QUEUE_REPO}")
# Always refresh the cache before checking for duplicates
print("\n=== Checking for duplicate submissions ===")
global REQUESTED_MODELS
global USERS_TO_SUBMISSION_DATES
start_time = time.time()
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
print(f"Cache refresh completed in {time.time() - start_time:.2f} seconds")
print(f"Found {len(REQUESTED_MODELS)} existing submissions")
user_name = ""
model_path = model
if "/" in model:
user_name = model.split("/")[0]
model_path = model.split("/")[1]
print(f"\nUser name: {user_name}")
print(f"Model path: {model_path}")
precision = precision.split(" ")[0]
if revision == "":
revision = "main"
print("Using default revision: main")
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
# Check if model is already submitted
print("\n=== Checking for existing submission ===")
model_key = f"{model}_{revision}_{precision}"
if model_key in REQUESTED_MODELS:
print(f"Found existing submission with key: {model_key}")
# Get the status from the queue file
queue_file = REQUESTED_MODELS[model_key]
try:
with open(queue_file, 'r') as f:
queue_entry = json.load(f)
status = queue_entry.get('status')
print(f"Found existing submission with status: {status}")
if status is None:
print(f"Warning: No status found in queue file {queue_file}")
return styled_warning("Error checking model status. Please try again later.")
if status != EvaluationStatus.FAILED.value:
print(f"Model already submitted and in {status} status")
return styled_warning(f"This model has been already submitted and is in {status} status.")
except Exception as e:
print(f"Error reading queue file: {e}")
print(f"Full traceback: {traceback.format_exc()}")
return styled_warning("Error checking model status. Please try again later.")
except Exception as e:
print(f"Error during evaluation: {str(e)}")
raise
print("\n=== Validating model type ===")
if model_type is None or model_type == "":
print("Error: Model type is missing")
return styled_error("Please select a model type.")
print("\n=== Validating model existence ===")
if revision == "":
revision = "main"
print("Using default revision: main")
print("\n=== Validating model on Hugging Face ===")
try:
if weight_type in ["Delta", "Adapter"]:
print(f"Checking base model {base_model} on Hugging Face...")
base_model_on_hub, error, _ = is_model_on_hub(
model_name=base_model,
revision=revision,
token=TOKEN,
test_tokenizer=True
)
print(f"Base model check result: {base_model_on_hub}")
if not base_model_on_hub:
print(f"Error: Base model not found: {error}")
return styled_error(f'Base model "{base_model}" {error}')
if not weight_type == "Adapter":
print(f"Checking model {model} on Hugging Face...")
model_on_hub, error, _ = is_model_on_hub(
model_name=model,
revision=revision,
token=TOKEN,
test_tokenizer=True
)
print(f"Model check result: {model_on_hub}")
if not model_on_hub:
print(f"Error: Model not found: {error}")
return styled_error(f'Model "{model}" {error}')
except Exception as e:
print(f"Error checking model on Hugging Face: {e}")
print(f"Full traceback: {traceback.format_exc()}")
return styled_error(f"Failed to validate model on Hugging Face: {str(e)}")
print("\n=== Getting model info ===")
try:
model_info = API.model_info(repo_id=model, revision=revision)
print(f"Successfully retrieved model info for {model}")
except Exception as e:
print(f"Error getting model info: {e}")
print(f"Full traceback: {traceback.format_exc()}")
return styled_error("Could not get your model information. Please fill it up properly.")
print("\n=== Getting model size ===")
try:
model_size = get_model_size(model_info=model_info, precision=precision)
print(f"Model size: {model_size}")
except Exception as e:
print(f"Error getting model size: {e}")
print(f"Full traceback: {traceback.format_exc()}")
model_size = "?"
print("\n=== Validating model card and license ===")
try:
license = model_info.cardData["license"]
print(f"Model license: {license}")
except Exception as e:
print(f"Error getting model license: {e}")
print(f"Full traceback: {traceback.format_exc()}")
return styled_error("Please select a license for your model")
print("\n=== Checking model card ===")
try:
modelcard_OK, error_msg = check_model_card(model)
print(f"Model card check result: {modelcard_OK}")
if not modelcard_OK:
print(f"Model card error: {error_msg}")
return styled_error(error_msg)
except Exception as e:
print(f"Error checking model card: {e}")
print(f"Full traceback: {traceback.format_exc()}")
return styled_error("Failed to validate model card")
print("\n=== Creating evaluation entry ===")
eval_entry = {
"model": model,
"base_model": base_model,
"revision": revision,
"precision": precision,
"weight_type": weight_type,
"status": "PENDING",
"submitted_time": current_time,
"model_type": model_type,
"likes": model_info.likes,
"params": model_size,
"license": license,
"private": False,
}
print(f"\nEvaluation entry created: {json.dumps(eval_entry, indent=2)}")
print("\n=== Checking for duplicate submission ===")
model_key = f"{model}_{revision}_{precision}"
if model_key in REQUESTED_MODELS:
print(f"Found existing submission with key: {model_key}")
# Get the status from the queue file
queue_file = REQUESTED_MODELS[model_key]
try:
with open(queue_file, 'r') as f:
queue_entry = json.load(f)
status = queue_entry.get('status')
print(f"Found existing submission with status: {status}")
if status is None:
print(f"Warning: No status found in queue file {queue_file}")
return styled_warning("Error checking model status. Please try again later.")
if status != EvaluationStatus.FAILED.value:
print(f"Model already submitted and in {status} status")
return styled_warning(f"This model has been already submitted and is in {status} status.")
except Exception as e:
print(f"Error reading queue file: {e}")
print(f"Full traceback: {traceback.format_exc()}")
return styled_warning("Error checking model status. Please try again later.")
print("\n=== Creating evaluation file ===")
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
print(f"Creating output directory: {OUT_DIR}")
os.makedirs(OUT_DIR, exist_ok=True)
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
print(f"Output file path: {out_path}")
# Write evaluation entry to file
try:
with open(out_path, "w") as f:
f.write(json.dumps(eval_entry))
print("\nEvaluation file created successfully")
# Upload to Hugging Face
print("\n=== Uploading evaluation file ===")
API.upload_file(
path_or_fileobj=out_path,
path_in_repo=out_path.split("eval-queue/")[1],
repo_id=QUEUE_REPO,
repo_type="dataset",
commit_message=f"Add evaluation request for {model}",
token=TOKEN
)
print(f"\nEvaluation request uploaded successfully to {QUEUE_REPO}")
# Clean up local file
os.remove(out_path)
print("\nLocal evaluation file removed")
return styled_message(
"Evaluation request created successfully! Please wait for the evaluation to complete."
)
except Exception as e:
print(f"Error during file operations: {str(e)}")
print(f"Full traceback: {traceback.format_exc()}")
return styled_error(f"Failed to create evaluation request: {str(e)}")
dataloader = DataLoader(tsac_dataset, batch_size=32, shuffle=False)
model_obj.eval()
with torch.no_grad():
predictions = []
targets = []
for batch in dataloader:
inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
target = batch['target'].to(device)
# Log the first batch details
if len(predictions) == 0: # Only log for the first batch
print(f"\nFirst batch example:")
print(f"Input keys: {list(inputs.keys())}")
print(f"Target shape: {target.shape}")
outputs = model_obj(**inputs)
print(f"\nModel output type: {type(outputs)}")
# Try to get logits from different possible formats
if isinstance(outputs, dict):
print(f"Output keys: {list(outputs.keys())}")
# Try different common keys
if 'logits' in outputs:
logits = outputs['logits']
elif 'prediction_logits' in outputs:
logits = outputs['prediction_logits']
else:
raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
elif isinstance(outputs, tuple):
print(f"Output tuple length: {len(outputs)}")
# Try different positions in the tuple
if len(outputs) > 0:
logits = outputs[0]
else:
raise ValueError("Empty output tuple")
else:
# If it's a single tensor, assume it's the logits
logits = outputs
print(f"Logits shape: {logits.shape}")
# For sequence classification, we typically use the [CLS] token's prediction
# Get the first token's prediction (CLS token)
cls_logits = logits[:, 0, :] # Shape: [batch_size, num_classes]
predictions.extend(cls_logits.argmax(dim=-1).cpu().tolist())
targets.extend(target.cpu().tolist())
accuracy = sum(p == t for p, t in zip(predictions, targets)) / len(predictions)
eval_entry['results'] = {'accuracy': accuracy}
# Update the queue file with results
with open(out_path, "w") as f:
f.write(json.dumps(eval_entry))
# Evaluate on ArabML
print("Evaluating on ArabML Tunisian Corpus...")
arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train", trust_remote_code=True)
def preprocess_arabml(examples):
return tokenizer(examples['Tweet'], padding=True, truncation=True, max_length=512)
arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True)
total_tokens = 0
covered_tokens = 0
for example in arabml_dataset:
tokens = tokenizer.tokenize(example['Tweet'])
total_tokens += len(tokens)
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
# Store results