Spaces:

AdrianHagen
/

Food101-Streamlit

Sleeping

App Files Files Community

Food101-Streamlit / src /eval /evaluate_food101.py

AdrianHagen

Clean history: code-only (no models)

21fb9ff about 2 months ago

raw

history blame contribute delete

12.3 kB

	#!/usr/bin/env python3
	"""
	Food101 evaluation script for model evaluation with MLflow tracking.

	This script evaluates models on the Food101 dataset with MLflow experiment tracking.
	"""

	from typing import List, Dict, Tuple, Any, Union
	from pathlib import Path
	import mlflow
	from datetime import datetime
	import pandas as pd
	import glob
	import random
	import dagshub

	from src.models.food_classification_model import FoodClassificationModel
	from src.labels import LABELS, index_to_label

	dagshub.init(repo_owner="HubertWojcik10", repo_name="TikkaMasalAI", mlflow=True)
	mlflow.autolog()


	class Food101Evaluator:
	"""Model evaluator for Food101 dataset with MLflow tracking."""

	def __init__(
	self,
	model: FoodClassificationModel,
	experiment_name: str = "food101_evaluation",
	sample_limit: int = 50,
	random_seed: int = 42,
	run_name: str = None,
	):
	"""
	Initialize the Food101 evaluator.

	Args:
	model: FoodClassificationModel instance to use for evaluation (required)
	experiment_name: Name of the MLflow experiment
	sample_limit: Maximum number of samples to evaluate
	random_seed: Random seed for reproducible sampling
	run_name: Custom name for the MLflow run (optional)
	"""
	self.DATASET_NAME = "Food101"
	self.experiment_name = experiment_name
	self.sample_limit = sample_limit
	self.model = model
	self.random_seed = random_seed
	self.custom_run_name = run_name
	self.model_name = self.model.__class__.__name__
	self.data_dir = (
	Path(__file__).parent.parent.parent / "data" / "raw" / "food101" / "data"
	)

	def load_validation_data(self) -> List[Tuple[bytes, int]]:
	"""
	Load validation data from parquet files with random sampling.

	Returns:
	List of tuples: (image_bytes, true_index)
	"""
	random.seed(self.random_seed)

	validation_files = glob.glob(f"{self.data_dir}/validation-*.parquet")
	print(f"Found {len(validation_files)} validation files")

	# Load all samples first
	all_samples = []

	for file_path in validation_files:
	print(f"Loading from {Path(file_path).name}...")
	df = pd.read_parquet(file_path)

	for _, row in df.iterrows():
	image_data = row["image"]["bytes"]
	true_index = row["label"]

	all_samples.append((image_data, true_index))

	print(f"Total available samples: {len(all_samples)}")

	# Randomly sample the requested number of samples
	if len(all_samples) <= self.sample_limit:
	selected_samples = all_samples
	print(f"Using all {len(selected_samples)} available samples")
	else:
	selected_samples = random.sample(all_samples, self.sample_limit)
	print(
	f"Randomly selected {len(selected_samples)} samples from {len(all_samples)} available"
	)

	print(f"Random seed used: {self.random_seed}")
	return selected_samples

	def calculate_accuracy(
	self, predictions: List[Union[int, str]], ground_truths: List[int]
	) -> float:
	"""
	Calculate exact accuracy for Food101 dataset.

	Args:
	predictions: List of predicted indices or label names
	ground_truths: List of true labels

	Returns:
	Accuracy score as float
	"""
	if not predictions or not ground_truths:
	return 0.0

	# Check for exact matches
	exact_matches = 0
	for pred, true in zip(predictions, ground_truths):
	if pred == true:
	exact_matches += 1

	return exact_matches / len(predictions)

	def evaluate_model(
	self, samples: List[Tuple[bytes, int]], verbose: bool = True
	) -> Dict[str, Any]:
	"""
	Evaluate the model on the provided samples.

	Args:
	samples: List of (image_bytes, true_index) tuples
	verbose: Whether to print detailed results

	Returns:
	Dictionary with evaluation metrics
	"""
	print(f"\nEvaluating model on {len(samples)} samples...")

	predictions = []
	ground_truths = []
	prediction_examples = []
	correct_predictions = 0

	for i, (image_bytes, true_index) in enumerate(samples):
	try:
	predicted_index = self.model.classify(image_bytes)
	predictions.append(predicted_index)
	ground_truths.append(true_index)

	# Check if prediction is correct using dataset-specific logic
	is_correct = predicted_index == true_index
	if is_correct:
	correct_predictions += 1

	# Convert index to label name for display and logging
	predicted_label_name = index_to_label(predicted_index)

	# Store first 10 examples for MLflow
	if i < 10:
	prediction_examples.append(
	{
	"sample_id": i + 1,
	"true_label": LABELS[true_index],
	"predicted_label": predicted_label_name,
	"predicted_index": predicted_index,
	"true_index": true_index,
	"is_correct": is_correct,
	}
	)

	if verbose and i < 10: # Print first 10 predictions
	status = "✓" if is_correct else "✗"
	print(
	f"Sample {i+1:2d}: {status} True='{LABELS[true_index]:25s}' (idx: {true_index}) \| Predicted='{predicted_label_name}' (idx: {predicted_index})"
	)

	except Exception as e:
	print(f"Error processing sample {i+1}: {e}")
	predictions.append("ERROR")
	ground_truths.append(true_index)

	# Calculate metrics
	total_samples = len(samples)
	successful_predictions = len([p for p in predictions if p != "ERROR"])

	# Calculate accuracy using dataset-specific method
	accuracy = self.calculate_accuracy(predictions, ground_truths)
	success_rate = (
	successful_predictions / total_samples if total_samples > 0 else 0
	)

	results = {
	"total_samples": total_samples,
	"successful_predictions": successful_predictions,
	"correct_predictions": correct_predictions,
	"success_rate": success_rate,
	"accuracy": accuracy,
	"prediction_examples": prediction_examples,
	}

	return results

	def log_mlflow_metrics(self, results: Dict[str, Any]) -> None:
	"""
	Log evaluation metrics to MLflow.

	Args:
	results: The results from the evaluation.
	"""
	mlflow.log_metric("total_samples", results["total_samples"])
	mlflow.log_metric("successful_predictions", results["successful_predictions"])
	mlflow.log_metric("success_rate", results["success_rate"])
	mlflow.log_metric("correct_predictions", results["correct_predictions"])
	mlflow.log_metric("accuracy", results["accuracy"])

	def log_mlflow_artifacts(self, results: Dict[str, Any]) -> None:
	"""
	Log evaluation artifacts to MLflow.

	Args:
	results: The results from the evaluation.

	"""
	examples_data = []
	for example in results["prediction_examples"]:
	status = "✓" if example.get("is_correct", False) else "✗"
	examples_data.append(
	f"Sample {example['sample_id']}: {status} {example['true_label']} -> {example['predicted_label']}"
	)

	examples_text = "\n".join(examples_data)
	examples_file = f"{self.DATASET_NAME.lower()}_evaluation_examples.txt"
	with open(examples_file, "w", encoding="utf-8", newline="") as f:
	f.write(examples_text)
	mlflow.log_artifact(examples_file)

	model_source = (
	getattr(self.model, "model_path", "N/A")
	if hasattr(self.model, "model_path")
	else "N/A"
	)
	summary = f"""{self.model_name} {self.DATASET_NAME} Evaluation Summary
	========================================={'=' * len(self.DATASET_NAME)}
	Model: {self.model_name} ({model_source})
	Dataset: {self.DATASET_NAME} validation set
	Samples: {results['total_samples']}
	Success Rate: {results['success_rate']:.2%}
	Accuracy: {results['accuracy']:.2%}
	Correct Predictions: {results['correct_predictions']}
	"""

	summary_file = f"{self.DATASET_NAME.lower()}_evaluation_summary.txt"
	with open(summary_file, "w", encoding="utf-8", newline="") as f:
	f.write(summary)
	mlflow.log_artifact(summary_file)

	# Clean up temporary files
	Path(examples_file).unlink(missing_ok=True)
	Path(summary_file).unlink(missing_ok=True)

	def run_evaluation(self) -> None:
	"""Run the complete evaluation pipeline with MLflow tracking."""
	print("=" * 60)
	print(f"{self.model_name} {self.DATASET_NAME} Evaluation with MLflow")
	print("=" * 60)

	mlflow.set_experiment(self.experiment_name)

	# Create descriptive run name
	if self.custom_run_name:
	run_name = self.custom_run_name
	else:
	timestamp = datetime.now().strftime("%m%d_%H%M")
	run_name = f"{self.model_name}_Food101_n{self.sample_limit}_seed{self.random_seed}_{timestamp}"

	with mlflow.start_run(run_name=run_name):
	# Add useful tags for filtering and organization
	mlflow.set_tag("model_type", self.model_name)
	mlflow.set_tag("dataset", self.DATASET_NAME)
	mlflow.set_tag("sample_size", str(self.sample_limit))
	mlflow.set_tag("evaluation_type", "validation")

	mlflow.log_param("model_name", self.model_name)
	mlflow.log_param("model_class", self.model.__class__.__name__)
	mlflow.log_param("dataset", self.DATASET_NAME)
	mlflow.log_param("sample_limit", self.sample_limit)
	mlflow.log_param("random_seed", self.random_seed)
	mlflow.log_param("evaluation_date", datetime.now().isoformat())

	# Log model-specific parameters if available
	if hasattr(self.model, "model_path"):
	mlflow.log_param(
	"model_source", getattr(self.model, "model_path", "Unknown")
	)
	if hasattr(self.model, "preprocessor_path"):
	mlflow.log_param(
	"preprocessor_path",
	getattr(self.model, "preprocessor_path", "Unknown"),
	)

	samples = self.load_validation_data()

	if not samples:
	print(
	f"No validation samples loaded. Check the {self.DATASET_NAME} dataset connection."
	)
	mlflow.log_param("status", "failed - no data")
	return

	mlflow.log_param("samples_loaded", len(samples))

	results = self.evaluate_model(samples, verbose=True)

	self.log_mlflow_metrics(results)
	self.log_mlflow_artifacts(results)

	self._print_results(results)

	def _print_results(self, results: Dict[str, Any]) -> None:
	"""Print evaluation results to console."""
	print("\n" + "=" * 60)
	print("EVALUATION RESULTS")
	print("=" * 60)
	print(f"Total samples processed: {results['total_samples']}")
	print(f"Successful predictions: {results['successful_predictions']}")
	print(f"Success rate: {results['success_rate']:.2%}")
	print(f"Correct predictions: {results['correct_predictions']}")
	print(f"Accuracy: {results['accuracy']:.2%}")

	print(f"\nMLflow run ID: {mlflow.active_run().info.run_id}")