Spaces:
Running
Running
| import argparse | |
| import json | |
| import random | |
| from typing import Dict, Optional, List | |
| from langchain_openai import ChatOpenAI | |
| from langchain_anthropic import ChatAnthropic | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| from geo_bot import GeoBot | |
| from benchmark import MapGuesserBenchmark | |
| from config import MODELS_CONFIG, DATA_PATHS, SUCCESS_THRESHOLD_KM | |
| def agent_mode(model_name: str, steps: int, headless: bool, samples: int): | |
| """ | |
| Runs the AI Agent in a benchmark loop over multiple samples, | |
| using multi-step exploration for each. | |
| """ | |
| print( | |
| f"Starting Agent Mode (as a benchmark): model={model_name}, steps={steps}, samples={samples}" | |
| ) | |
| try: | |
| with open(DATA_PATHS["golden_labels"], "r", encoding="utf-8") as f: | |
| golden_labels = json.load(f).get("samples", []) | |
| except FileNotFoundError: | |
| print(f"Error: Golden labels file not found at {DATA_PATHS['golden_labels']}.") | |
| return | |
| if not golden_labels: | |
| print("Error: No samples found in golden_labels.json.") | |
| return | |
| num_to_test = min(samples, len(golden_labels)) | |
| test_samples = golden_labels[:num_to_test] | |
| print(f"Will run on {len(test_samples)} samples.") | |
| config = MODELS_CONFIG.get(model_name) | |
| model_class = globals()[config["class"]] | |
| model_instance_name = config["model_name"] | |
| benchmark_helper = MapGuesserBenchmark(headless=True) | |
| all_results = [] | |
| with GeoBot( | |
| model=model_class, model_name=model_instance_name, headless=headless | |
| ) as bot: | |
| for i, sample in enumerate(test_samples): | |
| print( | |
| f"\n--- Running Sample {i + 1}/{len(test_samples)} (ID: {sample.get('id')}) ---" | |
| ) | |
| # **FIXED**: Correct sequence: Load Data -> Clean Environment -> Run Loop | |
| if not bot.controller.load_location_from_data(sample): | |
| print( | |
| f" β Failed to load location for sample {sample.get('id')}. Skipping." | |
| ) | |
| continue | |
| bot.controller.setup_clean_environment() | |
| final_guess = bot.run_agent_loop(max_steps=steps) | |
| true_coords = {"lat": sample.get("lat"), "lng": sample.get("lng")} | |
| distance_km = None | |
| is_success = False | |
| if final_guess: | |
| distance_km = benchmark_helper.calculate_distance( | |
| true_coords, final_guess | |
| ) | |
| if distance_km is not None: | |
| is_success = distance_km <= SUCCESS_THRESHOLD_KM | |
| print(f"\nResult for Sample ID: {sample.get('id')}") | |
| print( | |
| f" Ground Truth: Lat={true_coords['lat']:.4f}, Lon={true_coords['lng']:.4f}" | |
| ) | |
| print( | |
| f" Final Guess: Lat={final_guess[0]:.4f}, Lon={final_guess[1]:.4f}" | |
| ) | |
| dist_str = f"{distance_km:.1f} km" if distance_km is not None else "N/A" | |
| print(f" Distance: {dist_str}, Success: {is_success}") | |
| else: | |
| print("Agent did not make a final guess for this sample.") | |
| all_results.append( | |
| { | |
| "sample_id": sample.get("id"), | |
| "model": bot.model_name, | |
| "true_coordinates": true_coords, | |
| "predicted_coordinates": final_guess, | |
| "distance_km": distance_km, | |
| "success": is_success, | |
| } | |
| ) | |
| summary = benchmark_helper.generate_summary(all_results) | |
| if summary: | |
| print("\n\n--- Agent Benchmark Complete! Summary ---") | |
| for model, stats in summary.items(): | |
| print(f"Model: {model}") | |
| print(f" Success Rate: {stats['success_rate'] * 100:.1f}%") | |
| print(f" Avg Distance: {stats['average_distance_km']:.1f} km") | |
| print("\nAgent Mode finished.") | |
| def benchmark_mode(models: list, samples: int, headless: bool): | |
| """Runs the benchmark on pre-collected data.""" | |
| print(f"Starting Benchmark Mode: models={models}, samples={samples}") | |
| benchmark = MapGuesserBenchmark(headless=headless) | |
| summary = benchmark.run_benchmark(models=models, max_samples=samples) | |
| if summary: | |
| print("\n--- Benchmark Complete! Summary ---") | |
| for model, stats in summary.items(): | |
| print(f"Model: {model}") | |
| print(f" Success Rate: {stats['success_rate'] * 100:.1f}%") | |
| print(f" Avg Distance: {stats['average_distance_km']:.1f} km") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="MapCrunch AI Agent & Benchmark") | |
| parser.add_argument( | |
| "--mode", | |
| choices=["agent", "benchmark"], | |
| default="agent", | |
| help="Operation mode.", | |
| ) | |
| parser.add_argument( | |
| "--model", | |
| choices=list(MODELS_CONFIG.keys()), | |
| default="gpt-4o", | |
| help="Model to use.", | |
| ) | |
| parser.add_argument( | |
| "--steps", type=int, default=10, help="[Agent] Number of exploration steps." | |
| ) | |
| parser.add_argument( | |
| "--samples", | |
| type=int, | |
| default=50, | |
| help="Number of samples to process for the selected mode.", | |
| ) | |
| parser.add_argument( | |
| "--headless", action="store_true", help="Run browser in headless mode." | |
| ) | |
| parser.add_argument( | |
| "--models", | |
| nargs="+", | |
| choices=list(MODELS_CONFIG.keys()), | |
| help="[Benchmark] Models to benchmark.", | |
| ) | |
| args = parser.parse_args() | |
| if args.mode == "agent": | |
| agent_mode( | |
| model_name=args.model, | |
| steps=args.steps, | |
| headless=args.headless, | |
| samples=args.samples, | |
| ) | |
| elif args.mode == "benchmark": | |
| benchmark_mode( | |
| models=args.models or [args.model], | |
| samples=args.samples, | |
| headless=args.headless, | |
| ) | |
| if __name__ == "__main__": | |
| main() | |