Spaces:

ColamanAI
/

Map-anything-seg

Running

File size: 86,570 Bytes

b74998d

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.


import gc
import os
import shutil
import sys
import time
from datetime import datetime
from pathlib import Path
from collections import defaultdict
from typing import List, Dict, Tuple

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import cv2
import gradio as gr
import numpy as np
import spaces
import torch
import trimesh
from PIL import Image
from pillow_heif import register_heif_opener
from sklearn.cluster import DBSCAN

from mapanything.utils.geometry import depthmap_to_world_frame, points_to_normals
from mapanything.utils.hf_utils.css_and_html import (
    get_gradio_theme,
    GRADIO_CSS,
)
from mapanything.utils.hf_utils.hf_helpers import initialize_mapanything_model, initialize_mapanything_local
from mapanything.utils.hf_utils.viz import predictions_to_glb
from mapanything.utils.image import load_images, rgb

register_heif_opener()
sys.path.append("mapanything/")

# ============================================================================
# 全局配置
# ============================================================================

# MapAnything Configuration
high_level_config = {
    "path": "configs/train.yaml",
    "hf_model_name": "facebook/map-anything",
    "model_str": "mapanything",
    "config_overrides": [
        "machine=aws",
        "model=mapanything",
        "model/task=images_only",
        "model.encoder.uses_torch_hub=false",
    ],
    "checkpoint_name": "model.safetensors",
    "config_name": "config.json",
    "trained_with_amp": True,
    "trained_with_amp_dtype": "bf16",
    "data_norm_type": "dinov2",
    "patch_size": 14,
    "resolution": 518,
}

# GroundingDINO 配置 - 从 HuggingFace 加载
GROUNDING_DINO_MODEL_ID = "IDEA-Research/grounding-dino-tiny"
GROUNDING_DINO_BOX_THRESHOLD = 0.25
GROUNDING_DINO_TEXT_THRESHOLD = 0.2

# SAM 配置 - 使用 HuggingFace 的 SAM 模型
SAM_MODEL_ID = "facebook/sam-vit-huge"  # 或使用 "facebook/sam-vit-base" 更快更小

DEFAULT_TEXT_PROMPT = "window . table . sofa . tv . book . door"

# 通用物体列表（GroundingDINO 会检测图像中存在的物体）
COMMON_OBJECTS_PROMPT = (
    "person . face . hand . "
    "chair . sofa . couch . bed . table . desk . cabinet . shelf . drawer . "
    "door . window . wall . floor . ceiling . curtain . "
    "tv . monitor . screen . computer . laptop . keyboard . mouse . "
    "phone . tablet . remote . "
    "lamp . light . chandelier . "
    "book . magazine . paper . pen . pencil . "
    "bottle . cup . glass . mug . plate . bowl . fork . knife . spoon . "
    "vase . plant . flower . pot . "
    "clock . picture . frame . mirror . "
    "pillow . cushion . blanket . towel . "
    "bag . backpack . suitcase . "
    "box . basket . container . "
    "shoe . hat . coat . "
    "toy . ball . "
    "car . bicycle . motorcycle . bus . truck . "
    "tree . grass . sky . cloud . sun . "
    "dog . cat . bird . "
    "building . house . bridge . road . street . "
    "sign . pole . bench"
)

# V8: DBSCAN聚类配置
# 根据物体类型设置不同的聚类半径（eps）
DBSCAN_EPS_CONFIG = {
    'sofa': 1.5,      # 沙发：1.5米半径（大物体，同一个沙发的检测可能相距较远）
    'bed': 1.5,
    'couch': 1.5,
    'desk': 0.8,      # 桌子：0.8米半径（中等物体）
    'table': 0.8,
    'chair': 0.6,     # 椅子：0.6米（较小）
    'cabinet': 0.8,
    'window': 0.5,    # 窗户：0.5米（位置固定，聚类严格）
    'door': 0.6,
    'tv': 0.6,
    'default': 1.0    # 默认：1米
}

DBSCAN_MIN_SAMPLES = 1  # 最小样本数（设为1意味着单个检测也能成为一个簇）

ENABLE_VISUAL_FEATURES = False

# 分割质量控制
MIN_DETECTION_CONFIDENCE = 0.35        # 最低检测置信度（过滤误检测）
MIN_MASK_AREA = 100                    # 最小mask面积（像素）

# 匹配分数计算配置（用于备用匹配算法）
MATCH_3D_DISTANCE_THRESHOLD = 2.5      # 3D距离阈值（米）

# 全局模型变量
model = None
grounding_dino_model = None
grounding_dino_processor = None
sam_predictor = None


# ============================================================================
# 分割模型加载
# ============================================================================

def load_grounding_dino_model(device):
    """加载 GroundingDINO 模型 - 从 HuggingFace"""
    global grounding_dino_model, grounding_dino_processor
    
    if grounding_dino_model is not None:
        print("✅ GroundingDINO 已加载")
        return
    
    try:
        from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
        
        print(f"📥 从 HuggingFace 加载 GroundingDINO: {GROUNDING_DINO_MODEL_ID}")
        grounding_dino_processor = AutoProcessor.from_pretrained(GROUNDING_DINO_MODEL_ID)
        grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
            GROUNDING_DINO_MODEL_ID
        ).to(device).eval()
        
        print("✅ GroundingDINO 加载成功")
        
    except Exception as e:
        print(f"❌ GroundingDINO 加载失败: {e}")
        import traceback
        traceback.print_exc()


def load_sam_model(device):
    """加载 SAM 模型 - 从 HuggingFace"""
    global sam_predictor
    
    if sam_predictor is not None:
        print("✅ SAM 已加载")
        return
    
    try:
        from transformers import SamModel, SamProcessor
        
        print(f"📥 从 HuggingFace 加载 SAM: {SAM_MODEL_ID}")
        sam_model = SamModel.from_pretrained(SAM_MODEL_ID).to(device).eval()
        sam_processor = SamProcessor.from_pretrained(SAM_MODEL_ID)
        
        # 将模型和处理器存储为全局变量
        sam_predictor = {'model': sam_model, 'processor': sam_processor}
        print("✅ SAM 加载成功")
            
    except Exception as e:
        print(f"❌ SAM 加载失败: {e}")
        print("   SAM 功能将被禁用，将使用边界框作为mask")
        import traceback
        traceback.print_exc()


# ============================================================================
# 分割功能
# ============================================================================


def generate_distinct_colors(n):
    """生成 N 个视觉上区分度高的颜色（RGB，0-255）"""
    import colorsys
    if n == 0:
        return []
    
    colors = []
    for i in range(n):
        hue = i / max(n, 1)
        rgb = colorsys.hsv_to_rgb(hue, 0.9, 0.95)
        rgb_color = tuple(int(c * 255) for c in rgb)
        colors.append(rgb_color)
    
    return colors


def run_grounding_dino_detection(image_np, text_prompt, device):
    """使用 GroundingDINO 进行检测"""
    if grounding_dino_model is None or grounding_dino_processor is None:
        print("⚠️ GroundingDINO 未加载")
        return []
    
    try:
        print(f"🔍 GroundingDINO 检测: {text_prompt}")
        
        # 转换为 PIL Image
        if image_np.dtype == np.uint8:
            pil_image = Image.fromarray(image_np)
        else:
            pil_image = Image.fromarray((image_np * 255).astype(np.uint8))
        
        # 预处理
        inputs = grounding_dino_processor(images=pil_image, text=text_prompt, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # 推理
        with torch.no_grad():
            outputs = grounding_dino_model(**inputs)
        
        # 后处理
        results = grounding_dino_processor.post_process_grounded_object_detection(
            outputs,
            inputs["input_ids"],
            threshold=GROUNDING_DINO_BOX_THRESHOLD,
            text_threshold=GROUNDING_DINO_TEXT_THRESHOLD,
            target_sizes=[pil_image.size[::-1]]
        )[0]
        
        # 转换为统一格式
        detections = []
        boxes = results["boxes"].cpu().numpy()
        scores = results["scores"].cpu().numpy()
        labels = results["labels"]
        
        print(f"✅ 检测到 {len(boxes)} 个物体")
        
        for box, score, label in zip(boxes, scores, labels):
            detection = {
                'bbox': box.tolist(),  # [x1, y1, x2, y2]
                'label': label,
                'confidence': float(score)
            }
            detections.append(detection)
            print(f"   - {label}: {score:.2f}")
        
        return detections
        
    except Exception as e:
        print(f"❌ GroundingDINO 检测失败: {e}")
        import traceback
        traceback.print_exc()
        return []


def run_sam_refinement(image_np, boxes):
    """使用 SAM 精确分割 - HuggingFace Transformers 版本"""
    if sam_predictor is None:
        print("⚠️ SAM 未加载，使用 bbox 作为 mask")
        # 使用 bbox 创建简单的矩形 mask
        masks = []
        h, w = image_np.shape[:2]
        for box in boxes:
            x1, y1, x2, y2 = map(int, box)
            mask = np.zeros((h, w), dtype=bool)
            mask[y1:y2, x1:x2] = True
            masks.append(mask)
        return masks
    
    try:
        print(f"🎯 SAM 精确分割 {len(boxes)} 个区域...")
        
        from PIL import Image
        sam_model = sam_predictor['model']
        sam_processor = sam_predictor['processor']
        device = sam_model.device
        
        # 转换为 PIL Image
        if image_np.dtype == np.uint8:
            pil_image = Image.fromarray(image_np)
        else:
            pil_image = Image.fromarray((image_np * 255).astype(np.uint8))
        
        masks = []
        for box in boxes:
            x1, y1, x2, y2 = map(int, box)
            input_boxes = [[[x1, y1, x2, y2]]]  # SAM 需要的格式
            
            # 处理输入
            inputs = sam_processor(pil_image, input_boxes=input_boxes, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # 推理
            with torch.no_grad():
                outputs = sam_model(**inputs)
            
            # 后处理获取mask
            pred_masks = sam_processor.image_processor.post_process_masks(
                outputs.pred_masks.cpu(),
                inputs["original_sizes"].cpu(),
                inputs["reshaped_input_sizes"].cpu()
            )[0][0][0]  # 取第一个mask
            
            masks.append(pred_masks.numpy() > 0.5)
        
        print(f"✅ SAM 分割完成")
        return masks
        
    except Exception as e:
        print(f"❌ SAM 分割失败: {e}")
        import traceback
        traceback.print_exc()
        # Fallback to bbox masks
        masks = []
        h, w = image_np.shape[:2]
        for box in boxes:
            x1, y1, x2, y2 = map(int, box)
            mask = np.zeros((h, w), dtype=bool)
            mask[y1:y2, x1:x2] = True
            masks.append(mask)
        return masks


def normalize_label(label):
    """规范化标签，提取主要类别

    

    例如: 'sofa bed' -> 'sofa', 'desk cabinet' -> 'desk', 'table desk' -> 'table'

          'windows' -> 'window', 'chairs' -> 'chair' (单复数转换)

    """
    label = label.strip().lower()
    
    # 优先级顺序（从高到低）
    priority_labels = ['sofa', 'bed', 'table', 'desk', 'chair', 'cabinet', 'window', 'door']
    
    # 查找标签中是否包含优先级类别
    for priority in priority_labels:
        if priority in label:
            return priority
    
    # 如果没有匹配，返回第一个词
    first_word = label.split()[0] if label else label
    
    # 处理常见复数形式 -> 单数
    if first_word.endswith('s') and len(first_word) > 1:
        singular = first_word[:-1]  # 去掉末尾的 's'
        # 特殊复数规则
        if first_word.endswith('sses'):  # glasses -> glass
            singular = first_word[:-2]
        elif first_word.endswith('ies'):  # cherries -> cherry
            singular = first_word[:-3] + 'y'
        elif first_word.endswith('ves'):  # shelves -> shelf
            singular = first_word[:-3] + 'f'
        
        # 返回单数形式
        return singular
    
    return first_word


def labels_match(label1, label2):
    """判断两个标签是否匹配（支持模糊匹配）

    

    例如: 'sofa' 和 'sofa bed' 匹配

          'desk' 和 'table desk' 匹配

    """
    norm1 = normalize_label(label1)
    norm2 = normalize_label(label2)
    return norm1 == norm2


def compute_object_3d_center(points, mask):
    """计算物体的 3D 中心点"""
    masked_points = points[mask]
    if len(masked_points) == 0:
        return None
    return np.median(masked_points, axis=0)


def compute_3d_bbox_iou(center1, size1, center2, size2):
    """计算两个3D边界框的IoU"""
    try:
        # 计算边界框范围 [min, max]
        min1 = center1 - size1 / 2
        max1 = center1 + size1 / 2
        min2 = center2 - size2 / 2
        max2 = center2 + size2 / 2
        
        # 计算交集
        inter_min = np.maximum(min1, min2)
        inter_max = np.minimum(max1, max2)
        inter_size = np.maximum(0, inter_max - inter_min)
        inter_volume = np.prod(inter_size)
        
        # 计算并集
        volume1 = np.prod(size1)
        volume2 = np.prod(size2)
        union_volume = volume1 + volume2 - inter_volume
        
        if union_volume == 0:
            return 0.0
        
        return inter_volume / union_volume
    except:
        return 0.0


def compute_2d_mask_iou(mask1, mask2):
    """计算两个2D mask的IoU"""
    try:
        intersection = np.logical_and(mask1, mask2).sum()
        union = np.logical_or(mask1, mask2).sum()
        if union == 0:
            return 0.0
        return intersection / union
    except:
        return 0.0


def extract_visual_features(image, mask, encoder):
    """提取mask区域的视觉特征（使用DINOv2）

    

    Args:

        image: [H, W, 3] float32 in [0, 1] or uint8 in [0, 255]

        mask: [H, W] bool

        encoder: DINOv2 encoder model

    

    Returns:

        feature vector (1D numpy array) or None if failed

    """
    try:
        # 将mask区域裁剪出来
        coords = np.argwhere(mask)
        if len(coords) == 0:
            return None
        
        y_min, x_min = coords.min(axis=0)
        y_max, x_max = coords.max(axis=0)
        
        # 确保裁剪区域有效
        if y_max <= y_min or x_max <= x_min:
            return None
        
        # 裁剪并resize到224x224
        cropped = image[y_min:y_max+1, x_min:x_max+1]
        
        # 确保是 uint8 格式
        if cropped.dtype == np.float32 or cropped.dtype == np.float64:
            if cropped.max() <= 1.0:
                cropped = (cropped * 255).astype(np.uint8)
            else:
                cropped = cropped.astype(np.uint8)
        
        from PIL import Image
        import torchvision.transforms as T
        
        pil_img = Image.fromarray(cropped)
        pil_img = pil_img.resize((224, 224), Image.BILINEAR)
        
        # 转换为tensor
        transform = T.Compose([
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        
        # 获取encoder的设备
        try:
            device = next(encoder.parameters()).device
        except:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        img_tensor = transform(pil_img).unsqueeze(0).to(device)  # [1, 3, 224, 224]
        
        # 提取特征 - 使用 encoder 的前向传播
        with torch.no_grad():
            # 不同的encoder可能有不同的调用方式
            if hasattr(encoder, 'forward_features'):
                # 如果有 forward_features 方法（标准 DINOv2）
                features = encoder.forward_features(img_tensor)
            else:
                # 否则直接调用（DINOv2Encoder 只需要 input tensor）
                features = encoder(img_tensor)
            
            # 如果 features 不是 tensor，尝试转换
            if not isinstance(features, torch.Tensor):
                if isinstance(features, dict):
                    # 如果返回字典，尝试获取 'x' 或 'last_hidden_state'
                    features = features.get('x', features.get('last_hidden_state', None))
                    if features is None:
                        return None
                elif hasattr(features, 'data'):
                    # 如果是某种包装对象，尝试获取 data 属性
                    features = features.data
                else:
                    # 无法处理，返回 None
                    return None
            
            # 确保 features 是 tensor
            if not isinstance(features, torch.Tensor):
                return None
            
            # 确保是 4D tensor: [B, C, H, W] 或 3D: [B, N, C] 或 2D: [B, C]
            if len(features.shape) == 4:
                # [B, C, H, W] -> Global average pooling
                features = features.mean(dim=[2, 3])  # [B, C]
            elif len(features.shape) == 3:
                # [B, N, C] -> 取平均 or 取 CLS token
                features = features.mean(dim=1)  # [B, C]
            elif len(features.shape) == 2:
                # [B, C] -> 已经是我们需要的格式
                pass
            else:
                # 不支持的 shape
                return None
            
            # L2 normalize
            features = features / (features.norm(dim=1, keepdim=True) + 1e-8)
        
        return features.cpu().numpy()[0]
        
    except Exception as e:
        import traceback
        print(f"     ⚠️ 特征提取失败: {type(e).__name__}: {e}")
        print(f"        调用栈:\n{traceback.format_exc()}")  # 显示完整堆栈
        return None


def compute_feature_similarity(feat1, feat2):
    """计算特征相似度（余弦相似度）"""
    if feat1 is None or feat2 is None:
        return 0.0
    try:
        return np.dot(feat1, feat2)
    except:
        return 0.0


def compute_match_score(obj1, obj2, weights={'distance': 0.5, 'iou_3d': 0.25, 'iou_2d': 0.15, 'feature': 0.1}):
    """计算综合匹配分数（0-1）

    

    动态调整权重：如果某个准则不可用，将其权重重新分配给其他准则

    """
    scores = {}
    available_criteria = []
    
    # 1. 3D距离分数（距离越近，分数越高）
    if obj1.get('center_3d') is not None and obj2.get('center_3d') is not None:
        distance = np.linalg.norm(obj1['center_3d'] - obj2['center_3d'])
        scores['distance'] = max(0, 1 - distance / MATCH_3D_DISTANCE_THRESHOLD)
        available_criteria.append('distance')
    else:
        scores['distance'] = 0.0
    
    # 2. 3D IoU分数
    if obj1.get('bbox_3d') is not None and obj2.get('bbox_3d') is not None:
        scores['iou_3d'] = compute_3d_bbox_iou(
            obj1['bbox_3d']['center'], obj1['bbox_3d']['size'],
            obj2['bbox_3d']['center'], obj2['bbox_3d']['size']
        )
        available_criteria.append('iou_3d')
    else:
        scores['iou_3d'] = 0.0
    
    # 3. 2D IoU分数
    if obj1.get('mask_2d') is not None and obj2.get('mask_2d') is not None:
        scores['iou_2d'] = compute_2d_mask_iou(obj1['mask_2d'], obj2['mask_2d'])
        available_criteria.append('iou_2d')
    else:
        scores['iou_2d'] = 0.0
    
    # 4. 视觉特征相似度
    if obj1.get('visual_feature') is not None and obj2.get('visual_feature') is not None:
        scores['feature'] = compute_feature_similarity(obj1['visual_feature'], obj2['visual_feature'])
        available_criteria.append('feature')
    else:
        scores['feature'] = 0.0
    
    # 动态调整权重：只使用可用的准则
    if len(available_criteria) == 0:
        return 0.0, scores
    
    # 重新归一化权重
    total_available_weight = sum(weights[k] for k in available_criteria)
    if total_available_weight == 0:
        return 0.0, scores
    
    adjusted_weights = {k: weights[k] / total_available_weight for k in available_criteria}
    
    # 加权求和
    total_score = sum(scores[k] * adjusted_weights[k] for k in available_criteria)
    
    return total_score, scores


def compute_adaptive_eps(centers, base_eps):
    """自适应计算eps值

    

    根据物体的3D位置分布自动调整eps：

    - 如果物体很分散，增大eps（避免过度分割）

    - 如果物体很集中，使用默认eps

    """
    if len(centers) <= 1:
        return base_eps
    
    # 计算所有点之间的距离
    from scipy.spatial.distance import pdist
    distances = pdist(centers)
    
    if len(distances) == 0:
        return base_eps
    
    # 使用中位数距离作为参考
    median_dist = np.median(distances)
    
    # 自适应策略：如果中位数距离很大，说明物体分散，增大eps
    # 如果中位数距离很小，说明物体集中，保持或减小eps
    if median_dist > base_eps * 2:
        # 物体非常分散，大幅增大eps（可能是同一物体的多视图检测）
        adaptive_eps = min(median_dist * 0.6, base_eps * 2.5)
    elif median_dist > base_eps:
        # 物体较分散，适度增大eps
        adaptive_eps = median_dist * 0.5
    else:
        # 物体集中，使用默认eps
        adaptive_eps = base_eps
    
    return adaptive_eps


def match_objects_across_views(all_view_detections):
    """跨视图匹配相同物体（V8增强版：自适应DBSCAN聚类）

    

    V8增强版改进：

    - 自适应eps：根据物体分布自动调整聚类半径

    - 智能合并：聚类后再检查是否有明显重复的簇

    - 置信度加权：使用置信度加权计算簇中心

    

    Args:

        all_view_detections: List[List[Dict]], 每个视图的检测结果

        

    Returns:

        object_id_map: Dict[view_idx][det_idx] = global_object_id

        unique_objects: List[Dict] - 唯一物体列表

    """
    print("\n🔗 V8增强版: 自适应DBSCAN聚类匹配物体...")
    
    # 收集所有检测，按标签分组
    objects_by_label = defaultdict(list)
    
    for view_idx, detections in enumerate(all_view_detections):
        for det_idx, det in enumerate(detections):
            # 只处理有3D中心的物体
            if det.get('center_3d') is None:
                continue
            
            norm_label = normalize_label(det['label'])
            objects_by_label[norm_label].append({
                'view_idx': view_idx,
                'det_idx': det_idx,
                'label': det['label'],
                'norm_label': norm_label,
                'center_3d': det['center_3d'],
                'confidence': det['confidence'],
                'bbox_3d': det.get('bbox_3d'),
            })
    
    if len(objects_by_label) == 0:
        return {}, []
    
    # V8: 对每种物体类别分别进行DBSCAN聚类
    object_id_map = defaultdict(dict)
    unique_objects = []
    next_global_id = 0
    
    for norm_label, objects in objects_by_label.items():
        print(f"\n   📦 处理 {norm_label}: {len(objects)} 个检测")
        
        # 如果只有1个检测，直接作为1个物体
        if len(objects) == 1:
            obj = objects[0]
            unique_objects.append({
                'global_id': next_global_id,
                'label': obj['label'],
                'views': [(obj['view_idx'], obj['det_idx'])],
                'center_3d': obj['center_3d'],
            })
            object_id_map[obj['view_idx']][obj['det_idx']] = next_global_id
            next_global_id += 1
            print(f"      → 1个簇（单独检测）")
            continue
        
        # 提取3D中心点坐标
        centers = np.array([obj['center_3d'] for obj in objects])
        
        # 获取该类型的基础聚类半径
        base_eps = DBSCAN_EPS_CONFIG.get(norm_label, DBSCAN_EPS_CONFIG.get('default', 1.0))
        
        # 🔥 V8增强：自适应计算eps
        eps = compute_adaptive_eps(centers, base_eps)
        
        # DBSCAN聚类
        clustering = DBSCAN(eps=eps, min_samples=DBSCAN_MIN_SAMPLES, metric='euclidean')
        cluster_labels = clustering.fit_predict(centers)
        
        # 统计簇
        n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
        n_noise = list(cluster_labels).count(-1)
        
        if eps != base_eps:
            print(f"      → {n_clusters} 个簇 (基础eps={base_eps}m → 自适应eps={eps:.2f}m)")
        else:
            print(f"      → {n_clusters} 个簇 (eps={eps}m)")
        if n_noise > 0:
            print(f"      ⚠️  {n_noise} 个噪声点（孤立检测）")
        
        # 调试：显示每个簇的详细信息
        for cluster_id in sorted(set(cluster_labels)):
            if cluster_id == -1:
                continue
            cluster_objs = [objects[i] for i, label in enumerate(cluster_labels) if label == cluster_id]
            cluster_centers = [obj['center_3d'] for obj in cluster_objs]
            cluster_views = [f"V{obj['view_idx']+1}" for obj in cluster_objs]
            
            # 计算簇内最大距离
            max_dist = 0
            if len(cluster_centers) > 1:
                from scipy.spatial.distance import pdist
                distances = pdist(np.array(cluster_centers))
                max_dist = distances.max() if len(distances) > 0 else 0
            
            print(f"         簇 {cluster_id}: {len(cluster_objs)} 个检测 (来自视图: {', '.join(cluster_views)}, 最大簇内距离: {max_dist:.2f}m)")
        
        # 为每个簇创建一个全局物体
        cluster_to_global_id = {}
        
        for cluster_id in set(cluster_labels):
            if cluster_id == -1:
                # 噪声点，每个单独成为一个物体
                for i, label in enumerate(cluster_labels):
                    if label == -1:
                        obj = objects[i]
                        unique_objects.append({
                            'global_id': next_global_id,
                            'label': obj['label'],
                            'views': [(obj['view_idx'], obj['det_idx'])],
                            'center_3d': obj['center_3d'],
                        })
                        object_id_map[obj['view_idx']][obj['det_idx']] = next_global_id
                        next_global_id += 1
            else:
                # 正常簇
                cluster_objects = [objects[i] for i, label in enumerate(cluster_labels) if label == cluster_id]
                
                # 计算簇的中心（加权平均，权重为置信度）
                total_conf = sum(o['confidence'] for o in cluster_objects)
                weighted_center = sum(o['center_3d'] * o['confidence'] for o in cluster_objects) / total_conf
                
                # 创建全局物体
                unique_objects.append({
                    'global_id': next_global_id,
                    'label': cluster_objects[0]['label'],
                    'views': [(o['view_idx'], o['det_idx']) for o in cluster_objects],
                    'center_3d': weighted_center,
                })
                
                # 映射所有检测到这个全局ID
                for obj in cluster_objects:
                    object_id_map[obj['view_idx']][obj['det_idx']] = next_global_id
                
                print(f"         簇 {cluster_id}: {len(cluster_objects)} 个检测合并")
                
                next_global_id += 1
    
    print(f"\n   📊 总结:")
    print(f"      总检测数: {sum(len(objs) for objs in objects_by_label.values())}")
    print(f"      唯一物体: {len(unique_objects)}")
    
    # 打印匹配结果（按规范化标签统计）
    label_counts = defaultdict(int)
    original_labels = defaultdict(set)
    for obj in unique_objects:
        norm_label = normalize_label(obj['label'])
        label_counts[norm_label] += 1
        original_labels[norm_label].add(obj['label'])
    
    print(f"\n   📊 物体类别统计（规范化后）:")
    for norm_label, count in sorted(label_counts.items()):
        orig_labels = original_labels[norm_label]
        if len(orig_labels) > 1:
            print(f"   {norm_label} (原标签: {', '.join(sorted(orig_labels))}): {count} 个")
        else:
            print(f"   {norm_label}: {count} 个")
    
    return object_id_map, unique_objects


def create_multi_view_segmented_mesh(processed_data, all_view_detections, all_view_masks, 

                                     object_id_map, unique_objects, target_dir, use_sam=True):
    """创建多视图融合的分割 mesh（使用 utils3d.image_mesh）"""
    try:
        print("\n🎨 生成多视图分割 mesh...")
        
        # 按物体类别（label）分配颜色，使用规范化标签避免组合标签问题
        # 获取所有不同的规范化类别
        unique_normalized_labels = sorted(set(normalize_label(obj['label']) for obj in unique_objects))
        label_colors = {}
        colors = generate_distinct_colors(len(unique_normalized_labels))
        
        # 为规范化标签分配颜色
        for i, norm_label in enumerate(unique_normalized_labels):
            label_colors[norm_label] = colors[i]
        
        # 为每个唯一物体分配基于规范化类别的颜色
        for obj in unique_objects:
            norm_label = normalize_label(obj['label'])
            obj['color'] = label_colors[norm_label]
            obj['normalized_label'] = norm_label  # 保存规范化标签
        
        # 打印类别-颜色映射（按规范化标签）
        print(f"   物体类别颜色映射（规范化标签）：")
        for norm_label, color in sorted(label_colors.items()):
            count = sum(1 for obj in unique_objects if normalize_label(obj['label']) == norm_label)
            # 显示所有原始标签
            original_labels = set(obj['label'] for obj in unique_objects if normalize_label(obj['label']) == norm_label)
            if len(original_labels) > 1:
                print(f"   {norm_label} (包含: {', '.join(sorted(original_labels))}) × {count} → RGB{color}")
            else:
                print(f"   {norm_label} × {count} → RGB{color}")
        
        # 导入 utils3d
        import utils3d
        
        all_meshes = []
        
        # 为每个视图生成 mesh
        for view_idx in range(len(processed_data)):
            view_data = processed_data[view_idx]
            image = view_data["image"]
            points3d = view_data["points3d"]
            mask = view_data.get("mask")
            normal = view_data.get("normal")
            
            detections = all_view_detections[view_idx]
            masks = all_view_masks[view_idx]
            
            if len(detections) == 0:
                continue
            
            # 确保图像在 [0, 255] 范围
            if image.dtype != np.uint8:
                if image.max() <= 1.0:
                    image = (image * 255).astype(np.uint8)
                else:
                    image = image.astype(np.uint8)
            
            # 创建彩色图像（使用置信度优先策略避免颜色混乱）
            colored_image = image.copy()
            confidence_map = np.zeros((image.shape[0], image.shape[1]), dtype=np.float32)  # 记录每个像素的置信度
            
            # 收集所有检测及其信息（应用质量过滤）
            detections_info = []
            filtered_count = 0
            for det_idx, (det, seg_mask) in enumerate(zip(detections, masks)):
                # 过滤低置信度检测
                if det['confidence'] < MIN_DETECTION_CONFIDENCE:
                    filtered_count += 1
                    continue
                
                # 过滤过小的mask
                mask_area = seg_mask.sum()
                if mask_area < MIN_MASK_AREA:
                    filtered_count += 1
                    continue
                
                global_id = object_id_map[view_idx].get(det_idx)
                if global_id is None:
                    continue
                
                unique_obj = next((obj for obj in unique_objects if obj['global_id'] == global_id), None)
                if unique_obj is None:
                    continue
                
                detections_info.append({
                    'mask': seg_mask,
                    'color': unique_obj['color'],
                    'confidence': det['confidence'],
                    'label': det['label'],
                    'area': mask_area
                })
            
            if filtered_count > 0:
                print(f"   视图 {view_idx + 1}: 过滤了 {filtered_count} 个低质量检测")
            
            # 按置信度排序（从低到高），这样高置信度的会最后写入
            detections_info.sort(key=lambda x: x['confidence'])
            
            # 应用颜色（置信度高的优先）
            for info in detections_info:
                seg_mask = info['mask']
                color = info['color']
                conf = info['confidence']
                
                # 只在当前置信度更高的地方覆盖
                update_mask = seg_mask & (conf > confidence_map)
                colored_image[update_mask] = color
                confidence_map[update_mask] = conf
            
            # 使用 utils3d.image_mesh 生成 mesh
            height, width = image.shape[:2]
            
            if normal is None:
                faces, vertices, vertex_colors, vertex_uvs = utils3d.numpy.image_mesh(
                    points3d,
                    colored_image.astype(np.float32) / 255,
                    utils3d.numpy.image_uv(width=width, height=height),
                    mask=mask if mask is not None else np.ones((height, width), dtype=bool),
                    tri=True
                )
                vertex_normals = None
            else:
                faces, vertices, vertex_colors, vertex_uvs, vertex_normals = utils3d.numpy.image_mesh(
                    points3d,
                    colored_image.astype(np.float32) / 255,
                    utils3d.numpy.image_uv(width=width, height=height),
                    normal,
                    mask=mask if mask is not None else np.ones((height, width), dtype=bool),
                    tri=True
                )
            
            # 坐标变换
            vertices = vertices * np.array([1, -1, -1], dtype=np.float32)
            if vertex_normals is not None:
                vertex_normals = vertex_normals * np.array([1, -1, -1], dtype=np.float32)
            
            # 创建 mesh
            view_mesh = trimesh.Trimesh(
                vertices=vertices,
                faces=faces,
                vertex_normals=vertex_normals,
                vertex_colors=(vertex_colors * 255).astype(np.uint8),
                process=False
            )
            
            all_meshes.append(view_mesh)
            print(f"   视图 {view_idx + 1}: {len(vertices):,} 顶点, {len(faces):,} 面")
        
        if len(all_meshes) == 0:
            print("⚠️ 未生成任何 mesh")
            return None
        
        # 融合所有 mesh
        print("   融合所有视图...")
        combined_mesh = trimesh.util.concatenate(all_meshes)
        
        # 保存
        glb_path = os.path.join(target_dir, 'multi_view_segmented_mesh.glb')
        combined_mesh.export(glb_path)
        
        print(f"✅ 多视图分割 mesh 已保存: {glb_path}")
        print(f"   总计: {len(combined_mesh.vertices):,} 顶点, {len(combined_mesh.faces):,} 面")
        print(f"   {len(unique_objects)} 个唯一物体")
        
        return glb_path
        
    except Exception as e:
        print(f"❌ 生成多视图 mesh 失败: {e}")
        import traceback
        traceback.print_exc()
        return None


def create_segmented_pointcloud(processed_data, detections, masks, target_dir, use_sam=True):
    """创建分割点云（单视图，仅用于兼容）"""
    if len(detections) == 0:
        return None
    
    try:
        print(f"🎨 生成分割点云...")
        
        # 使用第一个视图
        first_view = processed_data[0]
        image = first_view["image"]
        points3d = first_view["points3d"]
        normal = first_view.get("normal")
        mask = first_view.get("mask")
        
        # 确保图像在 [0, 255] 范围
        if image.dtype != np.uint8:
            if image.max() <= 1.0:
                image = (image * 255).astype(np.uint8)
            else:
                image = image.astype(np.uint8)
        
        # 生成颜色
        distinct_colors = generate_distinct_colors(len(detections))
        
        # 创建彩色图像
        colored_image = image.copy()
        
        for i, (det, seg_mask) in enumerate(zip(detections, masks)):
            color = distinct_colors[i]
            colored_image[seg_mask] = color
            print(f"   {det['label']} → RGB{color}")
        
        # 生成点云（使用 MapAnything 的方法）
        height, width = image.shape[:2]
        
        # 简单方法：直接从 points3d 生成顶点颜色
        vertices = points3d.reshape(-1, 3)
        colors = (colored_image.astype(np.float32) / 255.0).reshape(-1, 3)
        
        if mask is not None:
            valid_mask = mask.reshape(-1)
            vertices = vertices[valid_mask]
            colors = colors[valid_mask]
        
        # 坐标变换
        vertices = vertices * np.array([1, -1, -1], dtype=np.float32)
        
        # 创建点云
        pointcloud = trimesh.PointCloud(
            vertices=vertices,
            colors=(colors * 255).astype(np.uint8)
        )
        
        # 保存
        seg_glb_path = os.path.join(target_dir, 'segmented_pointcloud.glb')
        pointcloud.export(seg_glb_path)
        
        print(f"✅ 分割点云已保存: {seg_glb_path}")
        return seg_glb_path
        
    except Exception as e:
        print(f"❌ 生成分割点云失败: {e}")
        import traceback
        traceback.print_exc()
        return None


# ============================================================================
# 核心模型推理
# ============================================================================

@spaces.GPU(duration=120)
def run_model(

    target_dir,

    apply_mask=True,

    mask_edges=True,

    filter_black_bg=False,

    filter_white_bg=False,

    enable_segmentation=False,

    text_prompt=DEFAULT_TEXT_PROMPT,

    use_sam=True,

):
    """

    Run the MapAnything model + GroundingDINO + SAM segmentation

    """
    global model, grounding_dino_model, sam_predictor
    import torch

    print(f"处理图像: {target_dir}")

    # 设备检查
    device = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    # 初始化 MapAnything 模型 - 从 HuggingFace
    if model is None:
        print("📥 从 HuggingFace 加载 MapAnything...")
        model = initialize_mapanything_model(high_level_config, device)
        print("✅ MapAnything 加载成功")
    else:
        model = model.to(device)

    model.eval()

    # 加载分割模型
    if enable_segmentation:
        load_grounding_dino_model(device)
        if use_sam:
            load_sam_model(device)

    # 加载图像
    print("加载图像...")
    image_folder_path = os.path.join(target_dir, "images")
    views = load_images(image_folder_path)

    print(f"加载了 {len(views)} 张图像")
    if len(views) == 0:
        raise ValueError("未找到图像")

    # 运行 MapAnything 推理
    print("运行 3D 重建...")
    outputs = model.infer(
        views, apply_mask=apply_mask, mask_edges=True, memory_efficient_inference=False
    )

    # 转换预测结果
    predictions = {}
    extrinsic_list = []
    intrinsic_list = []
    world_points_list = []
    depth_maps_list = []
    images_list = []
    final_mask_list = []
    confidences = []
    
    for pred in outputs:
        depthmap_torch = pred["depth_z"][0].squeeze(-1)
        intrinsics_torch = pred["intrinsics"][0]
        camera_pose_torch = pred["camera_poses"][0]
        conf = pred["conf"][0].squeeze(-1)
        
        pts3d_computed, valid_mask = depthmap_to_world_frame(
            depthmap_torch, intrinsics_torch, camera_pose_torch
        )

        if "mask" in pred:
            mask = pred["mask"][0].squeeze(-1).cpu().numpy().astype(bool)
        else:
            mask = np.ones_like(depthmap_torch.cpu().numpy(), dtype=bool)

        mask = mask & valid_mask.cpu().numpy()
        image = pred["img_no_norm"][0].cpu().numpy()

        extrinsic_list.append(camera_pose_torch.cpu().numpy())
        intrinsic_list.append(intrinsics_torch.cpu().numpy())
        world_points_list.append(pts3d_computed.cpu().numpy())
        depth_maps_list.append(depthmap_torch.cpu().numpy())
        images_list.append(image)
        final_mask_list.append(mask)
        confidences.append(conf.cpu().numpy())

    predictions["extrinsic"] = np.stack(extrinsic_list, axis=0)
    predictions["intrinsic"] = np.stack(intrinsic_list, axis=0)
    predictions["world_points"] = np.stack(world_points_list, axis=0)
    predictions["conf"] = np.stack(confidences, axis=0)
    
    depth_maps = np.stack(depth_maps_list, axis=0)
    if len(depth_maps.shape) == 3:
        depth_maps = depth_maps[..., np.newaxis]
    predictions["depth"] = depth_maps
    
    predictions["images"] = np.stack(images_list, axis=0)
    predictions["final_mask"] = np.stack(final_mask_list, axis=0)

    # 处理可视化数据
    processed_data = process_predictions_for_visualization(
        predictions, views, high_level_config, filter_black_bg, filter_white_bg
    )

    # 多视图分割处理
    segmented_glb = None
    if enable_segmentation and grounding_dino_model is not None:
        print("\n🎯 开始多视图分割...")
        print(f"🔍 使用检测提示: {text_prompt[:100]}...")
        
        all_view_detections = []
        all_view_masks = []
        
        # 对每个视图进行分割
        for view_idx, ref_image in enumerate(images_list):
            print(f"\n📸 处理视图 {view_idx + 1}/{len(images_list)}...")
            
            if ref_image.dtype != np.uint8:
                ref_image_np = (ref_image * 255).astype(np.uint8)
            else:
                ref_image_np = ref_image
            
            # GroundingDINO 检测
            detections = run_grounding_dino_detection(ref_image_np, text_prompt, device)
            
            if len(detections) > 0:
                # SAM 精确分割
                boxes = [d['bbox'] for d in detections]
                masks = run_sam_refinement(ref_image_np, boxes) if use_sam else []
                
                # 获取3D点云和encoder（用于特征提取）
                points3d = world_points_list[view_idx]
                encoder = model.encoder if hasattr(model, 'encoder') else None
                
                # V5: 为每个检测物体提取多种特征
                for det_idx, (det, mask) in enumerate(zip(detections, masks)):
                    # 1. 计算3D中心点
                    center_3d = compute_object_3d_center(points3d, mask)
                    det['center_3d'] = center_3d
                    
                    # 2. 计算3D边界框
                    if center_3d is not None:
                        masked_points = points3d[mask]
                        if len(masked_points) > 0:
                            bbox_min = masked_points.min(axis=0)
                            bbox_max = masked_points.max(axis=0)
                            bbox_size = bbox_max - bbox_min
                            det['bbox_3d'] = {
                                'center': center_3d,
                                'size': bbox_size,
                                'min': bbox_min,
                                'max': bbox_max
                            }
                    
                    # 3. 存储2D mask（用于IoU计算）
                    det['mask_2d'] = mask
                    
                    # 4. 提取视觉特征（DINOv2）- 可选
                    if ENABLE_VISUAL_FEATURES and encoder is not None:
                        visual_feat = extract_visual_features(ref_image, mask, encoder)
                        det['visual_feature'] = visual_feat
                    else:
                        det['visual_feature'] = None
                
                all_view_detections.append(detections)
                all_view_masks.append(masks)
            else:
                all_view_detections.append([])
                all_view_masks.append([])
        
        # 跨视图匹配物体
        if any(len(dets) > 0 for dets in all_view_detections):
            object_id_map, unique_objects = match_objects_across_views(all_view_detections)
            
            # 生成多视图分割 mesh
            segmented_glb = create_multi_view_segmented_mesh(
                processed_data, all_view_detections, all_view_masks,
                object_id_map, unique_objects, target_dir, use_sam
            )

    # 清理
    torch.cuda.empty_cache()

    return predictions, processed_data, segmented_glb


# ============================================================================
# 从 gradio_app.py 复制的其他函数
# ============================================================================

def update_view_selectors(processed_data):
    """Update view selector dropdowns based on available views"""
    if processed_data is None or len(processed_data) == 0:
        choices = ["View 1"]
    else:
        num_views = len(processed_data)
        choices = [f"View {i + 1}" for i in range(num_views)]

    return (
        gr.Dropdown(choices=choices, value=choices[0]),
        gr.Dropdown(choices=choices, value=choices[0]),
        gr.Dropdown(choices=choices, value=choices[0]),
    )


def get_view_data_by_index(processed_data, view_index):
    """Get view data by index, handling bounds"""
    if processed_data is None or len(processed_data) == 0:
        return None

    view_keys = list(processed_data.keys())
    if view_index < 0 or view_index >= len(view_keys):
        view_index = 0

    return processed_data[view_keys[view_index]]


def update_depth_view(processed_data, view_index):
    """Update depth view for a specific view index"""
    view_data = get_view_data_by_index(processed_data, view_index)
    if view_data is None or view_data["depth"] is None:
        return None

    return colorize_depth(view_data["depth"], mask=view_data.get("mask"))


def update_normal_view(processed_data, view_index):
    """Update normal view for a specific view index"""
    view_data = get_view_data_by_index(processed_data, view_index)
    if view_data is None or view_data["normal"] is None:
        return None

    return colorize_normal(view_data["normal"], mask=view_data.get("mask"))


def update_measure_view(processed_data, view_index):
    """Update measure view for a specific view index with mask overlay"""
    view_data = get_view_data_by_index(processed_data, view_index)
    if view_data is None:
        return None, []

    image = view_data["image"].copy()

    if image.dtype != np.uint8:
        if image.max() <= 1.0:
            image = (image * 255).astype(np.uint8)
        else:
            image = image.astype(np.uint8)

    if view_data["mask"] is not None:
        mask = view_data["mask"]
        invalid_mask = ~mask

        if invalid_mask.any():
            overlay_color = np.array([255, 220, 220], dtype=np.uint8)
            alpha = 0.5
            for c in range(3):
                image[:, :, c] = np.where(
                    invalid_mask,
                    (1 - alpha) * image[:, :, c] + alpha * overlay_color[c],
                    image[:, :, c],
                ).astype(np.uint8)

    return image, []


def navigate_depth_view(processed_data, current_selector_value, direction):
    """Navigate depth view"""
    if processed_data is None or len(processed_data) == 0:
        return "View 1", None

    try:
        current_view = int(current_selector_value.split()[1]) - 1
    except:
        current_view = 0

    num_views = len(processed_data)
    new_view = (current_view + direction) % num_views

    new_selector_value = f"View {new_view + 1}"
    depth_vis = update_depth_view(processed_data, new_view)

    return new_selector_value, depth_vis


def navigate_normal_view(processed_data, current_selector_value, direction):
    """Navigate normal view"""
    if processed_data is None or len(processed_data) == 0:
        return "View 1", None

    try:
        current_view = int(current_selector_value.split()[1]) - 1
    except:
        current_view = 0

    num_views = len(processed_data)
    new_view = (current_view + direction) % num_views

    new_selector_value = f"View {new_view + 1}"
    normal_vis = update_normal_view(processed_data, new_view)

    return new_selector_value, normal_vis


def navigate_measure_view(processed_data, current_selector_value, direction):
    """Navigate measure view"""
    if processed_data is None or len(processed_data) == 0:
        return "View 1", None, []

    try:
        current_view = int(current_selector_value.split()[1]) - 1
    except:
        current_view = 0

    num_views = len(processed_data)
    new_view = (current_view + direction) % num_views

    new_selector_value = f"View {new_view + 1}"
    measure_image, measure_points = update_measure_view(processed_data, new_view)

    return new_selector_value, measure_image, measure_points


def populate_visualization_tabs(processed_data):
    """Populate the depth, normal, and measure tabs with processed data"""
    if processed_data is None or len(processed_data) == 0:
        return None, None, None, []

    depth_vis = update_depth_view(processed_data, 0)
    normal_vis = update_normal_view(processed_data, 0)
    measure_img, _ = update_measure_view(processed_data, 0)

    return depth_vis, normal_vis, measure_img, []


def handle_uploads(input_video, input_images, s_time_interval=1.0):
    """Handle uploaded video/images"""
    start_time = time.time()
    gc.collect()
    torch.cuda.empty_cache()

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
    target_dir = f"input_images_{timestamp}"
    target_dir_images = os.path.join(target_dir, "images")

    if os.path.exists(target_dir):
        shutil.rmtree(target_dir)
    os.makedirs(target_dir)
    os.makedirs(target_dir_images)

    image_paths = []

    # Handle images
    if input_images is not None:
        for file_data in input_images:
            if isinstance(file_data, dict) and "name" in file_data:
                file_path = file_data["name"]
            else:
                file_path = file_data

            file_ext = os.path.splitext(file_path)[1].lower()
            if file_ext in [".heic", ".heif"]:
                try:
                    with Image.open(file_path) as img:
                        if img.mode not in ("RGB", "L"):
                            img = img.convert("RGB")
                        base_name = os.path.splitext(os.path.basename(file_path))[0]
                        dst_path = os.path.join(target_dir_images, f"{base_name}.jpg")
                        img.save(dst_path, "JPEG", quality=95)
                        image_paths.append(dst_path)
                except Exception as e:
                    print(f"Error converting HEIC: {e}")
                    dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
                    shutil.copy(file_path, dst_path)
                    image_paths.append(dst_path)
            else:
                dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
                shutil.copy(file_path, dst_path)
                image_paths.append(dst_path)

    # Handle video
    if input_video is not None:
        if isinstance(input_video, dict) and "name" in input_video:
            video_path = input_video["name"]
        else:
            video_path = input_video

        vs = cv2.VideoCapture(video_path)
        fps = vs.get(cv2.CAP_PROP_FPS)
        frame_interval = int(fps * s_time_interval)

        count = 0
        video_frame_num = 0
        while True:
            gotit, frame = vs.read()
            if not gotit:
                break
            count += 1
            if count % frame_interval == 0:
                image_path = os.path.join(target_dir_images, f"{video_frame_num:06}.png")
                cv2.imwrite(image_path, frame)
                image_paths.append(image_path)
                video_frame_num += 1

    image_paths = sorted(image_paths)

    end_time = time.time()
    print(f"Files copied to {target_dir_images}; took {end_time - start_time:.3f} seconds")
    return target_dir, image_paths


def update_gallery_on_upload(input_video, input_images, s_time_interval=1.0):
    """Update gallery on upload"""
    if not input_video and not input_images:
        return None, None, None, None, None
    target_dir, image_paths = handle_uploads(input_video, input_images, s_time_interval)
    return (
        None,
        None,
        target_dir,
        image_paths,
        "上传完成，点击「重建」开始 3D 处理",
    )


@spaces.GPU(duration=120)
def gradio_demo(

    target_dir,

    frame_filter="All",

    show_cam=True,

    filter_black_bg=False,

    filter_white_bg=False,

    conf_thres=3.0,

    apply_mask=True,

    show_mesh=True,

    enable_segmentation=False,

    text_prompt=DEFAULT_TEXT_PROMPT,

    use_sam=True,

):
    """Perform reconstruction"""
    if not os.path.isdir(target_dir) or target_dir == "None":
        return None, None, "请先上传文件", None, None, None, None, None, None, None, None

    start_time = time.time()
    gc.collect()
    torch.cuda.empty_cache()

    target_dir_images = os.path.join(target_dir, "images")
    all_files = sorted(os.listdir(target_dir_images)) if os.path.isdir(target_dir_images) else []
    all_files_display = [f"{i}: {filename}" for i, filename in enumerate(all_files)]
    frame_filter_choices = ["All"] + all_files_display

    print("运行 MapAnything 模型...")
    with torch.no_grad():
        predictions, processed_data, segmented_glb = run_model(
            target_dir, apply_mask, True, filter_black_bg, filter_white_bg,
            enable_segmentation, text_prompt, use_sam
        )

    # 保存预测结果
    prediction_save_path = os.path.join(target_dir, "predictions.npz")
    np.savez(prediction_save_path, **predictions)

    if frame_filter is None:
        frame_filter = "All"

    # 生成原始 GLB
    glbfile = os.path.join(
        target_dir,
        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_mesh{show_mesh}.glb",
    )

    glbscene = predictions_to_glb(
        predictions,
        filter_by_frames=frame_filter,
        show_cam=show_cam,
        mask_black_bg=filter_black_bg,
        mask_white_bg=filter_white_bg,
        as_mesh=show_mesh,
        conf_percentile=conf_thres,
    )
    glbscene.export(file_obj=glbfile)

    # 清理
    del predictions
    gc.collect()
    torch.cuda.empty_cache()

    end_time = time.time()
    print(f"总耗时: {end_time - start_time:.2f}秒")
    log_msg = f"✅ 重建成功 ({len(all_files)} 帧)"

    # 填充可视化标签
    depth_vis, normal_vis, measure_img, measure_pts = populate_visualization_tabs(processed_data)

    # 更新视图选择器
    depth_selector, normal_selector, measure_selector = update_view_selectors(processed_data)

    return (
        glbfile,
        segmented_glb,
        log_msg,
        gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True),
        processed_data,
        depth_vis,
        normal_vis,
        measure_img,
        "",
        depth_selector,
        normal_selector,
        measure_selector,
    )


def colorize_depth(depth_map, mask=None):
    """Convert depth map to colorized visualization"""
    if depth_map is None:
        return None

    depth_normalized = depth_map.copy()
    valid_mask = depth_normalized > 0

    if mask is not None:
        valid_mask = valid_mask & mask

    if valid_mask.sum() > 0:
        valid_depths = depth_normalized[valid_mask]
        p5 = np.percentile(valid_depths, 5)
        p95 = np.percentile(valid_depths, 95)
        depth_normalized[valid_mask] = (depth_normalized[valid_mask] - p5) / (p95 - p5)

    import matplotlib.pyplot as plt
    colormap = plt.cm.turbo_r
    colored = colormap(depth_normalized)
    colored = (colored[:, :, :3] * 255).astype(np.uint8)
    colored[~valid_mask] = [255, 255, 255]

    return colored


def colorize_normal(normal_map, mask=None):
    """Convert normal map to colorized visualization"""
    if normal_map is None:
        return None

    normal_vis = normal_map.copy()

    if mask is not None:
        invalid_mask = ~mask
        normal_vis[invalid_mask] = [0, 0, 0]

    normal_vis = (normal_vis + 1.0) / 2.0
    normal_vis = (normal_vis * 255).astype(np.uint8)

    return normal_vis


def process_predictions_for_visualization(

    predictions, views, high_level_config, filter_black_bg=False, filter_white_bg=False

):
    """Extract depth, normal, and 3D points from predictions for visualization"""
    processed_data = {}

    for view_idx, view in enumerate(views):
        image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
        pred_pts3d = predictions["world_points"][view_idx]

        view_data = {
            "image": image[0],
            "points3d": pred_pts3d,
            "depth": None,
            "normal": None,
            "mask": None,
        }

        mask = predictions["final_mask"][view_idx].copy()

        if filter_black_bg:
            view_colors = image[0] * 255 if image[0].max() <= 1.0 else image[0]
            black_bg_mask = view_colors.sum(axis=2) >= 16
            mask = mask & black_bg_mask

        if filter_white_bg:
            view_colors = image[0] * 255 if image[0].max() <= 1.0 else image[0]
            white_bg_mask = ~(
                (view_colors[:, :, 0] > 240)
                & (view_colors[:, :, 1] > 240)
                & (view_colors[:, :, 2] > 240)
            )
            mask = mask & white_bg_mask

        view_data["mask"] = mask
        view_data["depth"] = predictions["depth"][view_idx].squeeze()

        normals, _ = points_to_normals(pred_pts3d, mask=view_data["mask"])
        view_data["normal"] = normals

        processed_data[view_idx] = view_data

    return processed_data


def reset_measure(processed_data):
    """Reset measure points"""
    if processed_data is None or len(processed_data) == 0:
        return None, [], ""
    first_view = list(processed_data.values())[0]
    return first_view["image"], [], ""


def measure(processed_data, measure_points, current_view_selector, event: gr.SelectData):
    """Handle measurement on images"""
    try:
        if processed_data is None or len(processed_data) == 0:
            return None, [], "没有可用数据"

        try:
            current_view_index = int(current_view_selector.split()[1]) - 1
        except:
            current_view_index = 0

        if current_view_index < 0 or current_view_index >= len(processed_data):
            current_view_index = 0

        view_keys = list(processed_data.keys())
        current_view = processed_data[view_keys[current_view_index]]

        if current_view is None:
            return None, [], "没有视图数据"

        point2d = event.index[0], event.index[1]

        if (
            current_view["mask"] is not None
            and 0 <= point2d[1] < current_view["mask"].shape[0]
            and 0 <= point2d[0] < current_view["mask"].shape[1]
        ):
            if not current_view["mask"][point2d[1], point2d[0]]:
                masked_image, _ = update_measure_view(processed_data, current_view_index)
                return (
                    masked_image,
                    measure_points,
                    '<span style="color: red; font-weight: bold;">无法在遮罩区域测量（显示为灰色）</span>',
                )

        measure_points.append(point2d)

        image, _ = update_measure_view(processed_data, current_view_index)
        if image is None:
            return None, [], "没有可用图像"

        image = image.copy()
        points3d = current_view["points3d"]

        if image.dtype != np.uint8:
            if image.max() <= 1.0:
                image = (image * 255).astype(np.uint8)
            else:
                image = image.astype(np.uint8)

        for p in measure_points:
            if 0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]:
                image = cv2.circle(image, p, radius=5, color=(255, 0, 0), thickness=2)

        depth_text = ""
        for i, p in enumerate(measure_points):
            if (
                current_view["depth"] is not None
                and 0 <= p[1] < current_view["depth"].shape[0]
                and 0 <= p[0] < current_view["depth"].shape[1]
            ):
                d = current_view["depth"][p[1], p[0]]
                depth_text += f"- **P{i + 1} 深度: {d:.2f}m.**\n"
            else:
                if (
                    points3d is not None
                    and 0 <= p[1] < points3d.shape[0]
                    and 0 <= p[0] < points3d.shape[1]
                ):
                    z = points3d[p[1], p[0], 2]
                    depth_text += f"- **P{i + 1} Z坐标: {z:.2f}m.**\n"

        if len(measure_points) == 2:
            point1, point2 = measure_points
            if (
                0 <= point1[0] < image.shape[1]
                and 0 <= point1[1] < image.shape[0]
                and 0 <= point2[0] < image.shape[1]
                and 0 <= point2[1] < image.shape[0]
            ):
                image = cv2.line(image, point1, point2, color=(255, 0, 0), thickness=2)

            distance_text = "- **距离: 无法计算**"
            if (
                points3d is not None
                and 0 <= point1[1] < points3d.shape[0]
                and 0 <= point1[0] < points3d.shape[1]
                and 0 <= point2[1] < points3d.shape[0]
                and 0 <= point2[0] < points3d.shape[1]
            ):
                try:
                    p1_3d = points3d[point1[1], point1[0]]
                    p2_3d = points3d[point2[1], point2[0]]
                    distance = np.linalg.norm(p1_3d - p2_3d)
                    distance_text = f"- **距离: {distance:.2f}m**"
                except Exception as e:
                    distance_text = f"- **距离计算错误: {e}**"

            measure_points = []
            text = depth_text + distance_text
            return [image, measure_points, text]
        else:
            return [image, measure_points, depth_text]

    except Exception as e:
        print(f"测量错误: {e}")
        return None, [], f"测量错误: {e}"


def clear_fields():
    """Clear 3D viewer"""
    return None, None


def update_log():
    """Display log message"""
    return "加载和重建中..."


def update_visualization(

    target_dir,

    frame_filter,

    show_cam,

    is_example,

    conf_thres=None,

    filter_black_bg=False,

    filter_white_bg=False,

    show_mesh=True,

):
    """Update visualization"""
    if is_example == "True":
        return gr.update(), "没有可用的重建。请先点击重建按钮。"

    if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
        return gr.update(), "没有可用的重建。请先点击重建按钮。"

    predictions_path = os.path.join(target_dir, "predictions.npz")
    if not os.path.exists(predictions_path):
        return gr.update(), f"没有可用的重建。请先运行「重建」。"

    loaded = np.load(predictions_path, allow_pickle=True)
    predictions = {key: loaded[key] for key in loaded.keys()}

    glbfile = os.path.join(
        target_dir,
        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_mesh{show_mesh}_black{filter_black_bg}_white{filter_white_bg}.glb",
    )

    glbscene = predictions_to_glb(
        predictions,
        filter_by_frames=frame_filter,
        show_cam=show_cam,
        mask_black_bg=filter_black_bg,
        mask_white_bg=filter_white_bg,
        as_mesh=show_mesh,
        conf_percentile=conf_thres,
    )
    glbscene.export(file_obj=glbfile)

    return glbfile, "可视化已更新。"


def update_all_views_on_filter_change(

    target_dir,

    filter_black_bg,

    filter_white_bg,

    processed_data,

    depth_view_selector,

    normal_view_selector,

    measure_view_selector,

):
    """Update all views on filter change"""
    if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
        return processed_data, None, None, None, []

    predictions_path = os.path.join(target_dir, "predictions.npz")
    if not os.path.exists(predictions_path):
        return processed_data, None, None, None, []

    try:
        loaded = np.load(predictions_path, allow_pickle=True)
        predictions = {key: loaded[key] for key in loaded.keys()}

        image_folder_path = os.path.join(target_dir, "images")
        views = load_images(image_folder_path)

        new_processed_data = process_predictions_for_visualization(
            predictions, views, high_level_config, filter_black_bg, filter_white_bg
        )

        try:
            depth_view_idx = int(depth_view_selector.split()[1]) - 1 if depth_view_selector else 0
        except:
            depth_view_idx = 0

        try:
            normal_view_idx = int(normal_view_selector.split()[1]) - 1 if normal_view_selector else 0
        except:
            normal_view_idx = 0

        try:
            measure_view_idx = int(measure_view_selector.split()[1]) - 1 if measure_view_selector else 0
        except:
            measure_view_idx = 0

        depth_vis = update_depth_view(new_processed_data, depth_view_idx)
        normal_vis = update_normal_view(new_processed_data, normal_view_idx)
        measure_img, _ = update_measure_view(new_processed_data, measure_view_idx)

        return new_processed_data, depth_vis, normal_vis, measure_img, []

    except Exception as e:
        print(f"更新视图失败: {e}")
        return processed_data, None, None, None, []


# ============================================================================
# 示例场景
# ============================================================================

def get_scene_info(examples_dir):
    """Get information about scenes in the examples directory"""
    import glob

    scenes = []
    if not os.path.exists(examples_dir):
        return scenes

    for scene_folder in sorted(os.listdir(examples_dir)):
        scene_path = os.path.join(examples_dir, scene_folder)
        if os.path.isdir(scene_path):
            image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff", "*.tif"]
            image_files = []
            for ext in image_extensions:
                image_files.extend(glob.glob(os.path.join(scene_path, ext)))
                image_files.extend(glob.glob(os.path.join(scene_path, ext.upper())))

            if image_files:
                image_files = sorted(image_files)
                first_image = image_files[0]
                num_images = len(image_files)

                scenes.append(
                    {
                        "name": scene_folder,
                        "path": scene_path,
                        "thumbnail": first_image,
                        "num_images": num_images,
                        "image_files": image_files,
                    }
                )

    return scenes


def load_example_scene(scene_name, examples_dir="examples"):
    """Load a scene from examples directory"""
    scenes = get_scene_info(examples_dir)

    selected_scene = None
    for scene in scenes:
        if scene["name"] == scene_name:
            selected_scene = scene
            break

    if selected_scene is None:
        return None, None, None, None, "场景未找到"

    target_dir, image_paths = handle_uploads(None, selected_scene["image_files"])

    return (
        None,
        None,
        target_dir,
        image_paths,
        f"已加载场景 '{scene_name}' ({selected_scene['num_images']} 张图像)。点击「重建」开始 3D 处理。",
    )


# ============================================================================
# Gradio UI
# ============================================================================

theme = get_gradio_theme()

# 自定义CSS防止UI抖动
CUSTOM_CSS = GRADIO_CSS + """

/* 防止组件撑开布局 */

.gradio-container {

    max-width: 100% !important;

}



/* 固定Gallery高度 */

.gallery-container {

    max-height: 350px !important;

    overflow-y: auto !important;

}



/* 固定File组件高度 */

.file-preview {

    max-height: 200px !important;

    overflow-y: auto !important;

}



/* 固定Video组件高度 */

.video-container {

    max-height: 300px !important;

}



/* 防止Textbox无限扩展 */

.textbox-container {

    max-height: 100px !important;

}



/* 保持Tabs内容区域稳定 */

.tab-content {

    min-height: 550px !important;

}

"""

with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V8 - 3D重建与物体分割") as demo:
    is_example = gr.Textbox(label="is_example", visible=False, value="None")
    processed_data_state = gr.State(value=None)
    measure_points_state = gr.State(value=[])

    # 顶部标题
    gr.HTML("""

    <div style="text-align: center; margin: 20px 0;">

        <h2 style="color: #1976D2; margin-bottom: 10px;">MapAnything V8 - 3D重建与物体分割</h2>

        <p style="color: #666; font-size: 16px;">基于DBSCAN聚类的智能物体识别 | 多视图融合 | 自适应参数调整</p>

    </div>

    """)

    target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")

    with gr.Row(equal_height=False):
        # 左侧：输入区域
        with gr.Column(scale=1, min_width=300):
            gr.Markdown("### 📤 输入")
            
            with gr.Tabs():
                with gr.Tab("📷 图片"):
                    input_images = gr.File(
                        file_count="multiple", 
                        label="上传多张图片（推荐3-10张）", 
                        interactive=True,
                        height=200
                    )
                
                with gr.Tab("🎥 视频"):
                    input_video = gr.Video(
                        label="上传视频", 
                        interactive=True,
                        height=300
                    )
                    s_time_interval = gr.Slider(
                        minimum=0.1, maximum=5.0, value=1.0, step=0.1,
                        label="帧采样间隔（秒）", interactive=True
                    )
            
            image_gallery = gr.Gallery(
                label="图片预览", columns=3, height=350,
                show_download_button=True, object_fit="contain", preview=True
            )
            
            with gr.Row():
                submit_btn = gr.Button("🚀 开始重建", variant="primary", scale=2)
                clear_btn = gr.ClearButton(
                    [input_video, input_images, target_dir_output, image_gallery],
                    value="🗑️ 清空", scale=1
                )

        # 右侧：输出区域
        with gr.Column(scale=2, min_width=600):
            gr.Markdown("### 🎯 输出")

            with gr.Tabs():
                with gr.Tab("🏗️ 原始3D"):
                    reconstruction_output = gr.Model3D(
                        height=550, zoom_speed=0.5, pan_speed=0.5,
                        clear_color=[0.0, 0.0, 0.0, 0.0]
                    )
                
                with gr.Tab("🎨 分割3D"):
                    segmented_output = gr.Model3D(
                        height=550, zoom_speed=0.5, pan_speed=0.5,
                        clear_color=[0.0, 0.0, 0.0, 0.0]
                    )
                
                with gr.Tab("📊 深度图"):
                    with gr.Row(elem_classes=["navigation-row"]):
                        prev_depth_btn = gr.Button("◀", size="sm", scale=1)
                        depth_view_selector = gr.Dropdown(
                            choices=["View 1"], value="View 1",
                            label="视图", scale=3, interactive=True
                        )
                        next_depth_btn = gr.Button("▶", size="sm", scale=1)
                    depth_map = gr.Image(
                        type="numpy", label="", format="png", interactive=False,
                        height=500
                    )
                
                with gr.Tab("🧭 法线图"):
                    with gr.Row(elem_classes=["navigation-row"]):
                        prev_normal_btn = gr.Button("◀", size="sm", scale=1)
                        normal_view_selector = gr.Dropdown(
                            choices=["View 1"], value="View 1",
                            label="视图", scale=3, interactive=True
                        )
                        next_normal_btn = gr.Button("▶", size="sm", scale=1)
                    normal_map = gr.Image(
                        type="numpy", label="", format="png", interactive=False,
                        height=500
                    )
                
                with gr.Tab("📏 测量"):
                    gr.Markdown("**点击图片两次进行距离测量**")
                    with gr.Row(elem_classes=["navigation-row"]):
                        prev_measure_btn = gr.Button("◀", size="sm", scale=1)
                        measure_view_selector = gr.Dropdown(
                            choices=["View 1"], value="View 1",
                            label="视图", scale=3, interactive=True
                        )
                        next_measure_btn = gr.Button("▶", size="sm", scale=1)
                    measure_image = gr.Image(
                        type="numpy", show_label=False,
                        format="webp", interactive=False, sources=[],
                        height=500
                    )
                    measure_text = gr.Markdown("")
            
            log_output = gr.Textbox(
                value="📌 请上传图片或视频，然后点击「开始重建」",
                label="状态信息",
                interactive=False,
                lines=1,
                max_lines=1
            )

    # 高级选项（可折叠）
    with gr.Accordion("⚙️ 高级选项", open=False):
        with gr.Row(equal_height=False):
            with gr.Column(scale=1, min_width=300):
                gr.Markdown("#### 可视化参数")
                frame_filter = gr.Dropdown(
                    choices=["All"], value="All", label="显示帧"
                )
                conf_thres = gr.Slider(
                    minimum=0, maximum=100, value=0, step=0.1,
                    label="置信度阈值（百分位）"
                )
                show_cam = gr.Checkbox(label="显示相机", value=True)
                show_mesh = gr.Checkbox(label="显示网格", value=True)
                filter_black_bg = gr.Checkbox(label="过滤黑色背景", value=False)
                filter_white_bg = gr.Checkbox(label="过滤白色背景", value=False)
            
            with gr.Column(scale=1, min_width=300):
                gr.Markdown("#### 重建参数")
                apply_mask_checkbox = gr.Checkbox(
                    label="应用深度掩码", value=True
                )
                
                gr.Markdown("#### 分割参数")
                enable_segmentation = gr.Checkbox(
                    label="启用语义分割", value=False
                )
                use_sam_checkbox = gr.Checkbox(
                    label="使用SAM精确分割", value=True
                )
                
                text_prompt = gr.Textbox(
                    value=DEFAULT_TEXT_PROMPT,
                    label="检测物体（用 . 分隔）",
                    placeholder="例如: chair . table . sofa",
                    lines=2,
                    max_lines=2
                )
                
                with gr.Row():
                    detect_all_btn = gr.Button("🔍 检测所有", size="sm")
                    restore_default_btn = gr.Button("↻ 默认", size="sm")

    # 示例场景（可折叠）
    with gr.Accordion("🖼️ 示例场景", open=False):
        scenes = get_scene_info("examples")
        if scenes:
            for i in range(0, len(scenes), 4):
                with gr.Row(equal_height=True):
                    for j in range(4):
                        scene_idx = i + j
                        if scene_idx < len(scenes):
                            scene = scenes[scene_idx]
                            with gr.Column(scale=1, min_width=150):
                                scene_img = gr.Image(
                                    value=scene["thumbnail"], 
                                    height=150,
                                    interactive=False, 
                                    show_label=False, 
                                    sources=[],
                                    container=False
                                )
                                gr.Markdown(
                                    f"**{scene['name']}** ({scene['num_images']}张)",
                                    elem_classes=["text-center"]
                                )
                                scene_img.select(
                                    fn=lambda name=scene["name"]: load_example_scene(name),
                                    outputs=[
                                        reconstruction_output, segmented_output,
                                        target_dir_output, image_gallery, log_output
                                    ]
                                )

    # === 事件绑定 ===
    
    # 分割选项按钮
    detect_all_btn.click(
        fn=lambda: COMMON_OBJECTS_PROMPT,
        outputs=[text_prompt]
    )
    restore_default_btn.click(
        fn=lambda: DEFAULT_TEXT_PROMPT,
        outputs=[text_prompt]
    )
    
    # 上传文件自动更新
    input_video.change(
        fn=update_gallery_on_upload,
        inputs=[input_video, input_images, s_time_interval],
        outputs=[reconstruction_output, segmented_output, target_dir_output, image_gallery, log_output]
    )
    input_images.change(
        fn=update_gallery_on_upload,
        inputs=[input_video, input_images, s_time_interval],
        outputs=[reconstruction_output, segmented_output, target_dir_output, image_gallery, log_output]
    )
    
    # 重建按钮
    submit_btn.click(
        fn=clear_fields,
        outputs=[reconstruction_output, segmented_output]
    ).then(
        fn=update_log,
        outputs=[log_output]
    ).then(
        fn=gradio_demo,
        inputs=[
            target_dir_output, frame_filter, show_cam,
            filter_black_bg, filter_white_bg, conf_thres,
            apply_mask_checkbox, show_mesh,
            enable_segmentation, text_prompt, use_sam_checkbox
        ],
        outputs=[
            reconstruction_output, segmented_output, log_output, frame_filter,
            processed_data_state, depth_map, normal_map, measure_image,
            measure_text, depth_view_selector, normal_view_selector, measure_view_selector
        ]
    ).then(
        fn=lambda: "False",
        outputs=[is_example]
    )
    
    # 清空按钮
    clear_btn.add([reconstruction_output, segmented_output, log_output])
    
    # 可视化参数实时更新
    for component in [frame_filter, show_cam, conf_thres, show_mesh]:
        component.change(
            fn=update_visualization,
            inputs=[
                target_dir_output, frame_filter, show_cam, is_example,
                conf_thres, filter_black_bg, filter_white_bg, show_mesh
            ],
            outputs=[reconstruction_output, log_output]
        )
    
    # 背景过滤器更新所有视图
    for bg_filter in [filter_black_bg, filter_white_bg]:
        bg_filter.change(
            fn=update_all_views_on_filter_change,
            inputs=[
                target_dir_output, filter_black_bg, filter_white_bg, processed_data_state,
                depth_view_selector, normal_view_selector, measure_view_selector
            ],
            outputs=[processed_data_state, depth_map, normal_map, measure_image, measure_points_state]
        )
    
    # 深度图导航
    prev_depth_btn.click(
        fn=lambda pd, cs: navigate_depth_view(pd, cs, -1),
        inputs=[processed_data_state, depth_view_selector],
        outputs=[depth_view_selector, depth_map]
    )
    next_depth_btn.click(
        fn=lambda pd, cs: navigate_depth_view(pd, cs, 1),
        inputs=[processed_data_state, depth_view_selector],
        outputs=[depth_view_selector, depth_map]
    )
    depth_view_selector.change(
        fn=lambda pd, sv: update_depth_view(pd, int(sv.split()[1]) - 1) if sv else None,
        inputs=[processed_data_state, depth_view_selector],
        outputs=[depth_map]
    )
    
    # 法线图导航
    prev_normal_btn.click(
        fn=lambda pd, cs: navigate_normal_view(pd, cs, -1),
        inputs=[processed_data_state, normal_view_selector],
        outputs=[normal_view_selector, normal_map]
    )
    next_normal_btn.click(
        fn=lambda pd, cs: navigate_normal_view(pd, cs, 1),
        inputs=[processed_data_state, normal_view_selector],
        outputs=[normal_view_selector, normal_map]
    )
    normal_view_selector.change(
        fn=lambda pd, sv: update_normal_view(pd, int(sv.split()[1]) - 1) if sv else None,
        inputs=[processed_data_state, normal_view_selector],
        outputs=[normal_map]
    )
    
    # 测量功能
    measure_image.select(
        fn=measure,
        inputs=[processed_data_state, measure_points_state, measure_view_selector],
        outputs=[measure_image, measure_points_state, measure_text]
    )
    prev_measure_btn.click(
        fn=lambda pd, cs: navigate_measure_view(pd, cs, -1),
        inputs=[processed_data_state, measure_view_selector],
        outputs=[measure_view_selector, measure_image, measure_points_state]
    )
    next_measure_btn.click(
        fn=lambda pd, cs: navigate_measure_view(pd, cs, 1),
        inputs=[processed_data_state, measure_view_selector],
        outputs=[measure_view_selector, measure_image, measure_points_state]
    )
    measure_view_selector.change(
        fn=lambda pd, sv: update_measure_view(pd, int(sv.split()[1]) - 1) if sv else (None, []),
        inputs=[processed_data_state, measure_view_selector],
        outputs=[measure_image, measure_points_state]
    )

# 启动信息
print("\n" + "="*60)
print("🚀 MapAnything V8 - 3D重建与物体分割")
print("="*60)
print("📊 核心技术: 自适应DBSCAN聚类 + 多视图融合")
print(f"🔧 质量控制: 置信度≥{MIN_DETECTION_CONFIDENCE} | 面积≥{MIN_MASK_AREA}px")
print(f"🎯 聚类半径: 沙发{DBSCAN_EPS_CONFIG['sofa']}m | 桌子{DBSCAN_EPS_CONFIG['table']}m | 窗户{DBSCAN_EPS_CONFIG['window']}m | 默认{DBSCAN_EPS_CONFIG['default']}m")
print("="*60 + "\n")

demo.queue(max_size=20).launch(show_error=True, share=True, ssr_mode=False)