Spaces:

ColamanAI
/

Map-anything-seg

Running

App Files Files Community

ColamanAI commited on 17 days ago

Commit

b74998d

verified ·

1 Parent(s): c8b23fc

Upload 169 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +57 -0
app.py +19 -0
mapanything/__init__.py +0 -0
mapanything/__pycache__/__init__.cpython-312.pyc +0 -0
mapanything/datasets/__init__.py +177 -0
mapanything/datasets/base/__init__.py +0 -0
mapanything/datasets/base/base_dataset.py +697 -0
mapanything/datasets/base/batched_sampler.py +431 -0
mapanything/datasets/base/easy_dataset.py +478 -0
mapanything/datasets/utils/__init__.py +0 -0
mapanything/datasets/utils/data_splits.py +1734 -0
mapanything/datasets/wai/__init__.py +0 -0
mapanything/datasets/wai/ase.py +294 -0
mapanything/datasets/wai/blendedmvs.py +313 -0
mapanything/datasets/wai/dl3dv.py +356 -0
mapanything/datasets/wai/dynamicreplica.py +297 -0
mapanything/datasets/wai/eth3d.py +277 -0
mapanything/datasets/wai/megadepth.py +314 -0
mapanything/datasets/wai/mpsd.py +311 -0
mapanything/datasets/wai/mvs_synth.py +308 -0
mapanything/datasets/wai/paralleldomain4d.py +309 -0
mapanything/datasets/wai/sailvos3d.py +308 -0
mapanything/datasets/wai/scannetpp.py +307 -0
mapanything/datasets/wai/spring.py +316 -0
mapanything/datasets/wai/tav2_wb.py +328 -0
mapanything/datasets/wai/unrealstereo4k.py +309 -0
mapanything/models/__init__.py +190 -0
mapanything/models/__pycache__/__init__.cpython-312.pyc +0 -0
mapanything/models/external/README.md +5 -0
mapanything/models/external/__init__.py +0 -0
mapanything/models/external/anycalib/__init__.py +100 -0
mapanything/models/external/dinov2/__init__.py +6 -0
mapanything/models/external/dinov2/hub/__init__.py +4 -0
mapanything/models/external/dinov2/hub/backbones.py +183 -0
mapanything/models/external/dinov2/hub/utils.py +42 -0
mapanything/models/external/dinov2/layers/__init__.py +14 -0
mapanything/models/external/dinov2/layers/attention.py +90 -0
mapanything/models/external/dinov2/layers/block.py +290 -0
mapanything/models/external/dinov2/layers/dino_head.py +67 -0
mapanything/models/external/dinov2/layers/drop_path.py +36 -0
mapanything/models/external/dinov2/layers/layer_scale.py +26 -0
mapanything/models/external/dinov2/layers/mlp.py +40 -0
mapanything/models/external/dinov2/layers/patch_embed.py +100 -0
mapanything/models/external/dinov2/layers/swiglu_ffn.py +71 -0
mapanything/models/external/dinov2/models/__init__.py +44 -0
mapanything/models/external/dinov2/models/vision_transformer.py +448 -0
mapanything/models/external/dinov2/utils/__init__.py +4 -0
mapanything/models/external/dinov2/utils/cluster.py +102 -0
mapanything/models/external/dinov2/utils/config.py +74 -0
mapanything/models/external/dinov2/utils/dtype.py +38 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,57 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual Environment
+venv/
+ENV/
+env/
+.venv
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+# HuggingFace Space 临时文件
+input_images_*/
+*.glb
+*.npz
+flagged/
+# 本地模型缓存（已改用 HuggingFace）
+models/
+# 日志
+*.log
+logs/
+# 测试文件
+.pytest_cache/
+.coverage
+htmlcov/
+# 系统文件
+Thumbs.db

app.py ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+HuggingFace Space 入口文件
+直接导入并运行 gradio_app_v8
+"""
+import sys
+from pathlib import Path
+# 添加 scripts 目录到 Python 路径
+scripts_dir = Path(__file__).parent / "scripts"
+sys.path.insert(0, str(scripts_dir))
+# 导入并运行主应用
+if __name__ == "__main__":
+    # 导入 gradio_app_v8（会自动启动 demo）
+    import gradio_app_v8

mapanything/__init__.py ADDED Viewed

File without changes

mapanything/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (154 Bytes). View file

mapanything/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+MapAnything Datasets
+"""
+import torch
+from mapanything.datasets.wai.ase import ASEWAI  # noqa
+from mapanything.datasets.wai.blendedmvs import BlendedMVSWAI  # noqa
+from mapanything.datasets.wai.dl3dv import DL3DVWAI  # noqa
+from mapanything.datasets.wai.dynamicreplica import DynamicReplicaWAI  # noqa
+from mapanything.datasets.wai.eth3d import ETH3DWAI  # noqa
+from mapanything.datasets.wai.megadepth import MegaDepthWAI  # noqa
+from mapanything.datasets.wai.mpsd import MPSDWAI  # noqa
+from mapanything.datasets.wai.mvs_synth import MVSSynthWAI  # noqa
+from mapanything.datasets.wai.paralleldomain4d import ParallelDomain4DWAI  # noqa
+from mapanything.datasets.wai.sailvos3d import SAILVOS3DWAI  # noqa
+from mapanything.datasets.wai.scannetpp import ScanNetPPWAI  # noqa
+from mapanything.datasets.wai.spring import SpringWAI  # noqa
+from mapanything.datasets.wai.tav2_wb import TartanAirV2WBWAI  # noqa
+from mapanything.datasets.wai.unrealstereo4k import UnrealStereo4KWAI  # noqa
+from mapanything.utils.train_tools import get_rank, get_world_size
+def get_test_data_loader(
+    dataset, batch_size, num_workers=8, shuffle=False, drop_last=False, pin_mem=True
+):
+    "Get simple PyTorch dataloader corresponding to the testing dataset"
+    # PyTorch dataset
+    if isinstance(dataset, str):
+        dataset = eval(dataset)
+    world_size = get_world_size()
+    rank = get_rank()
+    if torch.distributed.is_initialized():
+        sampler = torch.utils.data.DistributedSampler(
+            dataset,
+            num_replicas=world_size,
+            rank=rank,
+            shuffle=shuffle,
+            drop_last=drop_last,
+        )
+    elif shuffle:
+        sampler = torch.utils.data.RandomSampler(dataset)
+    else:
+        sampler = torch.utils.data.SequentialSampler(dataset)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=pin_mem,
+        drop_last=drop_last,
+    )
+    return data_loader
+def get_test_many_ar_data_loader(
+    dataset, batch_size, num_workers=8, drop_last=False, pin_mem=True
+):
+    "Get PyTorch dataloader corresponding to the testing dataset that supports many aspect ratios"
+    # PyTorch dataset
+    if isinstance(dataset, str):
+        dataset = eval(dataset)
+    world_size = get_world_size()
+    rank = get_rank()
+    # Get BatchedMultiFeatureRandomSampler
+    sampler = dataset.make_sampler(
+        batch_size,
+        shuffle=True,
+        world_size=world_size,
+        rank=rank,
+        drop_last=drop_last,
+        use_dynamic_sampler=False,
+    )
+    # Init the data laoder
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=pin_mem,
+        drop_last=drop_last,
+    )
+    return data_loader
+class DynamicBatchDatasetWrapper:
+    """
+    Wrapper dataset that handles DynamicBatchedMultiFeatureRandomSampler output.
+    The dynamic sampler returns batches (lists of tuples) instead of individual samples.
+    This wrapper ensures that the underlying dataset's __getitem__ method gets called
+    with individual tuples as expected.
+    """
+    def __init__(self, dataset):
+        self.dataset = dataset
+    def __getitem__(self, batch_indices):
+        """
+        Handle batch of indices from DynamicBatchedMultiFeatureRandomSampler.
+        Args:
+            batch_indices: List of tuples like [(sample_idx, feat_idx_1, feat_idx_2, ...), ...]
+        Returns:
+            List of samples from the underlying dataset
+        """
+        if isinstance(batch_indices, (list, tuple)) and len(batch_indices) > 0:
+            # If it's a batch (list of tuples), process each item
+            if isinstance(batch_indices[0], (list, tuple)):
+                return [self.dataset[idx] for idx in batch_indices]
+            else:
+                # Single tuple, call dataset directly
+                return self.dataset[batch_indices]
+        else:
+            # Fallback for single index
+            return self.dataset[batch_indices]
+    def __len__(self):
+        return len(self.dataset)
+    def __getattr__(self, name):
+        # Delegate all other attributes to the wrapped dataset
+        return getattr(self.dataset, name)
+def get_train_data_loader(
+    dataset,
+    max_num_of_imgs_per_gpu,
+    num_workers=8,
+    shuffle=True,
+    drop_last=True,
+    pin_mem=True,
+):
+    "Dynamic PyTorch dataloader corresponding to the training dataset"
+    # PyTorch dataset
+    if isinstance(dataset, str):
+        dataset = eval(dataset)
+    world_size = get_world_size()
+    rank = get_rank()
+    # Get DynamicBatchedMultiFeatureRandomSampler
+    batch_sampler = dataset.make_sampler(
+        shuffle=shuffle,
+        world_size=world_size,
+        rank=rank,
+        drop_last=drop_last,
+        max_num_of_images_per_gpu=max_num_of_imgs_per_gpu,
+        use_dynamic_sampler=True,
+    )
+    # Wrap the dataset to handle batch format from dynamic sampler
+    wrapped_dataset = DynamicBatchDatasetWrapper(dataset)
+    # Init the dynamic data loader
+    data_loader = torch.utils.data.DataLoader(
+        wrapped_dataset,
+        batch_sampler=batch_sampler,
+        num_workers=num_workers,
+        pin_memory=pin_mem,
+    )
+    return data_loader

mapanything/datasets/base/__init__.py ADDED Viewed

File without changes

mapanything/datasets/base/base_dataset.py ADDED Viewed

	@@ -0,0 +1,697 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+Base class for MapAnything datasets.
+"""
+from typing import List, Tuple, Union
+import numpy as np
+import PIL
+import torch
+import torchvision.transforms as tvf
+from scipy.spatial.transform import Rotation
+from mapanything.datasets.base.easy_dataset import EasyDataset
+from mapanything.utils.cropping import (
+    bbox_from_intrinsics_in_out,
+    camera_matrix_of_crop,
+    crop_image_and_other_optional_info,
+    rescale_image_and_other_optional_info,
+)
+from mapanything.utils.geometry import (
+    depthmap_to_camera_coordinates,
+    get_absolute_pointmaps_and_rays_info,
+)
+from uniception.models.encoders.image_normalizations import IMAGE_NORMALIZATION_DICT
+class BaseDataset(EasyDataset):
+    """
+    Define all basic options.
+    Usage:
+        class MyDataset(BaseDataset):
+            def _get_views(self, idx):
+                views = []
+                views.append(dict(img=, ...))
+                return views
+    """
+    def __init__(
+        self,
+        num_views: int,
+        variable_num_views: bool = False,
+        split: str = None,
+        covisibility_thres: float = None,
+        resolution: Union[int, Tuple[int, int], List[Tuple[int, int]]] = None,
+        principal_point_centered: bool = False,
+        transform: str = None,
+        data_norm_type: str = None,
+        aug_crop: int = 0,
+        seed: int = None,
+        max_num_retries: int = 5,
+    ):
+        """
+        PyTorch dataset for multi-view images sampled from scenes, where the images form a single connected component.
+        Args:
+            num_views (int): Number of views.
+            variable_num_views (bool): If True, the number of views can vary from batch to batch. The maximum number of views is num_views and minimum is 2.
+                                       On by default for N-view train dataloader (hydra config).
+            split (str): 'train', 'val', 'test', etc.
+            covisibility_thres (float): Covisibility (%) threshold to determine if another image is a neighbor or not
+            resolution (int or tuple or list of tuples): Resolution of the images
+            principal_point_centered (bool): If True, the principal point is centered in the image.
+            transform (str): Transform to apply to the images. Options:
+            - 'colorjitter+grayscale+gaublur':
+                tvf.Compose([
+                    tvf.RandomApply([tvf.ColorJittter(0.3, 0.4, 0.2, 0.1)], p=0.75),
+                    tvf.RandomGrayscale(p=0.05),
+                    tvf.RandomApply([tvf.GaussianBlur(5, sigma=(0.1, 1.0))], p=0.05),
+                ]) after ImgNorm
+            - 'colorjitter': tvf.ColorJittter(0.5, 0.5, 0.5, 0.1) after ImgNorm
+            - 'imgnorm': ImgNorm only
+            data_norm_type (str): Image normalization type.
+                                  For options, see UniCeption image normalization dict.
+            aug_crop (int): Augment crop. If int greater than 0, indicates the number of pixels to increase in target resolution.
+            seed (int): Seed for the random number generator.
+            max_num_retries (int): Maximum number of retries for loading a different sample from the dataset, if provided idx fails.
+        """
+        self.num_views = num_views
+        self.variable_num_views = variable_num_views
+        self.num_views_min = 2
+        self.split = split
+        self.covisibility_thres = covisibility_thres
+        self._set_resolutions(resolution)
+        self.principal_point_centered = principal_point_centered
+        # Update the number of views if necessary and make it a list if variable_num_views is True
+        if self.variable_num_views and self.num_views > self.num_views_min:
+            self.num_views = list(range(self.num_views_min, self.num_views + 1))
+        # Initialize the image normalization type
+        if data_norm_type in IMAGE_NORMALIZATION_DICT.keys():
+            self.data_norm_type = data_norm_type
+            image_norm = IMAGE_NORMALIZATION_DICT[data_norm_type]
+            ImgNorm = tvf.Compose(
+                [
+                    tvf.ToTensor(),
+                    tvf.Normalize(mean=image_norm.mean, std=image_norm.std),
+                ]
+            )
+        elif data_norm_type == "identity":
+            self.data_norm_type = data_norm_type
+            ImgNorm = tvf.Compose([tvf.ToTensor()])
+        else:
+            raise ValueError(
+                f"Unknown data_norm_type: {data_norm_type}. Available options: identity or {list(IMAGE_NORMALIZATION_DICT.keys())}"
+            )
+        # Initialize torchvision transforms
+        if transform == "imgnorm":
+            self.transform = ImgNorm
+        elif transform == "colorjitter":
+            self.transform = tvf.Compose([tvf.ColorJitter(0.5, 0.5, 0.5, 0.1), ImgNorm])
+        elif transform == "colorjitter+grayscale+gaublur":
+            self.transform = tvf.Compose(
+                [
+                    tvf.RandomApply([tvf.ColorJitter(0.3, 0.4, 0.2, 0.1)], p=0.75),
+                    tvf.RandomGrayscale(p=0.05),
+                    tvf.RandomApply([tvf.GaussianBlur(5, sigma=(0.1, 1.0))], p=0.05),
+                    ImgNorm,
+                ]
+            )
+        else:
+            raise ValueError(
+                'Unknown transform. Available options: "imgnorm", "colorjitter", "colorjitter+grayscale+gaublur"'
+            )
+        # Initialize the augmentation parameters
+        self.aug_crop = aug_crop
+        # Initialize the seed for the random number generator
+        self.seed = seed
+        self._seed_offset = 0
+        # Initialize the maximum number of retries for loading a different sample from the dataset, if the first idx fails
+        self.max_num_retries = max_num_retries
+        # Initialize the dataset type flags
+        self.is_metric_scale = False  # by default a dataset is not metric scale, subclasses can overwrite this
+        self.is_synthetic = False  # by default a dataset is not synthetic, subclasses can overwrite this
+    def _load_data(self):
+        self.scenes = []
+        self.num_of_scenes = len(self.scenes)
+    def __len__(self):
+        "Length of the dataset is determined by the number of scenes in the dataset split"
+        return self.num_of_scenes
+    def get_stats(self):
+        "Get the number of scenes in the dataset split"
+        return f"{self.num_of_scenes} scenes"
+    def __repr__(self):
+        resolutions_str = "[" + ";".join(f"{w}x{h}" for w, h in self._resolutions) + "]"
+        return (
+            f"""{type(self).__name__}({self.get_stats()},
+            {self.num_views=}
+            {self.split=},
+            {self.seed=},
+            resolutions={resolutions_str},
+            {self.transform=})""".replace("self.", "")
+            .replace("\n", "")
+            .replace("   ", "")
+        )
+    def _get_views(self, idx, num_views_to_sample, resolution):
+        raise NotImplementedError()
+    def _set_seed_offset(self, idx):
+        """
+        Set the seed offset. This is directly added to self.seed when setting the random seed.
+        """
+        self._seed_offset = idx
+    def _set_resolutions(self, resolutions):
+        assert resolutions is not None, "undefined resolution"
+        if isinstance(resolutions, int):
+            resolutions = [resolutions]
+        elif isinstance(resolutions, tuple):
+            resolutions = [resolutions]
+        elif isinstance(resolutions, list):
+            assert all(isinstance(res, tuple) for res in resolutions), (
+                f"Bad type for {resolutions=}, should be int or tuple of ints or list of tuples of ints"
+            )
+        else:
+            raise ValueError(
+                f"Bad type for {resolutions=}, should be int or tuple of ints or list of tuples of ints"
+            )
+        self._resolutions = []
+        for resolution in resolutions:
+            if isinstance(resolution, int):
+                width = height = resolution
+            else:
+                width, height = resolution
+            assert isinstance(width, int), (
+                f"Bad type for {width=} {type(width)=}, should be int"
+            )
+            assert isinstance(height, int), (
+                f"Bad type for {height=} {type(height)=}, should be int"
+            )
+            self._resolutions.append((width, height))
+    def _crop_resize_if_necessary(
+        self,
+        image,
+        resolution,
+        depthmap,
+        intrinsics,
+        additional_quantities=None,
+    ):
+        """
+        Process an image by downsampling and cropping as needed to match the target resolution.
+        This method performs the following operations:
+        1. Converts the image to PIL.Image if necessary
+        2. Crops the image centered on the principal point if requested
+        3. Downsamples the image using high-quality Lanczos filtering
+        4. Performs final cropping to match the target resolution
+        Args:
+            image (numpy.ndarray or PIL.Image.Image): Input image to be processed
+            resolution (tuple): Target resolution as (width, height)
+            depthmap (numpy.ndarray): Depth map corresponding to the image
+            intrinsics (numpy.ndarray): Camera intrinsics matrix (3x3)
+            additional_quantities (dict, optional): Additional image-related data to be processed
+                                                   alongside the main image with nearest interpolation. Defaults to None.
+        Returns:
+            tuple: Processed image, depthmap, and updated intrinsics matrix.
+                  If additional_quantities is provided, it returns those as well.
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+        # Cropping centered on the principal point if necessary
+        if self.principal_point_centered:
+            W, H = image.size
+            cx, cy = intrinsics[:2, 2].round().astype(int)
+            if cx < 0 or cx >= W or cy < 0 or cy >= H:
+                # Skip centered cropping if principal point is outside image bounds
+                pass
+            else:
+                min_margin_x = min(cx, W - cx)
+                min_margin_y = min(cy, H - cy)
+                left, top = cx - min_margin_x, cy - min_margin_y
+                right, bottom = cx + min_margin_x, cy + min_margin_y
+                crop_bbox = (left, top, right, bottom)
+                # Only perform the centered crop if the crop_bbox is larger than the target resolution
+                crop_width = right - left
+                crop_height = bottom - top
+                if crop_width > resolution[0] and crop_height > resolution[1]:
+                    image, depthmap, intrinsics, additional_quantities = (
+                        crop_image_and_other_optional_info(
+                            image=image,
+                            crop_bbox=crop_bbox,
+                            depthmap=depthmap,
+                            camera_intrinsics=intrinsics,
+                            additional_quantities=additional_quantities,
+                        )
+                    )
+        # Get the target resolution for re-scaling
+        target_rescale_resolution = np.array(resolution)
+        if self.aug_crop > 1:
+            target_rescale_resolution += self._rng.integers(0, self.aug_crop)
+        # High-quality Lanczos down-scaling if necessary
+        image, depthmap, intrinsics, additional_quantities = (
+            rescale_image_and_other_optional_info(
+                image=image,
+                output_resolution=target_rescale_resolution,
+                depthmap=depthmap,
+                camera_intrinsics=intrinsics,
+                additional_quantities_to_be_resized_with_nearest=additional_quantities,
+            )
+        )
+        # Actual cropping (if necessary)
+        new_intrinsics = camera_matrix_of_crop(
+            input_camera_matrix=intrinsics,
+            input_resolution=image.size,
+            output_resolution=resolution,
+            offset_factor=0.5,
+        )
+        crop_bbox = bbox_from_intrinsics_in_out(
+            input_camera_matrix=intrinsics,
+            output_camera_matrix=new_intrinsics,
+            output_resolution=resolution,
+        )
+        image, depthmap, new_intrinsics, additional_quantities = (
+            crop_image_and_other_optional_info(
+                image=image,
+                crop_bbox=crop_bbox,
+                depthmap=depthmap,
+                camera_intrinsics=intrinsics,
+                additional_quantities=additional_quantities,
+            )
+        )
+        # Return the output
+        if additional_quantities is not None:
+            return image, depthmap, new_intrinsics, additional_quantities
+        else:
+            return image, depthmap, new_intrinsics
+    def _random_walk_sampling(
+        self,
+        scene_pairwise_covisibility,
+        num_of_samples,
+        max_retries=4,
+        use_bidirectional_covis=True,
+    ):
+        """
+        Randomly samples S indices from an N x N covisibility matrix by forming adjacency edges such that the resulting subgraph (given by the indices) is connected.
+        If the current node has no new unvisited neighbors, backtracking occurs.
+        Retries with different starting indices if the desired number of samples is not reached, excluding previously visited components.
+        Args:
+            scene_pairwise_covisibility : np.ndarray (mmap)
+                N x N covisibility matrix for the scene, where N is the number of views in the scene.
+            num_of_samples : int
+                The desired number of nodes to sample (num_of_samples < N).
+            max_retries : int
+                The maximum number of retries with different starting indices.
+            use_bidirectional_covis : bool
+                Whether to compute bidirectional covisibility by averaging row and column values.
+                If False, uses only row access (faster for large memory-mapped arrays).
+                Defaults to True.
+        Returns:
+            np.ndarray
+                An array of sampled indices forming a connected subgraph.
+        """
+        excluded_nodes = set()
+        best_walk = []  # To keep track of the best walk found
+        for _ in range(max_retries):
+            visited = set()
+            walk = []  # List to store the random walk sampling order
+            stack = []  # Stack for backtracking
+            # Choose a random starting index that is not in the excluded set
+            all_nodes = set(range(len(scene_pairwise_covisibility)))
+            available_nodes = list(all_nodes - excluded_nodes)
+            if not available_nodes:
+                break  # No more nodes to try
+            start = self._rng.choice(available_nodes)
+            walk.append(start)
+            visited.add(start)
+            stack.append(start)
+            # Continue until we have sampled S indices or all expandable nodes are exhausted
+            while len(walk) < num_of_samples and stack:
+                current = stack[-1]
+                # Get the pairwise covisibility for the current node
+                if use_bidirectional_covis:
+                    # Use bidirectional covisibility (slower for large memory-mapped arrays)
+                    pairwise_covisibility = (
+                        scene_pairwise_covisibility[current, :]
+                        + scene_pairwise_covisibility[:, current].T
+                    ) / 2
+                else:
+                    # Use only row access (faster for large memory-mapped arrays)
+                    pairwise_covisibility = scene_pairwise_covisibility[current, :]
+                # Normalize the covisibility using self covisibility
+                pairwise_covisibility = pairwise_covisibility / (
+                    pairwise_covisibility[current] + 1e-8
+                )
+                # Assign overlap score of zero to self-pairs
+                pairwise_covisibility[current] = 0
+                # Threshold the covisibility to get adjacency list for the current node
+                adjacency_list_for_current = (
+                    pairwise_covisibility > self.covisibility_thres
+                ).astype(int)
+                adjacency_list_for_current = np.flatnonzero(adjacency_list_for_current)
+                # Get all unvisited neighbors
+                candidates = [
+                    idx for idx in adjacency_list_for_current if idx not in visited
+                ]  # Remove visited nodes
+                if candidates:
+                    # Randomly select one of the unvisited overlapping neighbors
+                    next_node = self._rng.choice(candidates)
+                    walk.append(next_node)
+                    visited.add(next_node)
+                    stack.append(next_node)
+                else:
+                    # If no unvisited neighbor is available, backtrack
+                    stack.pop()
+            # Update the best walk if the current walk is larger
+            if len(walk) > len(best_walk):
+                best_walk = walk
+            # If we have enough samples, return the result
+            if len(walk) >= num_of_samples:
+                return np.array(walk)
+            # Add all visited nodes to the excluded set
+            excluded_nodes.update(visited)
+        # If all retries are exhausted and we still don't have enough samples, return the best walk found
+        return np.array(best_walk)
+    def _sample_view_indices(
+        self,
+        num_views_to_sample,
+        num_views_in_scene,
+        scene_pairwise_covisibility,
+        use_bidirectional_covis=True,
+    ):
+        """
+        Sample view indices from a scene based on the adjacency list and the number of views to sample.
+        Args:
+            num_views_to_sample (int): Number of views to sample.
+            num_views_in_scene (int): Total number of views available in the scene.
+            scene_pairwise_covisibility (np.ndarray): N x N covisibility matrix for the scene, where N is the number of views in the scene.
+            use_bidirectional_covis (bool): Whether to compute bidirectional covisibility by averaging row and column values.
+                If False, uses only row access (faster for large memory-mapped arrays).
+        Returns:
+            numpy.ndarray: Array of sampled view indices.
+        """
+        if num_views_to_sample == num_views_in_scene:
+            # Select all views in the scene
+            view_indices = self._rng.permutation(num_views_in_scene)
+        elif num_views_to_sample > num_views_in_scene:
+            # Select all views in the scene and repeat them to get the desired number of views
+            view_indices = self._rng.choice(
+                num_views_in_scene, size=num_views_to_sample, replace=True
+            )
+        else:
+            # Select a subset of single component connected views in the scene using random walk sampling
+            view_indices = self._random_walk_sampling(
+                scene_pairwise_covisibility,
+                num_views_to_sample,
+                use_bidirectional_covis=use_bidirectional_covis,
+            )
+            # If the required num of views can't be obtained even with 4 retries, repeat existing indices to get the desired number of views
+            if len(view_indices) < num_views_to_sample:
+                view_indices = self._rng.choice(
+                    view_indices, size=num_views_to_sample, replace=True
+                )
+        return view_indices
+    def _getitem_fn(self, idx):
+        if isinstance(idx, tuple):
+            # The idx is a tuple if specifying the aspect-ratio or/and the number of views
+            if isinstance(self.num_views, int):
+                idx, ar_idx = idx
+            else:
+                idx, ar_idx, num_views_to_sample_idx = idx
+        else:
+            assert len(self._resolutions) == 1
+            assert isinstance(self.num_views, int)
+            ar_idx = 0
+        # Setup the rng
+        if self.seed:  # reseed for each _getitem_fn
+            # Leads to deterministic sampling where repeating self.seed and self._seed_offset yields the same multi-view set again
+            # Scenes will be repeated if size of dataset is artificially increased using "N @" or "N *"
+            # When scenes are repeated, self._seed_offset is increased to ensure new multi-view sets
+            # This is useful for evaluation if the number of dataset scenes is < N, yet we want unique multi-view sets each iter
+            self._rng = np.random.default_rng(seed=self.seed + self._seed_offset + idx)
+        elif not hasattr(self, "_rng"):
+            seed = torch.initial_seed()  # this is different for each dataloader process
+            self._rng = np.random.default_rng(seed=seed)
+        # Get the views for the given index and check that the number of views is correct
+        resolution = self._resolutions[ar_idx]
+        if isinstance(self.num_views, int):
+            num_views_to_sample = self.num_views
+        else:
+            num_views_to_sample = self.num_views[num_views_to_sample_idx]
+        views = self._get_views(idx, num_views_to_sample, resolution)
+        if isinstance(self.num_views, int):
+            assert len(views) == self.num_views
+        else:
+            assert len(views) in self.num_views
+        for v, view in enumerate(views):
+            # Store the index and other metadata
+            view["idx"] = (idx, ar_idx, v)
+            view["is_metric_scale"] = self.is_metric_scale
+            view["is_synthetic"] = self.is_synthetic
+            # Check the depth, intrinsics, and pose data (also other data if present)
+            assert "camera_intrinsics" in view
+            assert "camera_pose" in view
+            assert np.isfinite(view["camera_pose"]).all(), (
+                f"NaN or infinite values in camera pose for view {view_name(view)}"
+            )
+            assert np.isfinite(view["depthmap"]).all(), (
+                f"NaN or infinite values in depthmap for view {view_name(view)}"
+            )
+            assert "valid_mask" not in view
+            assert "pts3d" not in view, (
+                f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            )
+            if "prior_depth_z" in view:
+                assert np.isfinite(view["prior_depth_z"]).all(), (
+                    f"NaN or infinite values in prior_depth_z for view {view_name(view)}"
+                )
+            if "non_ambiguous_mask" in view:
+                assert np.isfinite(view["non_ambiguous_mask"]).all(), (
+                    f"NaN or infinite values in non_ambiguous_mask for view {view_name(view)}"
+                )
+            # Encode the image
+            width, height = view["img"].size
+            view["true_shape"] = np.int32((height, width))
+            view["img"] = self.transform(view["img"])
+            view["data_norm_type"] = self.data_norm_type
+            # Compute the pointmaps, raymap and depth along ray
+            (
+                pts3d,
+                valid_mask,
+                ray_origins_world,
+                ray_directions_world,
+                depth_along_ray,
+                ray_directions_cam,
+                pts3d_cam,
+            ) = get_absolute_pointmaps_and_rays_info(**view)
+            view["pts3d"] = pts3d
+            view["valid_mask"] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+            view["depth_along_ray"] = depth_along_ray
+            view["ray_directions_cam"] = ray_directions_cam
+            view["pts3d_cam"] = pts3d_cam
+            # Compute the prior depth along ray if present
+            if "prior_depth_z" in view:
+                prior_pts3d, _ = depthmap_to_camera_coordinates(
+                    view["prior_depth_z"], view["camera_intrinsics"]
+                )
+                view["prior_depth_along_ray"] = np.linalg.norm(prior_pts3d, axis=-1)
+                view["prior_depth_along_ray"] = view["prior_depth_along_ray"][..., None]
+                del view["prior_depth_z"]
+            # Convert ambiguous mask dtype to match valid mask dtype
+            if "non_ambiguous_mask" in view:
+                view["non_ambiguous_mask"] = view["non_ambiguous_mask"].astype(
+                    view["valid_mask"].dtype
+                )
+            else:
+                ambiguous_mask = view["depthmap"] < 0
+                view["non_ambiguous_mask"] = ~ambiguous_mask
+                view["non_ambiguous_mask"] = view["non_ambiguous_mask"].astype(
+                    view["valid_mask"].dtype
+                )
+            # Check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            # Check shapes
+            assert view["depthmap"].shape == view["img"].shape[1:]
+            assert view["depthmap"].shape == view["pts3d"].shape[:2]
+            assert view["depthmap"].shape == view["valid_mask"].shape
+            assert view["depthmap"].shape == view["depth_along_ray"].shape[:2]
+            assert view["depthmap"].shape == view["ray_directions_cam"].shape[:2]
+            assert view["depthmap"].shape == view["pts3d_cam"].shape[:2]
+            if "prior_depth_along_ray" in view:
+                assert view["depthmap"].shape == view["prior_depth_along_ray"].shape[:2]
+            if "non_ambiguous_mask" in view:
+                assert view["depthmap"].shape == view["non_ambiguous_mask"].shape
+            # Expand the last dimension of the depthmap
+            view["depthmap"] = view["depthmap"][..., None]
+            # Append RNG state to the views, this allows to check whether the RNG is in the same state each time
+            view["rng"] = int.from_bytes(self._rng.bytes(4), "big")
+            # Compute and store the quaternions and translation for the camera poses
+            # Notation is (x, y, z, w) for quaternions
+            # This also ensures that the camera poses have a positive determinant (right-handed coordinate system)
+            view["camera_pose_quats"] = (
+                Rotation.from_matrix(view["camera_pose"][:3, :3])
+                .as_quat()
+                .astype(view["camera_pose"].dtype)
+            )
+            view["camera_pose_trans"] = view["camera_pose"][:3, 3].astype(
+                view["camera_pose"].dtype
+            )
+            # Check the pointmaps, rays, depth along ray, and camera pose quaternions and translation to ensure they are finite
+            assert np.isfinite(view["pts3d"]).all(), (
+                f"NaN in pts3d for view {view_name(view)}"
+            )
+            assert np.isfinite(view["valid_mask"]).all(), (
+                f"NaN in valid_mask for view {view_name(view)}"
+            )
+            assert np.isfinite(view["depth_along_ray"]).all(), (
+                f"NaN in depth_along_ray for view {view_name(view)}"
+            )
+            assert np.isfinite(view["ray_directions_cam"]).all(), (
+                f"NaN in ray_directions_cam for view {view_name(view)}"
+            )
+            assert np.isfinite(view["pts3d_cam"]).all(), (
+                f"NaN in pts3d_cam for view {view_name(view)}"
+            )
+            assert np.isfinite(view["camera_pose_quats"]).all(), (
+                f"NaN in camera_pose_quats for view {view_name(view)}"
+            )
+            assert np.isfinite(view["camera_pose_trans"]).all(), (
+                f"NaN in camera_pose_trans for view {view_name(view)}"
+            )
+            if "prior_depth_along_ray" in view:
+                assert np.isfinite(view["prior_depth_along_ray"]).all(), (
+                    f"NaN in prior_depth_along_ray for view {view_name(view)}"
+                )
+        return views
+    def __getitem__(self, idx):
+        if self.max_num_retries == 0:
+            return self._getitem_fn(idx)
+        num_retries = 0
+        while num_retries <= self.max_num_retries:
+            try:
+                return self._getitem_fn(idx)
+            except Exception as e:
+                scene_idx = idx[0] if isinstance(idx, tuple) else idx
+                print(
+                    f"Error in {type(self).__name__}.__getitem__ for scene_idx={scene_idx}: {e}"
+                )
+                if num_retries >= self.max_num_retries:
+                    print(
+                        f"Max retries ({self.max_num_retries}) reached, raising the exception"
+                    )
+                    raise e
+                # Retry with a different scene index
+                num_retries += 1
+                if isinstance(idx, tuple):
+                    # The scene index is the first element of the tuple
+                    idx_list = list(idx)
+                    idx_list[0] = np.random.randint(0, len(self))
+                    idx = tuple(idx_list)
+                else:
+                    # The scene index is idx
+                    idx = np.random.randint(0, len(self))
+                scene_idx = idx[0] if isinstance(idx, tuple) else idx
+                print(
+                    f"Retrying with scene_idx={scene_idx} ({num_retries} of {self.max_num_retries})"
+                )
+def is_good_type(v):
+    """
+    Check if a value has an acceptable data type for processing in the dataset.
+    Args:
+        v: The value to check.
+    Returns:
+        tuple: A tuple containing:
+            - bool: True if the type is acceptable, False otherwise.
+            - str or None: Error message if the type is not acceptable, None otherwise.
+    """
+    if isinstance(v, (str, int, tuple)):
+        return True, None
+    if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
+        return False, f"bad {v.dtype=}"
+    return True, None
+def view_name(view, batch_index=None):
+    """
+    Generate a string identifier for a view based on its dataset, label, and instance.
+    Args:
+        view (dict): Dictionary containing view information with 'dataset', 'label', and 'instance' keys.
+        batch_index (int, optional): Index to select from batched data. Defaults to None.
+    Returns:
+        str: A formatted string in the form "dataset/label/instance".
+    """
+    def sel(x):
+        return x[batch_index] if batch_index not in (None, slice(None)) else x
+    db = sel(view["dataset"])
+    label = sel(view["label"])
+    instance = sel(view["instance"])
+    return f"{db}/{label}/{instance}"

mapanything/datasets/base/batched_sampler.py ADDED Viewed

	@@ -0,0 +1,431 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+Utilities for random sampling under a single or multiple constraints
+References: DUSt3R
+"""
+import numpy as np
+import torch
+def round_by(total, multiple, up=False):
+    """
+    Round a number to the nearest multiple of another number.
+    Args:
+        total (int): The number to round
+        multiple (int): The multiple to round to
+        up (bool, optional): Whether to round up. Defaults to False.
+    Returns:
+        int: The rounded number
+    """
+    if up:
+        total = total + multiple - 1
+    return (total // multiple) * multiple
+class BatchedRandomSampler:
+    """
+    Random sampling under a constraint: each sample in the batch has the same feature,
+    which is chosen randomly from a known pool of 'features' for each batch.
+    For instance, the 'feature' could be the image aspect-ratio.
+    The index returned is a tuple (sample_idx, feat_idx).
+    This sampler ensures that each series of `batch_size` indices has the same `feat_idx`.
+    """
+    def __init__(
+        self, dataset, batch_size, pool_size, world_size=1, rank=0, drop_last=True
+    ):
+        """
+        Args:
+            dataset: Dataset to sample from
+            batch_size: Number of samples per batch
+            pool_size: Integer representing the size of feature pool
+            world_size: Number of distributed processes
+            rank: Rank of the current process
+            drop_last: Whether to drop the last incomplete batch
+        """
+        self.batch_size = batch_size
+        self.pool_size = pool_size
+        self.len_dataset = N = len(dataset)
+        self.total_size = round_by(N, batch_size * world_size) if drop_last else N
+        assert world_size == 1 or drop_last, (
+            "must drop the last batch in distributed mode"
+        )
+        # Distributed sampler
+        self.world_size = world_size
+        self.rank = rank
+        self.epoch = None
+    def __len__(self):
+        """
+        Get the length of the sampler.
+        Returns:
+            int: The number of samples in the sampler for the current process
+        """
+        return self.total_size // self.world_size
+    def set_epoch(self, epoch):
+        """
+        Set the epoch for this sampler.
+        This should be called before each epoch to ensure proper shuffling of the data.
+        Args:
+            epoch (int): The current epoch number
+        """
+        self.epoch = epoch
+    def __iter__(self):
+        """
+        Iterator over the indices.
+        This method generates random indices for each batch, ensuring that all samples
+        within a batch have the same feature index for the given feature pool.
+        Yields:
+            tuple: A tuple containing (sample_idx, feat_idx)
+        """
+        # Prepare RNG
+        if self.epoch is None:
+            assert self.world_size == 1 and self.rank == 0, (
+                "use set_epoch() if distributed mode is used"
+            )
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        else:
+            seed = self.epoch + 777
+        rng = np.random.default_rng(seed=seed)
+        # Random indices (will restart from 0 if not drop_last)
+        sample_idxs = np.arange(self.total_size)
+        rng.shuffle(sample_idxs)
+        # Random feat_idxs (same across each batch)
+        n_batches = (self.total_size + self.batch_size - 1) // self.batch_size
+        feat_idxs = rng.integers(self.pool_size, size=n_batches)
+        feat_idxs = np.broadcast_to(feat_idxs[:, None], (n_batches, self.batch_size))
+        feat_idxs = feat_idxs.ravel()[: self.total_size]
+        # Put them together
+        idxs = np.c_[sample_idxs, feat_idxs]  # shape = (total_size, 2)
+        # Distributed sampler: we select a subset of batches
+        # Make sure the slice for each node is aligned with batch_size
+        size_per_proc = self.batch_size * (
+            (self.total_size + self.world_size * self.batch_size - 1)
+            // (self.world_size * self.batch_size)
+        )
+        idxs = idxs[self.rank * size_per_proc : (self.rank + 1) * size_per_proc]
+        yield from (tuple(idx) for idx in idxs)
+class BatchedMultiFeatureRandomSampler:
+    """
+    Random sampling under multiple constraints: each sample in the batch has the same features,
+    which are chosen randomly from known pools of 'features' for each batch.
+    For instance, the 'features' could be the image aspect-ratio and scene type.
+    The index returned is a tuple (sample_idx, feat_idx_1, feat_idx_2, ...).
+    This sampler ensures that each series of `batch_size` indices has the same feature indices.
+    """
+    def __init__(
+        self, dataset, batch_size, pool_sizes, world_size=1, rank=0, drop_last=True
+    ):
+        """
+        Args:
+            dataset: Dataset to sample from
+            batch_size: Number of samples per batch
+            pool_sizes: List of integers representing the size of each feature pool
+            world_size: Number of distributed processes
+            rank: Rank of the current process
+            drop_last: Whether to drop the last incomplete batch
+        """
+        self.batch_size = batch_size
+        self.pool_sizes = pool_sizes if isinstance(pool_sizes, list) else [pool_sizes]
+        self.len_dataset = N = len(dataset)
+        self.total_size = round_by(N, batch_size * world_size) if drop_last else N
+        assert world_size == 1 or drop_last, (
+            "must drop the last batch in distributed mode"
+        )
+        # Distributed sampler
+        self.world_size = world_size
+        self.rank = rank
+        self.epoch = None
+    def __len__(self):
+        """
+        Get the length of the sampler.
+        Returns:
+            int: The number of samples in the sampler for the current process
+        """
+        return self.total_size // self.world_size
+    def set_epoch(self, epoch):
+        """
+        Set the epoch for this sampler.
+        This should be called before each epoch to ensure proper shuffling of the data.
+        Args:
+            epoch (int): The current epoch number
+        """
+        self.epoch = epoch
+    def __iter__(self):
+        """
+        Iterator over the indices.
+        This method generates random indices for each batch, ensuring that all samples
+        within a batch have the same feature indices for multiple features.
+        Yields:
+            tuple: A tuple containing (sample_idx, feat_idx_1, feat_idx_2, ...)
+        """
+        # Prepare RNG
+        if self.epoch is None:
+            assert self.world_size == 1 and self.rank == 0, (
+                "use set_epoch() if distributed mode is used"
+            )
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        else:
+            seed = self.epoch + 777
+        rng = np.random.default_rng(seed=seed)
+        # Random indices (will restart from 0 if not drop_last)
+        sample_idxs = np.arange(self.total_size)
+        rng.shuffle(sample_idxs)
+        # Random feat_idxs (same across each batch)
+        n_batches = (self.total_size + self.batch_size - 1) // self.batch_size
+        # Generate feature indices for each feature pool
+        all_feat_idxs = []
+        for pool_size in self.pool_sizes:
+            feat_idxs = rng.integers(pool_size, size=n_batches)
+            feat_idxs = np.broadcast_to(
+                feat_idxs[:, None], (n_batches, self.batch_size)
+            )
+            feat_idxs = feat_idxs.ravel()[: self.total_size]
+            all_feat_idxs.append(feat_idxs)
+        # Put them together
+        idxs = np.column_stack(
+            [sample_idxs] + all_feat_idxs
+        )  # shape = (total_size, 1 + len(pool_sizes))
+        # Distributed sampler: we select a subset of batches
+        # Make sure the slice for each node is aligned with batch_size
+        size_per_proc = self.batch_size * (
+            (self.total_size + self.world_size * self.batch_size - 1)
+            // (self.world_size * self.batch_size)
+        )
+        idxs = idxs[self.rank * size_per_proc : (self.rank + 1) * size_per_proc]
+        yield from (tuple(idx) for idx in idxs)
+class DynamicBatchedMultiFeatureRandomSampler:
+    """
+    Random sampling under multiple constraints with dynamic batch size:
+    each sample in the batch has the same features, which are chosen randomly
+    from known pools of 'features' for each batch.
+    The batch size is dynamically determined based on a specified feature index,
+    using a direct mapping from feature values to batch sizes.
+    For instance, if one of the features is the number of images in a multi-view set,
+    you can specify different batch sizes for different numbers of images to optimize
+    GPU memory usage. This is achieved by using the feature_to_batch_size_map parameter
+    to directly specify what batch size to use for each feature value.
+    The returned index is a list of tuples [(sample_idx, feat_idx_1, feat_idx_2, ...), ...].
+    """
+    def __init__(
+        self,
+        dataset,
+        pool_sizes,
+        scaling_feature_idx=0,
+        feature_to_batch_size_map=None,
+        world_size=1,
+        rank=0,
+        drop_last=True,
+    ):
+        """
+        Args:
+            dataset: Dataset to sample from
+            pool_sizes: List of integers representing the size of each feature pool
+            scaling_feature_idx: Index of the feature to use for determining batch size (0-based index into pool_sizes)
+            feature_to_batch_size_map: Optional function or dict that maps feature values directly to batch sizes.
+                                 For example, if the feature represents number of views, this maps number of views
+                                 to appropriate batch size that can fit in GPU memory.
+                                 If None, uses a default batch size of 1 for all feature values.
+            world_size: Number of distributed processes
+            rank: Rank of the current process
+            drop_last: Whether to drop the last incomplete batch
+        """
+        self.pool_sizes = pool_sizes if isinstance(pool_sizes, list) else [pool_sizes]
+        self.scaling_feature_idx = scaling_feature_idx
+        # Ensure scaling_feature_idx is valid
+        if scaling_feature_idx < 0 or scaling_feature_idx >= len(self.pool_sizes):
+            raise ValueError(
+                f"scaling_feature_idx must be between 0 and {len(self.pool_sizes) - 1}"
+            )
+        # Set up mapping from feature values to batch sizes
+        self.feature_to_batch_size_map = feature_to_batch_size_map
+        if self.feature_to_batch_size_map is None:
+            # Default: batch size of 1 for all feature values
+            self.feature_to_batch_size_map = {
+                i: 1 for i in range(self.pool_sizes[scaling_feature_idx])
+            }
+        self.len_dataset = N = len(dataset)
+        # We don't know the exact batch size yet, so we use a large number for total_size
+        # This will be adjusted during iteration
+        self.total_size = N
+        # Distributed sampler
+        self.world_size = world_size
+        self.rank = rank
+        self.epoch = None
+        self.drop_last = drop_last
+    def __len__(self):
+        """
+        Get the approximate length of the sampler.
+        Since batch size varies, this is an estimate based on the largest batch size
+        in the mapping, which provides a lower bound on the number of batches.
+        Returns:
+            int: The estimated minimum number of samples in the sampler for the current process
+        """
+        # Find the largest batch size in the mapping
+        if callable(self.feature_to_batch_size_map):
+            # If it's a function, sample some values to find the maximum
+            batch_sizes = [
+                self.feature_to_batch_size_map(i)
+                for i in range(self.pool_sizes[self.scaling_feature_idx])
+            ]
+            max_batch_size = max(batch_sizes)
+        else:
+            # If it's a dict or similar, find the maximum directly
+            max_batch_size = max(self.feature_to_batch_size_map.values())
+        # Ensure minimum batch size of 1
+        max_batch_size = max(1, max_batch_size)
+        # Estimate total batches using the largest batch size
+        # This gives a lower bound on the number of batches
+        total_batches = self.total_size // max_batch_size
+        if not self.drop_last and self.total_size % max_batch_size > 0:
+            total_batches += 1
+        # Distribute among processes
+        return total_batches // self.world_size
+    def set_epoch(self, epoch):
+        """
+        Set the epoch for this sampler.
+        This should be called before each epoch to ensure proper shuffling of the data.
+        Args:
+            epoch (int): The current epoch number
+        """
+        self.epoch = epoch
+    def __iter__(self):
+        """
+        Iterator over the indices with dynamic batch sizes.
+        This method generates random indices for each batch, ensuring that all samples
+        within a batch have the same feature indices for multiple features.
+        The batch size is determined directly from the feature_to_batch_size_map.
+        The iterator enforces the length returned by __len__() by stopping after
+        exactly that many batches have been yielded for this process.
+        Yields:
+            list of tuples: A batch of tuples, each containing (sample_idx, feat_idx_1, feat_idx_2, ...)
+        """
+        # Prepare RNG
+        if self.epoch is None:
+            assert self.world_size == 1 and self.rank == 0, (
+                "use set_epoch() if distributed mode is used"
+            )
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        else:
+            seed = self.epoch + 777
+        rng = np.random.default_rng(seed=seed)
+        # Random indices for the entire dataset
+        sample_idxs = np.arange(self.total_size)
+        rng.shuffle(sample_idxs)
+        # Get the target number of batches for this process (enforce strict length)
+        target_batches_for_process = len(self)
+        batches_yielded_for_process = 0
+        # Process indices in batches with dynamic sizing
+        idx = 0
+        batch_idx = 0  # Track batch index for even distribution
+        while idx < len(sample_idxs) and (
+            batches_yielded_for_process < target_batches_for_process
+        ):
+            # Randomly select feature indices for this batch
+            feat_idxs = [rng.integers(pool_size) for pool_size in self.pool_sizes]
+            # Get the scaling feature value
+            scaling_feat = feat_idxs[self.scaling_feature_idx]
+            # Get the batch size directly from the mapping
+            if callable(self.feature_to_batch_size_map):
+                batch_size = self.feature_to_batch_size_map(scaling_feat)
+            else:
+                batch_size = self.feature_to_batch_size_map.get(scaling_feat, 1)
+            # Ensure minimum batch size of 1
+            batch_size = max(1, batch_size)
+            # Ensure we don't go beyond available samples
+            remaining = len(sample_idxs) - idx
+            if remaining < batch_size:
+                if self.drop_last:
+                    break
+                batch_size = remaining
+            # Create batch with consistent feature indices
+            batch = []
+            for i in range(batch_size):
+                if idx + i < len(sample_idxs):
+                    sample_idx = sample_idxs[idx + i]
+                    batch.append(tuple([sample_idx] + feat_idxs))
+            # Distribute batches among processes in round-robin fashion
+            if len(batch) > 0 and (batch_idx % self.world_size == self.rank):
+                yield batch
+                batches_yielded_for_process += 1
+            batch_idx += 1  # Increment batch index
+            idx += batch_size

mapanything/datasets/base/easy_dataset.py ADDED Viewed

	@@ -0,0 +1,478 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+Base dataset class that enables easy resizing and combining
+References: DUSt3R
+"""
+import numpy as np
+from mapanything.datasets.base.batched_sampler import (
+    BatchedMultiFeatureRandomSampler,
+    DynamicBatchedMultiFeatureRandomSampler,
+)
+class EasyDataset:
+    """
+    Dataset that can be easily resized and combined.
+    Examples:
+    ---------
+        2 * dataset ==> Duplicate each element 2x
+        10 @ dataset ==> Set the size to 10 (random sampling, duplicates if necessary)
+        Dataset1 + Dataset2 ==> Concatenate datasets
+    """
+    def __add__(self, other):
+        """
+        Concatenate this dataset with another dataset.
+        Args:
+            other (EasyDataset): Another dataset to concatenate with this one
+        Returns:
+            CatDataset: A new dataset that is the concatenation of this dataset and the other
+        """
+        return CatDataset([self, other])
+    def __rmul__(self, factor):
+        """
+        Multiply the dataset by a factor, duplicating each element.
+        Args:
+            factor (int): Number of times to duplicate each element
+        Returns:
+            MulDataset: A new dataset with each element duplicated 'factor' times
+        """
+        return MulDataset(factor, self)
+    def __rmatmul__(self, factor):
+        """
+        Resize the dataset to a specific size using random sampling.
+        Args:
+            factor (int): The new size of the dataset
+        Returns:
+            ResizedDataset: A new dataset with the specified size
+        """
+        return ResizedDataset(factor, self)
+    def set_epoch(self, epoch):
+        """
+        Set the current epoch for all constituent datasets.
+        Args:
+            epoch (int): The current epoch number
+        """
+        pass  # nothing to do by default
+    def make_sampler(
+        self,
+        batch_size=None,
+        shuffle=True,
+        world_size=1,
+        rank=0,
+        drop_last=True,
+        max_num_of_images_per_gpu=None,
+        use_dynamic_sampler=True,
+    ):
+        """
+        Create a sampler for this dataset.
+        Args:
+            batch_size (int, optional): Number of samples per batch (used for non-dynamic sampler). Defaults to None.
+            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to True.
+            world_size (int, optional): Number of distributed processes. Defaults to 1.
+            rank (int, optional): Rank of the current process. Defaults to 0.
+            drop_last (bool, optional): Whether to drop the last incomplete batch. Defaults to True.
+            max_num_of_images_per_gpu (int, optional): Maximum number of images per GPU for dynamic batching. Defaults to None.
+            use_dynamic_sampler (bool, optional): Whether to use the dynamic sampler. Defaults to True.
+        Returns:
+            DynamicBatchedMultiFeatureRandomSampler or BatchedMultiFeatureRandomSampler: A sampler for this dataset
+        Raises:
+            NotImplementedError: If shuffle is False
+            ValueError: If num_views has an invalid type or required parameters are missing
+        """
+        if not (shuffle):
+            raise NotImplementedError()  # cannot deal yet
+        if isinstance(self.num_views, int):
+            num_of_aspect_ratios = len(self._resolutions)
+            feature_pool_sizes = [num_of_aspect_ratios]
+            scaling_feature_idx = 0  # Use aspect ratio as scaling feature
+        elif isinstance(self.num_views, list):
+            num_of_aspect_ratios = len(self._resolutions)
+            num_of_num_views = len(self.num_views)
+            feature_pool_sizes = [num_of_aspect_ratios, num_of_num_views]
+            scaling_feature_idx = 1  # Use num_views as scaling feature
+        else:
+            raise ValueError(
+                f"Bad type for {self.num_views=}, should be int or list of ints"
+            )
+        if use_dynamic_sampler:
+            if max_num_of_images_per_gpu is None:
+                raise ValueError(
+                    "max_num_of_images_per_gpu must be provided when using dynamic sampler"
+                )
+            # Create feature-to-batch-size mapping
+            if isinstance(self.num_views, list):
+                # Map num_views_idx to batch size: max(1, max_num_of_images_per_gpu // (num_views_idx + dataset.num_views_min))
+                feature_to_batch_size_map = {}
+                for num_views_idx, num_views in enumerate(self.num_views):
+                    batch_size_for_multi_view_sets = max(
+                        1, max_num_of_images_per_gpu // num_views
+                    )
+                    feature_to_batch_size_map[num_views_idx] = (
+                        batch_size_for_multi_view_sets
+                    )
+            else:
+                # For fixed num_views, use a simple mapping
+                feature_to_batch_size_map = {
+                    0: max(1, max_num_of_images_per_gpu // self.num_views)
+                }
+            return DynamicBatchedMultiFeatureRandomSampler(
+                self,
+                pool_sizes=feature_pool_sizes,
+                scaling_feature_idx=scaling_feature_idx,
+                feature_to_batch_size_map=feature_to_batch_size_map,
+                world_size=world_size,
+                rank=rank,
+                drop_last=drop_last,
+            )
+        else:
+            if batch_size is None:
+                raise ValueError(
+                    "batch_size must be provided when not using dynamic sampler"
+                )
+            return BatchedMultiFeatureRandomSampler(
+                self,
+                batch_size,
+                feature_pool_sizes,
+                world_size=world_size,
+                rank=rank,
+                drop_last=drop_last,
+            )
+class MulDataset(EasyDataset):
+    """Artificially augmenting the size of a dataset."""
+    multiplicator: int
+    def __init__(self, multiplicator, dataset):
+        """
+        Initialize a dataset that artificially augments the size of another dataset.
+        Args:
+            multiplicator (int): Factor by which to multiply the dataset size
+            dataset (EasyDataset): The dataset to augment
+        """
+        assert isinstance(multiplicator, int) and multiplicator > 0
+        self.multiplicator = multiplicator
+        self.dataset = dataset
+    def __len__(self):
+        """
+        Get the length of the dataset.
+        Returns:
+            int: The number of samples in the dataset
+        """
+        return self.multiplicator * len(self.dataset)
+    def __repr__(self):
+        """
+        Get a string representation of the dataset.
+        Returns:
+            str: String representation showing the multiplication factor and the original dataset
+        """
+        return f"{self.multiplicator}*{repr(self.dataset)}"
+    def __getitem__(self, idx):
+        """
+        Get an item from the dataset.
+        Args:
+            idx: Index or tuple of indices to retrieve
+        Returns:
+            The item at the specified index from the original dataset
+        """
+        if isinstance(idx, tuple):
+            other = idx[1:]
+            idx = idx[0]
+            new_idx = (idx // self.multiplicator, *other)
+            return self.dataset[new_idx]
+        else:
+            return self.dataset[idx // self.multiplicator]
+    @property
+    def _resolutions(self):
+        """
+        Get the resolutions of the dataset.
+        Returns:
+            The resolutions from the original dataset
+        """
+        return self.dataset._resolutions
+    @property
+    def num_views(self):
+        """
+        Get the number of views used for the dataset.
+        Returns:
+            int or list: The number of views parameter from the original dataset
+        """
+        return self.dataset.num_views
+class ResizedDataset(EasyDataset):
+    """Artificially changing the size of a dataset."""
+    new_size: int
+    def __init__(self, new_size, dataset):
+        """
+        Initialize a dataset with an artificially changed size.
+        Args:
+            new_size (int): The new size of the dataset
+            dataset (EasyDataset): The original dataset
+        """
+        assert isinstance(new_size, int) and new_size > 0
+        self.new_size = new_size
+        self.dataset = dataset
+    def __len__(self):
+        """
+        Get the length of the dataset.
+        Returns:
+            int: The new size of the dataset
+        """
+        return self.new_size
+    def __repr__(self):
+        """
+        Get a string representation of the dataset.
+        Returns:
+            str: String representation showing the new size and the original dataset
+        """
+        size_str = str(self.new_size)
+        for i in range((len(size_str) - 1) // 3):
+            sep = -4 * i - 3
+            size_str = size_str[:sep] + "_" + size_str[sep:]
+        return f"{size_str} @ {repr(self.dataset)}"
+    def set_epoch(self, epoch):
+        """
+        Set the current epoch and generate a new random mapping of indices.
+        This method must be called before using __getitem__.
+        Args:
+            epoch (int): The current epoch number
+        """
+        # This random shuffle only depends on the epoch
+        rng = np.random.default_rng(seed=epoch + 777)
+        # Shuffle all indices
+        perm = rng.permutation(len(self.dataset))
+        # Calculate how many repetitions we need
+        num_repetitions = 1 + (len(self) - 1) // len(self.dataset)
+        # Rotary extension until target size is met
+        shuffled_idxs = np.concatenate([perm] * num_repetitions)
+        self._idxs_mapping = shuffled_idxs[: self.new_size]
+        # Generate the seed offset for each repetition
+        # This is needed to ensure we see unique samples when we repeat a scene
+        seed_offset_per_repetition = [
+            np.full(len(self.dataset), i) for i in range(num_repetitions)
+        ]
+        seed_offset_idxs = np.concatenate(seed_offset_per_repetition)
+        self._idxs_seed_offset = seed_offset_idxs[: self.new_size]
+        assert len(self._idxs_mapping) == self.new_size
+        assert len(self._idxs_seed_offset) == self.new_size
+    def __getitem__(self, idx):
+        """
+        Get an item from the dataset.
+        Args:
+            idx: Index or tuple of indices to retrieve
+        Returns:
+            The item at the mapped index from the original dataset
+        Raises:
+            AssertionError: If set_epoch has not been called
+        """
+        assert hasattr(self, "_idxs_mapping"), (
+            "You need to call dataset.set_epoch() to use ResizedDataset.__getitem__()"
+        )
+        if isinstance(idx, tuple):
+            other = idx[1:]
+            idx = idx[0]
+            self.dataset._set_seed_offset(self._idxs_seed_offset[idx])
+            new_idx = (self._idxs_mapping[idx], *other)
+            return self.dataset[new_idx]
+        else:
+            self.dataset._set_seed_offset(self._idxs_seed_offset[idx])
+            return self.dataset[self._idxs_mapping[idx]]
+    @property
+    def _resolutions(self):
+        """
+        Get the resolutions of the dataset.
+        Returns:
+            The resolutions from the original dataset
+        """
+        return self.dataset._resolutions
+    @property
+    def num_views(self):
+        """
+        Get the number of views used for the dataset.
+        Returns:
+            int or list: The number of views parameter from the original dataset
+        """
+        return self.dataset.num_views
+class CatDataset(EasyDataset):
+    """Concatenation of several datasets"""
+    def __init__(self, datasets):
+        """
+        Initialize a dataset that is a concatenation of several datasets.
+        Args:
+            datasets (list): List of EasyDataset instances to concatenate
+        """
+        for dataset in datasets:
+            assert isinstance(dataset, EasyDataset)
+        self.datasets = datasets
+        self._cum_sizes = np.cumsum([len(dataset) for dataset in datasets])
+    def __len__(self):
+        """
+        Get the length of the concatenated dataset.
+        Returns:
+            int: Total number of samples across all datasets
+        """
+        return self._cum_sizes[-1]
+    def __repr__(self):
+        """
+        Get a string representation of the concatenated dataset.
+        Returns:
+            str: String representation showing all concatenated datasets joined by '+'
+        """
+        # Remove uselessly long transform
+        return " + ".join(
+            repr(dataset).replace(
+                ",transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))",
+                "",
+            )
+            for dataset in self.datasets
+        )
+    def set_epoch(self, epoch):
+        """
+        Set the current epoch for all constituent datasets.
+        Args:
+            epoch (int): The current epoch number
+        """
+        for dataset in self.datasets:
+            dataset.set_epoch(epoch)
+    def __getitem__(self, idx):
+        """
+        Get an item from the concatenated dataset.
+        Args:
+            idx: Index or tuple of indices to retrieve
+        Returns:
+            The item at the specified index from the appropriate constituent dataset
+        Raises:
+            IndexError: If the index is out of range
+        """
+        other = None
+        if isinstance(idx, tuple):
+            other = idx[1:]
+            idx = idx[0]
+        if not (0 <= idx < len(self)):
+            raise IndexError()
+        db_idx = np.searchsorted(self._cum_sizes, idx, "right")
+        dataset = self.datasets[db_idx]
+        new_idx = idx - (self._cum_sizes[db_idx - 1] if db_idx > 0 else 0)
+        if other is not None:
+            new_idx = (new_idx, *other)
+        return dataset[new_idx]
+    @property
+    def _resolutions(self):
+        """
+        Get the resolutions of the dataset.
+        Returns:
+            The resolutions from the first dataset (all datasets must have the same resolutions)
+        Raises:
+            AssertionError: If datasets have different resolutions
+        """
+        resolutions = self.datasets[0]._resolutions
+        for dataset in self.datasets[1:]:
+            assert tuple(dataset._resolutions) == tuple(resolutions), (
+                "All datasets must have the same resolutions"
+            )
+        return resolutions
+    @property
+    def num_views(self):
+        """
+        Get the number of views used for the dataset.
+        Returns:
+            int or list: The number of views parameter from the first dataset
+        Raises:
+            AssertionError: If datasets have different num_views
+        """
+        num_views = self.datasets[0].num_views
+        for dataset in self.datasets[1:]:
+            assert dataset.num_views == num_views, (
+                "All datasets must have the same num_views and variable_num_views parameters"
+            )
+        return num_views

mapanything/datasets/utils/__init__.py ADDED Viewed

File without changes

mapanything/datasets/utils/data_splits.py ADDED Viewed

	@@ -0,0 +1,1734 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+Modules containing dataset split information
+"""
+class BlendedMVSSplits:
+    """
+    This class contains the information about the BlendedMVS dataset splits.
+    """
+    def __init__(self):
+        """
+        The splits are generated using the following logic:
+            # Get all seqls and seqhs using self.blendedmvs_info.all_sequences
+            all_sequences = self.blendedmvs_info.all_sequences
+            all_seqls = [int(seq[8:], 16) for seq in all_sequences]
+            all_seqhs = [int(seq[:8], 16) for seq in all_sequences]
+            # Split the seqls (& corresponding seqhs) using the DUSt3R train/val split logic
+            if split is None:
+                selection = slice(None)
+            elif split in ["train", "overfit"]:
+                # select 90% of all scenes
+                selection = [(seql % 10) > 0 for seql in all_seqls]
+            elif split == "val":
+                # select 10% of all scenes
+                selection = [(seql % 10) == 0 for seql in all_seqls]
+            else:
+                raise ValueError(f"Unknown split {split}, must be None, train, val or overfit")
+            # Filter sequences based on the selection
+            selected_seqls = [seql for seql, sel in zip(all_seqls, selection) if sel]
+            selected_seqhs = [seqh for seqh, sel in zip(all_seqhs, selection) if sel]
+            # Put them back into sequence names f"{seqh:08x}{seql:016x}"
+            sequence_names = [f"{seqh:08x}{seql:016x}" for seqh, seql in zip(selected_seqhs, selected_seqls)]
+            # Remove invalid sequence names which don't exist in self.blendedmvs_info.sequences
+            valid_sequences = set(self.blendedmvs_info.sequences)
+            valid_sequence_names = [name for name in sequence_names if name in valid_sequences]
+        """
+        # All the 502 sequences in the dataset (totals to 115k images)
+        self.all_scenes = [
+            "000000000000000000000000",
+            "00000000000000000000000a",
+            "00000000000000000000000b",
+            "00000000000000000000000c",
+            "00000000000000000000000d",
+            "00000000000000000000000e",
+            "00000000000000000000000f",
+            "000000000000000000000001",
+            "00000000000000000000001a",
+            "00000000000000000000001b",
+            "00000000000000000000001d",
+            "000000000000000000000002",
+            "000000000000000000000003",
+            "000000000000000000000004",
+            "000000000000000000000005",
+            "5a2a95f032a1c655cfe3de62",
+            "5a2af22b32a1c655cfe46013",
+            "5a2ba6de32a1c655cfe51b79",
+            "5a3b9731e24cd76dad1a5f1b",
+            "5a3ca9cb270f0e3f14d0eddb",
+            "5a3cb4e4270f0e3f14d12f43",
+            "5a03e732454a8a7ec672776c",
+            "5a3f4aba5889373fbbc5d3b5",
+            "5a4a38dad38c8a075495b5d2",
+            "5a5a1e48d62c7a12d5d00e47",
+            "5a6b1c418d100c2f8fdc4411",
+            "5a6feeb54a7fbc3f874f9db7",
+            "5a7cb1d6fe5c0d6fb53e64fb",
+            "5a7d3db14989e929563eb153",
+            "5a8aa0fab18050187cbe060e",
+            "5a9e5df65baeef72b4a021cd",
+            "5a48ba95c7dab83a7d7b44ed",
+            "5a48c4e9c7dab83a7d7b5cc7",
+            "5a48d4b2c7dab83a7d7b9851",
+            "5a69c47d0d5d0a7f3b2e9752",
+            "5a77b46b318efe6c6736e68a",
+            "5a355c271b63f53d5970f362",
+            "5a489fb1c7dab83a7d7b1070",
+            "5a533e8034d7582116e34209",
+            "5a562fc7425d0f5186314725",
+            "5a572fd9fc597b0478a81d14",
+            "5a588a8193ac3d233f77fbca",
+            "5a618c72784780334bc1972d",
+            "5a752d42acc41e2423f17674",
+            "5a969eea91dfc339a9a3ad2c",
+            "5a8315f624b8e938486e0bd8",
+            "5a57542f333d180827dfc132",
+            "5a0271884e62597cdee0d0eb",
+            "5a6400933d809f1d8200af15",
+            "5a6464143d809f1d8208c43c",
+            "5a563183425d0f5186314855",
+            "5aa0f9d7a9efce63548c69a1",
+            "5aa0f478a9efce63548c1cb4",
+            "5aa7db90bfdd572271e95246",
+            "5aa235f64a17b335eeaf9609",
+            "5aa515e613d42d091d29d300",
+            "5aa1196ea9efce63548ed649",
+            "5aaadd4cbc13235570d178a7",
+            "5ab6af12ac4291329b1072ab",
+            "5ab7e00aac4291329b15864d",
+            "5ab8b8e029f5351f7f2ccf59",
+            "5ab74bf2ac4291329b11e879",
+            "5ab85f1dac4291329b17cb50",
+            "5ab8713ba3799a1d138bd69a",
+            "5abc2506b53b042ead637d86",
+            "5acc7459a7853c4b5ebbef59",
+            "5acf8ca0f3d8a750097e4b15",
+            "5adc6bd52430a05ecb2ffb85",
+            "5ae2e9c5fe405c5076abc6b2",
+            "5af02e904c8216544b4ab5a2",
+            "5af28cea59bc705737003253",
+            "5af545d0559359053d25dcf5",
+            "5afacb69ab00705d0cefdd5b",
+            "5b2c67b5e0878c381608b8d8",
+            "5b3b2b9e8d46a939f933fdc0",
+            "5b3b353d8d46a939f93524b9",
+            "5b6e716d67b396324c2d77cb",
+            "5b6eff8b67b396324c5b2672",
+            "5b7a3890fc8fcf6781e2593a",
+            "5b21e18c58e2823a67a10dd8",
+            "5b60fa0c764f146feef84df0",
+            "5b69cc0cb44b61786eb959bf",
+            "5b78e57afc8fcf6781d0c3ba",
+            "5b192eb2170cf166458ff886",
+            "5b558a928bbfb62204e77ba2",
+            "5b864d850d072a699b32f4ae",
+            "5b908d3dc6ab78485f3d24a9",
+            "5b950c71608de421b1e7318f",
+            "5b4933abf2b5f44e95de482a",
+            "5b08286b2775267d5b0634ba",
+            "5b37189a35304b6f75e7583e",
+            "5b271079e0878c3816dacca4",
+            "5b22269758e2823a67a3bd03",
+            "5b62647143840965efc0dbde",
+            "5ba19a8a360c7c30c1c169df",
+            "5ba75d79d76ffa2c86cf2f05",
+            "5bb7a08aea1cfa39f1a947ab",
+            "5bb8a49aea1cfa39f1aa7f75",
+            "5bbb6eb2ea1cfa39f1af7e0c",
+            "5bc5f0e896b66a2cd8f9bd36",
+            "5bccd6beca24970bce448134",
+            "5bce7ac9ca24970bce4934b6",
+            "5bcf979a6d5f586b95c258cd",
+            "5bd43b4ba6b28b1ee86b92dd",
+            "5be3a5fb8cfdd56947f6b67c",
+            "5be3ae47f44e235bdbbc9771",
+            "5be4ab93870d330ff2dce134",
+            "5be47bf9b18881428d8fbc1d",
+            "5be883a4f98cee15019d5b83",
+            "5bea87f4abd34c35e1860ab5",
+            "5beb6e66abd34c35e18e66b9",
+            "5bf3a82cd439231948877aed",
+            "5bf7d63575c26f32dbf7413b",
+            "5bf17c0fd439231948355385",
+            "5bf26cbbd43923194854b270",
+            "5bf03590d4392319481971dc",
+            "5bf18642c50e6f7f8bdbd492",
+            "5bf21799d43923194842c001",
+            "5bfc9d5aec61ca1dd69132a2",
+            "5bfd0f32ec61ca1dd69dc77b",
+            "5bfe5ae0fe0ea555e6a969ca",
+            "5bff3c5cfe0ea555e6bcbf3a",
+            "5c0d13b795da9479e12e2ee9",
+            "5c1af2e2bee9a723c963d019",
+            "5c1b1500bee9a723c96c3e78",
+            "5c1dbf200843bc542d8ef8c4",
+            "5c1f33f1d33e1f2e4aa6dda4",
+            "5c2b3ed5e611832e8aed46bf",
+            "5c20ca3a0843bc542d94e3e2",
+            "5c062d84a96e33018ff6f0a6",
+            "5c189f2326173c3a09ed7ef3",
+            "5c1892f726173c3a09ea9aeb",
+            "5c34300a73a8df509add216d",
+            "5c34529873a8df509ae57b58",
+            "000000000000000000000006",
+            "000000000000000000000007",
+            "000000000000000000000008",
+            "000000000000000000000009",
+            "000000000000000000000010",
+            "000000000000000000000011",
+            "000000000000000000000012",
+            "000000000000000000000015",
+            "000000000000000000000016",
+            "000000000000000000000017",
+            "000000000000000000000018",
+            "000000000000000000000019",
+            "56d73ba74bd29b8c35abade2",
+            "56f34064e296120e10484dc4",
+            "57a4a7bb6b9272286e26dc18",
+            "57f8d9bbe73f6760f10e916a",
+            "58a0a2f33d0b4542479a11b1",
+            "58a0dd1a3d0b4542479a28f3",
+            "58a1a7914a4d262a170b1101",
+            "58a1bc804a4d262a170b2f01",
+            "58a1d9d14a4d262a170b58fe",
+            "58a01dea38486e3c98475871",
+            "58a1f5d74a4d262a170b65fc",
+            "58a2a09e156b87103d3d668c",
+            "58a2d9c3156b87103d3da90f",
+            "58a3ccb0156b87103d3e4332",
+            "58a3f2f8156b87103d3e5838",
+            "58a3f6c0156b87103d3e5971",
+            "58a3fc95156b87103d3e5d9b",
+            "58a07ce53d0b45424799fdde",
+            "58a07f233d0b45424799ffe7",
+            "58a44df2156b87103d3ee239",
+            "58a164f73d0b4542479a7a8e",
+            "58a0365e38486e3c984783eb",
+            "58a439cf156b87103d3ec885",
+            "58a464aa156b87103d3eec04",
+            "58a4452f156b87103d3ed55b",
+            "58a160983d0b4542479a7347",
+            "58a186444a4d262a170ae3ae",
+            "58a285424a4d262a170baf3e",
+            "58a41819156b87103d3e92a5",
+            "58a44463156b87103d3ed45e",
+            "58a47552156b87103d3f00a4",
+            "58c4bb4f4a69c55606122be4",
+            "58c6451e4a69c556061894f1",
+            "58ca7014affdfd07c70a95ce",
+            "58cf4771d0f5fb221defe6da",
+            "58d36897f387231e6c929903",
+            "58eaf1513353456af3a1682a",
+            "58f7f7299f5b5647873cb110",
+            "58f73e7c9f5b56478738929f",
+            "59a8f851597729752c31e7e0",
+            "59a452bf9b460239aa5d1c72",
+            "59a9619a825418241fb88191",
+            "59acd2f4b891807f439c8992",
+            "59bf97fe7e7b31545da34439",
+            "59c1c3e2fd6e3d4ead9f1013",
+            "59d2657f82ca7774b1ec081d",
+            "59da1fb88a126011d0394ae9",
+            "59e75a2ca9e91f2c5526005d",
+            "59e864b2a9e91f2c5529325f",
+            "59ecfd02e225f6492d20fcc9",
+            "59f37f74b45be2233001ba18",
+            "59f70ab1e5c5d366af29bf3e",
+            "59f87d0bfa6280566fb38c9a",
+            "59f363a8b45be22330016cad",
+            "564a27b26d07883f460d8ab0",
+            "565fb1dead14d4154dae2b94",
+            "567a0fb0a825d2fb79ac9a20",
+            "569b92eb826bcba945ca002b",
+            "576fefa017ce5a16397e87fd",
+            "584a7333fe3cb463906c9fe6",
+            "584aa8e9fe3cb463906cc7d0",
+            "584ad76bfe3cb463906ce6dc",
+            "584af003fe3cb463906d0e9b",
+            "584b9a747072670e72bfc49d",
+            "584b671f7072670e72bfaaf8",
+            "584b81747072670e72bfbbfd",
+            "584ba35f7072670e72bfca4d",
+            "584ba5977072670e72bfcc2d",
+            "584bc53c7072670e72bfe85f",
+            "584bc3997072670e72bfe58d",
+            "584bc4407072670e72bfe665",
+            "584bd5587072670e72bffe39",
+            "584bdadf7072670e72c0005c",
+            "584be5ed7072670e72c007b3",
+            "584c9ad27072670e72c060c5",
+            "584c9cc67072670e72c063a1",
+            "584c58b77072670e72c03990",
+            "584cea557072670e72c07fb4",
+            "584d19d47072670e72c0c6c0",
+            "584dfe467072670e72c1665a",
+            "584e875c7072670e72c1ec94",
+            "584e05667072670e72c17167",
+            "584f94e87072670e72c2d3f7",
+            "584fdffd7072670e72c32dc7",
+            "584fe07f7072670e72c32e59",
+            "585a2a71b338a62ad50138dc",
+            "585a206ab338a62ad501298f",
+            "585a217cb338a62ad5012b38",
+            "585b34afb338a62ad501e836",
+            "585bb25fc49c8507c3ce7812",
+            "585bbe55c49c8507c3ce81cd",
+            "585d6c8a2a57cc11d4920a1e",
+            "585e54c72a57cc11d492f71a",
+            "585e34302a57cc11d492be30",
+            "585ee0632a57cc11d4933608",
+            "585f9661712e2761468dabca",
+            "585ffe9a712e2761468df643",
+            "586a37ec9d1b5e34c28184fc",
+            "586a515a9d1b5e34c281b431",
+            "586a94939d1b5e34c2823b5d",
+            "586abc689d1b5e34c2826360",
+            "586b0e219d1b5e34c2828862",
+            "586b3db89d1b5e34c282cd52",
+            "586b4c459d1b5e34c282e66d",
+            "586b7d7d9d1b5e34c283359e",
+            "586b8f149d1b5e34c283497c",
+            "586b8f629d1b5e34c28349d6",
+            "586c4c4d9d1b5e34c28391a1",
+            "586c5b5b9d1b5e34c2839a5b",
+            "586c9fdf9d1b5e34c283b657",
+            "586c48329d1b5e34c2838e80",
+            "586caab99d1b5e34c283c213",
+            "586cd0779d1b5e34c28403a7",
+            "586d6d249d1b5e34c284b80e",
+            "586d8a029d1b5e34c284c948",
+            "586d55af9d1b5e34c284a999",
+            "586d07869d1b5e34c2842e5b",
+            "586d27489d1b5e34c28453af",
+            "586df9849d1b5e34c28506de",
+            "586e279c9d1b5e34c2852180",
+            "587bc5ec2366dd5d06e262c1",
+            "587c1abf2366dd5d06e28901",
+            "587c03f12366dd5d06e27722",
+            "587c19da2366dd5d06e2877b",
+            "587c31b92366dd5d06e2a9dc",
+            "587c87d02366dd5d06e2f989",
+            "587c97a52366dd5d06e30a96",
+            "587c45192366dd5d06e2c0eb",
+            "587cec702366dd5d06e37862",
+            "587cef0a2366dd5d06e379e3",
+            "587db5872366dd5d06e3e0af",
+            "587e2b1d2366dd5d06e41af0",
+            "587e2ea62366dd5d06e41f2e",
+            "587e5cb52366dd5d06e4486e",
+            "587eb1822366dd5d06e45f29",
+            "587f365d2366dd5d06e4906e",
+            "588a9c5fec4d5a1c088ec350",
+            "588a34cfec4d5a1c088ea8d1",
+            "588ab5bdec4d5a1c088ed60f",
+            "588aff9d90414422fbe7885a",
+            "588b20d290414422fbe79f40",
+            "588c08d590414422fbe8200b",
+            "588c203d90414422fbe8319e",
+            "588c989a90414422fbe86d96",
+            "588ca09d90414422fbe871a1",
+            "588cce2190414422fbe88520",
+            "588cd5ef90414422fbe8875c",
+            "588cf0ad90414422fbe8a20f",
+            "588e0d8c90414422fbe8f8b2",
+            "588e01c490414422fbe8ee2a",
+            "588e35e690414422fbe90a53",
+            "588f017e90414422fbe9b74b",
+            "588f095190414422fbe9c1ee",
+            "589aca717dc3d323d55671c4",
+            "589af2c97dc3d323d55691e8",
+            "589b49ea7dc3d323d556d9b4",
+            "589b04287dc3d323d556a185",
+            "589bf6a57dc3d323d55743ab",
+            "589c3c497dc3d323d5578468",
+            "589c3c577dc3d323d5578480",
+            "589c300f7dc3d323d5577926",
+            "589c24527dc3d323d5577126",
+            "589c35457dc3d323d5577d8d",
+            "589ca6a6b896147a1b73aff7",
+            "589d1e1fb896147a1b73ee5b",
+            "589d5c58b896147a1b742256",
+            "589d95538fa2cf375df3317b",
+            "589df0ffb504a864ad63521a",
+            "589ea316b504a864ad639a2b",
+            "589ec97cb504a864ad63adc3",
+            "589f214338486e3c9846f123",
+            "589fdfe738486e3c984736cf",
+            "590c2d70336bb52a190be886",
+            "590f91851225725be9e25d4e",
+            "591a467a6109e14d4f09b776",
+            "591cf3033162411cf9047f37",
+            "591ea44850991c70dc99a207",
+            "599aa591d5b41f366fed0d58",
+            "5643df56138263b51db1b5f3",
+            "5644bdac138263b51db9f669",
+            "5692a4c2adafac1f14201821",
+            "5850d4f97072670e72c425d6",
+            "5854c405804be105852330fe",
+            "5855a4fc804be1058523bd75",
+            "5856ac15804be105852419d8",
+            "5856ae8b804be10585241bae",
+            "5856b460804be10585242059",
+            "5857aa5ab338a62ad5ff4dbe",
+            "5857acf8b338a62ad5ff5107",
+            "5858db6cb338a62ad500103b",
+            "5858dbcab338a62ad5001081",
+            "5859d84fb338a62ad500e5cf",
+            "5861d8ea712e2761468f3cb3",
+            "5863edf8712e27614690cce0",
+            "5864a935712e2761469111b4",
+            "5864b076712e27614691197e",
+            "5864da88712e276146913d8b",
+            "5865f4a8712e27614691e39b",
+            "5867a434833dfe3f7b88edaf",
+            "5868cd15833dfe3f7b89bfa3",
+            "5880b3692366dd5d06e5d534",
+            "5880e3422366dd5d06e5ff8e",
+            "5880f0ef2366dd5d06e6166e",
+            "5881d2bfb6844814c136a119",
+            "5881f11d8ce2c2754d0714c3",
+            "5881fee18ce2c2754d0723f8",
+            "5882cda2b116682b4adebd25",
+            "5882d58fb116682b4adec7db",
+            "5884c256932ba84fbed70bf5",
+            "5884cc13932ba84fbed71ec4",
+            "5885bc5296fa095e0671a7f0",
+            "5886d14cb791366d617a362c",
+            "5888becfc02346100f4b0b21",
+            "5888e408c02346100f4b1a29",
+            "5889da66ec4d5a1c088e5187",
+            "5889e344ec4d5a1c088e59be",
+            "5889e754ec4d5a1c088e60ba",
+            "5890c16b90414422fbeb0262",
+            "5891d8ae9a8c0314c5cd30ab",
+            "5891d0479a8c0314c5cd2abd",
+            "5891ecf19a8c0314c5cd490a",
+            "5892c0cd9a8c0314c5cdc977",
+            "5894ab309a8c0314c5cee57d",
+            "5895a6a89a8c0314c5cfca7c",
+            "5895b8c29a8c0314c5cfd051",
+            "5895d38f9a8c0314c5cfe50c",
+            "5895f2329a8c0314c5d00117",
+            "5896bb989a8c0314c5d086b6",
+            "5896ebf39a8c0314c5d0a8c4",
+            "5898b1bac9dccc22987b7f74",
+            "5898b6ffc9dccc22987b8a03",
+            "5898b31cc9dccc22987b82ec",
+            "5898bbaac9dccc22987b8eba",
+            "5899cfa6b76d7a3780a4cb64",
+            "5899e5dcb76d7a3780a4ecc1",
+            "5947b62af1b45630bd0c2a02",
+            "57102be2877e1421026358af",
+            "57153d4031bb9900425bde85",
+            "57177cd7fb8d93461afc4527",
+            "58497cdf97b73e0b090c4273",
+            "58500b007072670e72c35588",
+            "58510bf97072670e72c46ddf",
+            "58522bd56789802282f2ecb3",
+            "58524a2e0e7012308944bcf3",
+            "58524a080e7012308944bcbf",
+            "58524c1d0e7012308944bfda",
+            "58524f170e7012308944c200",
+            "58529a4e0e70123089454c6f",
+            "58551bdf804be1058523556d",
+            "58568c9a804be10585240b03",
+            "58574b35804be105852455fd",
+            "58577c60b338a62ad5ff1564",
+            "58592d69b338a62ad5007a74",
+            "58598db2b338a62ad500bc38",
+            "58625f42712e2761468fb44c",
+            "58651bcc712e2761469166dc",
+            "58660e79712e27614691fe3d",
+            "58669aad712e27614692834c",
+            "58669c02712e27614692851a",
+            "58676c36833dfe3f7b88b7f2",
+            "58678b2d833dfe3f7b88e244",
+            "58790c82ce911104a3467c88",
+            "58800b0b2366dd5d06e5312d",
+            "58805eac2366dd5d06e56460",
+            "58806e422366dd5d06e57bb6",
+            "58831d060db9bf59bf8ab98b",
+            "58851ebb932ba84fbed7abad",
+            "58871dc3b791366d617a55ff",
+            "58873cabb791366d617a65a7",
+            "58873d44b791366d617a65dd",
+            "58888b3dc02346100f4af665",
+            "58897f62c02346100f4b8ee6",
+            "58933bac9a8c0314c5ce3508",
+            "58938e6d9a8c0314c5ce726f",
+            "58951cb49a8c0314c5cf4d5e",
+            "58970fd09a8c0314c5d0e383",
+            "58977ef09a8c0314c5d17b26",
+            "59056e6760bb961de55f3501",
+            "59071f2e5a6dbd3af4130f98",
+            "59102c811225725be9e64149",
+            "59338e76772c3e6384afbb15",
+            "59350ca084b7f26bf5ce6eb8",
+            "59397e493a87372f2c9e882b",
+            "59521e0b9096412211c2aa9d",
+            "59817e4a1bd4b175e7038d19",
+            "567884f58d2828b95e3c8eba",
+            "585559d9804be10585238ddf",
+            "585834cdb338a62ad5ffab4d",
+            "586082d8712e2761468e2877",
+            "586133c2712e2761468ecfe3",
+            "586281d2712e2761468fcaa2",
+            "586316e5712e276146903c4d",
+            "586326ad712e276146904571",
+            "586375c9712e276146907429",
+            "586389c9712e276146908da6",
+            "586496fa712e2761469108e7",
+            "586669c6712e27614692597a",
+            "586913a49d1b5e34c2808b02",
+            "586922da9d1b5e34c2809ff3",
+            "588185d8dfb7a15588a114a3",
+            "588305ed0db9bf59bf8a8c80",
+            "588315c60db9bf59bf8aa928",
+            "588332ee0db9bf59bf8ae9c3",
+            "588457b8932ba84fbed69942",
+            "588519d5932ba84fbed7a04a",
+            "588824d1b791366d617adeef",
+            "588857f6c02346100f4ac09f",
+            "589145ef90414422fbeb2e08",
+            "589433fa9a8c0314c5ce9656",
+            "589765d39a8c0314c5d16b12",
+            "5851165f7072670e72c4860d",
+            "5859341ab338a62ad500848d",
+            "5862388b712e2761468f84aa",
+            "5863915b712e276146909135",
+            "5866445b712e27614692383e",
+            "5866500d712e2761469240fd",
+            "5867785a833dfe3f7b88c764",
+            "5867969c833dfe3f7b88e8bc",
+            "5868040c833dfe3f7b8934f7",
+            "5880675a2366dd5d06e570ca",
+            "5882372c8ce2c2754d076af0",
+            "5883535e932ba84fbed5ad07",
+            "5888358cb791366d617af69d",
+            "5890330d90414422fbeaa0cb",
+            "5897076e9a8c0314c5d0d31b",
+            "5940564ec2d9527ab869f7e2",
+            "5947719bf1b45630bd096665",
+            "5948194ff1b45630bd0f47e3",
+            "5950206a41b158666ac50506",
+            "5983012d1bd4b175e70c985a",
+            "58586810b338a62ad5ffc20c",
+            "58592046b338a62ad5006b33",
+            "58592854b338a62ad500750a",
+            "58596531b338a62ad500aace",
+            "58818685dfb7a15588a11626",
+            "58829563f42b1d3ee3ec835f",
+            "58894345c02346100f4b51ca",
+            "585289980e7012308945276a",
+            "585369770e7012308945c709",
+            "585373640e7012308945cab9",
+            "588230658ce2c2754d076728",
+            "589388059a8c0314c5ce718b",
+            "595979485ec6a95e86a58c8d",
+            "5841206219d291325678ca90",
+            "58563650804be1058523da55",
+            "58564084804be1058523e116",
+            "58636467712e27614690661f",
+            "58647495712e27614690f36d",
+            "58654563712e276146918643",
+            "58664251712e276146923738",
+            "588084032366dd5d06e59e82",
+            "588159582366dd5d06e66877",
+            "5890279190414422fbea9734",
+            "5890523090414422fbeab3f0",
+            "5890641690414422fbeabbe7",
+            "585203546789802282f2aaf5",
+        ]
+        # Final sequences to be used after filtering (some of the sequences have incorrect/low quality depth)
+        # Generally water bodies like lakes have incorrect depth
+        # Filtered out sequences:
+        # "5692a4c2adafac1f14201821" # Incorrect Depth
+        # "5864a935712e2761469111b4" # Noisy Depth and artifacts near horizon
+        # "59f87d0bfa6280566fb38c9a" # Object-centric, noise with background and sometimes in front of object
+        # "58a44463156b87103d3ed45e" # Very noisy depth in background
+        # "5c2b3ed5e611832e8aed46bf" # Depth occluded by artifacts
+        # "5bf03590d4392319481971dc" # Depth occluded by artifacts
+        # "00000000000000000000001a" # Largely incomplete depth
+        # "00000000000000000000000c" # Imprecise depth for buildings
+        # "000000000000000000000000" # Incorrect depth for planar terrain
+        self.scenes = [
+            "00000000000000000000000a",
+            "00000000000000000000000b",
+            "00000000000000000000000d",
+            "00000000000000000000000e",
+            "00000000000000000000000f",
+            "000000000000000000000001",
+            "00000000000000000000001b",
+            "00000000000000000000001d",
+            "000000000000000000000002",
+            "000000000000000000000003",
+            "000000000000000000000004",
+            "000000000000000000000005",
+            "5a2a95f032a1c655cfe3de62",
+            "5a2af22b32a1c655cfe46013",
+            "5a2ba6de32a1c655cfe51b79",
+            "5a3b9731e24cd76dad1a5f1b",
+            "5a3ca9cb270f0e3f14d0eddb",
+            "5a3cb4e4270f0e3f14d12f43",
+            "5a03e732454a8a7ec672776c",
+            "5a3f4aba5889373fbbc5d3b5",
+            "5a4a38dad38c8a075495b5d2",
+            "5a5a1e48d62c7a12d5d00e47",
+            "5a6b1c418d100c2f8fdc4411",
+            "5a6feeb54a7fbc3f874f9db7",
+            "5a7cb1d6fe5c0d6fb53e64fb",
+            "5a7d3db14989e929563eb153",
+            "5a8aa0fab18050187cbe060e",
+            "5a9e5df65baeef72b4a021cd",
+            "5a48ba95c7dab83a7d7b44ed",
+            "5a48c4e9c7dab83a7d7b5cc7",
+            "5a48d4b2c7dab83a7d7b9851",
+            "5a69c47d0d5d0a7f3b2e9752",
+            "5a77b46b318efe6c6736e68a",
+            "5a355c271b63f53d5970f362",
+            "5a489fb1c7dab83a7d7b1070",
+            "5a533e8034d7582116e34209",
+            "5a562fc7425d0f5186314725",
+            "5a572fd9fc597b0478a81d14",
+            "5a588a8193ac3d233f77fbca",
+            "5a618c72784780334bc1972d",
+            "5a752d42acc41e2423f17674",
+            "5a969eea91dfc339a9a3ad2c",
+            "5a8315f624b8e938486e0bd8",
+            "5a57542f333d180827dfc132",
+            "5a0271884e62597cdee0d0eb",
+            "5a6400933d809f1d8200af15",
+            "5a6464143d809f1d8208c43c",
+            "5a563183425d0f5186314855",
+            "5aa0f9d7a9efce63548c69a1",
+            "5aa0f478a9efce63548c1cb4",
+            "5aa7db90bfdd572271e95246",
+            "5aa235f64a17b335eeaf9609",
+            "5aa515e613d42d091d29d300",
+            "5aa1196ea9efce63548ed649",
+            "5aaadd4cbc13235570d178a7",
+            "5ab6af12ac4291329b1072ab",
+            "5ab7e00aac4291329b15864d",
+            "5ab8b8e029f5351f7f2ccf59",
+            "5ab74bf2ac4291329b11e879",
+            "5ab85f1dac4291329b17cb50",
+            "5ab8713ba3799a1d138bd69a",
+            "5abc2506b53b042ead637d86",
+            "5acc7459a7853c4b5ebbef59",
+            "5acf8ca0f3d8a750097e4b15",
+            "5adc6bd52430a05ecb2ffb85",
+            "5ae2e9c5fe405c5076abc6b2",
+            "5af02e904c8216544b4ab5a2",
+            "5af28cea59bc705737003253",
+            "5af545d0559359053d25dcf5",
+            "5afacb69ab00705d0cefdd5b",
+            "5b2c67b5e0878c381608b8d8",
+            "5b3b2b9e8d46a939f933fdc0",
+            "5b3b353d8d46a939f93524b9",
+            "5b6e716d67b396324c2d77cb",
+            "5b6eff8b67b396324c5b2672",
+            "5b7a3890fc8fcf6781e2593a",
+            "5b21e18c58e2823a67a10dd8",
+            "5b60fa0c764f146feef84df0",
+            "5b69cc0cb44b61786eb959bf",
+            "5b78e57afc8fcf6781d0c3ba",
+            "5b192eb2170cf166458ff886",
+            "5b558a928bbfb62204e77ba2",
+            "5b864d850d072a699b32f4ae",
+            "5b908d3dc6ab78485f3d24a9",
+            "5b950c71608de421b1e7318f",
+            "5b4933abf2b5f44e95de482a",
+            "5b08286b2775267d5b0634ba",
+            "5b37189a35304b6f75e7583e",
+            "5b271079e0878c3816dacca4",
+            "5b22269758e2823a67a3bd03",
+            "5b62647143840965efc0dbde",
+            "5ba19a8a360c7c30c1c169df",
+            "5ba75d79d76ffa2c86cf2f05",
+            "5bb7a08aea1cfa39f1a947ab",
+            "5bb8a49aea1cfa39f1aa7f75",
+            "5bbb6eb2ea1cfa39f1af7e0c",
+            "5bc5f0e896b66a2cd8f9bd36",
+            "5bccd6beca24970bce448134",
+            "5bce7ac9ca24970bce4934b6",
+            "5bcf979a6d5f586b95c258cd",
+            "5bd43b4ba6b28b1ee86b92dd",
+            "5be3a5fb8cfdd56947f6b67c",
+            "5be3ae47f44e235bdbbc9771",
+            "5be4ab93870d330ff2dce134",
+            "5be47bf9b18881428d8fbc1d",
+            "5be883a4f98cee15019d5b83",
+            "5bea87f4abd34c35e1860ab5",
+            "5beb6e66abd34c35e18e66b9",
+            "5bf3a82cd439231948877aed",
+            "5bf7d63575c26f32dbf7413b",
+            "5bf17c0fd439231948355385",
+            "5bf26cbbd43923194854b270",
+            "5bf18642c50e6f7f8bdbd492",
+            "5bf21799d43923194842c001",
+            "5bfc9d5aec61ca1dd69132a2",
+            "5bfd0f32ec61ca1dd69dc77b",
+            "5bfe5ae0fe0ea555e6a969ca",
+            "5bff3c5cfe0ea555e6bcbf3a",
+            "5c0d13b795da9479e12e2ee9",
+            "5c1af2e2bee9a723c963d019",
+            "5c1b1500bee9a723c96c3e78",
+            "5c1dbf200843bc542d8ef8c4",
+            "5c1f33f1d33e1f2e4aa6dda4",
+            "5c20ca3a0843bc542d94e3e2",
+            "5c062d84a96e33018ff6f0a6",
+            "5c189f2326173c3a09ed7ef3",
+            "5c1892f726173c3a09ea9aeb",
+            "5c34300a73a8df509add216d",
+            "5c34529873a8df509ae57b58",
+            "000000000000000000000006",
+            "000000000000000000000007",
+            "000000000000000000000008",
+            "000000000000000000000009",
+            "000000000000000000000010",
+            "000000000000000000000011",
+            "000000000000000000000012",
+            "000000000000000000000015",
+            "000000000000000000000016",
+            "000000000000000000000017",
+            "000000000000000000000018",
+            "000000000000000000000019",
+            "56d73ba74bd29b8c35abade2",
+            "56f34064e296120e10484dc4",
+            "57a4a7bb6b9272286e26dc18",
+            "57f8d9bbe73f6760f10e916a",
+            "58a0a2f33d0b4542479a11b1",
+            "58a0dd1a3d0b4542479a28f3",
+            "58a1a7914a4d262a170b1101",
+            "58a1bc804a4d262a170b2f01",
+            "58a1d9d14a4d262a170b58fe",
+            "58a01dea38486e3c98475871",
+            "58a1f5d74a4d262a170b65fc",
+            "58a2a09e156b87103d3d668c",
+            "58a2d9c3156b87103d3da90f",
+            "58a3ccb0156b87103d3e4332",
+            "58a3f2f8156b87103d3e5838",
+            "58a3f6c0156b87103d3e5971",
+            "58a3fc95156b87103d3e5d9b",
+            "58a07ce53d0b45424799fdde",
+            "58a07f233d0b45424799ffe7",
+            "58a44df2156b87103d3ee239",
+            "58a164f73d0b4542479a7a8e",
+            "58a0365e38486e3c984783eb",
+            "58a439cf156b87103d3ec885",
+            "58a464aa156b87103d3eec04",
+            "58a4452f156b87103d3ed55b",
+            "58a160983d0b4542479a7347",
+            "58a186444a4d262a170ae3ae",
+            "58a285424a4d262a170baf3e",
+            "58a41819156b87103d3e92a5",
+            "58a47552156b87103d3f00a4",
+            "58c4bb4f4a69c55606122be4",
+            "58c6451e4a69c556061894f1",
+            "58ca7014affdfd07c70a95ce",
+            "58cf4771d0f5fb221defe6da",
+            "58d36897f387231e6c929903",
+            "58eaf1513353456af3a1682a",
+            "58f7f7299f5b5647873cb110",
+            "58f73e7c9f5b56478738929f",
+            "59a8f851597729752c31e7e0",
+            "59a452bf9b460239aa5d1c72",
+            "59a9619a825418241fb88191",
+            "59acd2f4b891807f439c8992",
+            "59bf97fe7e7b31545da34439",
+            "59c1c3e2fd6e3d4ead9f1013",
+            "59d2657f82ca7774b1ec081d",
+            "59da1fb88a126011d0394ae9",
+            "59e75a2ca9e91f2c5526005d",
+            "59e864b2a9e91f2c5529325f",
+            "59ecfd02e225f6492d20fcc9",
+            "59f37f74b45be2233001ba18",
+            "59f70ab1e5c5d366af29bf3e",
+            "59f363a8b45be22330016cad",
+            "564a27b26d07883f460d8ab0",
+            "565fb1dead14d4154dae2b94",
+            "567a0fb0a825d2fb79ac9a20",
+            "569b92eb826bcba945ca002b",
+            "576fefa017ce5a16397e87fd",
+            "584a7333fe3cb463906c9fe6",
+            "584aa8e9fe3cb463906cc7d0",
+            "584ad76bfe3cb463906ce6dc",
+            "584af003fe3cb463906d0e9b",
+            "584b9a747072670e72bfc49d",
+            "584b671f7072670e72bfaaf8",
+            "584b81747072670e72bfbbfd",
+            "584ba35f7072670e72bfca4d",
+            "584ba5977072670e72bfcc2d",
+            "584bc53c7072670e72bfe85f",
+            "584bc3997072670e72bfe58d",
+            "584bc4407072670e72bfe665",
+            "584bd5587072670e72bffe39",
+            "584bdadf7072670e72c0005c",
+            "584be5ed7072670e72c007b3",
+            "584c9ad27072670e72c060c5",
+            "584c9cc67072670e72c063a1",
+            "584c58b77072670e72c03990",
+            "584cea557072670e72c07fb4",
+            "584d19d47072670e72c0c6c0",
+            "584dfe467072670e72c1665a",
+            "584e875c7072670e72c1ec94",
+            "584e05667072670e72c17167",
+            "584f94e87072670e72c2d3f7",
+            "584fdffd7072670e72c32dc7",
+            "584fe07f7072670e72c32e59",
+            "585a2a71b338a62ad50138dc",
+            "585a206ab338a62ad501298f",
+            "585a217cb338a62ad5012b38",
+            "585b34afb338a62ad501e836",
+            "585bb25fc49c8507c3ce7812",
+            "585bbe55c49c8507c3ce81cd",
+            "585d6c8a2a57cc11d4920a1e",
+            "585e54c72a57cc11d492f71a",
+            "585e34302a57cc11d492be30",
+            "585ee0632a57cc11d4933608",
+            "585f9661712e2761468dabca",
+            "585ffe9a712e2761468df643",
+            "586a37ec9d1b5e34c28184fc",
+            "586a515a9d1b5e34c281b431",
+            "586a94939d1b5e34c2823b5d",
+            "586abc689d1b5e34c2826360",
+            "586b0e219d1b5e34c2828862",
+            "586b3db89d1b5e34c282cd52",
+            "586b4c459d1b5e34c282e66d",
+            "586b7d7d9d1b5e34c283359e",
+            "586b8f149d1b5e34c283497c",
+            "586b8f629d1b5e34c28349d6",
+            "586c4c4d9d1b5e34c28391a1",
+            "586c5b5b9d1b5e34c2839a5b",
+            "586c9fdf9d1b5e34c283b657",
+            "586c48329d1b5e34c2838e80",
+            "586caab99d1b5e34c283c213",
+            "586cd0779d1b5e34c28403a7",
+            "586d6d249d1b5e34c284b80e",
+            "586d8a029d1b5e34c284c948",
+            "586d55af9d1b5e34c284a999",
+            "586d07869d1b5e34c2842e5b",
+            "586d27489d1b5e34c28453af",
+            "586df9849d1b5e34c28506de",
+            "586e279c9d1b5e34c2852180",
+            "587bc5ec2366dd5d06e262c1",
+            "587c1abf2366dd5d06e28901",
+            "587c03f12366dd5d06e27722",
+            "587c19da2366dd5d06e2877b",
+            "587c31b92366dd5d06e2a9dc",
+            "587c87d02366dd5d06e2f989",
+            "587c97a52366dd5d06e30a96",
+            "587c45192366dd5d06e2c0eb",
+            "587cec702366dd5d06e37862",
+            "587cef0a2366dd5d06e379e3",
+            "587db5872366dd5d06e3e0af",
+            "587e2b1d2366dd5d06e41af0",
+            "587e2ea62366dd5d06e41f2e",
+            "587e5cb52366dd5d06e4486e",
+            "587eb1822366dd5d06e45f29",
+            "587f365d2366dd5d06e4906e",
+            "588a9c5fec4d5a1c088ec350",
+            "588a34cfec4d5a1c088ea8d1",
+            "588ab5bdec4d5a1c088ed60f",
+            "588aff9d90414422fbe7885a",
+            "588b20d290414422fbe79f40",
+            "588c08d590414422fbe8200b",
+            "588c203d90414422fbe8319e",
+            "588c989a90414422fbe86d96",
+            "588ca09d90414422fbe871a1",
+            "588cce2190414422fbe88520",
+            "588cd5ef90414422fbe8875c",
+            "588cf0ad90414422fbe8a20f",
+            "588e0d8c90414422fbe8f8b2",
+            "588e01c490414422fbe8ee2a",
+            "588e35e690414422fbe90a53",
+            "588f017e90414422fbe9b74b",
+            "588f095190414422fbe9c1ee",
+            "589aca717dc3d323d55671c4",
+            "589af2c97dc3d323d55691e8",
+            "589b49ea7dc3d323d556d9b4",
+            "589b04287dc3d323d556a185",
+            "589bf6a57dc3d323d55743ab",
+            "589c3c497dc3d323d5578468",
+            "589c3c577dc3d323d5578480",
+            "589c300f7dc3d323d5577926",
+            "589c24527dc3d323d5577126",
+            "589c35457dc3d323d5577d8d",
+            "589ca6a6b896147a1b73aff7",
+            "589d1e1fb896147a1b73ee5b",
+            "589d5c58b896147a1b742256",
+            "589d95538fa2cf375df3317b",
+            "589df0ffb504a864ad63521a",
+            "589ea316b504a864ad639a2b",
+            "589ec97cb504a864ad63adc3",
+            "589f214338486e3c9846f123",
+            "589fdfe738486e3c984736cf",
+            "590c2d70336bb52a190be886",
+            "590f91851225725be9e25d4e",
+            "591a467a6109e14d4f09b776",
+            "591cf3033162411cf9047f37",
+            "591ea44850991c70dc99a207",
+            "599aa591d5b41f366fed0d58",
+            "5643df56138263b51db1b5f3",
+            "5644bdac138263b51db9f669",
+            "5850d4f97072670e72c425d6",
+            "5854c405804be105852330fe",
+            "5855a4fc804be1058523bd75",
+            "5856ac15804be105852419d8",
+            "5856ae8b804be10585241bae",
+            "5856b460804be10585242059",
+            "5857aa5ab338a62ad5ff4dbe",
+            "5857acf8b338a62ad5ff5107",
+            "5858db6cb338a62ad500103b",
+            "5858dbcab338a62ad5001081",
+            "5859d84fb338a62ad500e5cf",
+            "5861d8ea712e2761468f3cb3",
+            "5863edf8712e27614690cce0",
+            "5864b076712e27614691197e",
+            "5864da88712e276146913d8b",
+            "5865f4a8712e27614691e39b",
+            "5867a434833dfe3f7b88edaf",
+            "5868cd15833dfe3f7b89bfa3",
+            "5880b3692366dd5d06e5d534",
+            "5880e3422366dd5d06e5ff8e",
+            "5880f0ef2366dd5d06e6166e",
+            "5881d2bfb6844814c136a119",
+            "5881f11d8ce2c2754d0714c3",
+            "5881fee18ce2c2754d0723f8",
+            "5882cda2b116682b4adebd25",
+            "5882d58fb116682b4adec7db",
+            "5884c256932ba84fbed70bf5",
+            "5884cc13932ba84fbed71ec4",
+            "5885bc5296fa095e0671a7f0",
+            "5886d14cb791366d617a362c",
+            "5888becfc02346100f4b0b21",
+            "5888e408c02346100f4b1a29",
+            "5889da66ec4d5a1c088e5187",
+            "5889e344ec4d5a1c088e59be",
+            "5889e754ec4d5a1c088e60ba",
+            "5890c16b90414422fbeb0262",
+            "5891d8ae9a8c0314c5cd30ab",
+            "5891d0479a8c0314c5cd2abd",
+            "5891ecf19a8c0314c5cd490a",
+            "5892c0cd9a8c0314c5cdc977",
+            "5894ab309a8c0314c5cee57d",
+            "5895a6a89a8c0314c5cfca7c",
+            "5895b8c29a8c0314c5cfd051",
+            "5895d38f9a8c0314c5cfe50c",
+            "5895f2329a8c0314c5d00117",
+            "5896bb989a8c0314c5d086b6",
+            "5896ebf39a8c0314c5d0a8c4",
+            "5898b1bac9dccc22987b7f74",
+            "5898b6ffc9dccc22987b8a03",
+            "5898b31cc9dccc22987b82ec",
+            "5898bbaac9dccc22987b8eba",
+            "5899cfa6b76d7a3780a4cb64",
+            "5899e5dcb76d7a3780a4ecc1",
+            "5947b62af1b45630bd0c2a02",
+            "57102be2877e1421026358af",
+            "57153d4031bb9900425bde85",
+            "57177cd7fb8d93461afc4527",
+            "58497cdf97b73e0b090c4273",
+            "58500b007072670e72c35588",
+            "58510bf97072670e72c46ddf",
+            "58522bd56789802282f2ecb3",
+            "58524a2e0e7012308944bcf3",
+            "58524a080e7012308944bcbf",
+            "58524c1d0e7012308944bfda",
+            "58524f170e7012308944c200",
+            "58529a4e0e70123089454c6f",
+            "58551bdf804be1058523556d",
+            "58568c9a804be10585240b03",
+            "58574b35804be105852455fd",
+            "58577c60b338a62ad5ff1564",
+            "58592d69b338a62ad5007a74",
+            "58598db2b338a62ad500bc38",
+            "58625f42712e2761468fb44c",
+            "58651bcc712e2761469166dc",
+            "58660e79712e27614691fe3d",
+            "58669aad712e27614692834c",
+            "58669c02712e27614692851a",
+            "58676c36833dfe3f7b88b7f2",
+            "58678b2d833dfe3f7b88e244",
+            "58790c82ce911104a3467c88",
+            "58800b0b2366dd5d06e5312d",
+            "58805eac2366dd5d06e56460",
+            "58806e422366dd5d06e57bb6",
+            "58831d060db9bf59bf8ab98b",
+            "58851ebb932ba84fbed7abad",
+            "58871dc3b791366d617a55ff",
+            "58873cabb791366d617a65a7",
+            "58873d44b791366d617a65dd",
+            "58888b3dc02346100f4af665",
+            "58897f62c02346100f4b8ee6",
+            "58933bac9a8c0314c5ce3508",
+            "58938e6d9a8c0314c5ce726f",
+            "58951cb49a8c0314c5cf4d5e",
+            "58970fd09a8c0314c5d0e383",
+            "58977ef09a8c0314c5d17b26",
+            "59056e6760bb961de55f3501",
+            "59071f2e5a6dbd3af4130f98",
+            "59102c811225725be9e64149",
+            "59338e76772c3e6384afbb15",
+            "59350ca084b7f26bf5ce6eb8",
+            "59397e493a87372f2c9e882b",
+            "59521e0b9096412211c2aa9d",
+            "59817e4a1bd4b175e7038d19",
+            "567884f58d2828b95e3c8eba",
+            "585559d9804be10585238ddf",
+            "585834cdb338a62ad5ffab4d",
+            "586082d8712e2761468e2877",
+            "586133c2712e2761468ecfe3",
+            "586281d2712e2761468fcaa2",
+            "586316e5712e276146903c4d",
+            "586326ad712e276146904571",
+            "586375c9712e276146907429",
+            "586389c9712e276146908da6",
+            "586496fa712e2761469108e7",
+            "586669c6712e27614692597a",
+            "586913a49d1b5e34c2808b02",
+            "586922da9d1b5e34c2809ff3",
+            "588185d8dfb7a15588a114a3",
+            "588305ed0db9bf59bf8a8c80",
+            "588315c60db9bf59bf8aa928",
+            "588332ee0db9bf59bf8ae9c3",
+            "588457b8932ba84fbed69942",
+            "588519d5932ba84fbed7a04a",
+            "588824d1b791366d617adeef",
+            "588857f6c02346100f4ac09f",
+            "589145ef90414422fbeb2e08",
+            "589433fa9a8c0314c5ce9656",
+            "589765d39a8c0314c5d16b12",
+            "5851165f7072670e72c4860d",
+            "5859341ab338a62ad500848d",
+            "5862388b712e2761468f84aa",
+            "5863915b712e276146909135",
+            "5866445b712e27614692383e",
+            "5866500d712e2761469240fd",
+            "5867785a833dfe3f7b88c764",
+            "5867969c833dfe3f7b88e8bc",
+            "5868040c833dfe3f7b8934f7",
+            "5880675a2366dd5d06e570ca",
+            "5882372c8ce2c2754d076af0",
+            "5883535e932ba84fbed5ad07",
+            "5888358cb791366d617af69d",
+            "5890330d90414422fbeaa0cb",
+            "5897076e9a8c0314c5d0d31b",
+            "5940564ec2d9527ab869f7e2",
+            "5947719bf1b45630bd096665",
+            "5948194ff1b45630bd0f47e3",
+            "5950206a41b158666ac50506",
+            "5983012d1bd4b175e70c985a",
+            "58586810b338a62ad5ffc20c",
+            "58592046b338a62ad5006b33",
+            "58592854b338a62ad500750a",
+            "58596531b338a62ad500aace",
+            "58818685dfb7a15588a11626",
+            "58829563f42b1d3ee3ec835f",
+            "58894345c02346100f4b51ca",
+            "585289980e7012308945276a",
+            "585369770e7012308945c709",
+            "585373640e7012308945cab9",
+            "588230658ce2c2754d076728",
+            "589388059a8c0314c5ce718b",
+            "595979485ec6a95e86a58c8d",
+            "5841206219d291325678ca90",
+            "58563650804be1058523da55",
+            "58564084804be1058523e116",
+            "58636467712e27614690661f",
+            "58647495712e27614690f36d",
+            "58654563712e276146918643",
+            "58664251712e276146923738",
+            "588084032366dd5d06e59e82",
+            "588159582366dd5d06e66877",
+            "5890279190414422fbea9734",
+            "5890523090414422fbeab3f0",
+            "5890641690414422fbeabbe7",
+            "585203546789802282f2aaf5",
+        ]
+        # Train set sequences after filtering
+        self.train_split_scenes = [
+            "00000000000000000000000b",
+            "00000000000000000000000d",
+            "00000000000000000000000e",
+            "00000000000000000000000f",
+            "000000000000000000000001",
+            "00000000000000000000001b",
+            "00000000000000000000001d",
+            "000000000000000000000002",
+            "000000000000000000000003",
+            "000000000000000000000004",
+            "000000000000000000000005",
+            "5a2a95f032a1c655cfe3de62",
+            "5a2af22b32a1c655cfe46013",
+            "5a2ba6de32a1c655cfe51b79",
+            "5a3b9731e24cd76dad1a5f1b",
+            "5a3ca9cb270f0e3f14d0eddb",
+            "5a3cb4e4270f0e3f14d12f43",
+            "5a03e732454a8a7ec672776c",
+            "5a3f4aba5889373fbbc5d3b5",
+            "5a5a1e48d62c7a12d5d00e47",
+            "5a6b1c418d100c2f8fdc4411",
+            "5a6feeb54a7fbc3f874f9db7",
+            "5a7cb1d6fe5c0d6fb53e64fb",
+            "5a7d3db14989e929563eb153",
+            "5a8aa0fab18050187cbe060e",
+            "5a9e5df65baeef72b4a021cd",
+            "5a48ba95c7dab83a7d7b44ed",
+            "5a48c4e9c7dab83a7d7b5cc7",
+            "5a48d4b2c7dab83a7d7b9851",
+            "5a69c47d0d5d0a7f3b2e9752",
+            "5a77b46b318efe6c6736e68a",
+            "5a355c271b63f53d5970f362",
+            "5a533e8034d7582116e34209",
+            "5a562fc7425d0f5186314725",
+            "5a618c72784780334bc1972d",
+            "5a752d42acc41e2423f17674",
+            "5a969eea91dfc339a9a3ad2c",
+            "5a8315f624b8e938486e0bd8",
+            "5a57542f333d180827dfc132",
+            "5a0271884e62597cdee0d0eb",
+            "5a6400933d809f1d8200af15",
+            "5a6464143d809f1d8208c43c",
+            "5a563183425d0f5186314855",
+            "5aa0f9d7a9efce63548c69a1",
+            "5aa7db90bfdd572271e95246",
+            "5aa235f64a17b335eeaf9609",
+            "5aa515e613d42d091d29d300",
+            "5aa1196ea9efce63548ed649",
+            "5aaadd4cbc13235570d178a7",
+            "5ab6af12ac4291329b1072ab",
+            "5ab7e00aac4291329b15864d",
+            "5ab8b8e029f5351f7f2ccf59",
+            "5ab74bf2ac4291329b11e879",
+            "5ab85f1dac4291329b17cb50",
+            "5ab8713ba3799a1d138bd69a",
+            "5abc2506b53b042ead637d86",
+            "5acc7459a7853c4b5ebbef59",
+            "5acf8ca0f3d8a750097e4b15",
+            "5adc6bd52430a05ecb2ffb85",
+            "5af02e904c8216544b4ab5a2",
+            "5af28cea59bc705737003253",
+            "5af545d0559359053d25dcf5",
+            "5afacb69ab00705d0cefdd5b",
+            "5b3b2b9e8d46a939f933fdc0",
+            "5b3b353d8d46a939f93524b9",
+            "5b6e716d67b396324c2d77cb",
+            "5b6eff8b67b396324c5b2672",
+            "5b7a3890fc8fcf6781e2593a",
+            "5b60fa0c764f146feef84df0",
+            "5b69cc0cb44b61786eb959bf",
+            "5b78e57afc8fcf6781d0c3ba",
+            "5b192eb2170cf166458ff886",
+            "5b558a928bbfb62204e77ba2",
+            "5b908d3dc6ab78485f3d24a9",
+            "5b950c71608de421b1e7318f",
+            "5b08286b2775267d5b0634ba",
+            "5b271079e0878c3816dacca4",
+            "5b22269758e2823a67a3bd03",
+            "5b62647143840965efc0dbde",
+            "5ba19a8a360c7c30c1c169df",
+            "5ba75d79d76ffa2c86cf2f05",
+            "5bb7a08aea1cfa39f1a947ab",
+            "5bb8a49aea1cfa39f1aa7f75",
+            "5bbb6eb2ea1cfa39f1af7e0c",
+            "5bce7ac9ca24970bce4934b6",
+            "5bcf979a6d5f586b95c258cd",
+            "5bd43b4ba6b28b1ee86b92dd",
+            "5be3a5fb8cfdd56947f6b67c",
+            "5be3ae47f44e235bdbbc9771",
+            "5be4ab93870d330ff2dce134",
+            "5be47bf9b18881428d8fbc1d",
+            "5be883a4f98cee15019d5b83",
+            "5bea87f4abd34c35e1860ab5",
+            "5beb6e66abd34c35e18e66b9",
+            "5bf3a82cd439231948877aed",
+            "5bf7d63575c26f32dbf7413b",
+            "5bf17c0fd439231948355385",
+            "5bf21799d43923194842c001",
+            "5bfd0f32ec61ca1dd69dc77b",
+            "5bfe5ae0fe0ea555e6a969ca",
+            "5c0d13b795da9479e12e2ee9",
+            "5c1af2e2bee9a723c963d019",
+            "5c1b1500bee9a723c96c3e78",
+            "5c1dbf200843bc542d8ef8c4",
+            "5c20ca3a0843bc542d94e3e2",
+            "5c062d84a96e33018ff6f0a6",
+            "5c189f2326173c3a09ed7ef3",
+            "5c1892f726173c3a09ea9aeb",
+            "5c34300a73a8df509add216d",
+            "000000000000000000000006",
+            "000000000000000000000007",
+            "000000000000000000000008",
+            "000000000000000000000009",
+            "000000000000000000000010",
+            "000000000000000000000011",
+            "000000000000000000000012",
+            "000000000000000000000015",
+            "000000000000000000000016",
+            "000000000000000000000017",
+            "000000000000000000000018",
+            "000000000000000000000019",
+            "56d73ba74bd29b8c35abade2",
+            "56f34064e296120e10484dc4",
+            "57a4a7bb6b9272286e26dc18",
+            "57f8d9bbe73f6760f10e916a",
+            "58a0a2f33d0b4542479a11b1",
+            "58a0dd1a3d0b4542479a28f3",
+            "58a1a7914a4d262a170b1101",
+            "58a1bc804a4d262a170b2f01",
+            "58a1d9d14a4d262a170b58fe",
+            "58a01dea38486e3c98475871",
+            "58a1f5d74a4d262a170b65fc",
+            "58a2a09e156b87103d3d668c",
+            "58a2d9c3156b87103d3da90f",
+            "58a3ccb0156b87103d3e4332",
+            "58a3f2f8156b87103d3e5838",
+            "58a3f6c0156b87103d3e5971",
+            "58a3fc95156b87103d3e5d9b",
+            "58a07ce53d0b45424799fdde",
+            "58a07f233d0b45424799ffe7",
+            "58a44df2156b87103d3ee239",
+            "58a164f73d0b4542479a7a8e",
+            "58a0365e38486e3c984783eb",
+            "58a439cf156b87103d3ec885",
+            "58a464aa156b87103d3eec04",
+            "58a4452f156b87103d3ed55b",
+            "58a160983d0b4542479a7347",
+            "58a285424a4d262a170baf3e",
+            "58a41819156b87103d3e92a5",
+            "58a47552156b87103d3f00a4",
+            "58c4bb4f4a69c55606122be4",
+            "58c6451e4a69c556061894f1",
+            "58ca7014affdfd07c70a95ce",
+            "58cf4771d0f5fb221defe6da",
+            "58d36897f387231e6c929903",
+            "58eaf1513353456af3a1682a",
+            "58f73e7c9f5b56478738929f",
+            "59a8f851597729752c31e7e0",
+            "59a452bf9b460239aa5d1c72",
+            "59a9619a825418241fb88191",
+            "59bf97fe7e7b31545da34439",
+            "59c1c3e2fd6e3d4ead9f1013",
+            "59d2657f82ca7774b1ec081d",
+            "59da1fb88a126011d0394ae9",
+            "59e75a2ca9e91f2c5526005d",
+            "59e864b2a9e91f2c5529325f",
+            "59ecfd02e225f6492d20fcc9",
+            "59f37f74b45be2233001ba18",
+            "59f70ab1e5c5d366af29bf3e",
+            "59f363a8b45be22330016cad",
+            "564a27b26d07883f460d8ab0",
+            "565fb1dead14d4154dae2b94",
+            "569b92eb826bcba945ca002b",
+            "576fefa017ce5a16397e87fd",
+            "584a7333fe3cb463906c9fe6",
+            "584aa8e9fe3cb463906cc7d0",
+            "584af003fe3cb463906d0e9b",
+            "584b9a747072670e72bfc49d",
+            "584b671f7072670e72bfaaf8",
+            "584b81747072670e72bfbbfd",
+            "584ba35f7072670e72bfca4d",
+            "584ba5977072670e72bfcc2d",
+            "584bc53c7072670e72bfe85f",
+            "584bc3997072670e72bfe58d",
+            "584bc4407072670e72bfe665",
+            "584bd5587072670e72bffe39",
+            "584bdadf7072670e72c0005c",
+            "584be5ed7072670e72c007b3",
+            "584c9ad27072670e72c060c5",
+            "584c9cc67072670e72c063a1",
+            "584cea557072670e72c07fb4",
+            "584d19d47072670e72c0c6c0",
+            "584dfe467072670e72c1665a",
+            "584e875c7072670e72c1ec94",
+            "584e05667072670e72c17167",
+            "584f94e87072670e72c2d3f7",
+            "584fdffd7072670e72c32dc7",
+            "584fe07f7072670e72c32e59",
+            "585a2a71b338a62ad50138dc",
+            "585a206ab338a62ad501298f",
+            "585a217cb338a62ad5012b38",
+            "585b34afb338a62ad501e836",
+            "585bb25fc49c8507c3ce7812",
+            "585bbe55c49c8507c3ce81cd",
+            "585d6c8a2a57cc11d4920a1e",
+            "585e54c72a57cc11d492f71a",
+            "585e34302a57cc11d492be30",
+            "585ee0632a57cc11d4933608",
+            "585f9661712e2761468dabca",
+            "585ffe9a712e2761468df643",
+            "586a37ec9d1b5e34c28184fc",
+            "586a515a9d1b5e34c281b431",
+            "586a94939d1b5e34c2823b5d",
+            "586abc689d1b5e34c2826360",
+            "586b0e219d1b5e34c2828862",
+            "586b3db89d1b5e34c282cd52",
+            "586b4c459d1b5e34c282e66d",
+            "586b7d7d9d1b5e34c283359e",
+            "586b8f149d1b5e34c283497c",
+            "586b8f629d1b5e34c28349d6",
+            "586c4c4d9d1b5e34c28391a1",
+            "586c5b5b9d1b5e34c2839a5b",
+            "586c9fdf9d1b5e34c283b657",
+            "586caab99d1b5e34c283c213",
+            "586cd0779d1b5e34c28403a7",
+            "586d6d249d1b5e34c284b80e",
+            "586d8a029d1b5e34c284c948",
+            "586d55af9d1b5e34c284a999",
+            "586d07869d1b5e34c2842e5b",
+            "586d27489d1b5e34c28453af",
+            "586e279c9d1b5e34c2852180",
+            "587bc5ec2366dd5d06e262c1",
+            "587c1abf2366dd5d06e28901",
+            "587c03f12366dd5d06e27722",
+            "587c19da2366dd5d06e2877b",
+            "587c31b92366dd5d06e2a9dc",
+            "587c87d02366dd5d06e2f989",
+            "587c97a52366dd5d06e30a96",
+            "587c45192366dd5d06e2c0eb",
+            "587cec702366dd5d06e37862",
+            "587cef0a2366dd5d06e379e3",
+            "587db5872366dd5d06e3e0af",
+            "587e2b1d2366dd5d06e41af0",
+            "587e2ea62366dd5d06e41f2e",
+            "587e5cb52366dd5d06e4486e",
+            "587eb1822366dd5d06e45f29",
+            "587f365d2366dd5d06e4906e",
+            "588a9c5fec4d5a1c088ec350",
+            "588a34cfec4d5a1c088ea8d1",
+            "588ab5bdec4d5a1c088ed60f",
+            "588aff9d90414422fbe7885a",
+            "588b20d290414422fbe79f40",
+            "588c08d590414422fbe8200b",
+            "588c203d90414422fbe8319e",
+            "588c989a90414422fbe86d96",
+            "588ca09d90414422fbe871a1",
+            "588cce2190414422fbe88520",
+            "588cd5ef90414422fbe8875c",
+            "588cf0ad90414422fbe8a20f",
+            "588e01c490414422fbe8ee2a",
+            "588e35e690414422fbe90a53",
+            "588f017e90414422fbe9b74b",
+            "588f095190414422fbe9c1ee",
+            "589aca717dc3d323d55671c4",
+            "589af2c97dc3d323d55691e8",
+            "589b49ea7dc3d323d556d9b4",
+            "589b04287dc3d323d556a185",
+            "589bf6a57dc3d323d55743ab",
+            "589c3c497dc3d323d5578468",
+            "589c3c577dc3d323d5578480",
+            "589c24527dc3d323d5577126",
+            "589c35457dc3d323d5577d8d",
+            "589ca6a6b896147a1b73aff7",
+            "589d1e1fb896147a1b73ee5b",
+            "589d5c58b896147a1b742256",
+            "589d95538fa2cf375df3317b",
+            "589df0ffb504a864ad63521a",
+            "589ea316b504a864ad639a2b",
+            "589ec97cb504a864ad63adc3",
+            "589f214338486e3c9846f123",
+            "589fdfe738486e3c984736cf",
+            "590c2d70336bb52a190be886",
+            "591a467a6109e14d4f09b776",
+            "591cf3033162411cf9047f37",
+            "591ea44850991c70dc99a207",
+            "599aa591d5b41f366fed0d58",
+            "5643df56138263b51db1b5f3",
+            "5644bdac138263b51db9f669",
+            "5850d4f97072670e72c425d6",
+            "5854c405804be105852330fe",
+            "5855a4fc804be1058523bd75",
+            "5856ac15804be105852419d8",
+            "5856ae8b804be10585241bae",
+            "5856b460804be10585242059",
+            "5857aa5ab338a62ad5ff4dbe",
+            "5857acf8b338a62ad5ff5107",
+            "5858db6cb338a62ad500103b",
+            "5858dbcab338a62ad5001081",
+            "5859d84fb338a62ad500e5cf",
+            "5861d8ea712e2761468f3cb3",
+            "5863edf8712e27614690cce0",
+            "5864b076712e27614691197e",
+            "5864da88712e276146913d8b",
+            "5865f4a8712e27614691e39b",
+            "5867a434833dfe3f7b88edaf",
+            "5868cd15833dfe3f7b89bfa3",
+            "5880b3692366dd5d06e5d534",
+            "5880e3422366dd5d06e5ff8e",
+            "5880f0ef2366dd5d06e6166e",
+            "5881d2bfb6844814c136a119",
+            "5881f11d8ce2c2754d0714c3",
+            "5881fee18ce2c2754d0723f8",
+            "5882cda2b116682b4adebd25",
+            "5882d58fb116682b4adec7db",
+            "5884c256932ba84fbed70bf5",
+            "5884cc13932ba84fbed71ec4",
+            "5885bc5296fa095e0671a7f0",
+            "5886d14cb791366d617a362c",
+            "5888becfc02346100f4b0b21",
+            "5888e408c02346100f4b1a29",
+            "5889da66ec4d5a1c088e5187",
+            "5889e754ec4d5a1c088e60ba",
+            "5890c16b90414422fbeb0262",
+            "5891d8ae9a8c0314c5cd30ab",
+            "5891d0479a8c0314c5cd2abd",
+            "5891ecf19a8c0314c5cd490a",
+            "5892c0cd9a8c0314c5cdc977",
+            "5894ab309a8c0314c5cee57d",
+            "5895a6a89a8c0314c5cfca7c",
+            "5895b8c29a8c0314c5cfd051",
+            "5895d38f9a8c0314c5cfe50c",
+            "5895f2329a8c0314c5d00117",
+            "5896bb989a8c0314c5d086b6",
+            "5896ebf39a8c0314c5d0a8c4",
+            "5898b1bac9dccc22987b7f74",
+            "5898b6ffc9dccc22987b8a03",
+            "5898bbaac9dccc22987b8eba",
+            "5899cfa6b76d7a3780a4cb64",
+            "5899e5dcb76d7a3780a4ecc1",
+            "57102be2877e1421026358af",
+            "57153d4031bb9900425bde85",
+            "57177cd7fb8d93461afc4527",
+            "58497cdf97b73e0b090c4273",
+            "58500b007072670e72c35588",
+            "58510bf97072670e72c46ddf",
+            "58522bd56789802282f2ecb3",
+            "58524a2e0e7012308944bcf3",
+            "58524a080e7012308944bcbf",
+            "58524c1d0e7012308944bfda",
+            "58524f170e7012308944c200",
+            "58529a4e0e70123089454c6f",
+            "58551bdf804be1058523556d",
+            "58568c9a804be10585240b03",
+            "58574b35804be105852455fd",
+            "58577c60b338a62ad5ff1564",
+            "58592d69b338a62ad5007a74",
+            "58625f42712e2761468fb44c",
+            "58651bcc712e2761469166dc",
+            "58660e79712e27614691fe3d",
+            "58669aad712e27614692834c",
+            "58676c36833dfe3f7b88b7f2",
+            "58678b2d833dfe3f7b88e244",
+            "58800b0b2366dd5d06e5312d",
+            "58805eac2366dd5d06e56460",
+            "58806e422366dd5d06e57bb6",
+            "58831d060db9bf59bf8ab98b",
+            "58851ebb932ba84fbed7abad",
+            "58871dc3b791366d617a55ff",
+            "58873cabb791366d617a65a7",
+            "58873d44b791366d617a65dd",
+            "58888b3dc02346100f4af665",
+            "58933bac9a8c0314c5ce3508",
+            "58938e6d9a8c0314c5ce726f",
+            "58951cb49a8c0314c5cf4d5e",
+            "58970fd09a8c0314c5d0e383",
+            "58977ef09a8c0314c5d17b26",
+            "59056e6760bb961de55f3501",
+            "59071f2e5a6dbd3af4130f98",
+            "59102c811225725be9e64149",
+            "59338e76772c3e6384afbb15",
+            "59350ca084b7f26bf5ce6eb8",
+            "59397e493a87372f2c9e882b",
+            "59521e0b9096412211c2aa9d",
+            "59817e4a1bd4b175e7038d19",
+            "567884f58d2828b95e3c8eba",
+            "585559d9804be10585238ddf",
+            "585834cdb338a62ad5ffab4d",
+            "586082d8712e2761468e2877",
+            "586133c2712e2761468ecfe3",
+            "586281d2712e2761468fcaa2",
+            "586316e5712e276146903c4d",
+            "586326ad712e276146904571",
+            "586375c9712e276146907429",
+            "586389c9712e276146908da6",
+            "586496fa712e2761469108e7",
+            "586669c6712e27614692597a",
+            "586913a49d1b5e34c2808b02",
+            "586922da9d1b5e34c2809ff3",
+            "588185d8dfb7a15588a114a3",
+            "588315c60db9bf59bf8aa928",
+            "588332ee0db9bf59bf8ae9c3",
+            "588519d5932ba84fbed7a04a",
+            "588824d1b791366d617adeef",
+            "588857f6c02346100f4ac09f",
+            "589145ef90414422fbeb2e08",
+            "589433fa9a8c0314c5ce9656",
+            "589765d39a8c0314c5d16b12",
+            "5851165f7072670e72c4860d",
+            "5859341ab338a62ad500848d",
+            "5863915b712e276146909135",
+            "5866445b712e27614692383e",
+            "5866500d712e2761469240fd",
+            "5867785a833dfe3f7b88c764",
+            "5867969c833dfe3f7b88e8bc",
+            "5868040c833dfe3f7b8934f7",
+            "5882372c8ce2c2754d076af0",
+            "5883535e932ba84fbed5ad07",
+            "5888358cb791366d617af69d",
+            "5890330d90414422fbeaa0cb",
+            "5897076e9a8c0314c5d0d31b",
+            "5940564ec2d9527ab869f7e2",
+            "5947719bf1b45630bd096665",
+            "5948194ff1b45630bd0f47e3",
+            "5950206a41b158666ac50506",
+            "5983012d1bd4b175e70c985a",
+            "58586810b338a62ad5ffc20c",
+            "58592046b338a62ad5006b33",
+            "58592854b338a62ad500750a",
+            "58596531b338a62ad500aace",
+            "58818685dfb7a15588a11626",
+            "58829563f42b1d3ee3ec835f",
+            "58894345c02346100f4b51ca",
+            "585289980e7012308945276a",
+            "585369770e7012308945c709",
+            "585373640e7012308945cab9",
+            "588230658ce2c2754d076728",
+            "589388059a8c0314c5ce718b",
+            "595979485ec6a95e86a58c8d",
+            "5841206219d291325678ca90",
+            "58563650804be1058523da55",
+            "58564084804be1058523e116",
+            "58636467712e27614690661f",
+            "58647495712e27614690f36d",
+            "58654563712e276146918643",
+            "58664251712e276146923738",
+            "588084032366dd5d06e59e82",
+            "588159582366dd5d06e66877",
+            "5890279190414422fbea9734",
+            "5890641690414422fbeabbe7",
+            "585203546789802282f2aaf5",
+        ]
+        # Validation set sequences after filtering
+        self.val_split_scenes = [
+            "00000000000000000000000a",
+            "5a4a38dad38c8a075495b5d2",
+            "5a489fb1c7dab83a7d7b1070",
+            "5a572fd9fc597b0478a81d14",
+            "5a588a8193ac3d233f77fbca",
+            "5aa0f478a9efce63548c1cb4",
+            "5ae2e9c5fe405c5076abc6b2",
+            "5b2c67b5e0878c381608b8d8",
+            "5b21e18c58e2823a67a10dd8",
+            "5b864d850d072a699b32f4ae",
+            "5b4933abf2b5f44e95de482a",
+            "5b37189a35304b6f75e7583e",
+            "5bc5f0e896b66a2cd8f9bd36",
+            "5bccd6beca24970bce448134",
+            "5bf26cbbd43923194854b270",
+            "5bf18642c50e6f7f8bdbd492",
+            "5bfc9d5aec61ca1dd69132a2",
+            "5bff3c5cfe0ea555e6bcbf3a",
+            "5c1f33f1d33e1f2e4aa6dda4",
+            "5c34529873a8df509ae57b58",
+            "58a186444a4d262a170ae3ae",
+            "58f7f7299f5b5647873cb110",
+            "59acd2f4b891807f439c8992",
+            "567a0fb0a825d2fb79ac9a20",
+            "584ad76bfe3cb463906ce6dc",
+            "584c58b77072670e72c03990",
+            "586c48329d1b5e34c2838e80",
+            "586df9849d1b5e34c28506de",
+            "588e0d8c90414422fbe8f8b2",
+            "589c300f7dc3d323d5577926",
+            "590f91851225725be9e25d4e",
+            "5889e344ec4d5a1c088e59be",
+            "5898b31cc9dccc22987b82ec",
+            "5947b62af1b45630bd0c2a02",
+            "58598db2b338a62ad500bc38",
+            "58669c02712e27614692851a",
+            "58790c82ce911104a3467c88",
+            "58897f62c02346100f4b8ee6",
+            "588305ed0db9bf59bf8a8c80",
+            "588457b8932ba84fbed69942",
+            "5862388b712e2761468f84aa",
+            "5880675a2366dd5d06e570ca",
+            "5890523090414422fbeab3f0",
+        ]
+class TartanAirV2Splits:
+    """
+    This class contains the information about the splits of the TartanAir V2 dataset.
+    """
+    def __init__(self):
+        """
+        Splits of environments with unique geometry selected based on TartanVO & UFM splits.
+        """
+        # Apart from the below 2 splits, all other TAv2 scenes are in the train split
+        # Val split
+        self.val_split_scenes = ["EndofTheWorld", "HongKong", "WesternDesertTown"]
+        # Test split
+        self.test_split_scenes = [
+            "DesertGasStation",
+            "OldScandinavia",
+            "PolarSciFi",
+            "Sewerage",
+            "Supermarket",
+        ]
+class MegaDepthSplits:
+    """
+    This class contains the information about the splits of the MegaDepth dataset.
+    """
+    def __init__(self):
+        """
+        Validation split is based on scenes used in DUSt3R.
+        """
+        self.val_split_scenes = ["0015_0", "0015_1", "0022_0"]
+class SpringSplits:
+    """
+    This class contains the information about the splits of the Spring dataset.
+    """
+    def __init__(self):
+        self.val_split_scenes = ["0013", "0023", "0037"]
+class MPSDSplits:
+    """
+    This class contains the information about the splits of the MPSD dataset.
+    """
+    def __init__(self):
+        """
+        Train & Validation split numpy files containing folder names are generated during preprocessing of MPSD dataset.
+        Load the numpy files to get the list of scenes in the train & validation split.
+        A 95% (Train) & 5% (Validation) split is used.
+        """
+        self.train_split_scenes = "load_numpy_file_with_train_scenes"
+        self.val_split_scenes = "load_numpy_file_with_val_scenes"
+class ScanNetPPSplits:
+    """
+    This class contains the information about the splits of the ScanNetPP dataset.
+    """
+    def __init__(self):
+        """
+        Validation & Test split only contains scenes from ScanNet++V2 to prevent data leak with other methods such as DUSt3R during benchmarking.
+        Following logic was used to generate the splits:
+        # Select 80%, 10%, 10% of the scenes for train, val, test respectively from ScanNet++ V2 (~300 scene subset; excluding V1 scenes)
+        snpp_v2_test_scenes = np.random.choice(
+            snpp_v2_processed_scenes, size=int(0.1 * len(snpp_v2_processed_scenes)), replace=False
+        )
+        remaining_scenes = [scene for scene in snpp_v2_processed_scenes if scene not in snpp_v2_test_scenes]
+        snpp_v2_val_scenes = np.random.choice(
+            remaining_scenes, size=int(0.1 * len(snpp_v2_processed_scenes)), replace=False
+        )
+        snpp_v2_train_scenes = [
+            scene for scene in remaining_scenes if scene not in snpp_v2_val_scenes and scene not in snpp_v2_test_scenes
+        ]
+        """
+        # Validation Scenes
+        self.val_split_scenes = [
+            "1c7a683c92",
+            "2a1b555966",
+            "3a43c7b8d2",
+            "4aef651da7",
+            "06bc6d1b24",
+            "7f22d5ef1b",
+            "7f77abce34",
+            "8ea517a2fc",
+            "29c7afafed",
+            "41eb967018",
+            "77b40ce601",
+            "086f09d6e3",
+            "307e3262f1",
+            "639f2c4d5a",
+            "894dbd41f1",
+            "898a7dfd0c",
+            "2779f8f9e2",
+            "151178afd7",
+            "182932a4f3",
+            "635852d56e",
+            "9906136b57",
+            "af112b8903",
+            "b0f057c684",
+            "b37177e6c8",
+            "b119249da7",
+            "be8367fcbe",
+            "c8fc01c453",
+            "e1fb8626c8",
+            "e2caaaf5b5",
+            "fe3fc057a1",
+        ]
+        # Test Scenes
+        self.test_split_scenes = [
+            "0e900bcc5c",
+            "0eba3981c9",
+            "1cbb105c6a",
+            "3c8d535d49",
+            "5d902f1593",
+            "6bd39ac392",
+            "6c14d5fd01",
+            "7c31a42404",
+            "9bfbc75700",
+            "13b4efaf62",
+            "062e5a23a6",
+            "95b9971d01",
+            "246fe09e98",
+            "637a27d04b",
+            "725b8f0cba",
+            "413085a827",
+            "696317583f",
+            "a4c043ac48",
+            "a9e4791c7e",
+            "b0b004c40f",
+            "c3bc5e82c5",
+            "c31ebd4b22",
+            "cba701332a",
+            "cc5ea8026c",
+            "cec8312f4e",
+            "e3b3b0d0c7",
+            "e667e09fe6",
+            "eaa6c90310",
+            "f9397af4cb",
+            "fb893ffaf3",
+        ]
+class DL3DV10KSplits:
+    """
+    This class contains the information about the splits of the DL3DV-10K dataset.
+    We use the official benchmark split as the val split.
+    """
+    def __init__(self):
+        """
+        Validation split is based on DL3DV-Benchmark.
+        """
+        self.val_split_scenes = [
+            "load https://huggingface.co/datasets/DL3DV/DL3DV-Benchmark/raw/main/benchmark-meta.csv \
+            & https://raw.githubusercontent.com/DL3DV-10K/Dataset/main/cache/DL3DV-valid.csv"
+        ]
+class ETH3DSplits:
+    """
+    This class contains the information about the splits of the ETH3D dataset.
+    """
+    def __init__(self):
+        """
+        All scenes are in the test split.
+        """
+        self.test_split_scenes = "all"

mapanything/datasets/wai/__init__.py ADDED Viewed

File without changes

mapanything/datasets/wai/ase.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+ASE Dataset using WAI format data.
+"""
+import os
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class ASEWAI(BaseDataset):
+    """
+    ASE dataset containing large diversity of synthetic indoor scenes.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = True
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"ase_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Resize the data to match the desired resolution
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image=image,
+                resolution=resolution,
+                depthmap=depthmap,
+                intrinsics=intrinsics,
+                additional_quantities=None,
+            )
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="ASE",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-rd", "--root_dir", default="/fsx/xrtech/data/ase", type=str)
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = ASEWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 518),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = ASEWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 518),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "ASE_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/blendedmvs.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+BlendedMVS Dataset using WAI format data.
+"""
+import os
+import cv2
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class BlendedMVSWAI(BaseDataset):
+    """
+    BlendedMVS dataset containing object-centric and birds-eye-view scenes.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = False
+        self.is_synthetic = False
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"blendedmvs_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth", "pred_mask/moge2"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Get the non_ambiguous_mask and ensure it matches image resolution
+            non_ambiguous_mask = view_data["pred_mask/moge2"].numpy().astype(int)
+            non_ambiguous_mask = cv2.resize(
+                non_ambiguous_mask,
+                (image.shape[1], image.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Mask out the GT depth using the non_ambiguous_mask
+            depthmap = np.where(non_ambiguous_mask, depthmap, 0)
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="BlendedMVS",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/blendedmvs", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = BlendedMVSWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 392),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = BlendedMVSWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 392),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "BlendedMVS_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=10, replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/dl3dv.py ADDED Viewed

	@@ -0,0 +1,356 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+DL3DV Dataset using WAI format data.
+"""
+import os
+import cv2
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.cropping import (
+    rescale_image_and_other_optional_info,
+    resize_with_nearest_interpolation_to_match_aspect_ratio,
+)
+from mapanything.utils.wai.core import load_data, load_frame
+class DL3DVWAI(BaseDataset):
+    """
+    DL3DV dataset containing over 10k in-the-wild and indoor scenes.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        mvs_confidence_filter_thres: float = 0.25,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+            mvs_confidence_filter_thres: Confidence threshold to filter MVS depth. Defaults to 0.25.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self.mvs_confidence_filter_thres = mvs_confidence_filter_thres
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = False
+        self.is_synthetic = False
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"dl3dv_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0_mvsa_based"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=[
+                    "image",
+                    "pred_depth/mvsanywhere",
+                    "pred_mask/moge2",
+                    "depth_confidence/mvsanywhere",
+                ],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["pred_depth/mvsanywhere"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Get the dimensions of the original image
+            img_h, img_w = image.shape[:2]
+            # Resize depth to match image aspect ratio while ensuring that depth resolution doesn't increase
+            depthmap, target_depth_h, target_depth_w = (
+                resize_with_nearest_interpolation_to_match_aspect_ratio(
+                    input_data=depthmap, img_h=img_h, img_w=img_w
+                )
+            )
+            # Now resize the image and update intrinsics to match the resized depth
+            image, _, intrinsics, _ = rescale_image_and_other_optional_info(
+                image=image,
+                output_resolution=(target_depth_w, target_depth_h),
+                depthmap=None,
+                camera_intrinsics=intrinsics,
+            )
+            image = np.array(image)
+            # Get the depth confidence map and mask out the MVS depth
+            confidence_map = view_data["depth_confidence/mvsanywhere"].numpy()
+            confidence_mask = (
+                confidence_map > self.mvs_confidence_filter_thres
+            ).astype(int)
+            confidence_mask = cv2.resize(
+                confidence_mask,
+                (image.shape[1], image.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            depthmap = np.where(confidence_mask, depthmap, 0)
+            # Get the non_ambiguous_mask and ensure it matches image resolution
+            non_ambiguous_mask = view_data["pred_mask/moge2"].numpy().astype(int)
+            non_ambiguous_mask = cv2.resize(
+                non_ambiguous_mask,
+                (image.shape[1], image.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Mask out the GT depth using the non_ambiguous_mask
+            depthmap = np.where(non_ambiguous_mask, depthmap, 0)
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="DL3DV",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-rd", "--root_dir", default="/fsx/xrtech/data/dl3dv", type=str)
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = DL3DVWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        mvs_confidence_filter_thres=0.25,
+        resolution=(518, 294),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = DL3DVWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     mvs_confidence_filter_thres=0.25,
+    #     resolution=(518, 294),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "DL3DV_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/dynamicreplica.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+Dynamic Replica Dataset using WAI format data.
+"""
+import os
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class DynamicReplicaWAI(BaseDataset):
+    """
+    Dynamic Replica dataset containing synthetic scenes with humans and animals.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = True
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"dynamicreplica_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = image[:, :, :3]  # RGBA to RGB
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Resize the data to match the desired resolution
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image=image,
+                resolution=resolution,
+                depthmap=depthmap,
+                intrinsics=intrinsics,
+                additional_quantities=None,
+            )
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="DynamicReplica",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/dynamicreplica", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = DynamicReplicaWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 294),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = DynamicReplicaWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 294),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "DynamicReplica_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/eth3d.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+ETH3D Dataset using WAI format data.
+"""
+import os
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class ETH3DWAI(BaseDataset):
+    """
+    ETH3D dataset containing high-quality outdoor and indoor scans of the ETH Zurich campus.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = "test"
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = False
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"eth3d_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Resize the data to match the desired resolution
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image=image,
+                resolution=resolution,
+                depthmap=depthmap,
+                intrinsics=intrinsics,
+                additional_quantities=None,
+            )
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="ETH3D",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-rd", "--root_dir", default="/fsx/xrtech/data/eth3d", type=str)
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = ETH3DWAI(
+        num_views=args.num_of_views,
+        covisibility_thres=0.025,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 336),
+        seed=777,
+        transform="imgnorm",
+        data_norm_type="dinov2",
+    )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "ETH3D_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/megadepth.py ADDED Viewed

	@@ -0,0 +1,314 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+MegaDepth Dataset using WAI format data.
+"""
+import os
+import cv2
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class MegaDepthWAI(BaseDataset):
+    """
+    MegaDepth dataset containing outdoor phototourism and in-the-wild scenes.
+    Also includes Tanks & Temples scenes.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = False
+        self.is_synthetic = False
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"megadepth_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth", "pred_mask/moge2"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Get the non_ambiguous_mask and ensure it matches image resolution
+            non_ambiguous_mask = view_data["pred_mask/moge2"].numpy().astype(int)
+            non_ambiguous_mask = cv2.resize(
+                non_ambiguous_mask,
+                (image.shape[1], image.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Mask out the GT depth using the non_ambiguous_mask
+            depthmap = np.where(non_ambiguous_mask, depthmap, 0)
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="MegaDepth",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/megadepth", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = MegaDepthWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 336),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = MegaDepthWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 336),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "MegaDepth_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/mpsd.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+MPSD Dataset using WAI format data.
+"""
+import os
+import cv2
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class MPSDWAI(BaseDataset):
+    """
+    MPSD dataset containing outdoor planet scale metric reconstructions.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = False
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"mpsd_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth", "pred_mask/moge2"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Get the non_ambiguous_mask and ensure it matches image resolution
+            non_ambiguous_mask = view_data["pred_mask/moge2"].numpy().astype(int)
+            non_ambiguous_mask = cv2.resize(
+                non_ambiguous_mask,
+                (image.shape[1], image.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Mask out the GT depth using the non_ambiguous_mask
+            depthmap = np.where(non_ambiguous_mask, depthmap, 0)
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="MPSD",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-rd", "--root_dir", default="/fsx/xrtech/data/mpsd", type=str)
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = MPSDWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.15,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 392),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = MPSDWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.15,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 392),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "MPSD_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/mvs_synth.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+MVS Synth Dataset using WAI format data.
+"""
+import os
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class MVSSynthWAI(BaseDataset):
+    """
+    MVS Synth dataset containing large diversity of synthetic in-the-wild scenes.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = True
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"mvs_synth_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Get the non ambiguous mask (zero depth pixels are sky or ambiguous)
+            non_ambiguous_mask = (depthmap > 0).astype(int)
+            # Mask out the outlier depth (horizon depth)
+            percentile_depth = np.percentile(depthmap, 95)
+            depthmap[depthmap > percentile_depth] = 0
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="MVSSynth",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/mvs_synth", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = MVSSynthWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 294),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = MVSSynthWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 294),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "MVSSynth_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/paralleldomain4d.py ADDED Viewed

	@@ -0,0 +1,309 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+Parallel Domain 4D Dataset using WAI format data.
+"""
+import os
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class ParallelDomain4DWAI(BaseDataset):
+    """
+    Parallel Domain 4D dataset containing large diversity of synthetic AV scenes.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = True
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"paralleldomain4d_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = image[:, :, :3]  # RGBA to RGB
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Get the non ambiguous mask (zero depth pixels are sky or ambiguous)
+            non_ambiguous_mask = (depthmap > 0).astype(int)
+            # Mask out the outlier depth (horizon depth)
+            percentile_depth = np.percentile(depthmap, 95)
+            depthmap[depthmap > percentile_depth] = 0
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="ParallelDomain4D",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/paralleldomain4d", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = ParallelDomain4DWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 392),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = ParallelDomain4DWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 392),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "ParallelDomain4D_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/sailvos3d.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+SAIL-VOS 3D Dataset using WAI format data.
+"""
+import os
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class SAILVOS3DWAI(BaseDataset):
+    """
+    SAIL-VOS 3D dataset containing large diversity of synthetic in-the-wild cut scenes from GTA.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = True
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"sailvos3d_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Get the non ambiguous mask (zero depth pixels are sky or ambiguous)
+            non_ambiguous_mask = (depthmap > 0).astype(int)
+            # Mask out the outlier depth (horizon depth)
+            percentile_depth = np.percentile(depthmap, 95)
+            depthmap[depthmap > percentile_depth] = 0
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="SAILVOS3D",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/sailvos3d", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = SAILVOS3DWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 336),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = SAILVOS3DWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 336),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "SAILVOS3D_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/scannetpp.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+ScanNet++V2 Dataset using WAI format data.
+"""
+import os
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class ScanNetPPWAI(BaseDataset):
+    """
+    ScanNet++V2 dataset containing large diversity of indoor scenes.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = False
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"scannetppv2_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "rendered_depth"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["rendered_depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Resize the data to match the desired resolution
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image=image,
+                resolution=resolution,
+                depthmap=depthmap,
+                intrinsics=intrinsics,
+                additional_quantities=None,
+            )
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="ScanNetPP",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/scannetppv2", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = ScanNetPPWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 336),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = ScanNetPPWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 336),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    # dataset = ScanNetPPWAI(
+    #     num_views=args.num_of_views,
+    #     split="test",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 336),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "ScanNetPP_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=10, replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/spring.py ADDED Viewed

	@@ -0,0 +1,316 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+Spring Dataset using WAI format data.
+"""
+import os
+import cv2
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class SpringWAI(BaseDataset):
+    """
+    Spring dataset containing high-quality large-scale in-the-wild scenes with unique animated objects.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = True
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"spring_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )  # Assumes only npy file in directory is covisibility map
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth", "skymask", "pred_mask/moge2"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Get the sky mask and mask out GT depth
+            sky_mask = view_data["skymask"].numpy().astype(int)
+            depthmap = np.where(sky_mask, 0, depthmap)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Get the non_ambiguous_mask and ensure it matches image resolution
+            non_ambiguous_mask = view_data["pred_mask/moge2"].numpy().astype(int)
+            non_ambiguous_mask = cv2.resize(
+                non_ambiguous_mask,
+                (image.shape[1], image.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Mask out the GT depth using the non_ambiguous_mask
+            depthmap = np.where(non_ambiguous_mask, depthmap, 0)
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="Spring",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/spring", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = SpringWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 294),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = SpringWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 294),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "Spring_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=10, replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/tav2_wb.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+TartanAirV2-WB Dataset using WAI format data.
+"""
+import os
+import cv2
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class TartanAirV2WBWAI(BaseDataset):
+    """
+    TartanAirV2-WB dataset containing vastly-sized in-the-wild synthetic scenes.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = True
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"tav2_wb_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth", "pred_mask/moge2"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Mask out the outlier depth caused due to transparent windows in TartanAirV2
+            percentile_depth = np.percentile(depthmap, 95)
+            depthmap[depthmap > percentile_depth] = 0
+            # Get the non_ambiguous_mask and ensure it matches image resolution
+            non_ambiguous_mask = view_data["pred_mask/moge2"].numpy().astype(int)
+            non_ambiguous_mask = cv2.resize(
+                non_ambiguous_mask,
+                (image.shape[1], image.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Mask out the GT depth using the non_ambiguous_mask
+            depthmap = np.where(non_ambiguous_mask, depthmap, 0)
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="TartanAirV2WB",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/tav2_wb", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = TartanAirV2WBWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 518),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = TartanAirV2WBWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 518),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    # dataset = TartanAirV2WBWAI(
+    #     num_views=args.num_of_views,
+    #     split="test",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 518),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "TartanAirV2WB_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/datasets/wai/unrealstereo4k.py ADDED Viewed

	@@ -0,0 +1,309 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+UnrealStereo4K Dataset using WAI format data.
+"""
+import os
+import numpy as np
+from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
+class UnrealStereo4KWAI(BaseDataset):
+    """
+    UnrealStereo4K dataset containing synthetic in-the-wild scenes.
+    """
+    def __init__(
+        self,
+        *args,
+        ROOT,
+        dataset_metadata_dir,
+        split,
+        overfit_num_sets=None,
+        sample_specific_scene: bool = False,
+        specific_scene_name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset attributes.
+        Args:
+            ROOT: Root directory of the dataset.
+            dataset_metadata_dir: Path to the dataset metadata directory.
+            split: Dataset split (train, val, test).
+            overfit_num_sets: If None, use all sets. Else, the dataset will be truncated to this number of sets.
+            sample_specific_scene: Whether to sample a specific scene from the dataset.
+            specific_scene_name: Name of the specific scene to sample.
+        """
+        # Initialize the dataset attributes
+        super().__init__(*args, **kwargs)
+        self.ROOT = ROOT
+        self.dataset_metadata_dir = dataset_metadata_dir
+        self.split = split
+        self.overfit_num_sets = overfit_num_sets
+        self.sample_specific_scene = sample_specific_scene
+        self.specific_scene_name = specific_scene_name
+        self._load_data()
+        # Define the dataset type flags
+        self.is_metric_scale = True
+        self.is_synthetic = True
+    def _load_data(self):
+        "Load the precomputed dataset metadata"
+        # Load the dataset metadata corresponding to the split
+        split_metadata_path = os.path.join(
+            self.dataset_metadata_dir,
+            self.split,
+            f"unrealstereo4k_scene_list_{self.split}.npy",
+        )
+        split_scene_list = np.load(split_metadata_path, allow_pickle=True)
+        # Get the list of all scenes
+        if not self.sample_specific_scene:
+            self.scenes = list(split_scene_list)
+        else:
+            self.scenes = [self.specific_scene_name]
+        self.num_of_scenes = len(self.scenes)
+    def _get_views(self, sampled_idx, num_views_to_sample, resolution):
+        # Get the scene name of the sampled index
+        scene_index = sampled_idx
+        scene_name = self.scenes[scene_index]
+        # Get the metadata corresponding to the scene
+        scene_root = os.path.join(self.ROOT, scene_name)
+        scene_meta = load_data(
+            os.path.join(scene_root, "scene_meta.json"), "scene_meta"
+        )
+        scene_file_names = list(scene_meta["frame_names"].keys())
+        num_views_in_scene = len(scene_file_names)
+        # Load the scene pairwise covisibility mmap
+        covisibility_version_key = "v0"
+        covisibility_map_dir = os.path.join(
+            scene_root, "covisibility", covisibility_version_key
+        )
+        # Assumes only npy file in directory is covisibility map
+        covisibility_map_name = next(
+            f for f in os.listdir(covisibility_map_dir) if f.endswith(".npy")
+        )
+        covisibility_map_path = os.path.join(
+            scene_root, "covisibility", covisibility_version_key, covisibility_map_name
+        )
+        pairwise_covisibility = load_data(covisibility_map_path, "mmap")
+        # Get the indices of the N views in the scene
+        view_indices = self._sample_view_indices(
+            num_views_to_sample, num_views_in_scene, pairwise_covisibility
+        )
+        # Get the views corresponding to the selected view indices
+        views = []
+        for view_index in view_indices:
+            # Load the data corresponding to the view
+            view_file_name = scene_file_names[view_index]
+            view_data = load_frame(
+                scene_root,
+                view_file_name,
+                modalities=["image", "depth"],
+                scene_meta=scene_meta,
+            )
+            # Convert necessary data to numpy
+            image = view_data["image"].permute(1, 2, 0).numpy()
+            image = image[:, :, :3]  # RGBA to RGB
+            image = (image * 255).astype(np.uint8)
+            depthmap = view_data["depth"].numpy().astype(np.float32)
+            intrinsics = view_data["intrinsics"].numpy().astype(np.float32)
+            c2w_pose = view_data["extrinsics"].numpy().astype(np.float32)
+            # Ensure that the depthmap has all valid values
+            depthmap = np.nan_to_num(depthmap, nan=0.0, posinf=0.0, neginf=0.0)
+            # Get the non ambiguous mask (zero depth pixels are sky or ambiguous)
+            non_ambiguous_mask = (depthmap > 0).astype(int)
+            # Mask out the outlier depth (horizon depth)
+            percentile_depth = np.percentile(depthmap, 95)
+            depthmap[depthmap > percentile_depth] = 0
+            # Resize the data to match the desired resolution
+            additional_quantities_to_resize = [non_ambiguous_mask]
+            image, depthmap, intrinsics, additional_quantities_to_resize = (
+                self._crop_resize_if_necessary(
+                    image=image,
+                    resolution=resolution,
+                    depthmap=depthmap,
+                    intrinsics=intrinsics,
+                    additional_quantities=additional_quantities_to_resize,
+                )
+            )
+            non_ambiguous_mask = additional_quantities_to_resize[0]
+            # Append the view dictionary to the list of views
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=c2w_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    non_ambiguous_mask=non_ambiguous_mask,
+                    dataset="UnrealStereo4K",
+                    label=scene_name,
+                    instance=os.path.join("images", str(view_file_name)),
+                )
+            )
+        return views
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-rd", "--root_dir", default="/fsx/xrtech/data/unrealstereo4k", type=str
+    )
+    parser.add_argument(
+        "-dmd",
+        "--dataset_metadata_dir",
+        default="/fsx/nkeetha/mapanything_dataset_metadata",
+        type=str,
+    )
+    parser.add_argument(
+        "-nv",
+        "--num_of_views",
+        default=2,
+        type=int,
+    )
+    parser.add_argument("--viz", action="store_true")
+    return parser
+if __name__ == "__main__":
+    import rerun as rr
+    from tqdm import tqdm
+    from mapanything.datasets.base.base_dataset import view_name
+    from mapanything.utils.image import rgb
+    from mapanything.utils.viz import script_add_rerun_args
+    parser = get_parser()
+    script_add_rerun_args(
+        parser
+    )  # Options: --headless, --connect, --serve, --addr, --save, --stdout
+    args = parser.parse_args()
+    dataset = UnrealStereo4KWAI(
+        num_views=args.num_of_views,
+        split="train",
+        covisibility_thres=0.25,
+        ROOT=args.root_dir,
+        dataset_metadata_dir=args.dataset_metadata_dir,
+        resolution=(518, 294),
+        aug_crop=16,
+        transform="colorjitter+grayscale+gaublur",
+        data_norm_type="dinov2",
+    )
+    # dataset = UnrealStereo4KWAI(
+    #     num_views=args.num_of_views,
+    #     split="val",
+    #     covisibility_thres=0.25,
+    #     ROOT=args.root_dir,
+    #     dataset_metadata_dir=args.dataset_metadata_dir,
+    #     resolution=(518, 294),
+    #     seed=777,
+    #     transform="imgnorm",
+    #     data_norm_type="dinov2",
+    # )
+    print(dataset.get_stats())
+    if args.viz:
+        rr.script_setup(args, "UnrealStereo4K_Dataloader")
+        rr.set_time("stable_time", sequence=0)
+        rr.log("world", rr.ViewCoordinates.RDF, static=True)
+    sampled_indices = np.random.choice(len(dataset), size=len(dataset), replace=False)
+    for num, idx in enumerate(tqdm(sampled_indices)):
+        views = dataset[idx]
+        assert len(views) == args.num_of_views
+        sample_name = f"{idx}"
+        for view_idx in range(args.num_of_views):
+            sample_name += f" {view_name(views[view_idx])}"
+        print(sample_name)
+        for view_idx in range(args.num_of_views):
+            image = rgb(
+                views[view_idx]["img"], norm_type=views[view_idx]["data_norm_type"]
+            )
+            depthmap = views[view_idx]["depthmap"]
+            pose = views[view_idx]["camera_pose"]
+            intrinsics = views[view_idx]["camera_intrinsics"]
+            pts3d = views[view_idx]["pts3d"]
+            valid_mask = views[view_idx]["valid_mask"]
+            if "non_ambiguous_mask" in views[view_idx]:
+                non_ambiguous_mask = views[view_idx]["non_ambiguous_mask"]
+            else:
+                non_ambiguous_mask = None
+            if "prior_depth_along_ray" in views[view_idx]:
+                prior_depth_along_ray = views[view_idx]["prior_depth_along_ray"]
+            else:
+                prior_depth_along_ray = None
+            if args.viz:
+                rr.set_time("stable_time", sequence=num)
+                base_name = f"world/view_{view_idx}"
+                pts_name = f"world/view_{view_idx}_pointcloud"
+                # Log camera info and loaded data
+                height, width = image.shape[0], image.shape[1]
+                rr.log(
+                    base_name,
+                    rr.Transform3D(
+                        translation=pose[:3, 3],
+                        mat3x3=pose[:3, :3],
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole",
+                    rr.Pinhole(
+                        image_from_camera=intrinsics,
+                        height=height,
+                        width=width,
+                        camera_xyz=rr.ViewCoordinates.RDF,
+                    ),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/rgb",
+                    rr.Image(image),
+                )
+                rr.log(
+                    f"{base_name}/pinhole/depth",
+                    rr.DepthImage(depthmap),
+                )
+                if prior_depth_along_ray is not None:
+                    rr.log(
+                        f"prior_depth_along_ray_{view_idx}",
+                        rr.DepthImage(prior_depth_along_ray),
+                    )
+                if non_ambiguous_mask is not None:
+                    rr.log(
+                        f"{base_name}/pinhole/non_ambiguous_mask",
+                        rr.SegmentationImage(non_ambiguous_mask.astype(int)),
+                    )
+                # Log points in 3D
+                filtered_pts = pts3d[valid_mask]
+                filtered_pts_col = image[valid_mask]
+                rr.log(
+                    pts_name,
+                    rr.Points3D(
+                        positions=filtered_pts.reshape(-1, 3),
+                        colors=filtered_pts_col.reshape(-1, 3),
+                    ),
+                )

mapanything/models/__init__.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+Model Factory for MapAnything
+"""
+import importlib.util
+import logging
+import warnings
+import numpy as np
+from omegaconf import DictConfig, OmegaConf
+# Core models that are always available
+from mapanything.models.mapanything import (
+    MapAnything,
+    MapAnythingAblations,
+    ModularDUSt3R,
+)
+# Suppress DINOv2 warnings
+logging.getLogger("dinov2").setLevel(logging.WARNING)
+warnings.filterwarnings("ignore", message="xFormers is available", category=UserWarning)
+warnings.filterwarnings(
+    "ignore", message="xFormers is not available", category=UserWarning
+)
+def resolve_special_float(value):
+    if value == "inf":
+        return np.inf
+    elif value == "-inf":
+        return -np.inf
+    else:
+        raise ValueError(f"Unknown special float value: {value}")
+def init_model(
+    model_str: str, model_config: DictConfig, torch_hub_force_reload: bool = False
+):
+    """
+    Initialize a model using OmegaConf configuration.
+    Args:
+        model_str (str): Name of the model class to create.
+        model_config (DictConfig): OmegaConf model configuration.
+        torch_hub_force_reload (bool): Whether to force reload relevant parts of the model from torch hub.
+    """
+    if not OmegaConf.has_resolver("special_float"):
+        OmegaConf.register_new_resolver("special_float", resolve_special_float)
+    model_dict = OmegaConf.to_container(model_config, resolve=True)
+    model = model_factory(
+        model_str, torch_hub_force_reload=torch_hub_force_reload, **model_dict
+    )
+    return model
+# Define model configurations with import paths
+MODEL_CONFIGS = {
+    # Core models
+    "mapanything": {
+        "class": MapAnything,
+    },
+    "mapanything_ablations": {
+        "class": MapAnythingAblations,
+    },
+    "modular_dust3r": {
+        "class": ModularDUSt3R,
+    },
+    # External models
+    "anycalib": {
+        "module": "mapanything.models.external.anycalib",
+        "class_name": "AnyCalibWrapper",
+    },
+    "dust3r": {
+        "module": "mapanything.models.external.dust3r",
+        "class_name": "DUSt3RBAWrapper",
+    },
+    "mast3r": {
+        "module": "mapanything.models.external.mast3r",
+        "class_name": "MASt3RSGAWrapper",
+    },
+    "moge": {
+        "module": "mapanything.models.external.moge",
+        "class_name": "MoGeWrapper",
+    },
+    "must3r": {
+        "module": "mapanything.models.external.must3r",
+        "class_name": "MUSt3RWrapper",
+    },
+    "pi3": {
+        "module": "mapanything.models.external.pi3",
+        "class_name": "Pi3Wrapper",
+    },
+    "pow3r": {
+        "module": "mapanything.models.external.pow3r",
+        "class_name": "Pow3RWrapper",
+    },
+    "pow3r_ba": {
+        "module": "mapanything.models.external.pow3r",
+        "class_name": "Pow3RBAWrapper",
+    },
+    "vggt": {
+        "module": "mapanything.models.external.vggt",
+        "class_name": "VGGTWrapper",
+    },
+    # Add other model classes here
+}
+def check_module_exists(module_path):
+    """
+    Check if a module can be imported without actually importing it.
+    Args:
+        module_path (str): The path to the module to check.
+    Returns:
+        bool: True if the module can be imported, False otherwise.
+    """
+    return importlib.util.find_spec(module_path) is not None
+def model_factory(model_str: str, **kwargs):
+    """
+    Model factory for MapAnything.
+    Args:
+        model_str (str): Name of the model to create.
+        **kwargs: Additional keyword arguments to pass to the model constructor.
+    Returns:
+       nn.Module: An instance of the specified model.
+    """
+    if model_str not in MODEL_CONFIGS:
+        raise ValueError(
+            f"Unknown model: {model_str}. Valid options are: {', '.join(MODEL_CONFIGS.keys())}"
+        )
+    model_config = MODEL_CONFIGS[model_str]
+    # Handle core models directly
+    if "class" in model_config:
+        model_class = model_config["class"]
+    # Handle external models with dynamic imports
+    elif "module" in model_config:
+        module_path = model_config["module"]
+        class_name = model_config["class_name"]
+        # Check if the module can be imported
+        if not check_module_exists(module_path):
+            raise ImportError(
+                f"Model '{model_str}' requires module '{module_path}' which is not installed. "
+                f"Please install the corresponding submodule or package."
+            )
+        # Dynamically import the module and get the class
+        try:
+            module = importlib.import_module(module_path)
+            model_class = getattr(module, class_name)
+        except (ImportError, AttributeError) as e:
+            raise ImportError(
+                f"Failed to import {class_name} from {module_path}: {str(e)}"
+            )
+    else:
+        raise ValueError(f"Invalid model configuration for {model_str}")
+    print(f"Initializing {model_class} with kwargs: {kwargs}")
+    if model_str != "org_dust3r":
+        return model_class(**kwargs)
+    else:
+        eval_str = kwargs.get("model_eval_str", None)
+        return eval(eval_str)
+def get_available_models() -> list:
+    """
+    Get a list of available models in MapAnything.
+    Returns:
+        list: A list of available model names.
+    """
+    return list(MODEL_CONFIGS.keys())
+__all__ = ["model_factory", "get_available_models"]

mapanything/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (5.63 kB). View file

mapanything/models/external/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# External Model Code for Benchmarking & Re-Training
+This directory contains external model code that we use to train and benchmark external models fairly. These libraries are not part of the core MapAnything codebase and are included for only benchmarking purposes. The code in this directory is licensed under the same license as the source code from which it was derived, unless otherwise specified.
+The open-source Apache 2.0 License of MapAnything does not apply to these libraries.

mapanything/models/external/__init__.py ADDED Viewed

File without changes

mapanything/models/external/anycalib/__init__.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+"""
+Inference wrapper for AnyCalib
+"""
+import torch
+from anycalib import AnyCalib
+from mapanything.utils.geometry import get_rays_in_camera_frame
+class AnyCalibWrapper(torch.nn.Module):
+    def __init__(
+        self,
+        name,
+        model_id="anycalib_pinhole",
+        **kwargs,
+    ):
+        super().__init__()
+        self.name = name
+        self.model_id = model_id
+        # Initialize the model
+        self.model = AnyCalib(model_id=self.model_id)
+    def forward(self, views):
+        """
+        Forward pass wrapper for AnyCalib.
+        Assumption:
+        - The number of input views is 1.
+        - The output camera model is pinhole (fx, fy, cx, cy).
+          This can be relaxed by not hardcoding the cam_id.
+        Args:
+            views (List[dict]): List of dictionaries containing the input views' images and instance information.
+                                Length of the list should be 1.
+                                Each dictionary should contain the following keys:
+                                    "img" (tensor): Image tensor of shape (B, C, H, W).
+                                    "data_norm_type" (list): ["identity"]
+        Returns:
+            List[dict]: A list containing the final outputs for the single view. Length of the list will be 1.
+        """
+        # Check that the number of input views is 1
+        assert len(views) == 1, "AnyCalib only supports 1 input view."
+        # Get input shape of the images and batch size per view
+        _, _, height, width = views[0]["img"].shape
+        # Check the data norm type
+        # AnyCalib expects a normalized image but without the DINOv2 mean and std applied ("identity")
+        data_norm_type = views[0]["data_norm_type"][0]
+        assert data_norm_type == "identity", (
+            "AnyCalib expects a normalized image but without the DINOv2 mean and std applied"
+        )
+        # Run AnyCalib inference
+        # Corresponding batched output dictionary:
+        # {
+        #      "intrinsics": List[(D_i,) tensors] for each camera model "i" at the original input resolution,
+        #      "fov_field": (B, N, 2) tensor with the regressed FoV field by the network. N≈320^2 (resolution close to the one seen during training),
+        #      "tangent_coords": alias for "fov_field",
+        #      "rays": (B, N, 3) tensor with the corresponding (via the exponential map) ray directions in the camera frame (x right, y down, z forward),
+        #      "pred_size": (H, W) tuple with the image size used by the network. It can be used e.g. for resizing the FoV/ray fields to the original image size.
+        # }
+        # For "pinhole" camera model, the intrinsics are (fx, fy, cx, cy).
+        model_outputs = self.model.predict(views[0]["img"], cam_id="pinhole")
+        # Convert the list of intrinsics to a tensor
+        intrinsics = []
+        for intrinsics_per_sample in model_outputs["intrinsics"]:
+            pred_fx, pred_fy, pred_cx, pred_cy = intrinsics_per_sample
+            intrinsics_per_sample = torch.tensor(
+                [
+                    [pred_fx, 0, pred_cx],
+                    [0, pred_fy, pred_cy],
+                    [0, 0, 1],
+                ],
+                device=views[0]["img"].device,
+            )
+            intrinsics.append(intrinsics_per_sample)
+        # Convert the list of intrinsics to a tensor of size (batch_size_per_view, 3, 3)
+        intrinsics = torch.stack(intrinsics)
+        # Get the ray directions
+        with torch.autocast("cuda", enabled=False):
+            _, ray_directions = get_rays_in_camera_frame(
+                intrinsics, height, width, normalize_to_unit_sphere=True
+            )
+        # Return the output in MapAnything format
+        res = [{"ray_directions": ray_directions, "intrinsics": intrinsics}]
+        return res

mapanything/models/external/dinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+__version__ = "0.0.1"

mapanything/models/external/dinov2/hub/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

mapanything/models/external/dinov2/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from mapanything.models.external.dinov2.hub.utils import (
+    _DINOV2_BASE_URL,
+    _make_dinov2_model_name,
+)
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(
+            arch_name, patch_size, num_register_tokens
+        )
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def dinov2_vits14(
+    *, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs
+):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs
+    )
+def dinov2_vitb14(
+    *, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs
+):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs
+    )
+def dinov2_vitl14(
+    *, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs
+):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs
+    )
+def dinov2_vitg14(
+    *, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs
+):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(
+    *, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs
+):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(
+    *, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs
+):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(
+    *, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs
+):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(
+    *, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs
+):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

mapanything/models/external/dinov2/hub/utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import itertools
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+def _make_dinov2_model_name(
+    arch_name: str, patch_size: int, num_register_tokens: int = 0
+) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(
+            itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1])
+        )
+        output = F.pad(x, pads)
+        return output

mapanything/models/external/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from mapanything.models.external.dinov2.layers.dino_head import DINOHead  # noqa
+from mapanything.models.external.dinov2.layers.mlp import Mlp  # noqa
+from mapanything.models.external.dinov2.layers.patch_embed import PatchEmbed  # noqa
+from mapanything.models.external.dinov2.layers.swiglu_ffn import (
+    SwiGLUFFN,  # noqa
+    SwiGLUFFNFused,  # noqa
+)
+from mapanything.models.external.dinov2.layers.block import NestedTensorBlock  # noqa
+from mapanything.models.external.dinov2.layers.attention import MemEffAttention  # noqa

mapanything/models/external/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+from torch import nn, Tensor
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Attention)")
+    else:
+        # warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

mapanything/models/external/dinov2/layers/block.py ADDED Viewed

	@@ -0,0 +1,290 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Any, Callable, Dict, List, Tuple
+import torch
+from torch import nn, Tensor
+from mapanything.models.external.dinov2.layers.attention import (
+    Attention,
+    MemEffAttention,
+)
+from mapanything.models.external.dinov2.layers.drop_path import DropPath
+from mapanything.models.external.dinov2.layers.layer_scale import LayerScale
+from mapanything.models.external.dinov2.layers.mlp import Mlp
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, index_select_cat, scaled_index_add
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Block)")
+    else:
+        # warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Block)")
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(
+        x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
+    )
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(
+            x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
+        )
+    else:
+        x_plus_residual = scaled_index_add(
+            x,
+            brange,
+            residual.to(dtype=x.dtype),
+            scaling=scaling_vector,
+            alpha=residual_scale_factor,
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = (
+        [b.shape[0] for b in branges]
+        if branges is not None
+        else [x.shape[0] for x in x_list]
+    )
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(
+            1, -1, x_list[0].shape[-1]
+        )
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [
+        get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list
+    ]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(
+        x_list, branges, residual_list, residual_scale_factors
+    ):
+        outputs.append(
+            add_residual(
+                x, brange, residual, residual_scale_factor, scaling_vector
+            ).view_as(x)
+        )
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma
+                if isinstance(self.ls1, LayerScale)
+                else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma
+                if isinstance(self.ls1, LayerScale)
+                else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

mapanything/models/external/dinov2/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(
+            nlayers,
+            in_dim,
+            bottleneck_dim,
+            hidden_dim=hidden_dim,
+            use_bn=use_bn,
+            bias=mlp_bias,
+        )
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(
+    nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True
+):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

mapanything/models/external/dinov2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

mapanything/models/external/dinov2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import nn, Tensor
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

mapanything/models/external/dinov2/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import nn, Tensor
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

mapanything/models/external/dinov2/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+import torch.nn as nn
+from torch import Tensor
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, (
+            f"Input image height {H} is not a multiple of patch height {patch_H}"
+        )
+        assert W % patch_W == 0, (
+            f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        )
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = (
+            Ho
+            * Wo
+            * self.embed_dim
+            * self.in_chans
+            * (self.patch_size[0] * self.patch_size[1])
+        )
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

mapanything/models/external/dinov2/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from typing import Callable, Optional
+import torch.nn.functional as F
+from torch import nn, Tensor
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (SwiGLU)")
+    else:
+        # warnings.warn("xFormers is disabled (SwiGLU)")
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

mapanything/models/external/dinov2/models/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+import mapanything.models.external.dinov2.models.vision_transformer as vits
+logger = logging.getLogger("dinov2")
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+            num_register_tokens=args.num_register_tokens,
+            interpolate_offset=args.interpolate_offset,
+            interpolate_antialias=args.interpolate_antialias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(
+        cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size
+    )

mapanything/models/external/dinov2/models/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import math
+from functools import partial
+from typing import Callable, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.utils.checkpoint import checkpoint
+from mapanything.models.external.dinov2.layers import (
+    MemEffAttention,
+    Mlp,
+    NestedTensorBlock as Block,
+    PatchEmbed,
+    SwiGLUFFNFused,
+)
+from mapanything.models.external.pi3.layers.attention import FlashAttention
+# logger = logging.getLogger("dinov2")
+def named_apply(
+    fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + self.num_tokens, embed_dim)
+        )
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim))
+            if num_register_tokens
+            else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [
+                x.item() for x in torch.linspace(0, drop_path_rate, depth)
+            ]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            # logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            # logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            # logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+                attn_class=FlashAttention,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append(
+                    [nn.Identity()] * i + blocks_list[i : i + chunksize]
+                )
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(
+            previous_dtype
+        )
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(
+                masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x
+            )
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [
+            self.prepare_tokens_with_masks(x, masks)
+            for x, masks in zip(x_list, masks_list)
+        ]
+        for blk in self.blocks:
+            if self.training:
+                x = checkpoint(blk, x, use_reentrant=False)
+            else:
+                x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            if self.training:
+                x = checkpoint(blk, x, use_reentrant=False)
+            else:
+                x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = (
+            range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        )
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), (
+            f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        )
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = (
+            range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        )
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), (
+            f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        )
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

mapanything/models/external/dinov2/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

mapanything/models/external/dinov2/utils/cluster.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, Optional
+class ClusterType(Enum):
+    AWS = "aws"
+    FAIR = "fair"
+    RSC = "rsc"
+def _guess_cluster_type() -> ClusterType:
+    uname = os.uname()
+    if uname.sysname == "Linux":
+        if uname.release.endswith("-aws"):
+            # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+            return ClusterType.AWS
+        elif uname.nodename.startswith("rsc"):
+            # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+            return ClusterType.RSC
+    return ClusterType.FAIR
+def get_cluster_type(
+    cluster_type: Optional[ClusterType] = None,
+) -> Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+    return cluster_type
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    CHECKPOINT_DIRNAMES = {
+        ClusterType.AWS: "checkpoints",
+        ClusterType.FAIR: "checkpoint",
+        ClusterType.RSC: "checkpoint/dino",
+    }
+    return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+def get_user_checkpoint_path(
+    cluster_type: Optional[ClusterType] = None,
+) -> Optional[Path]:
+    checkpoint_path = get_checkpoint_path(cluster_type)
+    if checkpoint_path is None:
+        return None
+    username = os.environ.get("USER")
+    assert username is not None
+    return checkpoint_path / username
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    SLURM_PARTITIONS = {
+        ClusterType.AWS: "learnlab",
+        ClusterType.FAIR: "learnlab",
+        ClusterType.RSC: "learn",
+    }
+    return SLURM_PARTITIONS[cluster_type]
+def get_slurm_executor_parameters(
+    nodes: int,
+    num_gpus_per_node: int,
+    cluster_type: Optional[ClusterType] = None,
+    **kwargs,
+) -> Dict[str, Any]:
+    # create default parameters
+    params = {
+        "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+        "gpus_per_node": num_gpus_per_node,
+        "tasks_per_node": num_gpus_per_node,  # one task per GPU
+        "cpus_per_task": 10,
+        "nodes": nodes,
+        "slurm_partition": get_slurm_partition(cluster_type),
+    }
+    # apply cluster-specific adjustments
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type == ClusterType.AWS:
+        params["cpus_per_task"] = 12
+        del params["mem_gb"]
+    elif cluster_type == ClusterType.RSC:
+        params["cpus_per_task"] = 12
+    # set additional parameters / apply overrides
+    params.update(kwargs)
+    return params

mapanything/models/external/dinov2/utils/config.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+import math
+import os
+import dinov2.distributed as distributed
+from dinov2.configs import dinov2_default_config
+from dinov2.logging import setup_logging
+from dinov2.utils import utils
+from omegaconf import OmegaConf
+logger = logging.getLogger("dinov2")
+def apply_scaling_rules_to_cfg(cfg):  # to fix
+    if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+        base_lr = cfg.optim.base_lr
+        cfg.optim.lr = base_lr
+        cfg.optim.lr *= math.sqrt(
+            cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0
+        )
+        logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+    else:
+        raise NotImplementedError
+    return cfg
+def write_config(cfg, output_dir, name="config.yaml"):
+    logger.info(OmegaConf.to_yaml(cfg))
+    saved_cfg_path = os.path.join(output_dir, name)
+    with open(saved_cfg_path, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    return saved_cfg_path
+def get_cfg_from_args(args):
+    args.output_dir = os.path.abspath(args.output_dir)
+    args.opts += [f"train.output_dir={args.output_dir}"]
+    default_cfg = OmegaConf.create(dinov2_default_config)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+    return cfg
+def default_setup(args):
+    distributed.enable(overwrite=True)
+    seed = getattr(args, "seed", 0)
+    rank = distributed.get_global_rank()
+    global logger
+    setup_logging(output=args.output_dir, level=logging.INFO)
+    logger = logging.getLogger("dinov2")
+    utils.fix_random_seeds(seed + rank)
+    logger.info("git:\n  {}\n".format(utils.get_sha()))
+    logger.info(
+        "\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))
+    )
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg_from_args(args)
+    os.makedirs(args.output_dir, exist_ok=True)
+    default_setup(args)
+    apply_scaling_rules_to_cfg(cfg)
+    write_config(cfg, args.output_dir)
+    return cfg

mapanything/models/external/dinov2/utils/dtype.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from typing import Dict, Union
+import numpy as np
+import torch
+TypeSpec = Union[str, np.dtype, torch.dtype]
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(dtype, np.dtype), (
+        f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    )
+    return _NUMPY_TO_TORCH_DTYPE[dtype]