Spaces:

RoyYang0714
/

3D-MOOD

Running on Zero

File size: 6,624 Bytes

9b33fca

"""3D-MOOD model config."""

from __future__ import annotations

from ml_collections import ConfigDict, FieldReference
from vis4d.config import class_config
from vis4d.config.typing import ExperimentParameters
from vis4d.op.fpp.fpn import FPN

from opendet3d.model.detect3d.grounding_dino_3d import GroundingDINO3D
from opendet3d.op.base.swin import SwinTransformer
from opendet3d.op.detect3d.grounding_dino_3d import (
    GroundingDINO3DCoder,
    GroundingDINO3DHead,
    RoI2Det3D,
    UniDepthHead,
)
from opendet3d.op.fpp.channel_mapper import ChannelMapper
from opendet3d.zoo.gdino.base.model import GDINO_MODEL_WEIGHTS


def get_gdino3d_hyperparams_cfg() -> ExperimentParameters:
    """Get the hyperparameters for 3D-MOOD."""
    params = ExperimentParameters()

    # Training
    params.samples_per_gpu = 2
    params.workers_per_gpu = 4
    params.accumulate_grad_batches = 1
    params.lr = 0.0004  # bs=128, lr=0.0004
    params.weight_decay = 0.0001

    # Learning rate schedule
    params.num_epochs = 120
    params.step_1 = 80
    params.step_2 = 110
    params.check_val_every_n_epoch = 1

    # Grounding DINO 3D Coder
    params.center_scale = 10.0
    params.depth_scale = 2.0
    params.dim_scale = 2.0
    params.orientation = "rotation_6d"

    # Grounding DINO 3D Loss
    params.loss_center_weight = 1.0
    params.loss_depth_weight = 1.0
    params.loss_dim_weight = 1.0
    params.loss_rot_weight = 1.0

    # Aux Depth Loss
    params.si_log_weight = 10.0

    # RoI2Det3D
    params.nms = False
    params.class_agnostic_nms = False
    params.max_per_img = 100
    params.score_threshold = 0.0
    params.iou_threshold = 0.5

    # Depth Head
    params.depth_output_scales = 1

    return params


def get_gdino3d_head_cfg(params: ExperimentParameters) -> ConfigDict:
    """Get the G-DINO 3D head config."""
    box_coder = class_config(
        GroundingDINO3DCoder,
        center_scale=params.center_scale,
        depth_scale=params.depth_scale,
        dim_scale=params.dim_scale,
        orientation=params.orientation,
    )

    bbox3d_head = class_config(
        GroundingDINO3DHead,
        box_coder=box_coder,
        depth_output_scales=params.depth_output_scales,
    )

    roi2det3d = class_config(
        RoI2Det3D,
        nms=params.nms,
        max_per_img=params.max_per_img,
        class_agnostic_nms=params.class_agnostic_nms,
        score_threshold=params.score_threshold,
        iou_threshold=params.iou_threshold,
        box_coder=box_coder,
    )

    return bbox3d_head, roi2det3d, box_coder


def get_gdino3d_cfg(
    params: ExperimentParameters,
    basemodel: ConfigDict,
    neck: ConfigDict,
    depth_fpn: ConfigDict,
    num_feature_levels: int = 4,
    chunked_size: int = -1,
    cat_mapping: dict[str, int] | None = None,
    pretrained: str | None = None,
    use_checkpoint: bool | FieldReference = False,
) -> ConfigDict:
    """Get the Grounding DINO with Swin-B model config."""
    # UniDepth Head
    depth_head = class_config(
        UniDepthHead,
        depth_scale=params.depth_scale,
        input_dims=[256, 256, 256, 256],
        output_scales=params.depth_output_scales,
    )

    bbox3d_head, roi2det3d, box_coder = get_gdino3d_head_cfg(params=params)

    if pretrained is not None:
        weights = GDINO_MODEL_WEIGHTS[pretrained]
    else:
        weights = None

    model = class_config(
        GroundingDINO3D,
        basemodel=basemodel,
        neck=neck,
        num_feature_levels=num_feature_levels,
        bbox3d_head=bbox3d_head,
        roi2det3d=roi2det3d,
        fpn=depth_fpn,
        depth_head=depth_head,
        use_checkpoint=use_checkpoint,
        weights=weights,
        chunked_size=chunked_size,
        cat_mapping=cat_mapping,
    )

    return model, box_coder


def get_gdino3d_swin_tiny_cfg(
    params: ExperimentParameters,
    chunked_size: int = -1,
    cat_mapping: dict[str, int] | None = None,
    pretrained: str | None = None,
    use_checkpoint: bool | FieldReference = False,
) -> ConfigDict:
    """Get the config of Swin-Tiny."""
    basemodel = class_config(
        SwinTransformer,
        convert_weights=True,
        embed_dims=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        drop_path_rate=0.2,
        out_indices=(0, 1, 2, 3),
        with_cp=use_checkpoint,
        pretrained="https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth",
    )

    neck = class_config(
        ChannelMapper,
        in_channels=[192, 384, 768],
        out_channels=256,
        num_outs=4,
        kernel_size=1,
        norm="GroupNorm",
        num_groups=32,
        activation=None,
        bias=True,
    )

    depth_fpn = class_config(
        FPN,
        in_channels_list=[96, 192, 384, 768],
        out_channels=256,
        extra_blocks=None,
        start_index=0,
    )

    return get_gdino3d_cfg(
        params,
        basemodel=basemodel,
        neck=neck,
        depth_fpn=depth_fpn,
        chunked_size=chunked_size,
        cat_mapping=cat_mapping,
        pretrained=pretrained,
        use_checkpoint=use_checkpoint,
    )


def get_gdino3d_swin_base_cfg(
    params: ExperimentParameters,
    chunked_size: int = -1,
    cat_mapping: dict[str, int] | None = None,
    pretrained: str | None = None,
    use_checkpoint: bool | FieldReference = False,
) -> ConfigDict:
    """Get the config of Swin-Base."""
    basemodel = class_config(
        SwinTransformer,
        convert_weights=True,
        pretrain_img_size=384,
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=12,
        drop_path_rate=0.3,
        out_indices=(0, 1, 2, 3),
        with_cp=use_checkpoint,
        pretrained="https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth",
    )

    neck = class_config(
        ChannelMapper,
        in_channels=[256, 512, 1024],
        out_channels=256,
        num_outs=4,
        kernel_size=1,
        norm="GroupNorm",
        num_groups=32,
        activation=None,
        bias=True,
    )

    depth_fpn = class_config(
        FPN,
        in_channels_list=[128, 256, 512, 1024],
        out_channels=256,
        extra_blocks=None,
        start_index=0,
    )

    return get_gdino3d_cfg(
        params,
        basemodel=basemodel,
        neck=neck,
        depth_fpn=depth_fpn,
        chunked_size=chunked_size,
        cat_mapping=cat_mapping,
        pretrained=pretrained,
        use_checkpoint=use_checkpoint,
    )