Spaces:

3dlg-hcvc
/

opdmulti-demo

Running

App Files Files Community

atwang commited on Sep 19, 2023

Commit

01664b3

1 Parent(s): 20c01c5

[NOT TESTED] initial implementation of app

Browse files

Files changed (50) hide show

.gitignore +3 -0
app.py +131 -4
configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml +47 -0
configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml +44 -0
configs/coco/instance-segmentation/swin/opd_base.yaml +50 -0
configs/coco/instance-segmentation/swin/opd_v1_real.yaml +7 -0
dev-requirements.txt +3 -0
examples/59-4860.png +0 -0
examples/59-4860_d.png +0 -0
inference.py +836 -0
mask2former/__init__.py +11 -0
mask2former/config.py +125 -0
mask2former/maskformer_model.py +820 -0
mask2former/modeling/__init__.py +6 -0
mask2former/modeling/backbone/__init__.py +1 -0
mask2former/modeling/backbone/swin.py +770 -0
mask2former/modeling/criterion.py +547 -0
mask2former/modeling/matcher.py +192 -0
mask2former/modeling/meta_arch/__init__.py +1 -0
mask2former/modeling/meta_arch/mask_former_head.py +133 -0
mask2former/modeling/meta_arch/per_pixel_baseline.py +243 -0
mask2former/modeling/pixel_decoder/__init__.py +1 -0
mask2former/modeling/pixel_decoder/fpn.py +312 -0
mask2former/modeling/pixel_decoder/msdeformattn.py +358 -0
mask2former/modeling/pixel_decoder/ops/functions/__init__.py +13 -0
mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py +72 -0
mask2former/modeling/pixel_decoder/ops/make.sh +13 -0
mask2former/modeling/pixel_decoder/ops/modules/__init__.py +12 -0
mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py +125 -0
mask2former/modeling/pixel_decoder/ops/setup.py +78 -0
mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp +46 -0
mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h +38 -0
mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu +158 -0
mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h +35 -0
mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh +1332 -0
mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h +67 -0
mask2former/modeling/pixel_decoder/ops/src/vision.cpp +21 -0
mask2former/modeling/pixel_decoder/ops/test.py +92 -0
mask2former/modeling/transformer_decoder/__init__.py +4 -0
mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py +461 -0
mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py +188 -0
mask2former/modeling/transformer_decoder/opd_transformer_decoder.py +520 -0
mask2former/modeling/transformer_decoder/position_encoding.py +64 -0
mask2former/modeling/transformer_decoder/transformer.py +369 -0
mask2former/utils/__init__.py +2 -0
mask2former/utils/misc.py +111 -0
mask2former/utils/motion_visualizer.py +676 -0
mask2former/utils/tranform.py +169 -0
pre-requirements.txt +6 -0
requirements.txt +11 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+build/
+venv/
+__pycache__/

app.py CHANGED Viewed

@@ -1,7 +1,134 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

+import os
+import re
+from types import SimpleNamespace
+from typing import Any
 import gradio as gr
+import numpy as np
+from detectron2 import engine
+from inference import main, setup_cfg
+# internal settings
+NUM_PROCESSES = 1
+CROP = False
+SCORE_THRESHOLD = 0.8
+MAX_PARTS = 5
+ARGS = SimpleNamespace(
+    config_file="configs/coco/instance-segmentation/swin/opd_v1_real.yaml",
+    model="...",
+    input_format="RGB",
+    output=".output",
+    cpu=True,
+)
+def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_samples: int) -> list[Any]:
+    def find_gifs(path: str) -> list[str]:
+        """Scrape folders for all generated gif files."""
+        for file in os.listdir(path):
+            sub_path = os.path.join(path, file)
+            if os.path.isdir(sub_path):
+                for image_file in os.listdir(sub_path):
+                    if re.match(r".*\.gif$", image_file):
+                        yield os.path.join(sub_path, image_file)
+    cfg = setup_cfg(ARGS)
+    engine.launch(
+        main,
+        NUM_PROCESSES,
+        args=(
+            cfg,
+            rgb_image,
+            depth_image,
+            intrinsics,
+            num_samples,
+            CROP,
+            SCORE_THRESHOLD,
+        ),
+    )
+    # process output
+    # TODO: may want to select these in decreasing order of score
+    pre_outputs = list(find_gifs(ARGS.output))
+    outputs = []
+    for idx in range(MAX_PARTS):  # hide unused components
+        if idx < len(pre_outputs):
+            outputs.append(gr.update(value=pre_outputs[idx], visible=True))
+        else:
+            outputs.append(gr.update(visible=False))
+    return outputs
+def variable_outputs(idx):
+    idx = int(idx)
+with gr.Blocks() as app:
+    gr.Markdown(
+        """
+    # OPDMulti Demo
+    Upload an image to see its range of motion.
+    """
+    )
+    # TODO: add gr.Examples
+    with gr.Row():
+        rgb_image = gr.Image(
+            image_mode="RGB", source="upload", type="filepath", label="RGB Image", show_label=True, interactive=True
+        )
+        depth_image = gr.Image(
+            image_mode="L", source="upload", type="filepath", label="Depth Image", show_label=True, interactive=True
+        )
+    intrinsics = gr.Dataframe(
+        value=[
+            [
+                214.85935872395834,
+                0.0,
+                0.0,
+            ],
+            [
+                0.0,
+                214.85935872395834,
+                0.0,
+            ],
+            [
+                125.90160319010417,
+                95.13726399739583,
+                1.0,
+            ],
+        ],
+        row_count=(3, "fixed"),
+        col_count=(3, "fixed"),
+        datatype="number",
+        type="numpy",
+        label="Intrinsics matrix",
+        show_label=True,
+        interactive=True,
+    )
+    num_samples = gr.Number(
+        value=10,
+        label="Number of samples",
+        show_label=True,
+        interactive=True,
+        precision=0,
+        minimum=3,
+        maximum=20,
+    )
+    submit_btn = gr.Button("Run model")
+    # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
+    # identified.
+    outputs = [gr.Image(type="filepath", label=f"Part {idx + 1}", visible=False) for idx in range(MAX_PARTS)]
+    # TODO: maybe need to use a queue here so we don't overload the instance
+    submit_btn.click(
+        fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=outputs, api_name="run_model"
+    )
+app.launch()

configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    # NORM: "SyncBN"
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  STEPS: (327778, 355092)
+  MAX_ITER: 368750
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+INPUT:
+  IMAGE_SIZE: 1024
+  MIN_SCALE: 0.1
+  MAX_SCALE: 2.0
+  FORMAT: "RGB"
+  DATASET_MAPPER_NAME: "coco_instance_lsj"
+TEST:
+  EVAL_PERIOD: 5000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2

configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+_BASE_: Base-COCO-InstanceSegmentation.yaml
+MODEL:
+  META_ARCHITECTURE: "MaskFormer"
+  SEM_SEG_HEAD:
+    NAME: "MaskFormerHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 80
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  MASK_FORMER:
+    TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 100
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    TEST:
+      SEMANTIC_ON: False
+      INSTANCE_ON: True
+      PANOPTIC_ON: False
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.8

configs/coco/instance-segmentation/swin/opd_base.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+_BASE_: ../maskformer2_R50_bs16_50ep.yaml
+INPUT:
+  FORMAT: "RGB"
+  IMAGE_SIZE: 256
+  MAX_SIZE_TEST: 256
+  MAX_SIZE_TRAIN: 256
+  MIN_SIZE_TEST: 256
+  MIN_SIZE_TRAIN:
+  - 256
+  # DATASET_MAPPER_NAME: "motion_instance"
+DATALOADER:
+  NUM_WORKERS: 4
+DATASETS:
+  TRAIN: ("MotionNet_train",)
+  TEST: ("MotionNet_valid",)
+MODEL:
+  MOTIONNET:
+    TYPE: BMOC_V0
+  SEM_SEG_HEAD:
+    NUM_CLASSES: 3
+  MASK_ON: True # Useful for our MotionEvaluator, because it's from an older version detectron2
+  MASK_FORMER:
+    TRANSFORMER_DECODER_NAME: OPDMultiScaleMaskedTransformerDecoder
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    MTYPE_WEIGHT: 2.0
+    MORIGIN_WEIGHT: 16.0
+    MAXIS_WEIGHT: 16.0
+    MSTATE_WEIGHT: 16.0
+    MSTATEMAX_WEIGHT: 16.0
+    EXTRINSIC_WEIGHT: 30.0
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  STEPS: (36000, 48000)
+  MAX_ITER: 60000
+  CHECKPOINT_PERIOD: 10000
+TEST:
+  AUG:
+    ENABLED: false
+    FLIP: false
+  EVAL_PERIOD: 10000
+SEED: 42

configs/coco/instance-segmentation/swin/opd_v1_real.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+_BASE_: ./opd_base.yaml
+MODEL:
+  MOTIONNET:
+    TYPE: BMOC_V1
+  PIXEL_MEAN: [142.60756197911175, 128.59507321750323, 110.82755928042158, 1267.231689453125]  # RGB mean from MotionDataset_real train
+  PIXEL_STD: [24.008765143841437, 24.132018526763215, 27.228518892160068, 599.8106079101562] # RGB stddev from MotionDataset_real train

dev-requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+black==23.9.1
+gradio==3.44.3
+huggingface-hub==0.17.2

examples/59-4860.png ADDED Viewed

examples/59-4860_d.png ADDED Viewed

inference.py ADDED Viewed

	@@ -0,0 +1,836 @@

+"""
+inference.py
+------------
+Provides functionality to run the OPDMulti model on an input image, independent of dataset and ground truth, and
+visualize the output. Large portions of the code originate from get_prediction.py, rgbd_to_pcd_vis.py,
+evaluate_on_log.py, and other related files. The primary goal was to create a more standalone script which could be
+converted more easily into a public demo, thus the goal was to sever most dependencies on existing ground truth or
+datasets.
+Example usage:
+python inference.py \
+    --rgb path/to/59-4860.png \
+    --depth path/to/59-4860_d.png \
+    --model path/to/model.pth \
+    --output path/to/output_dir
+"""
+import argparse
+import logging
+import os
+import time
+from copy import deepcopy
+from typing import Any
+import imageio
+import open3d as o3d
+import numpy as np
+import torch
+import torch.nn as nn
+from detectron2 import engine, evaluation
+from detectron2.modeling import build_model
+from detectron2.config import get_cfg, CfgNode
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.structures import instances
+from detectron2.utils import comm
+from detectron2.utils.logger import setup_logger
+from PIL import Image, ImageChops
+from mask2former import (
+    add_maskformer2_config,
+    add_motionnet_config,
+)
+# import based on torch version. Required for model loading. Code is taken from fvcore.common.checkpoint.py, in order to
+# replicate model loading without the overhead of setting up an OPDTrainer
+TORCH_VERSION: tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
+if TORCH_VERSION >= (1, 11):
+    from torch.ao import quantization
+    from torch.ao.quantization import FakeQuantizeBase, ObserverBase
+elif (
+    TORCH_VERSION >= (1, 8)
+    and hasattr(torch.quantization, "FakeQuantizeBase")
+    and hasattr(torch.quantization, "ObserverBase")
+):
+    from torch import quantization
+    from torch.quantization import FakeQuantizeBase, ObserverBase
+# TODO: find a global place for this instead of in many places in code
+TYPE_CLASSIFICATION = {
+    0: "rotation",
+    1: "translation",
+}
+POINT_COLOR = [1, 0, 0]  # red for demonstration
+IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")
+def get_parser() -> argparse.ArgumentParser:
+    """
+    Specfy command-line arguments.
+    The primary inputs to the script should be the image paths (RGBD) and camera intrinsics. Other arguments are
+    provided to facilitate script testing and model changes. Run file with -h/--help to see all arguments.
+    :return: parser for extracting command-line arguments
+    """
+    parser = argparse.ArgumentParser(description="Inference for OPDMulti")
+    # The main arguments which should be specified by the user
+    parser.add_argument(
+        "--rgb",
+        dest="rgb_image",
+        metavar="FILE",
+        help="path to RGB image file on which to run model",
+    )
+    parser.add_argument(
+        "--depth",
+        dest="depth_image",
+        metavar="FILE",
+        help="path to depth image file on which to run model",
+    )
+    parser.add_argument(  # FIXME: might make more sense to make this a path
+        "-i",
+        "--intrinsics",
+        nargs=9,
+        default=[
+            214.85935872395834,
+            0.0,
+            0.0,
+            0.0,
+            214.85935872395834,
+            0.0,
+            125.90160319010417,
+            95.13726399739583,
+            1.0,
+        ],
+        dest="intrinsics",
+        help="camera intrinsics matrix, as a list of values",
+    )
+    # optional parameters for user to specify
+    parser.add_argument(
+        "-n",
+        "--num-samples",
+        default=10,
+        dest="num_samples",
+        metavar="NUM",
+        help="number of sample states to generate in visualization",
+    )
+    parser.add_argument(
+        "--crop",
+        action="store_true",
+        dest="crop",
+        help="crop whitespace out of images for visualization",
+    )
+    # local script development arguments
+    parser.add_argument(
+        "-m",
+        "--model",
+        default="path/to/model/file",  # FIXME: set a good default path
+        dest="model",
+        metavar="FILE",
+        help="path to model file to run",
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        default="configs/coco/instance-segmentation/swin/opd_v1_real.yaml",
+        metavar="FILE",
+        dest="config_file",
+        help="path to config file",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="output",  # FIXME: set a good default path
+        dest="output",
+        help="path to output directory in which to save results",
+    )
+    parser.add_argument(
+        "--num-processes",
+        default=1,
+        dest="num_processes",
+        help="number of processes per machine. When using GPUs, this should be the number of GPUs.",
+    )
+    parser.add_argument(
+        "-s",
+        "--score-threshold",
+        default=0.8,
+        type=float,
+        dest="score_threshold",
+        help="threshold between 0.0 and 1.0 by which to filter out bad predictions",
+    )
+    parser.add_argument(
+        "--input-format",
+        default="RGB",
+        dest="input_format",
+        help="input format of image. Must be one of RGB, RGBD, or depth",
+    )
+    parser.add_argument(
+        "--cpu",
+        action="store_true",
+        help="flag to require code to use CPU only",
+    )
+    return parser
+def setup_cfg(args: argparse.Namespace) -> CfgNode:
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    # add model configurations
+    add_deeplab_config(cfg)
+    add_maskformer2_config(cfg)
+    add_motionnet_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    # set additional config parameters
+    cfg.MODEL.WEIGHTS = args.model
+    cfg.OBJ_DETECT = False  # TODO: figure out if this is needed, and parameterize it
+    cfg.MODEL.MOTIONNET.VOTING = "none"
+    # Output directory
+    cfg.OUTPUT_DIR = args.output
+    cfg.MODEL.DEVICE = "cpu" if args.cpu else "cuda"
+    cfg.MODEL.MODELATTRPATH = None
+    # Input format
+    cfg.INPUT.FORMAT = args.input_format
+    if args.input_format == "RGB":
+        cfg.MODEL.PIXEL_MEAN = cfg.MODEL.PIXEL_MEAN[0:3]
+        cfg.MODEL.PIXEL_STD = cfg.MODEL.PIXEL_STD[0:3]
+    elif args.input_format == "depth":
+        cfg.MODEL.PIXEL_MEAN = cfg.MODEL.PIXEL_MEAN[3:4]
+        cfg.MODEL.PIXEL_STD = cfg.MODEL.PIXEL_STD[3:4]
+    elif args.input_format == "RGBD":
+        pass
+    else:
+        raise ValueError("Invalid input format")
+    cfg.freeze()
+    engine.default_setup(cfg, args)
+    # Setup logger for "mask_former" module
+    setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="opdformer")
+    return cfg
+def format_input(rgb_path: str) -> list[dict[str, Any]]:
+    """
+    Read and format input image into detectron2 form so that it can be passed to the model.
+    :param rgb_path: path to RGB image file
+    :return: list of dictionaries per image, where each dictionary is of the form
+        {
+            "file_name": path to RGB image,
+            "image": torch.Tensor of dimensions [channel, height, width] representing the image
+        }
+    """
+    image = imageio.imread(rgb_path).astype(np.float32)
+    image_tensor = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))  # dim: [channel, height, width]
+    return [{"file_name": rgb_path, "image": image_tensor}]
+def load_model(model: nn.Module, checkpoint: Any) -> None:
+    """
+    Load weights from a checkpoint.
+    The majority of the function definition is taken from the DetectionCheckpointer implementation provided in
+    detectron2. While not all of this code is necessarily needed for model loading, it was ported with the intention
+    of keeping the implementation and output as close to the original as possible, and reusing the checkpoint class here
+    in isolation was determined to be infeasible.
+    :param model: model for which to load weights
+    :param checkpoint: checkpoint contains the weights.
+    """
+    def _strip_prefix_if_present(state_dict: dict[str, Any], prefix: str) -> None:
+        """If prefix is found on all keys in state dict, remove prefix."""
+        keys = sorted(state_dict.keys())
+        if not all(len(key) == 0 or key.startswith(prefix) for key in keys):
+            return
+        for key in keys:
+            newkey = key[len(prefix) :]
+            state_dict[newkey] = state_dict.pop(key)
+    checkpoint_state_dict = checkpoint.pop("model")
+    # convert from numpy to tensor
+    for k, v in checkpoint_state_dict.items():
+        if not isinstance(v, np.ndarray) and not isinstance(v, torch.Tensor):
+            raise ValueError("Unsupported type found in checkpoint! {}: {}".format(k, type(v)))
+        if not isinstance(v, torch.Tensor):
+            checkpoint_state_dict[k] = torch.from_numpy(v)
+    # if the state_dict comes from a model that was wrapped in a
+    # DataParallel or DistributedDataParallel during serialization,
+    # remove the "module" prefix before performing the matching.
+    _strip_prefix_if_present(checkpoint_state_dict, "module.")
+    # workaround https://github.com/pytorch/pytorch/issues/24139
+    model_state_dict = model.state_dict()
+    incorrect_shapes = []
+    for k in list(checkpoint_state_dict.keys()):  # state dict is modified in loop, so list op is necessary
+        if k in model_state_dict:
+            model_param = model_state_dict[k]
+            # Allow mismatch for uninitialized parameters
+            if TORCH_VERSION >= (1, 8) and isinstance(model_param, nn.parameter.UninitializedParameter):
+                continue
+            shape_model = tuple(model_param.shape)
+            shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
+            if shape_model != shape_checkpoint:
+                has_observer_base_classes = (
+                    TORCH_VERSION >= (1, 8)
+                    and hasattr(quantization, "ObserverBase")
+                    and hasattr(quantization, "FakeQuantizeBase")
+                )
+                if has_observer_base_classes:
+                    # Handle the special case of quantization per channel observers,
+                    # where buffer shape mismatches are expected.
+                    def _get_module_for_key(model: torch.nn.Module, key: str) -> torch.nn.Module:
+                        # foo.bar.param_or_buffer_name -> [foo, bar]
+                        key_parts = key.split(".")[:-1]
+                        cur_module = model
+                        for key_part in key_parts:
+                            cur_module = getattr(cur_module, key_part)
+                        return cur_module
+                    cls_to_skip = (
+                        ObserverBase,
+                        FakeQuantizeBase,
+                    )
+                    target_module = _get_module_for_key(model, k)
+                    if isinstance(target_module, cls_to_skip):
+                        # Do not remove modules with expected shape mismatches
+                        # them from the state_dict loading. They have special logic
+                        # in _load_from_state_dict to handle the mismatches.
+                        continue
+                incorrect_shapes.append((k, shape_checkpoint, shape_model))
+                checkpoint_state_dict.pop(k)
+    model.load_state_dict(checkpoint_state_dict, strict=False)
+def predict(model: nn.Module, inp: list[dict[str, Any]]) -> list[dict[str, instances.Instances]]:
+    """
+    Compute model predictions.
+    :param model: model to run on input
+    :param inp: input, in the form
+        {
+            "image_file": path to image,
+            "image": float32 torch.tensor of dimensions [channel, height, width] as RGB/RGBD/depth image
+        }
+    :return: list of detected instances and predicted openable parameters
+    """
+    with torch.no_grad(), evaluation.inference_context(model):
+        out = model(inp)
+    return out
+def generate_rotation_visualization(
+    pcd: o3d.geometry.PointCloud,
+    axis_arrow: o3d.geometry.TriangleMesh,
+    mask: np.ndarray,
+    axis_vector: np.ndarray,
+    origin: np.ndarray,
+    range_min: float,
+    range_max: float,
+    num_samples: int,
+    output_dir: str,
+) -> None:
+    """
+    Generate visualization files for a rotation motion of a part.
+    :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
+    :param axis_arrow: mesh object representing axis arrow of rotation to be rendered in visualization
+    :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
+    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
+    :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
+    :param range_min: float representing the minimum range of motion in radians
+    :param range_max: float representing the maximum range of motion in radians
+    :param num_samples: number of sample states to visualize in between range_min and range_max of motion
+    :param output_dir: string path to directory in which to save visualization output
+    """
+    angle_in_radians = np.linspace(range_min, range_max, num_samples)
+    angles_in_degrees = angle_in_radians * 180 / np.pi
+    for idx, angle_in_degrees in enumerate(angles_in_degrees):
+        # Make a copy of your original point cloud and arrow for each rotation
+        rotated_pcd = deepcopy(pcd)
+        rotated_arrow = deepcopy(axis_arrow)
+        angle_rad = np.radians(angle_in_degrees)
+        rotated_pcd = rotate_part(rotated_pcd, mask, axis_vector, origin, angle_rad)
+        # Create a Visualizer object for each rotation
+        vis = o3d.visualization.Visualizer()
+        vis.create_window()
+        # Add the rotated geometries
+        vis.add_geometry(rotated_pcd)
+        vis.add_geometry(rotated_arrow)
+        # Apply the additional rotation around x-axis if desired
+        angle_x = np.pi * 5.5 / 5  # 198 degrees
+        rotation_matrix = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
+        rotated_pcd.rotate(rotation_matrix, center=rotated_pcd.get_center())
+        rotated_arrow.rotate(rotation_matrix, center=rotated_pcd.get_center())
+        # Capture and save the image
+        output_filename = f"{output_dir}/{idx}.png"
+        vis.capture_screen_image(output_filename, do_render=True)
+        vis.destroy_window()
+def generate_translation_visualization(
+    pcd: o3d.geometry.PointCloud,
+    axis_arrow: o3d.geometry.TriangleMesh,
+    mask: np.ndarray,
+    end: np.ndarray,
+    range_min: float,
+    range_max: float,
+    num_samples: int,
+    output_dir: str,
+) -> None:
+    """
+    Generate visualization files for a translation motion of a part.
+    :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
+    :param axis_arrow: mesh object representing axis arrow of translation to be rendered in visualization
+    :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
+    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
+    :param origin: np.array of dimensions (3, ) representing the origin point of the axis of translation
+    :param range_min: float representing the minimum range of motion
+    :param range_max: float representing the maximum range of motion
+    :param num_samples: number of sample states to visualize in between range_min and range_max of motion
+    :param output_dir: string path to directory in which to save visualization output
+    """
+    translate_distances = np.linspace(range_min, range_max, num_samples)
+    for idx, translate_distance in enumerate(translate_distances):
+        translated_pcd = deepcopy(pcd)
+        translated_arrow = deepcopy(axis_arrow)
+        translated_pcd = translate_part(translated_pcd, mask, end, translate_distance.item())
+        # Create a Visualizer object for each rotation
+        vis = o3d.visualization.Visualizer()
+        vis.create_window()
+        # Add the translated geometries
+        vis.add_geometry(translated_pcd)
+        vis.add_geometry(translated_arrow)
+        # Apply the additional rotation around x-axis if desired
+        # TODO: not sure why we need this rotation for the translation, and when it would be desired
+        angle_x = np.pi * 5.5 / 5  # 198 degrees
+        R = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
+        translated_pcd.rotate(R, center=translated_pcd.get_center())
+        translated_arrow.rotate(R, center=translated_pcd.get_center())
+        # Capture and save the image
+        output_filename = f"{output_dir}/{idx}.png"
+        vis.capture_screen_image(output_filename, do_render=True)
+        vis.destroy_window()
+def get_rotation_matrix_from_vectors(vec1: np.ndarray, vec2: np.ndarray) -> np.ndarray:
+    """
+    Find the rotation matrix that aligns vec1 to vec2
+    :param vec1: A 3d "source" vector
+    :param vec2: A 3d "destination" vector
+    :return: A transform matrix (3x3) which when applied to vec1, aligns it with vec2.
+    """
+    a, b = (vec1 / np.linalg.norm(vec1)).reshape(3), (vec2 / np.linalg.norm(vec2)).reshape(3)
+    v = np.cross(a, b)
+    c = np.dot(a, b)
+    s = np.linalg.norm(v)
+    kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
+    rotation_matrix = np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s**2))
+    return rotation_matrix
+def draw_line(start_point: np.ndarray, end_point: np.ndarray) -> o3d.geometry.TriangleMesh:
+    """
+    Generate 3D mesh representing axis from start_point to end_point.
+    :param start_point: np.ndarray of dimensions (3, ) representing the start point of the axis
+    :param end_point: np.ndarray of dimensions (3, ) representing the end point of the axis
+    :return: mesh object representing axis from start to end
+    """
+    # Compute direction vector and normalize it
+    direction_vector = end_point - start_point
+    normalized_vector = direction_vector / np.linalg.norm(direction_vector)
+    # Compute the rotation matrix to align the Z-axis with the desired direction
+    target_vector = np.array([0, 0, 1])
+    rot_mat = get_rotation_matrix_from_vectors(target_vector, normalized_vector)
+    # Create the cylinder (shaft of the arrow)
+    cylinder_length = 0.9  # 90% of the total arrow length, you can adjust as needed
+    cylinder_radius = 0.01  # Adjust the thickness of the arrow shaft
+    cylinder = o3d.geometry.TriangleMesh.create_cylinder(radius=cylinder_radius, height=cylinder_length)
+    # Move base of cylinder to origin, rotate, then translate to start_point
+    cylinder.translate([0, 0, 0])
+    cylinder.rotate(rot_mat, center=[0, 0, 0])
+    cylinder.translate(start_point)
+    # Create the cone (head of the arrow)
+    cone_height = 0.1  # 10% of the total arrow length, adjust as needed
+    cone_radius = 0.03  # Adjust the size of the arrowhead
+    cone = o3d.geometry.TriangleMesh.create_cone(radius=cone_radius, height=cone_height)
+    # Move base of cone to origin, rotate, then translate to end of cylinder
+    cone.translate([-0, 0, 0])
+    cone.rotate(rot_mat, center=[0, 0, 0])
+    cone.translate(start_point + normalized_vector * 0.4)
+    arrow = cylinder + cone
+    return arrow
+def rotate_part(
+    pcd: o3d.geometry.PointCloud, mask: np.ndarray, axis_vector: np.ndarray, origin: np.ndarray, angle_rad: float
+) -> o3d.geometry.PointCloud:
+    """
+    Generate rotated point cloud of mask based on provided angle around axis.
+    :param pcd: point cloud object representing points of image
+    :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
+    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
+    :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
+    :param angle_rad: angle in radians to rotate mask part
+    :return: point cloud object after rotation of masked part
+    """
+    # Get the coordinates of the point cloud as a numpy array
+    points_np = np.asarray(pcd.points)
+    # Convert point cloud colors to numpy array for easier manipulation
+    colors_np = np.asarray(pcd.colors)
+    # Create skew-symmetric matrix from end
+    K = np.array(
+        [
+            [0, -axis_vector[2], axis_vector[1]],
+            [axis_vector[2], 0, -axis_vector[0]],
+            [-axis_vector[1], axis_vector[0], 0],
+        ]
+    )
+    # Compute rotation matrix using Rodrigues' formula
+    R = np.eye(3) + np.sin(angle_rad) * K + (1 - np.cos(angle_rad)) * np.dot(K, K)
+    # Iterate over the mask and rotate the points corresponding to the object pixels
+    for i in range(mask.shape[0]):
+        for j in range(mask.shape[1]):
+            if mask[i, j] > 0:  # This condition checks if the pixel belongs to the object
+                point_index = i * mask.shape[1] + j
+                # Translate the point such that the rotation origin is at the world origin
+                translated_point = points_np[point_index] - origin
+                # Rotate the translated point
+                rotated_point = np.dot(R, translated_point)
+                # Translate the point back
+                points_np[point_index] = rotated_point + origin
+                colors_np[point_index] = POINT_COLOR
+    # Update the point cloud's coordinates
+    pcd.points = o3d.utility.Vector3dVector(points_np)
+    # Update point cloud colors
+    pcd.colors = o3d.utility.Vector3dVector(colors_np)
+    return pcd
+def translate_part(pcd, mask, axis_vector, distance):
+    """
+    Generate translated point cloud of mask based on provided angle around axis.
+    :param pcd: point cloud object representing points of image
+    :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
+    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
+    :param distance: distance within coordinate system to translate mask part
+    :return: point cloud object after translation of masked part
+    """
+    normalized_vector = axis_vector / np.linalg.norm(axis_vector)
+    translation_vector = normalized_vector * distance
+    # Convert point cloud colors to numpy array for easier manipulation
+    colors_np = np.asarray(pcd.colors)
+    # Get the coordinates of the point cloud as a numpy array
+    points_np = np.asarray(pcd.points)
+    # Iterate over the mask and assign the color to the points corresponding to the object pixels
+    for i in range(mask.shape[0]):
+        for j in range(mask.shape[1]):
+            if mask[i, j] > 0:  # This condition checks if the pixel belongs to the object
+                point_index = i * mask.shape[1] + j
+                colors_np[point_index] = POINT_COLOR
+                points_np[point_index] += translation_vector
+    # Update point cloud colors
+    pcd.colors = o3d.utility.Vector3dVector(colors_np)
+    # Update the point cloud's coordinates
+    pcd.points = o3d.utility.Vector3dVector(points_np)
+    return pcd
+def batch_trim(images_path: str, save_path: str, identical: bool = False) -> None:
+    """
+    Trim white spaces from all images in the given path and save new images to folder.
+    :param images_path: local path to folder containing all images. Images must have the extension ".png", ".jpg", or
+    ".jpeg".
+    :param save_path: local path to folder in which to save trimmed images
+    :param identical: if True, will apply same crop to all images, else each image will have its whitespace trimmed
+    independently. Note that in the latter case, each image may have a slightly different size.
+    """
+    def get_trim(im):
+        """Trim whitespace from an image and return the cropped image."""
+        bg = Image.new(im.mode, im.size, im.getpixel((0, 0)))
+        diff = ImageChops.difference(im, bg)
+        diff = ImageChops.add(diff, diff, 2.0, -100)
+        bbox = diff.getbbox()
+        return bbox
+    if identical:  #
+        images = []
+        optimal_box = None
+        # load all images
+        for image_file in os.listdir(images_path):
+            if image_file.endswith(IMAGE_EXTENSIONS):
+                image_path = os.path.join(images_path, image_file)
+                images.append(Image.open(image_path))
+        # find optimal box size
+        for im in images:
+            bbox = get_trim(im)
+            if bbox is None:
+                bbox = (0, 0, im.size[0], im.size[1])  # bound entire image
+            if optimal_box is None:
+                optimal_box = bbox
+            else:
+                optimal_box = (
+                    min(optimal_box[0], bbox[0]),
+                    min(optimal_box[1], bbox[1]),
+                    max(optimal_box[2], bbox[2]),
+                    max(optimal_box[3], bbox[3]),
+                )
+        # apply cropping, if optimal box was found
+        if optimal_box:
+            for im in images:
+                im.crop(optimal_box)
+                im.close()
+    else:  # trim each image separately
+        for image_file in os.listdir(images_path):
+            if image_file.endswith(IMAGE_EXTENSIONS):
+                image_path = os.path.join(images_path, image_file)
+                with Image.open(image_path) as im:
+                    bbox = get_trim(im)
+                    trimmed = im.crop(bbox) if bbox else im
+                    trimmed.save(os.path.join(save_path, image_file))
+def create_gif(image_folder_path: str, num_samples: int, gif_filename: str = "output.gif") -> None:
+    """
+    Create gif out of folder of images and save to file.
+    :param image_folder_path: path to folder containing images (non-recursive). Assumes images are named as {i}.png for
+    each of i from 0 to num_samples.
+    :param num_samples: number of sampled images to compile into gif.
+    :param gif_filename: filename for gif, defaults to "output.gif"
+    """
+    # Generate a list of image filenames (assuming the images are saved as 0.png, 1.png, etc.)
+    image_files = [f"{image_folder_path}/{i}.png" for i in range(num_samples)]
+    # Read the images using imageio
+    images = [imageio.imread(image_file) for image_file in image_files]
+    # Save images as a gif
+    gif_output_path = f"{image_folder_path}/{gif_filename}"
+    imageio.mimsave(gif_output_path, images, duration=0.1)
+    return
+def main(
+    cfg: CfgNode,
+    rgb_image: str,
+    depth_image: str,
+    intrinsics: list[float],
+    num_samples: int,
+    crop: bool,
+    score_threshold: float,
+) -> None:
+    """
+    Main inference method.
+    :param cfg: configuration object
+    :param rgb_image: local path to RGB image
+    :param depth_image: local path to depth image
+    :param intrinsics: camera intrinsics matrix as a list of 9 values
+    :param num_samples: number of sample visualization states to generate
+    :param crop: if True, images will be cropped to remove whitespace before visualization
+    :param score_threshold: float between 0 and 1 representing threshold at which to filter instances based on score
+    """
+    logger = logging.getLogger("detectron2")
+    # setup data
+    logger.info("Loading image.")
+    inp = format_input(rgb_image)
+    # setup model
+    logger.info("Loading model.")
+    model = build_model(cfg)
+    weights = torch.load(cfg.MODEL.WEIGHTS, map_location=torch.device("cpu"))
+    if "model" not in weights:
+        weights = {"model": weights}
+    load_model(model, weights)
+    # run model on data
+    logger.info("Running model.")
+    prediction = predict(model, inp)[0]  # index 0 since there is only one image
+    # select best prediction to visualize
+    pred_instances = prediction["instances"]
+    score_ranking = np.argsort([-1 * pred_instances[i].scores.item() for i in range(len(pred_instances))])
+    score_ranking = [idx for idx in score_ranking if pred_instances[int(idx)].scores.item() > score_threshold]
+    if len(score_ranking) == 0:
+        logging.warning("The model did not predict any moving parts above the score threshold.")
+        return
+    for idx in score_ranking:  # iterate through all best predictions, by score threshold
+        pred = pred_instances[int(idx)]  # take highest predicted one
+        logger.info("Rendering prediction for instance %d", int(idx))
+        output_dir = os.path.join(cfg.OUTPUT_DIR, str(idx))
+        os.makedirs(output_dir, exist_ok=True)
+        # extract predicted values for visualization
+        mask = np.squeeze(pred.pred_masks.cpu().numpy())  # dim: [height, width]
+        origin = pred.morigin.cpu().numpy().flatten()  # dim: [3, ]
+        axis_vector = pred.maxis.cpu().numpy().flatten()  # dim: [3, ]
+        pred_type = TYPE_CLASSIFICATION.get(pred.mtype.item())
+        range_min = 0 - pred.mstate.cpu().numpy()
+        range_max = pred.mstatemax.cpu().numpy() - pred.mstate.cpu().numpy()
+        # process visualization
+        color = o3d.io.read_image(rgb_image)
+        depth = o3d.io.read_image(depth_image)
+        rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth(color, depth, convert_rgb_to_intensity=False)
+        color_np = np.asarray(color)
+        height, width = color_np.shape[:2]
+        # generate intrinsics
+        intrinsic_matrix = np.reshape(intrinsics, (3, 3), order="F")
+        intrinsic_obj = o3d.camera.PinholeCameraIntrinsic(
+            width,
+            height,
+            intrinsic_matrix[0, 0],
+            intrinsic_matrix[1, 1],
+            intrinsic_matrix[0, 2],
+            intrinsic_matrix[1, 2],
+        )
+        # Convert the RGBD image to a point cloud
+        pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, intrinsic_obj)
+        # Create a LineSet to visualize the direction vector
+        axis_arrow = draw_line(origin, axis_vector + origin)
+        axis_arrow.paint_uniform_color([0, 1, 0])
+        # if USE_GT:
+        #     anno_path = f"/localhome/atw7/projects/opdmulti/data/data_demo_dev/59-4860.json"
+        #     part_id = 32
+        #     # get annotation for the frame
+        #     import json
+        #     with open(anno_path, "r") as f:
+        #         anno = json.load(f)
+        #     articulations = anno["articulation"]
+        #     for articulation in articulations:
+        #         if articulation["partId"] == part_id:
+        #             range_min = articulation["rangeMin"] - articulation["state"]
+        #             range_max = articulation["rangeMax"] - articulation["state"]
+        #             break
+        if pred_type == "rotation":
+            generate_rotation_visualization(
+                pcd,
+                axis_arrow,
+                mask,
+                axis_vector,
+                origin,
+                range_min,
+                range_max,
+                num_samples,
+                output_dir,
+            )
+        elif pred_type == "translation":
+            generate_translation_visualization(
+                pcd,
+                axis_arrow,
+                mask,
+                axis_vector,
+                range_min,
+                range_max,
+                num_samples,
+                output_dir,
+            )
+        else:
+            raise ValueError(f"Invalid motion prediction type: {pred_type}")
+        if pred_type:
+            if crop:  # crop images to remove shared extraneous whitespace
+                output_dir_cropped = f"{output_dir}_cropped"
+                if not os.path.isdir(output_dir_cropped):
+                    os.makedirs(output_dir_cropped)
+                batch_trim(output_dir, output_dir_cropped, identical=True)
+                create_gif(output_dir_cropped, num_samples)
+            else:  # leave original dimensions of image as-is
+                create_gif(output_dir, num_samples)
+if __name__ == "__main__":
+    # parse arguments
+    start_time = time.time()
+    args = get_parser().parse_args()
+    cfg = setup_cfg(args)
+    # run main code
+    engine.launch(
+        main,
+        args.num_processes,
+        args=(
+            cfg,
+            args.rgb_image,
+            args.depth_image,
+            args.intrinsics,
+            args.num_samples,
+            args.crop,
+            args.score_threshold,
+        ),
+    )
+    end_time = time.time()
+    print(f"Inference time: {end_time - start_time:.2f} seconds")

mask2former/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Facebook, Inc. and its affiliates
+from . import modeling
+# config
+from .config import add_maskformer2_config, add_motionnet_config
+__all__ = [
+    "modeling",
+    "add_maskformer2_config",
+    "add_motionnet_config",
+]

mask2former/config.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.config import CfgNode as CN
+def add_motionnet_config(cfg: CN):
+    _C = cfg
+    _C.MODEL.MOTIONNET = CN()
+    _C.MODEL.MOTIONNET.TYPE = "BMOC_V0"
+    cfg.MODEL.MASK_FORMER.MTYPE_WEIGHT = 2.0
+    cfg.MODEL.MASK_FORMER.MORIGIN_WEIGHT = 16.0
+    cfg.MODEL.MASK_FORMER.MAXIS_WEIGHT = 16.0
+    cfg.MODEL.MASK_FORMER.MSTATE_WEIGHT = 16.0
+    cfg.MODEL.MASK_FORMER.MSTATEMAX_WEIGHT = 16.0
+    cfg.MODEL.MASK_FORMER.EXTRINSIC_WEIGHT = 30.0
+def add_maskformer2_config(cfg):
+    """
+    Add config for MASK_FORMER.
+    """
+    # NOTE: configs from original maskformer
+    # data config
+    # select the dataset mapper
+    cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
+    # Color augmentation
+    cfg.INPUT.COLOR_AUG_SSD = False
+    # We retry random cropping until no single category in semantic segmentation GT occupies more
+    # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
+    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
+    # Pad image and segmentation GT in dataset mapper.
+    cfg.INPUT.SIZE_DIVISIBILITY = -1
+    # solver config
+    # weight decay on embedding
+    cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
+    # optimizer
+    cfg.SOLVER.OPTIMIZER = "ADAMW"
+    cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
+    # mask_former model config
+    cfg.MODEL.MASK_FORMER = CN()
+    # loss
+    cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
+    cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
+    cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
+    cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
+    cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
+    # transformer config
+    cfg.MODEL.MASK_FORMER.NHEADS = 8
+    cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
+    cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
+    cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
+    cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
+    cfg.MODEL.MASK_FORMER.PRE_NORM = False
+    cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
+    cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
+    cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
+    cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
+    # mask_former inference config
+    cfg.MODEL.MASK_FORMER.TEST = CN()
+    cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
+    cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
+    cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
+    cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
+    cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
+    cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
+    # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
+    # you can use this config to override
+    cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
+    # pixel decoder config
+    cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
+    # adding transformer in pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
+    # pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
+    # swin transformer backbone
+    cfg.MODEL.SWIN = CN()
+    cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
+    cfg.MODEL.SWIN.PATCH_SIZE = 4
+    cfg.MODEL.SWIN.EMBED_DIM = 96
+    cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
+    cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
+    cfg.MODEL.SWIN.WINDOW_SIZE = 7
+    cfg.MODEL.SWIN.MLP_RATIO = 4.0
+    cfg.MODEL.SWIN.QKV_BIAS = True
+    cfg.MODEL.SWIN.QK_SCALE = None
+    cfg.MODEL.SWIN.DROP_RATE = 0.0
+    cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
+    cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
+    cfg.MODEL.SWIN.APE = False
+    cfg.MODEL.SWIN.PATCH_NORM = True
+    cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
+    cfg.MODEL.SWIN.USE_CHECKPOINT = False
+    # NOTE: maskformer2 extra configs
+    # transformer module
+    cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
+    # LSJ aug
+    cfg.INPUT.IMAGE_SIZE = 1024
+    cfg.INPUT.MIN_SCALE = 0.1
+    cfg.INPUT.MAX_SCALE = 2.0
+    # MSDeformAttn encoder configs
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
+    # point loss configs
+    # Number of points sampled during training for a mask point head.
+    cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
+    # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
+    # original paper.
+    cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
+    # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
+    # the original paper.
+    cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75

mask2former/maskformer_model.py ADDED Viewed

	@@ -0,0 +1,820 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import pdb
+from typing import Tuple
+from copy import deepcopy
+import torch
+from torch import device, nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.postprocessing import sem_seg_postprocess
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks
+from detectron2.utils.memory import retry_if_cuda_oom
+from .modeling.criterion import SetCriterion
+from .modeling.matcher import HungarianMatcher
+from .utils.tranform import matrix_to_quaternion, quaternion_to_matrix, rotation_6d_to_matrix, matrix_to_rotation_6d, geometric_median
+from .modeling.criterion import convert_to_filled_tensor
+import numpy as np
+@META_ARCH_REGISTRY.register()
+class MaskFormer(nn.Module):
+    """
+    Main class for mask classification semantic segmentation architectures.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        mask2former_backbone: nn.Module,
+        mask2former_sem_seg_head: nn.Module,
+        num_queries: int,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        # inference
+        semantic_on: bool,
+        panoptic_on: bool,
+        instance_on: bool,
+        test_topk_per_image: int,
+        # OPD
+        motionnet_type,
+        voting,
+        gtdet,
+        inference_matcher,
+        gtextrinsic,
+        only_DET,
+        obj_method
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            semantic_on: bool, whether to output semantic segmentation prediction
+            instance_on: bool, whether to output instance segmentation prediction
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.mask2former_backbone = mask2former_backbone
+        self.mask2former_sem_seg_head = mask2former_sem_seg_head
+        self.criterion = criterion
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
+        self.register_buffer("pixel_mean", torch.Tensor(
+            pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(
+            pixel_std).view(-1, 1, 1), False)
+        # additional args
+        self.semantic_on = semantic_on
+        self.instance_on = instance_on
+        self.panoptic_on = panoptic_on
+        self.test_topk_per_image = test_topk_per_image
+        if not self.semantic_on:
+            assert self.sem_seg_postprocess_before_inference
+        # OPD
+        self.motionnet_type = motionnet_type
+        self.voting = voting
+        self.gtdet = gtdet
+        self.inference_matcher = inference_matcher
+        self.gtextrinsic = gtextrinsic
+        self.only_DET = only_DET
+        self.obj_method = obj_method
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
+        # TODO: add mask2former backbone and semseghead to get object mask
+        if cfg.OBJ_DETECT:
+            mask2former_backbone = build_backbone(cfg.MASK2FORMER)
+            mask2former_sem_seg_head = build_sem_seg_head(
+                cfg.MASK2FORMER, backbone.output_shape())
+        else:
+            mask2former_backbone = None
+            mask2former_sem_seg_head = None
+        # Loss parameters:
+        deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT
+        # loss weights
+        class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT
+        dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT
+        mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT
+        # OPD
+        mtype_weight = cfg.MODEL.MASK_FORMER.MTYPE_WEIGHT
+        morigin_weight = cfg.MODEL.MASK_FORMER.MORIGIN_WEIGHT
+        maxis_weight = cfg.MODEL.MASK_FORMER.MAXIS_WEIGHT
+        extrinsic_weight = cfg.MODEL.MASK_FORMER.EXTRINSIC_WEIGHT
+        mstate_weight = cfg.MODEL.MASK_FORMER.MSTATE_WEIGHT
+        mstatemax_weight = cfg.MODEL.MASK_FORMER.MSTATEMAX_WEIGHT
+        motionnet_type = cfg.MODEL.MOTIONNET.TYPE
+        # building criterion
+        matcher = HungarianMatcher(
+            cost_class=class_weight,
+            cost_mask=mask_weight,
+            cost_dice=dice_weight,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+        )
+        if "GTDET" in cfg.MODEL:
+            gtdet = cfg.MODEL.GTDET
+        else:
+            gtdet = False
+        if "GTEXTRINSIC" in cfg.MODEL:
+            gtextrinsic = cfg.MODEL.GTEXTRINSIC
+        else:
+            gtextrinsic = None
+        if gtdet or gtextrinsic:
+            # This inference matcher is used for GT ablation when inferencing
+            inference_matcher = matcher
+        else:
+            inference_matcher = None
+        if "ONLY_DET" in cfg.MODEL:
+            only_DET = cfg.MODEL.ONLY_DET
+        else:
+            only_DET = False
+        # OPD
+        weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight, "loss_mtype": mtype_weight,
+                       "loss_morigin": morigin_weight, "loss_maxis": maxis_weight, "loss_mstate": mstate_weight, "loss_mstatemax": mstatemax_weight}
+        if motionnet_type == "BMOC_V1" or motionnet_type == "BMOC_V2" or motionnet_type == "BMOC_V3" or motionnet_type == "BMOC_V4" or motionnet_type == "BMOC_V5" or motionnet_type == "BMOC_V6":
+            weight_dict["loss_extrinsic"] = extrinsic_weight
+        if deep_supervision:
+            dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update(
+                    {k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        # OPD
+        if motionnet_type == "BMOC_V0":
+            weight_dict["loss_extrinsic"] = extrinsic_weight
+        # OPD
+        losses = ["labels", "masks", "mtypes", "morigins",
+                  "maxises", "extrinsics", "mstates", "mstatemaxs"]
+        criterion = SetCriterion(
+            sem_seg_head.num_classes,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            eos_coef=no_object_weight,
+            losses=losses,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+            oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO,
+            importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
+            motionnet_type=motionnet_type,
+            only_DET=only_DET,
+        )
+        # OPD
+        if "VOTING" in cfg.MODEL.MOTIONNET:
+            voting = cfg.MODEL.MOTIONNET.VOTING
+        else:
+            voting = None
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "mask2former_backbone": mask2former_backbone,
+            "mask2former_sem_seg_head": mask2former_sem_seg_head,
+            "criterion": criterion,
+            "num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES,
+            "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD,
+            "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD,
+            "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
+            "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY,
+            "sem_seg_postprocess_before_inference": (
+                cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE
+                or cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON
+                or cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON
+            ),
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+            # inference
+            "semantic_on": cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON,
+            "instance_on": cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON,
+            "panoptic_on": cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON,
+            "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
+            # OPD
+            "motionnet_type": motionnet_type,
+            "voting": voting,
+            "gtdet": gtdet,
+            "inference_matcher": inference_matcher,
+            "gtextrinsic": gtextrinsic,
+            "only_DET": only_DET,
+            "obj_method": cfg.OBJ_DETECT
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        # Load the targets if it's training or it's in the groundtruth ablation study
+        if self.training or self.gtdet or self.gtextrinsic:
+            # get the grpundtruth
+            if "instances" in batched_inputs[0]:
+                gt_instances = [x["instances"].to(
+                    self.device) for x in batched_inputs]
+                targets = self.prepare_targets(gt_instances, images)
+            else:
+                targets = None
+        if not self.obj_method:
+            features = self.backbone(images.tensor)
+            outputs = self.sem_seg_head(features)
+        else:
+            # TODO: add freezed model to extract object mask.
+            for para in self.mask2former_backbone.parameters():
+                para.requires_grad = False
+            for para in self.mask2former_sem_seg_head.parameters():
+                para.requires_grad = False
+            obj_feature = self.mask2former_backbone(images.tensor)
+            obj_output = self.mask2former_sem_seg_head(obj_feature)
+            pred_obj_masks = obj_output["pred_masks"]
+            # prob_masks = torch.sigmoid(pred_obj_masks)
+            pred_cls_results = obj_output["pred_logits"]
+            # TODO: use object prediction to help object pose prediction, find a way to calculate the IoU of part and object mask
+            for indice, pred_obj_mask in enumerate(pred_obj_masks):
+                # get binary mask
+                for idx, mask in enumerate(pred_obj_mask):
+                    max_score = torch.max(mask)
+                    pred_obj_mask[idx] = (mask > (max_score*0.5)).float()
+                # replace the pred masks with binary masks
+                pred_obj_masks[indice] = pred_obj_mask
+            # import pdb
+            # pdb.set_trace()
+            features = self.backbone(images.tensor)
+            outputs = self.sem_seg_head(features, pred_obj_masks)
+            # import pdb
+            # pdb.set_trace()
+        if self.training:
+            # bipartite matching-based loss
+            losses = self.criterion(outputs, targets)
+            for k in list(losses.keys()):
+                if k in self.criterion.weight_dict:
+                    losses[k] *= self.criterion.weight_dict[k]
+                else:
+                    # remove this loss if not specified in `weight_dict`
+                    print(f"Warning: {k} is not in loss")
+                    losses.pop(k)
+            return losses
+        else:
+            mask_cls_results = outputs["pred_logits"]
+            mask_pred_results = outputs["pred_masks"]
+            # OPD
+            mask_mtype_results = outputs["pred_mtypes"]
+            mask_morigin_results = outputs["pred_morigins"]
+            mask_maxis_results = outputs["pred_maxises"]
+            mask_mstate_results = outputs["pred_mstates"]
+            mask_mstatemax_results = outputs["pred_mstatemaxs"]
+            if "BMOC" in self.motionnet_type:
+                mask_extrinsic_results = outputs["pred_extrinsics"]
+            # upsample masks
+            mask_pred_results = F.interpolate(
+                mask_pred_results,
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )
+            if self.gtdet or self.gtextrinsic:
+                if self.gtdet:
+                    # Make other predictions be bad, so that they will not consider when evaluating
+                    mask_pred_results[:, :, :, :] = -30
+                    mask_cls_results[:, :, :3] = 0
+                    mask_cls_results[:, :, 3] = 15  # weight for softmax
+                # Initialize the predicted class and predicted mask to the default value
+                if targets[0]["masks"].shape[0] != 0:
+                    outputs_without_aux = {
+                        k: v for k, v in outputs.items() if k != "aux_outputs"}
+                    # Retrieve the matching between the outputs of the last layer and the targets
+                    indices = self.inference_matcher(
+                        outputs_without_aux, targets)
+                    def _get_src_permutation_idx(indices):
+                        # permute predictions following indices
+                        batch_idx = torch.cat(
+                            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+                        src_idx = torch.cat([src for (src, _) in indices])
+                        return batch_idx, src_idx
+                    def _get_tgt_permutation_idx(indices):
+                        # permute targets following indices
+                        batch_idx = torch.cat(
+                            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+                        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+                        return batch_idx, tgt_idx
+                    src_idx = _get_src_permutation_idx(indices)
+                    tgt_idx = _get_tgt_permutation_idx(indices)
+                    if self.gtdet:
+                        mask_pred_results[src_idx] = targets[0]["masks"].unsqueeze(0)[
+                            tgt_idx].float() * 30
+                        mask_pred_results[mask_pred_results == 0] = -30
+                        mask_cls_results[src_idx] = F.one_hot(
+                            targets[0]["labels"][tgt_idx[1]], num_classes=self.sem_seg_head.num_classes+1).float() * 15
+                    if self.gtextrinsic:
+                        if self.motionnet_type == "BMOC_V6":
+                            gt_extrinsic_raw = targets[0]["gt_extrinsic"][0]
+                            gt_extrinsic = torch.cat(
+                                [
+                                    gt_extrinsic_raw[0:3],
+                                    gt_extrinsic_raw[4:7],
+                                    gt_extrinsic_raw[8:11],
+                                    gt_extrinsic_raw[12:15],
+                                ],
+                                0,
+                            )
+                            mask_extrinsic_results[0] = gt_extrinsic
+                        else:
+                            raise ValueError("Not Implemented")
+            del outputs
+            if "BMOC" in self.motionnet_type:
+                processed_results = []
+                for mask_cls_result, mask_pred_result, input_per_image, image_size, mask_mtype_result, mask_morigin_result, mask_maxis_result, mask_mstate_result, mask_mstatemax_result, mask_extrinsic_result in zip(
+                    mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes, mask_mtype_results, mask_morigin_results, mask_maxis_results, mask_mstate_results, mask_mstatemax_results, mask_extrinsic_results
+                ):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    processed_results.append({})
+                    if self.sem_seg_postprocess_before_inference:
+                        mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                            mask_pred_result, image_size, height, width
+                        )
+                        mask_cls_result = mask_cls_result.to(mask_pred_result)
+                        # OPD
+                        mask_mtype_result = mask_mtype_result.to(
+                            mask_pred_result)
+                        mask_morigin_result = mask_morigin_result.to(
+                            mask_pred_result)
+                        mask_maxis_result = mask_maxis_result.to(
+                            mask_pred_result)
+                        mask_mstate_result = mask_mstate_result.to(
+                            mask_pred_result)
+                        mask_mstatemax_result = mask_mstatemax_result.to(
+                            mask_pred_result)
+                        mask_extrinsic_result = mask_extrinsic_result.to(
+                            mask_pred_result)
+                    # semantic segmentation inference
+                    if self.semantic_on:
+                        r = retry_if_cuda_oom(self.semantic_inference)(
+                            mask_cls_result, mask_pred_result)
+                        if not self.sem_seg_postprocess_before_inference:
+                            r = retry_if_cuda_oom(sem_seg_postprocess)(
+                                r, image_size, height, width)
+                        processed_results[-1]["sem_seg"] = r
+                    # panoptic segmentation inference
+                    if self.panoptic_on:
+                        panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(
+                            mask_cls_result, mask_pred_result)
+                        processed_results[-1]["panoptic_seg"] = panoptic_r
+                    # instance segmentation inference
+                    if self.instance_on:
+                        instance_r = retry_if_cuda_oom(self.instance_inference)(
+                            mask_cls_result, mask_pred_result, mask_mtype_result, mask_morigin_result, mask_maxis_result, mask_mstate_result, mask_mstatemax_result, mask_extrinsic_result)
+                        processed_results[-1]["instances"] = instance_r
+            else:
+                processed_results = []
+                for mask_cls_result, mask_pred_result, input_per_image, image_size, mask_mtype_result, mask_morigin_result, mask_maxis_result, mask_mstate_result, mask_mstatemax_result in zip(
+                    mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes, mask_mtype_results, mask_morigin_results, mask_maxis_results, mask_mstate_results, mask_mstatemax_results
+                ):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    processed_results.append({})
+                    if self.sem_seg_postprocess_before_inference:
+                        mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
+                            mask_pred_result, image_size, height, width
+                        )
+                        mask_cls_result = mask_cls_result.to(mask_pred_result)
+                        # OPD
+                        mask_mtype_result = mask_mtype_result.to(
+                            mask_pred_result)
+                        mask_morigin_result = mask_morigin_result.to(
+                            mask_pred_result)
+                        mask_maxis_result = mask_maxis_result.to(
+                            mask_pred_result)
+                        mask_mstate_result = mask_mstate_result.to(
+                            mask_pred_result)
+                        mask_mstatemax_result = mask_mstatemax_result.to(
+                            mask_pred_result)
+                    # semantic segmentation inference
+                    if self.semantic_on:
+                        r = retry_if_cuda_oom(self.semantic_inference)(
+                            mask_cls_result, mask_pred_result)
+                        if not self.sem_seg_postprocess_before_inference:
+                            r = retry_if_cuda_oom(sem_seg_postprocess)(
+                                r, image_size, height, width)
+                        processed_results[-1]["sem_seg"] = r
+                    # panoptic segmentation inference
+                    if self.panoptic_on:
+                        panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(
+                            mask_cls_result, mask_pred_result)
+                        processed_results[-1]["panoptic_seg"] = panoptic_r
+                    # instance segmentation inference
+                    if self.instance_on:
+                        instance_r = retry_if_cuda_oom(self.instance_inference)(
+                            mask_cls_result, mask_pred_result, mask_mtype_result, mask_morigin_result, mask_maxis_result, mask_mstate_result, mask_mstatemax_result, None)
+                        processed_results[-1]["instances"] = instance_r
+            return processed_results
+    def prepare_targets(self, targets, images):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        new_targets = []
+        for targets_per_image in targets:
+            if hasattr(targets_per_image, "gt_masks"):
+                # pad gt
+                gt_masks = targets_per_image.gt_masks
+                padded_masks = torch.zeros(
+                    (gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
+                padded_masks[:, : gt_masks.shape[1],
+                             : gt_masks.shape[2]] = gt_masks
+            else:
+                padded_masks = torch.tensor([])
+            if "BMOC" in self.motionnet_type:
+                new_targets.append(
+                    {
+                        "labels": targets_per_image.gt_classes,
+                        "masks": padded_masks,
+                        # OPD
+                        "gt_motion_valids": targets_per_image.gt_motion_valids,
+                        "gt_types": targets_per_image.gt_types,
+                        "gt_origins": targets_per_image.gt_origins,
+                        "gt_axises": targets_per_image.gt_axises,
+                        "gt_states": targets_per_image.gt_states,
+                        "gt_statemaxs": targets_per_image.gt_statemaxs,
+                        "gt_extrinsic": targets_per_image.gt_extrinsic,
+                        "gt_extrinsic_quaternion": targets_per_image.gt_extrinsic_quaternion,
+                        "gt_extrinsic_6d": targets_per_image.gt_extrinsic_6d,
+                    }
+                )
+            else:
+                new_targets.append(
+                    {
+                        "labels": targets_per_image.gt_classes,
+                        "masks": padded_masks,
+                        # OPD
+                        "gt_motion_valids": targets_per_image.gt_motion_valids,
+                        "gt_types": targets_per_image.gt_types,
+                        "gt_origins": targets_per_image.gt_origins,
+                        "gt_axises": targets_per_image.gt_axises,
+                        "gt_states": targets_per_image.gt_states,
+                        "gt_statemaxs": targets_per_image.gt_statemaxs,
+                    }
+                )
+        return new_targets
+    def semantic_inference(self, mask_cls, mask_pred):
+        mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg
+    def panoptic_inference(self, mask_cls, mask_pred):
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+        keep = labels.ne(self.sem_seg_head.num_classes) & (
+            scores > self.object_mask_threshold)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+        cur_mask_cls = mask_cls[keep]
+        cur_mask_cls = cur_mask_cls[:, :-1]
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.zeros(
+            (h, w), dtype=torch.int32, device=cur_masks.device)
+        segments_info = []
+        current_segment_id = 0
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            return panoptic_seg, segments_info
+        else:
+            # take argmax
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            stuff_memory_list = {}
+            for k in range(cur_classes.shape[0]):
+                pred_class = cur_classes[k].item()
+                isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
+                mask_area = (cur_mask_ids == k).sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+                mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
+                if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
+                    if mask_area / original_area < self.overlap_threshold:
+                        continue
+                    # merge stuff regions
+                    if not isthing:
+                        if int(pred_class) in stuff_memory_list.keys():
+                            panoptic_seg[mask] = stuff_memory_list[int(
+                                pred_class)]
+                            continue
+                        else:
+                            stuff_memory_list[int(
+                                pred_class)] = current_segment_id + 1
+                    current_segment_id += 1
+                    panoptic_seg[mask] = current_segment_id
+                    segments_info.append(
+                        {
+                            "id": current_segment_id,
+                            "isthing": bool(isthing),
+                            "category_id": int(pred_class),
+                        }
+                    )
+            return panoptic_seg, segments_info
+    # Voting algorithms for inference
+    def votingProcess(self, x, voting):
+        device = x.device
+        if voting == "median":
+            final = torch.median(x, axis=0)[0]
+        elif voting == "mean":
+            final = torch.mean(x, axis=0)
+        elif voting == "geo-median":
+            x = x.detach().cpu().numpy()
+            final = geometric_median(x)
+            final = torch.from_numpy(final).to(device)
+        return final
+    def convert_to_valid_extrinsic(self, mask_extrinsic, dim=0):
+        if dim == 0:
+            translation = mask_extrinsic[9:12]
+            rotation_mat = quaternion_to_matrix(matrix_to_quaternion(
+                torch.transpose(mask_extrinsic[:9].reshape(3, 3), 0, 1)))
+            rotation_vector = torch.flatten(rotation_mat.transpose(0, 1))
+            final_mask_extrinsic = torch.cat((rotation_vector, translation))
+        elif dim == 1:
+            translation = mask_extrinsic[:, 9:12]
+            rotation_mat = quaternion_to_matrix(matrix_to_quaternion(
+                torch.transpose(mask_extrinsic[:, :9].reshape(-1, 3, 3), 1, 2)))
+            rotation_vector = torch.flatten(
+                rotation_mat.transpose(1, 2), start_dim=1)
+            final_mask_extrinsic = torch.cat(
+                (rotation_vector, translation), dim=1)
+        return final_mask_extrinsic
+    def instance_inference(self, mask_cls, mask_pred, mask_mtype, mask_morigin, mask_maxis, mask_mstate, mask_mstatemax, mask_extrinsic):
+        # mask_pred is already processed to have the same shape as original input
+        image_size = mask_pred.shape[-2:]
+        # [Q, K]
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(
+            0).repeat(self.num_queries, 1).flatten(0, 1)
+        # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
+        scores_per_image, topk_indices = scores.flatten(
+            0, 1).topk(self.test_topk_per_image, sorted=False)
+        labels_per_image = labels[topk_indices]
+        topk_indices = topk_indices // self.sem_seg_head.num_classes
+        # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
+        mask_pred = mask_pred[topk_indices]
+        # OPD
+        mask_mtype = mask_mtype[topk_indices]
+        pred_probs = F.softmax(mask_mtype, dim=1)
+        mask_mtype = torch.argmax(pred_probs, 1).float()
+        mask_morigin = mask_morigin[topk_indices]
+        mask_maxis = mask_maxis[topk_indices]
+        mask_mstate = mask_mstate[topk_indices]
+        mask_mstatemax = mask_mstatemax[topk_indices]
+        if self.motionnet_type == "BMOC_V1":
+            mask_extrinsic = mask_extrinsic[topk_indices]
+            mask_extrinsic = self.convert_to_valid_extrinsic(
+                mask_extrinsic, dim=1)
+            if self.voting != "none":
+                final_translation = torch.median(
+                    mask_extrinsic[:, 9:12], axis=0)[0]
+                quaternions = matrix_to_quaternion(torch.transpose(
+                    mask_extrinsic[:, :9].reshape(-1, 3, 3), 1, 2))
+                final_quaternion = self.votingProcess(quaternions, self.voting)
+                final_rotation = quaternion_to_matrix(final_quaternion)
+                final_rotation_vector = torch.flatten(
+                    final_rotation.transpose(0, 1))
+                mask_extrinsic = torch.cat(
+                    (final_rotation_vector, final_translation))
+        elif self.motionnet_type == "BMOC_V2":
+            mask_extrinsic = mask_extrinsic[topk_indices]
+            if self.voting != "none":
+                final_translation = torch.median(
+                    mask_extrinsic[:, 4:7], axis=0)[0]
+                final_quaternion = self.votingProcess(
+                    mask_extrinsic[:, :4], self.voting)
+                final_rotation = quaternion_to_matrix(final_quaternion)
+                final_rotation_vector = torch.flatten(
+                    final_rotation.transpose(0, 1))
+                mask_extrinsic = torch.cat(
+                    (final_rotation_vector, final_translation))
+            elif self.voting == "none":
+                translations = mask_extrinsic[:, 4:7]
+                quaternions = mask_extrinsic[:, :4]
+                rotation_vector = torch.flatten(
+                    quaternion_to_matrix(quaternions).transpose(1, 2), 1)
+                mask_extrinsic = torch.cat((rotation_vector, translations), 1)
+        elif self.motionnet_type == "BMOC_V3":
+            mask_extrinsic = mask_extrinsic[topk_indices]
+            if self.voting != "none":
+                final_translation = torch.median(
+                    mask_extrinsic[:, 6:9], axis=0)[0]
+                final_6d = self.votingProcess(
+                    mask_extrinsic[:, :6], self.voting)
+                final_rotation = rotation_6d_to_matrix(final_6d)
+                final_rotation_vector = torch.flatten(
+                    final_rotation.transpose(0, 1))
+                mask_extrinsic = torch.cat(
+                    (final_rotation_vector, final_translation))
+            elif self.voting == "none":
+                translations = mask_extrinsic[:, 6:9]
+                rotation_6ds = mask_extrinsic[:, :6]
+                rotation_vector = torch.flatten(
+                    rotation_6d_to_matrix(rotation_6ds).transpose(1, 2), 1)
+                mask_extrinsic = torch.cat((rotation_vector, translations), 1)
+        elif self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5":
+            translation = mask_extrinsic[4:7]
+            quaternion = mask_extrinsic[:4]
+            rotation_vector = torch.flatten(
+                quaternion_to_matrix(quaternion).transpose(0, 1))
+            mask_extrinsic = torch.cat((rotation_vector, translation))
+        elif self.motionnet_type == "BMOC_V0" or self.motionnet_type == "BMOC_V6":
+            mask_extrinsic = self.convert_to_valid_extrinsic(
+                mask_extrinsic, dim=0)
+        if "BMOC" in self.motionnet_type:
+            # Use the predicted extrinsic matrix to convert the predicted morigin and maxis back to camera coordinate
+            maxis_end = mask_morigin + mask_maxis
+            mextrinsic_c2w = torch.eye(4, device=mask_morigin.device).repeat(
+                mask_morigin.shape[0], 1, 1
+            )
+            if self.motionnet_type == "BMOC_V0" or self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5" or self.motionnet_type == "BMOC_V6" or (self.motionnet_type == "BMOC_V1" and self.voting != "none") or (self.motionnet_type == "BMOC_V2" and self.voting != "none") or (self.motionnet_type == "BMOC_V3" and self.voting != "none"):
+                mextrinsic_c2w[:, 0:3, 0:4] = torch.transpose(
+                    mask_extrinsic.reshape(4, 3).repeat(
+                        mask_morigin.shape[0], 1, 1), 1, 2
+                )
+            elif self.motionnet_type == "BMOC_V1" or self.motionnet_type == "BMOC_V2" or self.motionnet_type == "BMOC_V3":
+                mextrinsic_c2w[:, 0:3, 0:4] = torch.transpose(
+                    mask_extrinsic.reshape(-1, 4, 3), 1, 2
+                )
+            mextrinsic_w2c = torch.inverse(mextrinsic_c2w)
+            mask_morigin = (
+                torch.matmul(
+                    mextrinsic_w2c[:, :3,
+                                   :3], mask_morigin.unsqueeze(2)
+                ).squeeze(2)
+                + mextrinsic_w2c[:, :3, 3]
+            )
+            end_in_cam = (
+                torch.matmul(
+                    mextrinsic_w2c[:, :3, :3], maxis_end.unsqueeze(2)
+                ).squeeze(2)
+                + mextrinsic_w2c[:, :3, 3]
+            )
+            mask_maxis = end_in_cam - mask_morigin
+        # if this is panoptic segmentation, we only keep the "thing" classes
+        if self.panoptic_on:
+            keep = torch.zeros_like(scores_per_image).bool()
+            for i, lab in enumerate(labels_per_image):
+                keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
+            scores_per_image = scores_per_image[keep]
+            labels_per_image = labels_per_image[keep]
+            mask_pred = mask_pred[keep]
+        result = Instances(image_size)
+        # mask (before sigmoid)
+        result.pred_masks = (mask_pred > 0).float()
+        # result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # Uncomment the following to get boxes from masks (this is slow)
+        result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
+        # calculate average mask prob
+        mask_scores_per_image = (mask_pred.sigmoid().flatten(
+            1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
+        result.scores = scores_per_image * mask_scores_per_image
+        result.pred_classes = labels_per_image
+        # OPD
+        result.mtype = mask_mtype
+        result.morigin = mask_morigin
+        result.maxis = mask_maxis
+        result.mstate = mask_mstate
+        result.mstatemax = mask_mstatemax
+        if self.motionnet_type == "BMOC_V0" or self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5" or self.motionnet_type == "BMOC_V6" or (self.motionnet_type == "BMOC_V1" and self.voting != "none") or (self.motionnet_type == "BMOC_V2" and self.voting != "none") or (self.motionnet_type == "BMOC_V3" and self.voting != "none"):
+            result.mextrinsic = mask_extrinsic.repeat(mask_morigin.shape[0], 1)
+        elif self.motionnet_type == "BMOC_V1" or self.motionnet_type == "BMOC_V2" or self.motionnet_type == "BMOC_V3":
+            result.mextrinsic = mask_extrinsic
+        return result

mask2former/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from .backbone.swin import D2SwinTransformer
+from .pixel_decoder.fpn import BasePixelDecoder
+from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
+from .meta_arch.mask_former_head import MaskFormerHead
+from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead

mask2former/modeling/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates.

mask2former/modeling/backbone/swin.py ADDED Viewed

	@@ -0,0 +1,770 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu, Yutong Lin, Yixuan Wei
+# --------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+        )
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        pretrain_img_size=224,
+        patch_size=4,
+        in_chans=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+        )
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1],
+            ]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs["res{}".format(i + 2)] = out
+        return outs
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+@BACKBONE_REGISTRY.register()
+class D2SwinTransformer(SwinTransformer, Backbone):
+    def __init__(self, cfg, input_shape):
+        pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE
+        patch_size = cfg.MODEL.SWIN.PATCH_SIZE
+        in_chans = 3
+        embed_dim = cfg.MODEL.SWIN.EMBED_DIM
+        depths = cfg.MODEL.SWIN.DEPTHS
+        num_heads = cfg.MODEL.SWIN.NUM_HEADS
+        window_size = cfg.MODEL.SWIN.WINDOW_SIZE
+        mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO
+        qkv_bias = cfg.MODEL.SWIN.QKV_BIAS
+        qk_scale = cfg.MODEL.SWIN.QK_SCALE
+        drop_rate = cfg.MODEL.SWIN.DROP_RATE
+        attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE
+        drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE
+        norm_layer = nn.LayerNorm
+        ape = cfg.MODEL.SWIN.APE
+        patch_norm = cfg.MODEL.SWIN.PATCH_NORM
+        use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT
+        super().__init__(
+            pretrain_img_size,
+            patch_size,
+            in_chans,
+            embed_dim,
+            depths,
+            num_heads,
+            window_size,
+            mlp_ratio,
+            qkv_bias,
+            qk_scale,
+            drop_rate,
+            attn_drop_rate,
+            drop_path_rate,
+            norm_layer,
+            ape,
+            patch_norm,
+            use_checkpoint=use_checkpoint,
+        )
+        self._out_features = cfg.MODEL.SWIN.OUT_FEATURES
+        self._out_feature_strides = {
+            "res2": 4,
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+        }
+        self._out_feature_channels = {
+            "res2": self.num_features[0],
+            "res3": self.num_features[1],
+            "res4": self.num_features[2],
+            "res5": self.num_features[3],
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    @property
+    def size_divisibility(self):
+        return 32

mask2former/modeling/criterion.py ADDED Viewed

	@@ -0,0 +1,547 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+"""
+MaskFormer criterion.
+"""
+import logging
+import torch
+import torch.nn.functional as F
+from torch import nn
+from detectron2.utils.comm import get_world_size
+from detectron2.projects.point_rend.point_features import (
+    get_uncertain_point_coords_with_randomness,
+    point_sample,
+)
+from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list, _max_by_axis
+from ..utils.tranform import matrix_to_quaternion, quaternion_to_matrix
+def dice_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+    ):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(-1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_masks
+dice_loss_jit = torch.jit.script(
+    dice_loss
+)  # type: torch.jit.ScriptModule
+def sigmoid_ce_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+    ):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    return loss.mean(1).sum() / num_masks
+sigmoid_ce_loss_jit = torch.jit.script(
+    sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+def calculate_uncertainty(logits):
+    """
+    We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
+        foreground class in `classes`.
+    Args:
+        logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or
+            class-agnostic, where R is the total number of predicted masks in all images and C is
+            the number of foreground classes. The values are logits.
+    Returns:
+        scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
+            the most uncertain locations having the highest uncertainty score.
+    """
+    assert logits.shape[1] == 1
+    gt_class_logits = logits.clone()
+    return -(torch.abs(gt_class_logits))
+def convert_to_filled_tensor(tensor_list):
+    max_size = _max_by_axis([list(tensor.shape) for tensor in tensor_list])
+    batch_shape = [len(tensor_list)] + max_size
+    dtype = tensor_list[0].dtype
+    device = tensor_list[0].device
+    filled_tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+    for old, new in zip(tensor_list, filled_tensor):
+        new[:old.shape[0]] = old
+    return filled_tensor
+def smooth_l1_loss(
+    input: torch.Tensor, target: torch.Tensor, beta: float, reduction: str = "none"
+) -> torch.Tensor:
+    """
+    Smooth L1 loss defined in the Fast R-CNN paper as:
+    ::
+                      | 0.5 * x ** 2 / beta   if abs(x) < beta
+        smoothl1(x) = |
+                      | abs(x) - 0.5 * beta   otherwise,
+    where x = input - target.
+    Smooth L1 loss is related to Huber loss, which is defined as:
+    ::
+                    | 0.5 * x ** 2                  if abs(x) < beta
+         huber(x) = |
+                    | beta * (abs(x) - 0.5 * beta)  otherwise
+    Smooth L1 loss is equal to huber(x) / beta. This leads to the following
+    differences:
+     - As beta -> 0, Smooth L1 loss converges to L1 loss, while Huber loss
+       converges to a constant 0 loss.
+     - As beta -> +inf, Smooth L1 converges to a constant 0 loss, while Huber loss
+       converges to L2 loss.
+     - For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant
+       slope of 1. For Huber loss, the slope of the L1 segment is beta.
+    Smooth L1 loss can be seen as exactly L1 loss, but with the abs(x) < beta
+    portion replaced with a quadratic function such that at abs(x) = beta, its
+    slope is 1. The quadratic segment smooths the L1 loss near x = 0.
+    Args:
+        input (Tensor): input tensor of any shape
+        target (Tensor): target value tensor with the same shape as input
+        beta (float): L1 to L2 change point.
+            For beta values < 1e-5, L1 loss is computed.
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+    Returns:
+        The loss with the reduction option applied.
+    Note:
+        PyTorch's builtin "Smooth L1 loss" implementation does not actually
+        implement Smooth L1 loss, nor does it implement Huber loss. It implements
+        the special case of both in which they are equal (beta=1).
+        See: https://pytorch.org/docs/stable/nn.html#torch.nn.SmoothL1Loss.
+    """
+    if beta < 1e-5:
+        # if beta == 0, then torch.where will result in nan gradients when
+        # the chain rule is applied due to pytorch implementation details
+        # (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of
+        # zeros, rather than "no gradient"). To avoid this issue, we define
+        # small values of beta to be exactly l1 loss.
+        loss = torch.abs(input - target)
+    else:
+        n = torch.abs(input - target)
+        cond = n < beta
+        loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
+    if reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+    return loss
+class SetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
+                 num_points, oversample_ratio, importance_sample_ratio, motionnet_type, only_DET):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+        # pointwise mask loss parameters
+        self.num_points = num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        # OPD
+        self.motionnet_type = motionnet_type
+        self.only_DET = only_DET
+    def loss_labels(self, outputs, targets, indices, num_masks):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"].float()
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {"loss_ce": loss_ce}
+        return losses
+    # OPD
+    def loss_mtypes(self, outputs, targets, indices, num_masks):
+        assert "pred_mtypes" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        target_motion_valid = convert_to_filled_tensor([t["gt_motion_valids"] for t in targets])[tgt_idx]
+        src_mtypes = outputs["pred_mtypes"][src_idx][target_motion_valid]
+        target_mtypes = convert_to_filled_tensor([t["gt_types"] for t in targets])[tgt_idx][target_motion_valid]
+        if src_mtypes.shape[0] == 0:
+            return {"loss_mtype": 0.0 * src_mtypes.sum()}
+        loss_mtype = F.cross_entropy(src_mtypes, target_mtypes.long(), reduction="sum") / num_masks
+        losses = {"loss_mtype": loss_mtype}
+        return losses
+    def loss_morigins(self, outputs, targets, indices, num_masks):
+        assert "pred_morigins" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        target_motion_valid = convert_to_filled_tensor([t["gt_motion_valids"] for t in targets])[tgt_idx]
+        # Only calculate origin loss for the rotation axis
+        target_mtypes = convert_to_filled_tensor([t["gt_types"] for t in targets])[tgt_idx][target_motion_valid]
+        rot_inds = (
+                (target_mtypes == 0).nonzero().unbind(1)[0]
+            )
+        src_morigins = outputs["pred_morigins"][src_idx][target_motion_valid][rot_inds]
+        target_morigins = convert_to_filled_tensor([t["gt_origins"] for t in targets])[tgt_idx][target_motion_valid][rot_inds]
+        if src_morigins.shape[0] == 0:
+            return {"loss_morigin": 0.0 * src_morigins.sum()}
+        loss_morigin = smooth_l1_loss(src_morigins, target_morigins, 1.0, reduction="sum") / num_masks
+        losses = {"loss_morigin": loss_morigin}
+        return losses
+    def loss_maxises(self, outputs, targets, indices, num_masks):
+        assert "pred_maxises" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        target_motion_valid = convert_to_filled_tensor([t["gt_motion_valids"] for t in targets])[tgt_idx]
+        src_maxises = outputs["pred_maxises"][src_idx][target_motion_valid]
+        target_maxises = convert_to_filled_tensor([t["gt_axises"] for t in targets])[tgt_idx][target_motion_valid]
+        if src_maxises.shape[0] == 0:
+            return {"loss_maxis": 0.0 * src_maxises.sum()}
+        loss_maxis = smooth_l1_loss(src_maxises, target_maxises, 1.0, reduction="sum") / num_masks
+        losses = {"loss_maxis": loss_maxis}
+        return losses
+    #TODO: add loss for motion state
+    def loss_mstates(self, outputs, targets, indices, num_masks):
+        assert "pred_mstates" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        target_motion_valid = convert_to_filled_tensor([t["gt_motion_valids"] for t in targets])[tgt_idx]
+        src_mstate = outputs["pred_mstates"][src_idx][target_motion_valid]
+        target_mstate = convert_to_filled_tensor([t["gt_states"] for t in targets])[tgt_idx][target_motion_valid]
+        if src_mstate.shape[0] == 0:
+            return {"loss_mstate": 0.0 * src_mstate.sum()}
+        loss_mstate = smooth_l1_loss(src_mstate, target_mstate, 1.0, reduction="sum") / num_masks
+        losses = {"loss_mstate": loss_mstate}
+        return losses
+    def loss_mstatemaxs(self, outputs, targets, indices, num_masks):
+        assert "pred_mstatemaxs" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        target_motion_valid = convert_to_filled_tensor([t["gt_motion_valids"] for t in targets])[tgt_idx]
+        src_mstatemax = outputs["pred_mstatemaxs"][src_idx][target_motion_valid]
+        target_mstatemax = convert_to_filled_tensor([t["gt_statemaxs"] for t in targets])[tgt_idx][target_motion_valid]
+        if src_mstatemax.shape[0] == 0:
+            return {"loss_mstatemax": 0.0 * src_mstatemax.sum()}
+        loss_mstatemax = smooth_l1_loss(src_mstatemax, target_mstatemax, 1.0, reduction="sum") / num_masks
+        losses = {"loss_mstatemax": loss_mstatemax}
+        return losses
+    def loss_extrinsics(self, outputs, targets, indices, num_masks):
+        assert "pred_extrinsics" in outputs
+        if self.motionnet_type == "BMOC_V0" or self.motionnet_type == "BMOC_V6":
+            target_motion_valid = torch.tensor([t["gt_motion_valids"][0] for t in targets], device=outputs["pred_extrinsics"].device)
+            src_extrinsics = outputs["pred_extrinsics"][target_motion_valid]
+            target_extrinsics_full = [t["gt_extrinsic"][0] for t in targets]
+            target_extrinsics = convert_to_filled_tensor([torch.cat(
+                                [
+                                    extrinsic[0:3],
+                                    extrinsic[4:7],
+                                    extrinsic[8:11],
+                                    extrinsic[12:15],
+                                ],
+                                0,
+                            ) for extrinsic in target_extrinsics_full])[target_motion_valid]
+            if src_extrinsics.shape[0] == 0:
+                return {"loss_extrinsic": 0.0 * src_extrinsics.sum()}
+            # Much proper to make sure each valid image gives the same contribution to the loss
+            # Therefore, here use the number of images to average
+            loss_extrinsic = smooth_l1_loss(src_extrinsics, target_extrinsics, 1.0, reduction="sum") / outputs["pred_extrinsics"].shape[0]
+        elif self.motionnet_type == "BMOC_V1":
+            src_idx = self._get_src_permutation_idx(indices)
+            tgt_idx = self._get_tgt_permutation_idx(indices)
+            target_motion_valid = convert_to_filled_tensor([t["gt_motion_valids"] for t in targets])[tgt_idx]
+            src_extrinsics = outputs["pred_extrinsics"][src_idx][target_motion_valid]
+            target_extrinsics_full = []
+            for t in targets:
+                extrinsics = t["gt_extrinsic"]
+                target_extrinsics_full.append(torch.cat(
+                                [
+                                    extrinsics[:, 0:3],
+                                    extrinsics[:, 4:7],
+                                    extrinsics[:, 8:11],
+                                    extrinsics[:, 12:15],
+                                ],
+                                1,
+                            ))
+            target_extrinsics = convert_to_filled_tensor(target_extrinsics_full)[tgt_idx][target_motion_valid]
+            if src_extrinsics.shape[0] == 0:
+                return {"loss_extrinsic": 0.0 * src_extrinsics.sum()}
+            # Much proper to make sure each valid image gives the same contribution to the loss
+            # Therefore, here use the number of images to average
+            loss_extrinsic = smooth_l1_loss(src_extrinsics, target_extrinsics, 1.0, reduction="sum") / num_masks
+        elif self.motionnet_type == "BMOC_V2":
+            src_idx = self._get_src_permutation_idx(indices)
+            tgt_idx = self._get_tgt_permutation_idx(indices)
+            target_motion_valid = convert_to_filled_tensor([t["gt_motion_valids"] for t in targets])[tgt_idx]
+            src_extrinsics = outputs["pred_extrinsics"][src_idx][target_motion_valid]
+            target_extrinsics = convert_to_filled_tensor([t["gt_extrinsic_quaternion"] for t in targets])[tgt_idx][target_motion_valid]
+            if src_extrinsics.shape[0] == 0:
+                return {"loss_extrinsic": 0.0 * src_extrinsics.sum()}
+            # Much proper to make sure each valid image gives the same contribution to the loss
+            # Therefore, here use the number of images to average
+            loss_extrinsic = smooth_l1_loss(src_extrinsics, target_extrinsics, 1.0, reduction="sum") / num_masks
+        elif self.motionnet_type == "BMOC_V3":
+            src_idx = self._get_src_permutation_idx(indices)
+            tgt_idx = self._get_tgt_permutation_idx(indices)
+            target_motion_valid = convert_to_filled_tensor([t["gt_motion_valids"] for t in targets])[tgt_idx]
+            src_extrinsics = outputs["pred_extrinsics"][src_idx][target_motion_valid]
+            target_extrinsics = convert_to_filled_tensor([t["gt_extrinsic_6d"] for t in targets])[tgt_idx][target_motion_valid]
+            if src_extrinsics.shape[0] == 0:
+                return {"loss_extrinsic": 0.0 * src_extrinsics.sum()}
+            # Much proper to make sure each valid image gives the same contribution to the loss
+            # Therefore, here use the number of images to average
+            loss_extrinsic = smooth_l1_loss(src_extrinsics, target_extrinsics, 1.0, reduction="sum") / num_masks
+        elif self.motionnet_type == "BMOC_V4"  or self.motionnet_type == "BMOC_V5":
+            target_motion_valid = torch.tensor([t["gt_motion_valids"][0] for t in targets], device=outputs["pred_extrinsics"].device)
+            src_extrinsics = outputs["pred_extrinsics"][target_motion_valid]
+            target_extrinsics = convert_to_filled_tensor([t["gt_extrinsic_quaternion"][0] for t in targets])[target_motion_valid]
+            if src_extrinsics.shape[0] == 0:
+                return {"loss_extrinsic": 0.0 * src_extrinsics.sum()}
+            # Much proper to make sure each valid image gives the same contribution to the loss
+            # Therefore, here use the number of images to average
+            loss_extrinsic = smooth_l1_loss(src_extrinsics, target_extrinsics, 1.0, reduction="sum") / outputs["pred_extrinsics"].shape[0]
+        return {"loss_extrinsic": loss_extrinsic}
+    def loss_masks(self, outputs, targets, indices, num_masks):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+        # No need to upsample predictions as we are using normalized coordinates :)
+        # N x 1 x H x W
+        src_masks = src_masks[:, None]
+        target_masks = target_masks[:, None]
+        with torch.no_grad():
+            # sample point_coords
+            point_coords = get_uncertain_point_coords_with_randomness(
+                src_masks,
+                lambda logits: calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            )
+            # get gt labels
+            point_labels = point_sample(
+                target_masks,
+                point_coords,
+                align_corners=False,
+            ).squeeze(1)
+        point_logits = point_sample(
+            src_masks,
+            point_coords,
+            align_corners=False,
+        ).squeeze(1)
+        losses = {
+            "loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
+            "loss_dice": dice_loss_jit(point_logits, point_labels, num_masks),
+        }
+        del src_masks
+        del target_masks
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def get_loss(self, loss, outputs, targets, indices, num_masks):
+        tmp_device = outputs["pred_logits"].device
+        tmp_list = ["mtypes", "morigins", "maxises"]
+        loss_map = {
+            'labels': self.loss_labels,
+            'masks': self.loss_masks,
+            # OPD
+            "mtypes": self.loss_mtypes,
+            "morigins": self.loss_morigins,
+            "maxises": self.loss_maxises,
+            "extrinsics": self.loss_extrinsics,
+            "mstates": self.loss_mstates,
+            "mstatemaxs": self.loss_mstatemaxs,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        tmp_loss = loss_map[loss](outputs, targets, indices, num_masks)
+        if self.only_DET and loss in tmp_list:
+            tmp_key = list(tmp_loss.keys())[0]
+            tmp_loss[tmp_key] = torch.tensor(0.0, device=tmp_device)
+            return tmp_loss
+        else:
+            return tmp_loss
+        # return loss_map[loss](outputs, targets, indices, num_masks)
+    def forward(self, outputs, targets):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        tmp_device = outputs["pred_logits"].device
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_masks = sum(len(t["labels"]) for t in targets)
+        num_masks = torch.as_tensor(
+            [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_masks)
+        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            if loss == "extrinsics" and self.motionnet_type == "BMCC":
+                    continue
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_masks))
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == "extrinsics" and (self.motionnet_type == "BMOC_V0" or self.motionnet_type == "BMCC"):
+                        continue
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_masks)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses
+    def __repr__(self):
+        head = "Criterion " + self.__class__.__name__
+        body = [
+            "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
+            "losses: {}".format(self.losses),
+            "weight_dict: {}".format(self.weight_dict),
+            "num_classes: {}".format(self.num_classes),
+            "eos_coef: {}".format(self.eos_coef),
+            "num_points: {}".format(self.num_points),
+            "oversample_ratio: {}".format(self.oversample_ratio),
+            "importance_sample_ratio: {}".format(self.importance_sample_ratio),
+        ]
+        _repr_indent = 4
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)

mask2former/modeling/matcher.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from torch.cuda.amp import autocast
+from detectron2.projects.point_rend.point_features import point_sample
+def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
+    denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+batch_dice_loss_jit = torch.jit.script(
+    batch_dice_loss
+)  # type: torch.jit.ScriptModule
+def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    hw = inputs.shape[1]
+    pos = F.binary_cross_entropy_with_logits(
+        inputs, torch.ones_like(inputs), reduction="none"
+    )
+    neg = F.binary_cross_entropy_with_logits(
+        inputs, torch.zeros_like(inputs), reduction="none"
+    )
+    loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
+        "nc,mc->nm", neg, (1 - targets)
+    )
+    return loss / hw
+batch_sigmoid_ce_loss_jit = torch.jit.script(
+    batch_sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
+            cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+        assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
+        self.num_points = num_points
+    @torch.no_grad()
+    def memory_efficient_forward(self, outputs, targets):
+        """More memory-friendly matching"""
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+        indices = []
+        # Iterate through batch size
+        for b in range(bs):
+            out_prob = outputs["pred_logits"][b].softmax(-1)  # [num_queries, num_classes]
+            tgt_ids = targets[b]["labels"]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob[:, tgt_ids]
+            out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["masks"].to(out_mask)
+            out_mask = out_mask[:, None]
+            tgt_mask = tgt_mask[:, None]
+            # all masks share the same set of points for efficient matching!
+            point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device)
+            # get gt labels
+            tgt_mask = point_sample(
+                tgt_mask,
+                point_coords.repeat(tgt_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            out_mask = point_sample(
+                out_mask,
+                point_coords.repeat(out_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).squeeze(1)
+            with autocast(enabled=False):
+                out_mask = out_mask.float()
+                tgt_mask = tgt_mask.float()
+                # Compute the focal loss between masks
+                if out_mask.shape[0] == 0 or tgt_mask.shape[0] == 0:
+                    cost_mask = batch_sigmoid_ce_loss(out_mask, tgt_mask)
+                    # Compute the dice loss betwen masks
+                    cost_dice = batch_dice_loss(out_mask, tgt_mask)
+                else:
+                    cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
+                    # Compute the dice loss betwen masks
+                    cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
+            # Final cost matrix
+            C = (
+                self.cost_mask * cost_mask
+                + self.cost_class * cost_class
+                + self.cost_dice * cost_dice
+            )
+            C = C.reshape(num_queries, -1).cpu()
+            indices.append(linear_sum_assignment(C))
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        return self.memory_efficient_forward(outputs, targets)
+    def __repr__(self, _repr_indent=4):
+        head = "Matcher " + self.__class__.__name__
+        body = [
+            "cost_class: {}".format(self.cost_class),
+            "cost_mask: {}".format(self.cost_mask),
+            "cost_dice: {}".format(self.cost_dice),
+        ]
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)

mask2former/modeling/meta_arch/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates.

mask2former/modeling/meta_arch/mask_former_head.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from copy import deepcopy
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
+from ..pixel_decoder.fpn import build_pixel_decoder
+@SEM_SEG_HEADS_REGISTRY.register()
+class MaskFormerHead(nn.Module):
+    _version = 2
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
+                    newk = k.replace(prefix, prefix + "pixel_decoder.")
+                    # logger.debug(f"{k} ==> {newk}")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+        # extra parameters
+        transformer_predictor: nn.Module,
+        transformer_in_feature: str,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+            transformer_predictor: the transformer decoder that makes prediction
+            transformer_in_feature: input feature name to the transformer_predictor
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+        self.ignore_value = ignore_value
+        self.common_stride = 4
+        self.loss_weight = loss_weight
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+        self.transformer_in_feature = transformer_in_feature
+        self.num_classes = num_classes
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        # figure out in_channels to transformer predictor
+        if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
+            transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
+            transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
+            transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        else:
+            transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
+        return {
+            "input_shape": {
+                k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+            },
+            "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+            "pixel_decoder": build_pixel_decoder(cfg, input_shape),
+            "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
+            "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
+            "transformer_predictor": build_transformer_decoder(
+                cfg,
+                transformer_predictor_in_channels,
+                mask_classification=True,
+            ),
+        }
+    def forward(self, features, mask=None):
+        return self.layers(features, mask)
+    def layers(self, features, mask=None):
+        mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
+        if self.transformer_in_feature == "multi_scale_pixel_decoder":
+            # TODO: pass object mask prediction to this function
+            predictions = self.predictor(multi_scale_features, mask_features, mask)
+        else:
+            if self.transformer_in_feature == "transformer_encoder":
+                assert (
+                    transformer_encoder_features is not None
+                ), "Please use the TransformerEncoderPixelDecoder."
+                predictions = self.predictor(transformer_encoder_features, mask_features, mask)
+            elif self.transformer_in_feature == "pixel_embedding":
+                predictions = self.predictor(mask_features, mask_features, mask)
+            else:
+                predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
+        return predictions

mask2former/modeling/meta_arch/per_pixel_baseline.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from ..transformer_decoder.maskformer_transformer_decoder import StandardTransformerDecoder
+from ..pixel_decoder.fpn import build_pixel_decoder
+@SEM_SEG_HEADS_REGISTRY.register()
+class PerPixelBaselineHead(nn.Module):
+    _version = 2
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            logger = logging.getLogger(__name__)
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
+                    newk = k.replace(prefix, prefix + "pixel_decoder.")
+                    # logger.warning(f"{k} ==> {newk}")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+        self.ignore_value = ignore_value
+        self.common_stride = 4
+        self.loss_weight = loss_weight
+        self.pixel_decoder = pixel_decoder
+        self.predictor = Conv2d(
+            self.pixel_decoder.mask_dim, num_classes, kernel_size=1, stride=1, padding=0
+        )
+        weight_init.c2_msra_fill(self.predictor)
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        return {
+            "input_shape": {
+                k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+            },
+            "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+            "pixel_decoder": build_pixel_decoder(cfg, input_shape),
+            "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
+        }
+    def forward(self, features, targets=None):
+        """
+        Returns:
+            In training, returns (None, dict of losses)
+            In inference, returns (CxHxW logits, {})
+        """
+        x = self.layers(features)
+        if self.training:
+            return None, self.losses(x, targets)
+        else:
+            x = F.interpolate(
+                x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+            )
+            return x, {}
+    def layers(self, features):
+        x, _, _ = self.pixel_decoder.forward_features(features)
+        x = self.predictor(x)
+        return x
+    def losses(self, predictions, targets):
+        predictions = predictions.float()  # https://github.com/pytorch/pytorch/issues/48163
+        predictions = F.interpolate(
+            predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+        )
+        loss = F.cross_entropy(
+            predictions, targets, reduction="mean", ignore_index=self.ignore_value
+        )
+        losses = {"loss_sem_seg": loss * self.loss_weight}
+        return losses
+@SEM_SEG_HEADS_REGISTRY.register()
+class PerPixelBaselinePlusHead(PerPixelBaselineHead):
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
+                    newk = k.replace(prefix, prefix + "pixel_decoder.")
+                    logger.debug(f"{k} ==> {newk}")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        # extra parameters
+        transformer_predictor: nn.Module,
+        transformer_in_feature: str,
+        deep_supervision: bool,
+        # inherit parameters
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_predictor: the transformer decoder that makes prediction
+            transformer_in_feature: input feature name to the transformer_predictor
+            deep_supervision: whether or not to add supervision to the output of
+                every transformer decoder layer
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+        """
+        super().__init__(
+            input_shape,
+            num_classes=num_classes,
+            pixel_decoder=pixel_decoder,
+            loss_weight=loss_weight,
+            ignore_value=ignore_value,
+        )
+        del self.predictor
+        self.predictor = transformer_predictor
+        self.transformer_in_feature = transformer_in_feature
+        self.deep_supervision = deep_supervision
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super().from_config(cfg, input_shape)
+        ret["transformer_in_feature"] = cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE
+        if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
+            in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        else:
+            in_channels = input_shape[ret["transformer_in_feature"]].channels
+        ret["transformer_predictor"] = StandardTransformerDecoder(
+            cfg, in_channels, mask_classification=False
+        )
+        ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        return ret
+    def forward(self, features, targets=None):
+        """
+        Returns:
+            In training, returns (None, dict of losses)
+            In inference, returns (CxHxW logits, {})
+        """
+        x, aux_outputs = self.layers(features)
+        if self.training:
+            if self.deep_supervision:
+                losses = self.losses(x, targets)
+                for i, aux_output in enumerate(aux_outputs):
+                    losses["loss_sem_seg" + f"_{i}"] = self.losses(
+                        aux_output["pred_masks"], targets
+                    )["loss_sem_seg"]
+                return None, losses
+            else:
+                return None, self.losses(x, targets)
+        else:
+            x = F.interpolate(
+                x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+            )
+            return x, {}
+    def layers(self, features):
+        mask_features, transformer_encoder_features, _ = self.pixel_decoder.forward_features(features)
+        if self.transformer_in_feature == "transformer_encoder":
+            assert (
+                transformer_encoder_features is not None
+            ), "Please use the TransformerEncoderPixelDecoder."
+            predictions = self.predictor(transformer_encoder_features, mask_features)
+        else:
+            predictions = self.predictor(features[self.transformer_in_feature], mask_features)
+        if self.deep_supervision:
+            return predictions["pred_masks"], predictions["aux_outputs"]
+        else:
+            return predictions["pred_masks"], None

mask2former/modeling/pixel_decoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates.

mask2former/modeling/pixel_decoder/fpn.py ADDED Viewed

	@@ -0,0 +1,312 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
+from torch.cuda.amp import autocast
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, DeformConv, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from ..transformer_decoder.position_encoding import PositionEmbeddingSine
+from ..transformer_decoder.transformer import TransformerEncoder, TransformerEncoderLayer, _get_clones, _get_activation_fn
+def build_pixel_decoder(cfg, input_shape):
+    """
+    Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`.
+    """
+    name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME
+    model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+    forward_features = getattr(model, "forward_features", None)
+    if not callable(forward_features):
+        raise ValueError(
+            "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. "
+            f"Please implement forward_features for {name} to only return mask features."
+        )
+    return model
+# This is a modified FPN decoder.
+@SEM_SEG_HEADS_REGISTRY.register()
+class BasePixelDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        conv_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            conv_dims: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
+        feature_channels = [v.channels for k, v in input_shape]
+        lateral_convs = []
+        output_convs = []
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(feature_channels):
+            if idx == len(self.in_features) - 1:
+                output_norm = get_norm(norm, conv_dim)
+                output_conv = Conv2d(
+                    in_channels,
+                    conv_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=use_bias,
+                    norm=output_norm,
+                    activation=F.relu,
+                )
+                weight_init.c2_xavier_fill(output_conv)
+                self.add_module("layer_{}".format(idx + 1), output_conv)
+                lateral_convs.append(None)
+                output_convs.append(output_conv)
+            else:
+                lateral_norm = get_norm(norm, conv_dim)
+                output_norm = get_norm(norm, conv_dim)
+                lateral_conv = Conv2d(
+                    in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
+                )
+                output_conv = Conv2d(
+                    conv_dim,
+                    conv_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=use_bias,
+                    norm=output_norm,
+                    activation=F.relu,
+                )
+                weight_init.c2_xavier_fill(lateral_conv)
+                weight_init.c2_xavier_fill(output_conv)
+                self.add_module("adapter_{}".format(idx + 1), lateral_conv)
+                self.add_module("layer_{}".format(idx + 1), output_conv)
+                lateral_convs.append(lateral_conv)
+                output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+        self.mask_dim = mask_dim
+        self.mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        weight_init.c2_xavier_fill(self.mask_features)
+        self.maskformer_num_feature_levels = 3  # always use 3 scales
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = {}
+        ret["input_shape"] = {
+            k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+        }
+        ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
+        return ret
+    def forward_features(self, features):
+        multi_scale_features = []
+        num_cur_levels = 0
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[::-1]):
+            x = features[f]
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            if lateral_conv is None:
+                y = output_conv(x)
+            else:
+                cur_fpn = lateral_conv(x)
+                # Following FPN implementation, we use nearest upsampling here
+                y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
+                y = output_conv(y)
+            if num_cur_levels < self.maskformer_num_feature_levels:
+                multi_scale_features.append(y)
+                num_cur_levels += 1
+        return self.mask_features(y), None, multi_scale_features
+    def forward(self, features, targets=None):
+        logger = logging.getLogger(__name__)
+        logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.")
+        return self.forward_features(features)
+class TransformerEncoderOnly(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        encoder_layer = TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+        self._reset_parameters()
+        self.d_model = d_model
+        self.nhead = nhead
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, src, mask, pos_embed):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        if mask is not None:
+            mask = mask.flatten(1)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        return memory.permute(1, 2, 0).view(bs, c, h, w)
+# This is a modified FPN decoder with extra Transformer encoder that processes the lowest-resolution feature map.
+@SEM_SEG_HEADS_REGISTRY.register()
+class TransformerEncoderPixelDecoder(BasePixelDecoder):
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        transformer_dropout: float,
+        transformer_nheads: int,
+        transformer_dim_feedforward: int,
+        transformer_enc_layers: int,
+        transformer_pre_norm: bool,
+        conv_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_dropout: dropout probability in transformer
+            transformer_nheads: number of heads in transformer
+            transformer_dim_feedforward: dimension of feedforward network
+            transformer_enc_layers: number of transformer encoder layers
+            transformer_pre_norm: whether to use pre-layernorm or not
+            conv_dims: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm)
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+        in_channels = feature_channels[len(self.in_features) - 1]
+        self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1)
+        weight_init.c2_xavier_fill(self.input_proj)
+        self.transformer = TransformerEncoderOnly(
+            d_model=conv_dim,
+            dropout=transformer_dropout,
+            nhead=transformer_nheads,
+            dim_feedforward=transformer_dim_feedforward,
+            num_encoder_layers=transformer_enc_layers,
+            normalize_before=transformer_pre_norm,
+        )
+        N_steps = conv_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        # update layer
+        use_bias = norm == ""
+        output_norm = get_norm(norm, conv_dim)
+        output_conv = Conv2d(
+            conv_dim,
+            conv_dim,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias,
+            norm=output_norm,
+            activation=F.relu,
+        )
+        weight_init.c2_xavier_fill(output_conv)
+        delattr(self, "layer_{}".format(len(self.in_features)))
+        self.add_module("layer_{}".format(len(self.in_features)), output_conv)
+        self.output_convs[0] = output_conv
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super().from_config(cfg, input_shape)
+        ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
+        ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
+        ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
+        ret[
+            "transformer_enc_layers"
+        ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS  # a separate config
+        ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
+        return ret
+    def forward_features(self, features):
+        multi_scale_features = []
+        num_cur_levels = 0
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[::-1]):
+            x = features[f]
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            if lateral_conv is None:
+                transformer = self.input_proj(x)
+                pos = self.pe_layer(x)
+                transformer = self.transformer(transformer, None, pos)
+                y = output_conv(transformer)
+                # save intermediate feature as input to Transformer decoder
+                transformer_encoder_features = transformer
+            else:
+                cur_fpn = lateral_conv(x)
+                # Following FPN implementation, we use nearest upsampling here
+                y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
+                y = output_conv(y)
+            if num_cur_levels < self.maskformer_num_feature_levels:
+                multi_scale_features.append(y)
+                num_cur_levels += 1
+        return self.mask_features(y), transformer_encoder_features, multi_scale_features
+    def forward(self, features, targets=None):
+        logger = logging.getLogger(__name__)
+        logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.")
+        return self.forward_features(features)

mask2former/modeling/pixel_decoder/msdeformattn.py ADDED Viewed

	@@ -0,0 +1,358 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
+from torch.cuda.amp import autocast
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from ..transformer_decoder.position_encoding import PositionEmbeddingSine
+from ..transformer_decoder.transformer import _get_clones, _get_activation_fn
+from .ops.modules import MSDeformAttn
+# MSDeformAttn Transformer encoder in deformable detr
+class MSDeformAttnTransformerEncoderOnly(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_encoder_layers=6, dim_feedforward=1024, dropout=0.1,
+                 activation="relu",
+                 num_feature_levels=4, enc_n_points=4,
+        ):
+        super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward,
+                                                            dropout, activation,
+                                                            num_feature_levels, nhead, enc_n_points)
+        self.encoder = MSDeformAttnTransformerEncoder(encoder_layer, num_encoder_layers)
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        normal_(self.level_embed)
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+    def forward(self, srcs, pos_embeds):
+        masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs]
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            src = src.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
+        return memory, spatial_shapes, level_start_index
+class MSDeformAttnTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4):
+        super().__init__()
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
+        # self attention
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+        return src
+class MSDeformAttnTransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
+        output = src
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        for _, layer in enumerate(self.layers):
+            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+        return output
+@SEM_SEG_HEADS_REGISTRY.register()
+class MSDeformAttnPixelDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        transformer_dropout: float,
+        transformer_nheads: int,
+        transformer_dim_feedforward: int,
+        transformer_enc_layers: int,
+        conv_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+        # deformable transformer encoder args
+        transformer_in_features: List[str],
+        common_stride: int,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_dropout: dropout probability in transformer
+            transformer_nheads: number of heads in transformer
+            transformer_dim_feedforward: dimension of feedforward network
+            transformer_enc_layers: number of transformer encoder layers
+            conv_dims: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__()
+        transformer_input_shape = {
+            k: v for k, v in input_shape.items() if k in transformer_in_features
+        }
+        # this is the input shape of pixel decoder
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
+        self.feature_strides = [v.stride for k, v in input_shape]
+        self.feature_channels = [v.channels for k, v in input_shape]
+        # this is the input shape of transformer encoder (could use less features than pixel decoder
+        transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride)
+        self.transformer_in_features = [k for k, v in transformer_input_shape]  # starting from "res2" to "res5"
+        transformer_in_channels = [v.channels for k, v in transformer_input_shape]
+        self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape]  # to decide extra FPN layers
+        self.transformer_num_feature_levels = len(self.transformer_in_features)
+        if self.transformer_num_feature_levels > 1:
+            input_proj_list = []
+            # from low resolution to high resolution (res5 -> res2)
+            for in_channels in transformer_in_channels[::-1]:
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, conv_dim, kernel_size=1),
+                    nn.GroupNorm(32, conv_dim),
+                ))
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1),
+                    nn.GroupNorm(32, conv_dim),
+                )])
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+        self.transformer = MSDeformAttnTransformerEncoderOnly(
+            d_model=conv_dim,
+            dropout=transformer_dropout,
+            nhead=transformer_nheads,
+            dim_feedforward=transformer_dim_feedforward,
+            num_encoder_layers=transformer_enc_layers,
+            num_feature_levels=self.transformer_num_feature_levels,
+        )
+        N_steps = conv_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        self.mask_dim = mask_dim
+        # use 1x1 conv instead
+        self.mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        weight_init.c2_xavier_fill(self.mask_features)
+        self.maskformer_num_feature_levels = 3  # always use 3 scales
+        self.common_stride = common_stride
+        # extra fpn levels
+        stride = min(self.transformer_feature_strides)
+        self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride))
+        lateral_convs = []
+        output_convs = []
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]):
+            lateral_norm = get_norm(norm, conv_dim)
+            output_norm = get_norm(norm, conv_dim)
+            lateral_conv = Conv2d(
+                in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm
+            )
+            output_conv = Conv2d(
+                conv_dim,
+                conv_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+                activation=F.relu,
+            )
+            weight_init.c2_xavier_fill(lateral_conv)
+            weight_init.c2_xavier_fill(output_conv)
+            self.add_module("adapter_{}".format(idx + 1), lateral_conv)
+            self.add_module("layer_{}".format(idx + 1), output_conv)
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = {}
+        ret["input_shape"] = {
+            k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+        }
+        ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
+        ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
+        ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
+        # ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
+        ret["transformer_dim_feedforward"] = 1024  # use 1024 for deformable transformer encoder
+        ret[
+            "transformer_enc_layers"
+        ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS  # a separate config
+        ret["transformer_in_features"] = cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES
+        ret["common_stride"] = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE
+        return ret
+    @autocast(enabled=False)
+    def forward_features(self, features):
+        srcs = []
+        pos = []
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.transformer_in_features[::-1]):
+            x = features[f].float()  # deformable detr does not support half precision
+            srcs.append(self.input_proj[idx](x))
+            pos.append(self.pe_layer(x))
+        y, spatial_shapes, level_start_index = self.transformer(srcs, pos)
+        bs = y.shape[0]
+        split_size_or_sections = [None] * self.transformer_num_feature_levels
+        for i in range(self.transformer_num_feature_levels):
+            if i < self.transformer_num_feature_levels - 1:
+                split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i]
+            else:
+                split_size_or_sections[i] = y.shape[1] - level_start_index[i]
+        y = torch.split(y, split_size_or_sections, dim=1)
+        out = []
+        multi_scale_features = []
+        num_cur_levels = 0
+        for i, z in enumerate(y):
+            out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1]))
+        # append `out` with extra FPN levels
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]):
+            x = features[f].float()
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            cur_fpn = lateral_conv(x)
+            # Following FPN implementation, we use nearest upsampling here
+            y = cur_fpn + F.interpolate(out[-1], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False)
+            y = output_conv(y)
+            out.append(y)
+        for o in out:
+            if num_cur_levels < self.maskformer_num_feature_levels:
+                multi_scale_features.append(o)
+                num_cur_levels += 1
+        return self.mask_features(out[-1]), out[0], multi_scale_features

mask2former/modeling/pixel_decoder/ops/functions/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+from .ms_deform_attn_func import MSDeformAttnFunction

mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+try:
+    import MultiScaleDeformableAttention as MSDA
+except ModuleNotFoundError as e:
+    info_string = (
+        "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
+        "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
+        "\t`sh make.sh`\n"
+    )
+    raise ModuleNotFoundError(info_string)
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()

mask2former/modeling/pixel_decoder/ops/make.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+python setup.py build install

mask2former/modeling/pixel_decoder/ops/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+from .ms_deform_attn import MSDeformAttn

mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import warnings
+import math
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+from ..functions import MSDeformAttnFunction
+from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+        self.im2col_step = 128
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+        try:
+            output = MSDeformAttnFunction.apply(
+                value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        except:
+            # CPU
+            output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
+        # # For FLOPs calculation only
+        # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+        return output

mask2former/modeling/pixel_decoder/ops/setup.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+import os
+import glob
+import torch
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+from setuptools import find_packages
+from setuptools import setup
+requirements = ["torch", "torchvision"]
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+    # Force cuda since torch ask for a device, not if cuda is in fact available.
+    if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        if CUDA_HOME is None:
+            raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
+        else:
+            raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "MultiScaleDeformableAttention",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+setup(
+    name="MultiScaleDeformableAttention",
+    version="1.0",
+    author="Weijie Su",
+    url="https://github.com/fundamentalvision/Deformable-DETR",
+    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
+    packages=find_packages(exclude=("configs", "tests",)),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)

mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp ADDED Viewed

	@@ -0,0 +1,46 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}

mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h ADDED Viewed

	@@ -0,0 +1,38 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);

mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu ADDED Viewed

	@@ -0,0 +1,158 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+    const int num_levels = spatial_shapes.size(0);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+        }));
+    }
+    output = output.view({batch, num_query, num_heads*channels});
+    return output;
+}
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+    const int num_levels = spatial_shapes.size(0);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+        }));
+    }
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}

mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h ADDED Viewed

	@@ -0,0 +1,35 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);

mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh ADDED Viewed

	@@ -0,0 +1,1332 @@

+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value,
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value,
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes,
+                              const int64_t* data_level_start_index,
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size,
+                              const int num_heads,
+                              const int channels,
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size,
+                              const int num_heads,
+                              const int channels,
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}

mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h ADDED Viewed

	@@ -0,0 +1,67 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+#pragma once
+#include "cpu/ms_deform_attn_cpu.h"
+#ifdef WITH_CUDA
+#include "cuda/ms_deform_attn_cuda.h"
+#endif
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}

mask2former/modeling/pixel_decoder/ops/src/vision.cpp ADDED Viewed

	@@ -0,0 +1,21 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+#include "ms_deform_attn.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}

mask2former/modeling/pixel_decoder/ops/test.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
+N, M, D = 1, 2, 2
+Lq, L, P = 2, 2, 2
+shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
+S = sum([(H*W).item() for H, W in shapes])
+torch.manual_seed(3)
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    func = MSDeformAttnFunction.apply
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
+    print(f'* {gradok} check_gradient_numerical(D={channels})')
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
+        check_gradient_numerical(channels, True, True, True)

mask2former/modeling/transformer_decoder/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from .maskformer_transformer_decoder import StandardTransformerDecoder
+from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
+from .opd_transformer_decoder import OPDMultiScaleMaskedTransformerDecoder

mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py ADDED Viewed

	@@ -0,0 +1,461 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
+import logging
+import fvcore.nn.weight_init as weight_init
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import Conv2d
+from .position_encoding import PositionEmbeddingSine
+from .maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY
+class SelfAttentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt,
+                     tgt_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt,
+                    tgt_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, tgt_mask,
+                                    tgt_key_padding_mask, query_pos)
+        return self.forward_post(tgt, tgt_mask,
+                                 tgt_key_padding_mask, query_pos)
+class CrossAttentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory,
+                     memory_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt, memory,
+                    memory_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt, memory,
+                memory_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, memory_mask,
+                                    memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, memory_mask,
+                                 memory_key_padding_mask, pos, query_pos)
+class FFNLayer(nn.Module):
+    def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm = nn.LayerNorm(d_model)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt):
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt):
+        tgt2 = self.norm(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt):
+        if self.normalize_before:
+            return self.forward_pre(tgt)
+        return self.forward_post(tgt)
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+@TRANSFORMER_DECODER_REGISTRY.register()
+class MultiScaleMaskedTransformerDecoder(nn.Module):
+    _version = 2
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "static_query" in k:
+                    newk = k.replace("static_query", "query_feat")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+    @configurable
+    def __init__(
+        self,
+        in_channels,
+        mask_classification=True,
+        *,
+        num_classes: int,
+        hidden_dim: int,
+        num_queries: int,
+        nheads: int,
+        dim_feedforward: int,
+        dec_layers: int,
+        pre_norm: bool,
+        mask_dim: int,
+        enforce_input_project: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+        assert mask_classification, "Only support mask classification model"
+        self.mask_classification = mask_classification
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        # define Transformer decoder here
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.num_queries = num_queries
+        # learnable query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        # level embedding (we always use 3 scales)
+        self.num_feature_levels = 3
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+        self.input_proj = nn.ModuleList()
+        for _ in range(self.num_feature_levels):
+            if in_channels != hidden_dim or enforce_input_project:
+                self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
+                weight_init.c2_xavier_fill(self.input_proj[-1])
+            else:
+                self.input_proj.append(nn.Sequential())
+        # output FFNs
+        if self.mask_classification:
+            self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+    @classmethod
+    def from_config(cls, cfg, in_channels, mask_classification):
+        ret = {}
+        ret["in_channels"] = in_channels
+        ret["mask_classification"] = mask_classification
+        ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
+        ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
+        ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
+        ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
+        # NOTE: because we add learnable query features which requires supervision,
+        # we add minus 1 to decoder layers to be consistent with our loss
+        # implementation: that is, number of auxiliary losses is always
+        # equal to number of decoder layers. With learnable query features, the number of
+        # auxiliary losses equals number of decoders plus 1.
+        assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1
+        ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1
+        ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
+        ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        return ret
+    def forward(self, x, mask_features, mask = None):
+        # x is a list of multi-scale feature
+        assert len(x) == self.num_feature_levels
+        src = []
+        pos = []
+        size_list = []
+        # disable mask, it does not affect performance
+        del mask
+        for i in range(self.num_feature_levels):
+            size_list.append(x[i].shape[-2:])
+            pos.append(self.pe_layer(x[i], None).flatten(2))
+            src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
+            # flatten NxCxHxW to HWxNxC
+            pos[-1] = pos[-1].permute(2, 0, 1)
+            src[-1] = src[-1].permute(2, 0, 1)
+        _, bs, _ = src[0].shape
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
+        predictions_class = []
+        predictions_mask = []
+        # prediction heads on learnable query features
+        outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
+        predictions_class.append(outputs_class)
+        predictions_mask.append(outputs_mask)
+        for i in range(self.num_layers):
+            level_index = i % self.num_feature_levels
+            attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+            # attention: cross-attention first
+            output = self.transformer_cross_attention_layers[i](
+                output, src[level_index],
+                memory_mask=attn_mask,
+                memory_key_padding_mask=None,  # here we do not apply masking on padded region
+                pos=pos[level_index], query_pos=query_embed
+            )
+            output = self.transformer_self_attention_layers[i](
+                output, tgt_mask=None,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed
+            )
+            # FFN
+            output = self.transformer_ffn_layers[i](
+                output
+            )
+            outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels])
+            predictions_class.append(outputs_class)
+            predictions_mask.append(outputs_mask)
+        assert len(predictions_class) == self.num_layers + 1
+        out = {
+            'pred_logits': predictions_class[-1],
+            'pred_masks': predictions_mask[-1],
+            'aux_outputs': self._set_aux_loss(
+                predictions_class if self.mask_classification else None, predictions_mask
+            )
+        }
+        return out
+    def forward_prediction_heads(self, output, mask_features, attn_mask_target_size):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        outputs_class = self.class_embed(decoder_output)
+        mask_embed = self.mask_embed(decoder_output)
+        outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
+        # NOTE: prediction is of higher-resolution
+        # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
+        attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False)
+        # must use bool type
+        # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
+        attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
+        attn_mask = attn_mask.detach()
+        return outputs_class, outputs_mask, attn_mask
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        if self.mask_classification:
+            return [
+                {"pred_logits": a, "pred_masks": b}
+                for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
+            ]
+        else:
+            return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]

mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import Conv2d
+from detectron2.utils.registry import Registry
+from .position_encoding import PositionEmbeddingSine
+from .transformer import Transformer
+TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
+TRANSFORMER_DECODER_REGISTRY.__doc__ = """
+Registry for transformer module in MaskFormer.
+"""
+def build_transformer_decoder(cfg, in_channels, mask_classification=True):
+    """
+    Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
+    """
+    name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
+    return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
+@TRANSFORMER_DECODER_REGISTRY.register()
+class StandardTransformerDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        in_channels,
+        mask_classification=True,
+        *,
+        num_classes: int,
+        hidden_dim: int,
+        num_queries: int,
+        nheads: int,
+        dropout: float,
+        dim_feedforward: int,
+        enc_layers: int,
+        dec_layers: int,
+        pre_norm: bool,
+        deep_supervision: bool,
+        mask_dim: int,
+        enforce_input_project: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dropout: dropout in Transformer
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            deep_supervision: whether to add supervision to every decoder layers
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+        self.mask_classification = mask_classification
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        transformer = Transformer(
+            d_model=hidden_dim,
+            dropout=dropout,
+            nhead=nheads,
+            dim_feedforward=dim_feedforward,
+            num_encoder_layers=enc_layers,
+            num_decoder_layers=dec_layers,
+            normalize_before=pre_norm,
+            return_intermediate_dec=deep_supervision,
+        )
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        if in_channels != hidden_dim or enforce_input_project:
+            self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
+            weight_init.c2_xavier_fill(self.input_proj)
+        else:
+            self.input_proj = nn.Sequential()
+        self.aux_loss = deep_supervision
+        # output FFNs
+        if self.mask_classification:
+            self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+    @classmethod
+    def from_config(cls, cfg, in_channels, mask_classification):
+        ret = {}
+        ret["in_channels"] = in_channels
+        ret["mask_classification"] = mask_classification
+        ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
+        ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
+        ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
+        ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
+        ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
+        ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
+        ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
+        ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
+        ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        return ret
+    def forward(self, x, mask_features, mask=None):
+        if mask is not None:
+            mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+        pos = self.pe_layer(x, mask)
+        src = x
+        hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
+        if self.mask_classification:
+            outputs_class = self.class_embed(hs)
+            out = {"pred_logits": outputs_class[-1]}
+        else:
+            out = {}
+        if self.aux_loss:
+            # [l, bs, queries, embed]
+            mask_embed = self.mask_embed(hs)
+            outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
+            out["pred_masks"] = outputs_seg_masks[-1]
+            out["aux_outputs"] = self._set_aux_loss(
+                outputs_class if self.mask_classification else None, outputs_seg_masks
+            )
+        else:
+            # FIXME h_boxes takes the last one computed, keep this in mind
+            # [bs, queries, embed]
+            mask_embed = self.mask_embed(hs[-1])
+            outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
+            out["pred_masks"] = outputs_seg_masks
+        return out
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        if self.mask_classification:
+            return [
+                {"pred_logits": a, "pred_masks": b}
+                for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
+            ]
+        else:
+            return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x

mask2former/modeling/transformer_decoder/opd_transformer_decoder.py ADDED Viewed

	@@ -0,0 +1,520 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
+import logging
+import fvcore.nn.weight_init as weight_init
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import Conv2d
+from .position_encoding import PositionEmbeddingSine
+from .maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY
+from .mask2former_transformer_decoder import (
+    SelfAttentionLayer,
+    CrossAttentionLayer,
+    FFNLayer,
+    MLP,
+)
+from ..criterion import convert_to_filled_tensor
+@TRANSFORMER_DECODER_REGISTRY.register()
+class OPDMultiScaleMaskedTransformerDecoder(nn.Module):
+    _version = 2
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "static_query" in k:
+                    newk = k.replace("static_query", "query_feat")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+    @configurable
+    def __init__(
+        self,
+        in_channels,
+        mask_classification=True,
+        *,
+        num_classes: int,
+        hidden_dim: int,
+        num_queries: int,
+        nheads: int,
+        dim_feedforward: int,
+        dec_layers: int,
+        pre_norm: bool,
+        mask_dim: int,
+        enforce_input_project: bool,
+        # OPD
+        motionnet_type,
+        obj_method
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+        # OPD
+        self.motionnet_type = motionnet_type
+        self.num_classes = num_classes
+        self.obj_method = obj_method
+        assert mask_classification, "Only support mask classification model"
+        self.mask_classification = mask_classification
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+        # define Transformer decoder here
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.num_queries = num_queries
+        # learnable query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        # level embedding (we always use 3 scales)
+        self.num_feature_levels = 3
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+        self.input_proj = nn.ModuleList()
+        for _ in range(self.num_feature_levels):
+            if in_channels != hidden_dim or enforce_input_project:
+                self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1))
+                weight_init.c2_xavier_fill(self.input_proj[-1])
+            else:
+                self.input_proj.append(nn.Sequential())
+        # output FFNs
+        if self.mask_classification:
+            self.class_embed = nn.Sequential(
+                nn.Linear(hidden_dim, 32),
+                nn.ReLU(inplace=True),
+                nn.Linear(32, num_classes + 1),
+            )
+            # OPD Changes
+            self.mtype_embed = nn.Sequential(
+                nn.Linear(hidden_dim, 32),
+                nn.ReLU(inplace=True),
+                nn.Linear(32, 2),
+            )
+            self.morigin_embed = nn.Sequential(
+                nn.Linear(hidden_dim, 32),
+                nn.ReLU(inplace=True),
+                nn.Linear(32, 3),
+            )
+            self.maxis_embed = nn.Sequential(
+                nn.Linear(hidden_dim, 32),
+                nn.ReLU(inplace=True),
+                nn.Linear(32, 3),
+            )
+            self.mstate_embed = nn.Sequential(
+                nn.Linear(hidden_dim, 32),
+                nn.ReLU(inplace=True),
+                nn.Linear(32, 1),
+            )
+            self.mstatemax_embed = nn.Sequential(
+                nn.Linear(hidden_dim, 32),
+                nn.ReLU(inplace=True),
+                nn.Linear(32, 1),
+            )
+            if self.motionnet_type == "BMOC_V0":
+                # Define the layers for the extrinsic prediction
+                self.extrinsic_feature_layer = nn.Sequential(
+                    # 16 * 256 * 64 * 64
+                    nn.Conv2d(256, 256, 3, 2, 1), # 16 * 256 * 32 * 32
+                    nn.BatchNorm2d(256),
+                    nn.ReLU(inplace=True),
+                    nn.MaxPool2d(2, 2), # 16 * 256 * 16 * 16
+                    nn.Conv2d(256, 256, 3, 2, 1), # 16 * 256 * 8 * 8
+                    nn.BatchNorm2d(256),
+                    nn.ReLU(inplace=True),
+                    nn.MaxPool2d(2, 2), # 16 * 256 * 4 * 4
+                    nn.Conv2d(256, 64, 1), # 16 * 64 * 4 * 4
+                    nn.BatchNorm2d(64),
+                    nn.ReLU(inplace=True),
+                    nn.Flatten() # 16 * 1024
+                )
+                for layer in self.extrinsic_feature_layer:
+                    if isinstance(layer, nn.Conv2d):
+                        nn.init.kaiming_normal_(
+                            layer.weight, mode="fan_out", nonlinearity="relu"
+                        )
+                self.extrinsic_pred_layer = nn.Sequential(
+                    nn.Linear(768, 512),
+                    # nn.Linear(768, 512),
+                    nn.ReLU(inplace=True),
+                    nn.Linear(512, 128),
+                    nn.ReLU(inplace=True),
+                    nn.Linear(128, 32),
+                    nn.ReLU(inplace=True),
+                    nn.Linear(32, 12), # 16 * 12
+                )
+            elif self.motionnet_type == "BMOC_V1":
+                self.extrinsic_embed = nn.Sequential(
+                    nn.Linear(hidden_dim, 32),
+                    nn.ReLU(inplace=True),
+                    nn.Linear(32, 12),
+                )
+            elif self.motionnet_type == "BMOC_V2":
+                self.extrinsic_embed = nn.Sequential(
+                    nn.Linear(hidden_dim, 32),
+                    nn.ReLU(inplace=True),
+                    nn.Linear(32, 7),
+                )
+            elif self.motionnet_type == "BMOC_V3":
+                self.extrinsic_embed = nn.Sequential(
+                    nn.Linear(hidden_dim, 32),
+                    nn.ReLU(inplace=True),
+                    nn.Linear(32, 9),
+                )
+            elif self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5" or self.motionnet_type == "BMOC_V6":
+                if self.motionnet_type == "BMOC_V5":
+                    self.mask_weight_layer = SelfAttentionLayer(
+                        d_model=hidden_dim,
+                        nhead=nheads,
+                        dropout=0.0,
+                        normalize_before=pre_norm,
+                    )
+                # Define the layers for the extrinsic prediction
+                self.extrinsic_feature_layer = nn.Sequential(
+                    nn.BatchNorm2d(256),
+                    # 16 * 256 * 64 * 64
+                    nn.Conv2d(256, 256, 3, 2, 1), # 16 * 256 * 32 * 32
+                    nn.BatchNorm2d(256),
+                    nn.ReLU(inplace=True),
+                    nn.MaxPool2d(2, 2), # 16 * 256 * 16 * 16
+                    nn.Conv2d(256, 256, 3, 2, 1), # 16 * 256 * 8 * 8
+                    nn.BatchNorm2d(256),
+                    nn.ReLU(inplace=True),
+                    nn.MaxPool2d(2, 2), # 16 * 256 * 4 * 4
+                    nn.Conv2d(256, 64, 1), # 16 * 64 * 4 * 4
+                    nn.BatchNorm2d(64),
+                    nn.ReLU(inplace=True),
+                    nn.Flatten() # 16 * 1024
+                )
+                for layer in self.extrinsic_feature_layer:
+                    if isinstance(layer, nn.Conv2d):
+                        nn.init.kaiming_normal_(
+                            layer.weight, mode="fan_out", nonlinearity="relu"
+                        )
+                if self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5":
+                    self.extrinsic_pred_layer = nn.Sequential(
+                        nn.Linear(1024, 512),
+                        nn.ReLU(inplace=True),
+                        nn.Linear(512, 128),
+                        nn.ReLU(inplace=True),
+                        nn.Linear(128, 32),
+                        nn.ReLU(inplace=True),
+                        nn.Linear(32, 7), # 16 * 7
+                    )
+                elif self.motionnet_type == "BMOC_V6":
+                    self.extrinsic_pred_layer = nn.Sequential(
+                        # nn.Linear(1024, 512),
+                        nn.Linear(768, 512),
+                        nn.ReLU(inplace=True),
+                        nn.Linear(512, 128),
+                        nn.ReLU(inplace=True),
+                        nn.Linear(128, 32),
+                        nn.ReLU(inplace=True),
+                        nn.Linear(32, 12), # 16 * 12
+                    )
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+    @classmethod
+    def from_config(cls, cfg, in_channels, mask_classification):
+        ret = {}
+        ret["in_channels"] = in_channels
+        ret["mask_classification"] = mask_classification
+        ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
+        ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
+        ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
+        ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
+        # NOTE: because we add learnable query features which requires supervision,
+        # we add minus 1 to decoder layers to be consistent with our loss
+        # implementation: that is, number of auxiliary losses is always
+        # equal to number of decoder layers. With learnable query features, the number of
+        # auxiliary losses equals number of decoders plus 1.
+        assert cfg.MODEL.MASK_FORMER.DEC_LAYERS >= 1
+        ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS - 1
+        ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
+        ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        # OPD
+        ret["motionnet_type"] = cfg.MODEL.MOTIONNET.TYPE
+        ret['obj_method'] = cfg.OBJ_DETECT
+        return ret
+    def forward(self, x, mask_features, mask = None):
+        # x is a list of multi-scale feature
+        assert len(x) == self.num_feature_levels
+        src = []
+        pos = []
+        size_list = []
+        # disable mask, it does not affect performance
+        # if not self.obj_method:
+        #     del mask
+        # import pdb
+        # pdb.set_trace()
+        for i in range(self.num_feature_levels):
+            size_list.append(x[i].shape[-2:])
+            pos.append(self.pe_layer(x[i], None).flatten(2))
+            src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
+            # flatten NxCxHxW to HWxNxC
+            pos[-1] = pos[-1].permute(2, 0, 1)
+            src[-1] = src[-1].permute(2, 0, 1)
+        _, bs, _ = src[0].shape
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
+        predictions_class = []
+        predictions_mask = []
+        # OPD
+        predictions_mtype = []
+        predictions_morigin = []
+        predictions_maxis = []
+        predictions_mstate = []
+        predictions_mstatemax = []
+        if self.motionnet_type == "BMOC_V1" or self.motionnet_type == "BMOC_V2" or self.motionnet_type == "BMOC_V3" or self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5" or self.motionnet_type == "BMOC_V6":
+            predictions_extrinsic = []
+        # prediction heads on learnable query features
+        outputs_class, outputs_mask, attn_mask, outputs_mtype, outputs_morigin, outputs_maxis, outputs_extrinsic, outputs_mstate, outputs_mstatemax = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0], query_embed=query_embed, mask=mask)
+        predictions_class.append(outputs_class)
+        predictions_mask.append(outputs_mask)
+        # OPD
+        predictions_mtype.append(outputs_mtype)
+        predictions_morigin.append(outputs_morigin)
+        predictions_maxis.append(outputs_maxis)
+        predictions_mstate.append(outputs_mstate)
+        predictions_mstatemax.append(outputs_mstatemax)
+        if self.motionnet_type == "BMOC_V1" or self.motionnet_type == "BMOC_V2" or self.motionnet_type == "BMOC_V3" or self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5" or self.motionnet_type == "BMOC_V6":
+            predictions_extrinsic.append(outputs_extrinsic)
+        for i in range(self.num_layers):
+            level_index = i % self.num_feature_levels
+            attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+            # attention: cross-attention first
+            output = self.transformer_cross_attention_layers[i](
+                output, src[level_index],
+                memory_mask=attn_mask,
+                memory_key_padding_mask=None,  # here we do not apply masking on padded region
+                pos=pos[level_index], query_pos=query_embed
+            )
+            output = self.transformer_self_attention_layers[i](
+                output, tgt_mask=None,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed
+            )
+            # FFN
+            output = self.transformer_ffn_layers[i](
+                output
+            )
+            outputs_class, outputs_mask, attn_mask, outputs_mtype, outputs_morigin, outputs_maxis, outputs_extrinsic, outputs_mstate, outputs_mstatemax = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels], query_embed=query_embed)
+            predictions_class.append(outputs_class)
+            predictions_mask.append(outputs_mask)
+            # OPD
+            predictions_mtype.append(outputs_mtype)
+            predictions_morigin.append(outputs_morigin)
+            predictions_maxis.append(outputs_maxis)
+            predictions_mstate.append(outputs_mstate)
+            predictions_mstatemax.append(outputs_mstatemax)
+            if self.motionnet_type == "BMOC_V1" or self.motionnet_type == "BMOC_V2" or self.motionnet_type == "BMOC_V3" or self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5" or self.motionnet_type == "BMOC_V6":
+                predictions_extrinsic.append(outputs_extrinsic)
+        assert len(predictions_class) == self.num_layers + 1
+        if self.mask_classification:
+            if self.motionnet_type == "BMOC_V0" or self.motionnet_type == "BMCC":
+                aux_outputs = self._set_aux_loss(
+                        predictions_class, predictions_mask, predictions_mtype, predictions_morigin, predictions_maxis, None, predictions_mstate, predictions_mstatemax
+                    )
+            elif self.motionnet_type == "BMOC_V1" or self.motionnet_type == "BMOC_V2"  or self.motionnet_type == "BMOC_V3" or self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5" or self.motionnet_type == "BMOC_V6":
+                aux_outputs = self._set_aux_loss(
+                        predictions_class, predictions_mask, predictions_mtype, predictions_morigin, predictions_maxis, predictions_extrinsic, predictions_mstate, predictions_mstatemax
+                    )
+        else:
+            aux_outputs = self._set_aux_loss(
+                    None, predictions_mask, None, None, None, None, None
+                )
+        # OPD
+        if self.motionnet_type == "BMOC_V0":
+            extrinsic_feature = self.extrinsic_feature_layer(mask_features)
+            predictions_extrinsic = self.extrinsic_pred_layer(extrinsic_feature)
+        out = {
+            'pred_logits': predictions_class[-1],
+            'pred_masks': predictions_mask[-1],
+            # OPD
+            'pred_mtypes': predictions_mtype[-1],
+            'pred_morigins': predictions_morigin[-1],
+            'pred_maxises': predictions_maxis[-1],
+            'aux_outputs': aux_outputs,
+            'pred_mstates': predictions_mstate[-1],
+            'pred_mstatemaxs': predictions_mstatemax[-1],
+        }
+        if self.motionnet_type == "BMOC_V0":
+            out['pred_extrinsics'] = predictions_extrinsic
+        elif self.motionnet_type == "BMOC_V1" or self.motionnet_type == "BMOC_V2"  or self.motionnet_type == "BMOC_V3" or self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5" or self.motionnet_type == "BMOC_V6":
+            out['pred_extrinsics'] = predictions_extrinsic[-1]
+        return out
+    def forward_prediction_heads(self, output, mask_features, attn_mask_target_size, query_embed, mask = None):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        outputs_class = self.class_embed(decoder_output)
+        # OPD Changes
+        outputs_mtype = self.mtype_embed(decoder_output)
+        outputs_morigin = self.morigin_embed(decoder_output)
+        outputs_maxis = self.maxis_embed(decoder_output)
+        outputs_mstate = self.mstate_embed(decoder_output)
+        outputs_mstatemax = self.mstatemax_embed(decoder_output)
+        if self.motionnet_type == "BMOC_V1" or self.motionnet_type == "BMOC_V2"  or self.motionnet_type == "BMOC_V3":
+            outputs_extrinsic = self.extrinsic_embed(decoder_output)
+        elif self.motionnet_type == "BMOC_V0" or self.motionnet_type == "BMCC":
+            outputs_extrinsic = None
+        mask_embed = self.mask_embed(decoder_output)
+        outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
+        # import pdb
+        # pdb.set_trace()
+        # TODO: Add different variants of using object mask to get the extrinsic
+        if self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V6":
+            binary_mask = (outputs_mask > 0).float()
+            weighted_masked_feature = mask_features + torch.einsum("bqhw,bchw->bchw", binary_mask, mask_features)
+            extrinsic_feature = self.extrinsic_feature_layer(weighted_masked_feature)
+            outputs_extrinsic = self.extrinsic_pred_layer(extrinsic_feature)
+        elif  self.motionnet_type == "BMOC_V5":
+            # Get one weight for each query
+            mask_weights = torch.transpose(self.mask_weight_layer(
+                torch.transpose(mask_embed, 0, 1), tgt_mask=None,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed
+            ), 0, 1).mean(2)
+            binary_mask = (outputs_mask > 0).float()
+            weighted_mask = torch.einsum("bq,bqhw->bqhw", mask_weights, binary_mask)
+            weighted_masked_feature = mask_features + torch.einsum("bqhw,bchw->bchw", weighted_mask, mask_features)
+            extrinsic_feature = self.extrinsic_feature_layer(weighted_masked_feature)
+            outputs_extrinsic = self.extrinsic_pred_layer(extrinsic_feature)
+        # NOTE: prediction is of higher-resolution
+        # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
+        attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False)
+        # must use bool type
+        # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
+        attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
+        attn_mask = attn_mask.detach()
+        return outputs_class, outputs_mask, attn_mask, outputs_mtype, outputs_morigin, outputs_maxis, outputs_extrinsic, outputs_mstate, outputs_mstatemax
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks, predictions_mtype, predictions_morigin, predictions_maxis, predictions_extrinsic, predictions_mstate, predictions_mstatemax):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        if self.mask_classification:
+            if self.motionnet_type == "BMOC_V0" or self.motionnet_type == "BMCC":
+                return [
+                    {"pred_logits": a, "pred_masks": b, "pred_mtypes": c, "pred_morigins": d, "pred_maxises": e, "pred_mstates": f, "pred_mstatemaxs": g}
+                    for a, b, c, d, e, f, g in zip(outputs_class[:-1], outputs_seg_masks[:-1], predictions_mtype[:-1], predictions_morigin[:-1], predictions_maxis[:-1], predictions_mstate[:-1], predictions_mstatemax[:-1])
+                ]
+            elif self.motionnet_type == "BMOC_V1" or self.motionnet_type == "BMOC_V2"  or self.motionnet_type == "BMOC_V3" or self.motionnet_type == "BMOC_V4" or self.motionnet_type == "BMOC_V5" or self.motionnet_type == "BMOC_V6":
+                return [
+                    {"pred_logits": a, "pred_masks": b, "pred_mtypes": c, "pred_morigins": d, "pred_maxises": e, "pred_extrinsics": f, "pred_mstates": g, "pred_mstatemaxs": h}
+                    for a, b, c, d, e, f, g, h in zip(outputs_class[:-1], outputs_seg_masks[:-1], predictions_mtype[:-1], predictions_morigin[:-1], predictions_maxis[:-1], predictions_extrinsic[:-1], predictions_mstate[:-1], predictions_mstatemax[:-1])
+                ]
+        else:
+            return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]

mask2former/modeling/transformer_decoder/position_encoding.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, x, mask=None):
+        if mask is None:
+            mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+    def __repr__(self, _repr_indent=4):
+        head = "Positional encoding " + self.__class__.__name__
+        body = [
+            "num_pos_feats: {}".format(self.num_pos_feats),
+            "temperature: {}".format(self.temperature),
+            "normalize: {}".format(self.normalize),
+            "scale: {}".format(self.scale),
+        ]
+        # _repr_indent = 4
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)

mask2former/modeling/transformer_decoder/transformer.py ADDED Viewed

	@@ -0,0 +1,369 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py
+"""
+Transformer class.
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import List, Optional
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        return_intermediate_dec=False,
+    ):
+        super().__init__()
+        encoder_layer = TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+        decoder_layer = TransformerDecoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+        )
+        self._reset_parameters()
+        self.d_model = d_model
+        self.nhead = nhead
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, src, mask, query_embed, pos_embed):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        if mask is not None:
+            mask = mask.flatten(1)
+        tgt = torch.zeros_like(query_embed)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        hs = self.decoder(
+            tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed
+        )
+        return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(
+        self,
+        src,
+        mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        output = src
+        for layer in self.layers:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos
+            )
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos=pos,
+                query_pos=query_pos,
+            )
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return output.unsqueeze(0)
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(
+            q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
+        )[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+    def forward_pre(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(
+            q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
+        )[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+    def forward(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+class TransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(
+            q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
+        )[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward_pre(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(
+            q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
+        )[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(
+                tgt,
+                memory,
+                tgt_mask,
+                memory_mask,
+                tgt_key_padding_mask,
+                memory_key_padding_mask,
+                pos,
+                query_pos,
+            )
+        return self.forward_post(
+            tgt,
+            memory,
+            tgt_mask,
+            memory_mask,
+            tgt_key_padding_mask,
+            memory_key_padding_mask,
+            pos,
+            query_pos,
+        )
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")

mask2former/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates.
2	+ from .motion_visualizer import MotionVisualizer

mask2former/utils/misc.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
+"""
+Misc functions, including distributed helpers.
+Mostly copy-paste from torchvision references.
+"""
+from typing import List, Optional
+import torch
+import torch.distributed as dist
+import torchvision
+from torch import Tensor
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+    def decompose(self):
+        return self.tensors, self.mask
+    def __repr__(self):
+        return str(self.tensors)
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("not supported")
+    return NestedTensor(tensor, mask)
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(
+            torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
+        ).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+    return NestedTensor(tensor, mask=mask)
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True

mask2former/utils/motion_visualizer.py ADDED Viewed

	@@ -0,0 +1,676 @@

+from fvcore.common.file_io import PathManager
+from detectron2.utils.visualizer import (
+    Visualizer,
+    ColorMode,
+    _create_text_labels,
+    GenericMask,
+)
+from detectron2.structures import (
+    BitMasks,
+    Boxes,
+    BoxMode,
+    Keypoints,
+    PolygonMasks,
+    RotatedBoxes,
+)
+from detectron2.utils.colormap import random_color
+from PIL import Image
+import numpy as np
+from numpy.linalg import norm
+import math
+MOTION_TYPE = {0: "rotation", 1: "translation"}
+_COLORS_CAT = {
+    0: np.array([166, 206, 227]) / 255,
+    1: np.array([31, 120, 180]) / 255,
+    2: np.array([202, 178, 214]) / 255,
+    3: np.array([106, 61, 154]) / 255,
+    4: np.array([178, 223, 138]) / 255,
+    5: np.array([51, 160, 44]) / 255,
+}
+_COLORS_LEVEL = {
+    0: np.array([0, 255, 0]) / 255,
+    1: np.array([255, 128, 0]) / 255,
+    2: np.array([255, 0, 0]) / 255,
+}
+def getFocalLength(FOV, height, width=None):
+    # FOV is in radius, should be vertical angle
+    if width == None:
+        f = height / (2 * math.tan(FOV / 2))
+        return f
+    else:
+        fx = height / (2 * math.tan(FOV / 2))
+        fy = fx / height * width
+        return (fx, fy)
+def camera_to_image(point, is_real=False, intrinsic_matrix=None):
+    point_camera = np.array(point)
+    # Calculate the camera intrinsic parameters (they are fixed in this project)
+    if not is_real:
+        # Below is for the MoionNet synthetic dataset intrinsic
+        FOV = 50
+        img_width = img_height = 256
+        fx, fy = getFocalLength(FOV / 180 * math.pi, img_height, img_width)
+        cy = img_height / 2
+        cx = img_width / 2
+        x = point_camera[0] * fx / (-point_camera[2]) + cx
+        y = -(point_camera[1] * fy / (-point_camera[2])) + cy
+    else:
+        # Below is the for MotionREAL dataset
+        point_2d = np.dot(intrinsic_matrix, point_camera[:3])
+        x = point_2d[0] / point_2d[2]
+        y = point_2d[1] / point_2d[2]
+    return (x, y)
+def rotation_from_vectors(source, dest):
+    a, b = (source / np.linalg.norm(source)).reshape(3), (
+        dest / np.linalg.norm(dest)
+    ).reshape(3)
+    v = np.cross(a, b)
+    c = np.dot(a, b)
+    s = np.linalg.norm(v)
+    kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
+    rmat = np.eye(3) + kmat + np.matmul(kmat, kmat) * ((1 - c) / (s ** 2))
+    return rmat
+def rotatePoint(x, y, angle, scale):
+    rad = np.pi * angle / 180
+    x2 = np.cos(rad) * x - np.sin(rad) * y
+    y2 = np.sin(rad) * x + np.cos(rad) * y
+    return [x2 * scale, y2 * scale]
+def circlePoints(axis, radius=0.5, num=50):
+    angles = np.linspace(0, 2 * np.pi, num, endpoint=False)
+    x_vec = np.cos(angles) * radius
+    y_vec = np.sin(angles) * radius
+    z_vec = np.zeros_like(x_vec) + 0.5
+    points = np.stack((x_vec, y_vec, z_vec), axis=0)
+    rot = rotation_from_vectors(np.array([0, 0, 1]), np.asarray(axis))
+    points = np.matmul(rot, points)
+    return points
+def get_iou(bb1, bb2):
+    x_left = max(bb1[0], bb2[0])
+    y_top = max(bb1[1], bb2[1])
+    x_right = min(bb1[0] + bb1[2], bb2[0] + bb2[2])
+    y_bottom = min(bb1[1] + bb1[3], bb2[1] + bb2[3])
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+    area = (x_right - x_left) * (y_bottom - y_top)
+    bb1_area = bb1[2] * bb1[3]
+    bb2_area = bb2[2] * bb2[3]
+    iou = area / float(bb1_area + bb2_area - area)
+    return iou
+class MotionVisualizer(Visualizer):
+    def draw_gt_instance(self, anno, part_id_json, is_real=False, intrinsic_matrix=None, line_length=1):
+        # All annotations have been in the camera coordinate
+        masks = [anno["segmentation"]]
+        boxes = [BoxMode.convert(anno["bbox"], anno["bbox_mode"], BoxMode.XYXY_ABS)]
+        labels = [anno["category_id"]]
+        colors = None
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get(
+            "thing_colors"
+        ):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                for c in labels
+            ]
+        origins = [anno["motion"]["current_origin"]]
+        # Calculate the 2d origin (Only consider draw only one origin)
+        origins_4d = [origin[:] + [1] for origin in origins]
+        origin_2d = [camera_to_image(origin, is_real, intrinsic_matrix) for origin in origins_4d]
+        axises = [anno["motion"]["current_axis"]]
+        new_point = list(np.array(origins[0]) + line_length * np.array(axises[0]))
+        new_point = new_point[:] + [1]
+        new_point = camera_to_image(new_point, is_real, intrinsic_matrix)
+        arrow_p0 = rotatePoint(
+            new_point[0] - origin_2d[0][0], new_point[1] - origin_2d[0][1], 30, 0.1
+        )
+        arrow_p1 = rotatePoint(
+            new_point[0] - origin_2d[0][0], new_point[1] - origin_2d[0][1], -30, 0.1
+        )
+        circle_p = circlePoints(axises[0], 0.1, 50)
+        circle_p = line_length * circle_p + np.repeat(
+            np.asarray(origins[0])[:, np.newaxis], 50, axis=1
+        )
+        circle_p = circle_p.transpose()
+        circle_p_2d = np.asarray([camera_to_image(p, is_real, intrinsic_matrix) for p in circle_p])
+        self.draw_line(
+            [origin_2d[0][0], new_point[0]],
+            [origin_2d[0][1], new_point[1]],
+            color=_COLORS_LEVEL[0],
+            linewidth=2,
+        )
+        self.draw_line(
+            [new_point[0] - arrow_p0[0], new_point[0]],
+            [new_point[1] - arrow_p0[1], new_point[1]],
+            color=_COLORS_LEVEL[0],
+            linewidth=2,
+        )
+        self.draw_line(
+            [new_point[0] - arrow_p1[0], new_point[0]],
+            [new_point[1] - arrow_p1[1], new_point[1]],
+            color=_COLORS_LEVEL[0],
+            linewidth=2,
+        )
+        self.draw_polygon(
+            circle_p_2d, color=_COLORS_LEVEL[0], edge_color=_COLORS_LEVEL[0], alpha=0.0
+        )
+        mtype = 0 if anno["motion"]["type"] == "rotation" else 1
+        if not mtype:
+            self.draw_circle(origin_2d[0], color=_COLORS_LEVEL[0], radius=5)
+        names = self.metadata.get("thing_classes", None)
+        if names:
+            labels = [names[i] + "_" + anno["motion"]["type"] for i in labels]
+        labels = [
+            "{}".format(i) + ("|crowd" if a.get("iscrowd", 0) else "")
+            for i, a in zip(labels, [anno])
+        ]
+        cat_id = anno["category_id"]
+        self.overlay_instances(
+            labels=labels,
+            boxes=boxes,
+            masks=masks,
+            assigned_colors=[_COLORS_CAT[cat_id * 2 + mtype]],
+        )
+        part_id_json["partId"] = anno["motion"]["partId"]
+        part_id_json["type"] = anno["motion"]["type"]
+        part_id_json["category_id"] = anno["category_id"]
+        return self.output
+    def draw_prior(self, anno):
+        # All annotations have been in the camera coordinate
+        labels = [0]
+        origin = anno["start"]
+        origin_2d = anno["start_2d"]
+        new_point = anno["end_2d"]
+        axises = [anno["axises"]]
+        print(axises)
+        projection = anno["projMat"]
+        arrow_p0 = rotatePoint(
+            new_point[0] - origin_2d[0], new_point[1] - origin_2d[1], 30, 0.1
+        )
+        arrow_p1 = rotatePoint(
+            new_point[0] - origin_2d[0], new_point[1] - origin_2d[1], -30, 0.1
+        )
+        circle_p = circlePoints(axises[0], 0.1, 50)
+        circle_p = circle_p + np.repeat(np.asarray(origin)[:, np.newaxis], 50, axis=1)
+        # circle_p = circle_p.transpose()
+        circle_p = np.vstack((circle_p, np.ones(circle_p.shape[1])))
+        circle_p_2d = np.dot(projection, circle_p)
+        circle_p_2d = circle_p_2d / circle_p_2d[3, :]
+        circle_p_2d = circle_p_2d[:2, :]
+        circle_p_2d[0, :] = (circle_p_2d[0, :] + 1) / 2 * anno["img_size"]
+        circle_p_2d[1, :] = (-circle_p_2d[1, :] + 1) / 2 * anno["img_size"]
+        circle_p_2d = circle_p_2d.transpose()
+        axis_diff = anno["error"]
+        if axis_diff <= 2:
+            axis_color = _COLORS_LEVEL[0]
+        elif axis_diff > 2 and axis_diff <= 10:
+            axis_color = _COLORS_LEVEL[1]
+        elif axis_diff > 10:
+            axis_color = _COLORS_LEVEL[2]
+        print(axis_diff)
+        self.draw_line(
+            [origin_2d[0], new_point[0]],
+            [origin_2d[1], new_point[1]],
+            color=axis_color,
+            linewidth=2,
+        )
+        self.draw_line(
+            [new_point[0] - arrow_p0[0], new_point[0]],
+            [new_point[1] - arrow_p0[1], new_point[1]],
+            color=axis_color,
+            linewidth=2,
+        )
+        self.draw_line(
+            [new_point[0] - arrow_p1[0], new_point[0]],
+            [new_point[1] - arrow_p1[1], new_point[1]],
+            color=axis_color,
+            linewidth=2,
+        )
+        self.draw_polygon(
+            circle_p_2d, color=axis_color, edge_color=axis_color, alpha=0.0
+        )
+        mtype = 1
+        if not mtype:
+            self.draw_circle(origin_2d, color=_COLORS_LEVEL[0], radius=5)
+        cat_id = 0
+        labels = [
+            "{}".format(i) + ("|crowd" if a.get("iscrowd", 0) else "")
+            for i, a in zip(labels, [anno])
+        ]
+        # self.overlay_instances(
+        #     labels=labels, boxes=None, masks=None, assigned_colors=[_COLORS_CAT[cat_id*2+mtype]]
+        # )
+        return self.output
+    def draw_pred_instance(self, prediction, d, match, is_real=False, intrinsic_matrix=None, line_length=1, no_mask=False, diagonal_length=-1):
+        if "annotations" in d:
+            boxes = prediction.get("bbox", None)
+            anno = None
+            annos = d["annotations"]
+            max_iou = -1
+            if not len(annos):
+                return None
+            for gt_anno in annos:
+                iou = get_iou(gt_anno["bbox"], boxes)
+                if np.isnan(iou):
+                    return False
+                if iou > max_iou:
+                    max_iou = iou
+                    anno = gt_anno
+        else:
+            max_iou = -1
+            boxes = prediction.get("bbox", None)
+            anno = d
+            boxes = prediction.get("bbox", None)
+            iou = get_iou(anno["bbox"], boxes)
+            if iou > max_iou:
+                max_iou = iou
+        boxes = [BoxMode.convert(boxes, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)]
+        # Based on the motion type, determine to visualize the predicted motion origin or gt motion origin
+        # For translation joint, the motion origin is meaningless
+        pred_type = prediction["mtype"]
+        if pred_type == 1:
+            pred_origin = anno["motion"]["current_origin"]
+        else:
+            pred_origin = prediction["morigin"]
+        # Prepare the predicted origin and predicted axis
+        pred_origin_4d = pred_origin + [1]
+        pred_origin_2d = camera_to_image(pred_origin_4d, is_real, intrinsic_matrix)
+        pred_axis = np.array(prediction["maxis"])
+        pred_axis = list(pred_axis / norm(pred_axis))
+        pred_new_point = list(np.array(pred_origin) + line_length * np.array(pred_axis))
+        pred_new_point = pred_new_point + [1]
+        pred_new_point = camera_to_image(pred_new_point, is_real, intrinsic_matrix)
+        # Prepare the gt origin and gt axis
+        gt_origin = anno["motion"]["current_origin"]
+        gt_origin_4d = gt_origin + [1]
+        gt_origin_2d = camera_to_image(gt_origin_4d, is_real, intrinsic_matrix)
+        gt_axis = anno["motion"][
+            "current_axis"
+        ]  # gt_axis has been normalized in the annotation
+        gt_new_point = list(np.array(gt_origin) + line_length * np.array(gt_axis))
+        gt_new_point = gt_new_point + [1]
+        gt_new_point = camera_to_image(gt_new_point, is_real, intrinsic_matrix)
+        # Caluculate the axis and origin error to determine the color for the visualization of axis and origin
+        axis_diff = (
+            np.arccos(
+                np.abs(
+                    np.dot(np.array(gt_axis), np.array(pred_axis))
+                    / (norm(pred_axis) * norm(gt_axis))
+                )
+            )
+            / np.pi
+            * 180.0
+        )
+        if axis_diff <= 5:
+            axis_color = _COLORS_LEVEL[0]
+        elif axis_diff > 5 and axis_diff <= 10:
+            axis_color = _COLORS_LEVEL[1]
+        elif axis_diff > 10:
+            axis_color = _COLORS_LEVEL[2]
+        if diagonal_length == -1:
+            raise ValueError("diagonal length error")
+        origin_diff = np.linalg.norm(
+            np.cross(np.array(pred_origin) - np.array(gt_origin), np.array(gt_axis))
+        ) / np.linalg.norm(gt_axis) / diagonal_length
+        if origin_diff <= 0.1:
+            origin_color = _COLORS_LEVEL[0]
+        elif origin_diff > 0.1 and origin_diff <= 0.25:
+            origin_color = _COLORS_LEVEL[1]
+        elif origin_diff > 0.25:
+            origin_color = _COLORS_LEVEL[2]
+        # Visualize gt
+        gt_color = np.array([0, 0, 255]) / 255
+        gt_arrow_p0 = rotatePoint(
+            gt_new_point[0] - gt_origin_2d[0],
+            gt_new_point[1] - gt_origin_2d[1],
+            30,
+            0.1,
+        )
+        gt_arrow_p1 = rotatePoint(
+            gt_new_point[0] - gt_origin_2d[0],
+            gt_new_point[1] - gt_origin_2d[1],
+            -30,
+            0.1,
+        )
+        gt_circle_p = circlePoints(gt_axis, 0.1, 50)
+        gt_circle_p = line_length * gt_circle_p + np.repeat(
+            np.asarray(gt_origin)[:, np.newaxis], 50, axis=1
+        )
+        gt_circle_p = gt_circle_p.transpose()
+        gt_circle_p_2d = np.asarray([camera_to_image(p, is_real, intrinsic_matrix) for p in gt_circle_p])
+        self.draw_line(
+            [gt_origin_2d[0], gt_new_point[0]],
+            [gt_origin_2d[1], gt_new_point[1]],
+            color=gt_color,
+            linewidth=2,
+        )
+        self.draw_line(
+            [gt_new_point[0] - gt_arrow_p0[0], gt_new_point[0]],
+            [gt_new_point[1] - gt_arrow_p0[1], gt_new_point[1]],
+            color=gt_color,
+            linewidth=2,
+        )
+        self.draw_line(
+            [gt_new_point[0] - gt_arrow_p1[0], gt_new_point[0]],
+            [gt_new_point[1] - gt_arrow_p1[1], gt_new_point[1]],
+            color=gt_color,
+            linewidth=2,
+        )
+        self.draw_polygon(
+            gt_circle_p_2d, color=gt_color, edge_color=gt_color, alpha=0.0
+        )
+        if pred_type == 0:
+            # self.draw_text("origin_error: {:.3f}".format(origin_diff), (origin_2d[0][0], origin_2d[0][1]-10*text_y_offset), color="c")
+            self.draw_circle(gt_origin_2d, color=gt_color, radius=5)
+        # Visualize the predicted axis
+        pred_arrow_p0 = rotatePoint(
+            pred_new_point[0] - pred_origin_2d[0],
+            pred_new_point[1] - pred_origin_2d[1],
+            30,
+            0.1,
+        )
+        pred_arrow_p1 = rotatePoint(
+            pred_new_point[0] - pred_origin_2d[0],
+            pred_new_point[1] - pred_origin_2d[1],
+            -30,
+            0.1,
+        )
+        pred_circle_p = circlePoints(pred_axis, 0.1, 50)
+        pred_circle_p = line_length * pred_circle_p + np.repeat(
+            np.asarray(pred_origin)[:, np.newaxis], 50, axis=1
+        )
+        pred_circle_p = pred_circle_p.transpose()
+        pred_circle_p_2d = np.asarray([camera_to_image(p, is_real, intrinsic_matrix) for p in pred_circle_p])
+        # text_y_offset = 1 if (new_point[1]-origin_2d[0][1]) > 0 else -1
+        # self.draw_text("axis_error: {:.3f}".format(axis_diff), (origin_2d[0][0], origin_2d[0][1]-20*text_y_offset), color="tan")
+        self.draw_line(
+            [pred_origin_2d[0], pred_new_point[0]],
+            [pred_origin_2d[1], pred_new_point[1]],
+            color=axis_color,
+            linewidth=2,
+        )
+        self.draw_line(
+            [pred_new_point[0] - pred_arrow_p0[0], pred_new_point[0]],
+            [pred_new_point[1] - pred_arrow_p0[1], pred_new_point[1]],
+            color=axis_color,
+            linewidth=2,
+        )
+        self.draw_line(
+            [pred_new_point[0] - pred_arrow_p1[0], pred_new_point[0]],
+            [pred_new_point[1] - pred_arrow_p1[1], pred_new_point[1]],
+            color=axis_color,
+            linewidth=2,
+        )
+        self.draw_polygon(
+            pred_circle_p_2d, color=axis_color, edge_color=axis_color, alpha=0.0
+        )
+        if pred_type == 0:
+            # self.draw_text("origin_error: {:.3f}".format(origin_diff), (origin_2d[0][0], origin_2d[0][1]-10*text_y_offset), color="c")
+            self.draw_circle(pred_origin_2d, color=origin_color, radius=5)
+        # Assign color to the segmentation
+        cat_id = prediction.get("category_id", None)
+        color_cat = _COLORS_CAT[cat_id * 2 + pred_type]
+        scores = [prediction.get("score", None)]
+        classes = [prediction.get("category_id", None)]
+        labels = _create_text_labels_motion(
+            classes,
+            scores,
+            self.metadata.get("thing_classes", None),
+            MOTION_TYPE[pred_type],
+        )
+        keypoints = prediction.get("keypoints", None)
+        if prediction.get("segmentation"):
+            import pycocotools.mask as mask_util
+            masks = [prediction.get("segmentation")]
+        else:
+            masks = None
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get(
+            "thing_colors"
+        ):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                for c in classes
+            ]
+            alpha = 0.8
+        else:
+            colors = [color_cat]
+            alpha = 0.5
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.img = self._create_grayscale_image(
+                (mask_util.decode(prediction.get("segmentation")).any() > 0).numpy()
+            )
+            alpha = 0.3
+        # import pdb
+        # pdb.set_trace()
+        match["iou"] = max_iou
+        # Add the gt information
+        match["gt"] = {}
+        match["gt"]["partId"] = anno["motion"]["partId"]
+        match["gt"]["label"] = anno["motion"]["part_label"]
+        match["gt"]["type"] = anno["motion"]["type"]
+        match["gt"]["category_id"] = anno["category_id"]
+        match["gt"]["origin"] = gt_origin
+        match["gt"]["axis"] = gt_axis
+        # add the prediction information
+        match["pred"] = {}
+        match["pred"]["score"] = scores[0]
+        match["pred"]["type"] = pred_type
+        match["pred"]["category_id"] = cat_id
+        match["pred"]["origin"] = pred_origin
+        match["pred"]["axis"] = pred_axis
+        # add additional information
+        match["axis_error"] = axis_diff
+        match["origin_error"] = origin_diff
+        match["match"] = (
+            int(pred_type)
+            == int(
+                list(MOTION_TYPE.keys())[
+                    list(MOTION_TYPE.values()).index(anno["motion"]["type"])
+                ]
+            )
+        ) and (cat_id == anno["category_id"])
+        if no_mask:
+            masks = None
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return self.output
+    def draw_pred_only(self, prediction, prob):
+        scores = prediction.scores if prediction.has("scores") else None
+        if scores.numpy()[0] < prob:
+            return None
+        origins = list(prediction.morigin.numpy())
+        origins = [list(origin) for origin in origins]
+        axises = list(prediction.maxis.numpy())
+        axises = [list(axis) for axis in axises]
+        types = list(prediction.mtype.numpy())
+        classes = prediction.pred_classes if prediction.has("pred_classes") else None
+        color_cat = _COLORS_CAT[classes.numpy()[0] * 2 + types[0]]
+        origins_4d = [origin[:] + [1] for origin in origins]
+        origin_2d = [camera_to_image(origin) for origin in origins_4d]
+        new_point = list(np.array(origins[0]) + np.array(axises[0]))
+        new_point = new_point[:] + [1]
+        new_point = camera_to_image(new_point)
+        axis_color = _COLORS_LEVEL[0]
+        origin_color = _COLORS_LEVEL[0]
+        arrow_p0 = rotatePoint(
+            new_point[0] - origin_2d[0][0], new_point[1] - origin_2d[0][1], 30, 0.1
+        )
+        arrow_p1 = rotatePoint(
+            new_point[0] - origin_2d[0][0], new_point[1] - origin_2d[0][1], -30, 0.1
+        )
+        circle_p = circlePoints(axises[0], 0.1, 50)
+        circle_p = circle_p + np.repeat(
+            np.asarray(origins[0])[:, np.newaxis], 50, axis=1
+        )
+        circle_p = circle_p.transpose()
+        circle_p_2d = np.asarray([camera_to_image(p) for p in circle_p])
+        # text_y_offset = 1 if (new_point[1]-origin_2d[0][1]) > 0 else -1
+        # self.draw_text("axis_error: {:.3f}".format(axis_diff), (origin_2d[0][0], origin_2d[0][1]-20*text_y_offset), color="tan")
+        self.draw_line(
+            [origin_2d[0][0], new_point[0]],
+            [origin_2d[0][1], new_point[1]],
+            color=axis_color,
+            linewidth=2,
+        )
+        self.draw_line(
+            [new_point[0] - arrow_p0[0], new_point[0]],
+            [new_point[1] - arrow_p0[1], new_point[1]],
+            color=axis_color,
+            linewidth=2,
+        )
+        self.draw_line(
+            [new_point[0] - arrow_p1[0], new_point[0]],
+            [new_point[1] - arrow_p1[1], new_point[1]],
+            color=axis_color,
+            linewidth=2,
+        )
+        self.draw_polygon(
+            circle_p_2d, color=axis_color, edge_color=axis_color, alpha=0.0
+        )
+        if types[0] == 0:
+            # self.draw_text("origin_error: {:.3f}".format(origin_diff), (origin_2d[0][0], origin_2d[0][1]-10*text_y_offset), color="c")
+            self.draw_circle(origin_2d[0], color=origin_color, radius=5)
+        boxes = prediction.pred_boxes if prediction.has("pred_boxes") else None
+        labels = _create_text_labels_motion(
+            classes,
+            scores,
+            self.metadata.get("thing_classes", None),
+            MOTION_TYPE[types[0]],
+        )
+        keypoints = (
+            prediction.pred_keypoints if prediction.has("pred_keypoints") else None
+        )
+        if prediction.has("pred_masks"):
+            masks = np.asarray(prediction.pred_masks)
+            masks = [
+                GenericMask(x, self.output.height, self.output.width) for x in masks
+            ]
+        else:
+            masks = None
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get(
+            "thing_colors"
+        ):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                for c in classes
+            ]
+            alpha = 0.8
+        else:
+            colors = [color_cat]
+            alpha = 0.5
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.img = self._create_grayscale_image(
+                (prediction.pred_masks.any(dim=0) > 0).numpy()
+            )
+            alpha = 0.3
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return self.output
+def _create_text_labels_motion(classes, scores, class_names, motion_type):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+    Returns:
+        list[str] or None
+    """
+    labels = None
+    if classes is not None and class_names is not None and len(class_names) > 1:
+        labels = [class_names[i] for i in classes]
+        labels = [label + "_" + motion_type for label in labels]
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    return labels

mask2former/utils/tranform.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import torch
+from torch.nn import functional as F
+import numpy as np
+from scipy.spatial.distance import cdist, euclidean
+def geometric_median(X, eps=1e-5):
+    y = np.mean(X, 0)
+    while True:
+        D = cdist(X, [y])
+        nonzeros = (D != 0)[:, 0]
+        Dinv = 1 / D[nonzeros]
+        Dinvs = np.sum(Dinv)
+        W = Dinv / Dinvs
+        T = np.sum(W * X[nonzeros], 0)
+        num_zeros = len(X) - np.sum(nonzeros)
+        if num_zeros == 0:
+            y1 = T
+        elif num_zeros == len(X):
+            return y
+        else:
+            R = (T - y) * Dinvs
+            r = np.linalg.norm(R)
+            rinv = 0 if r == 0 else num_zeros/r
+            y1 = max(0, 1-rinv)*T + min(1, rinv)*y
+        if euclidean(y, y1) < eps:
+            return y1
+        y = y1
+#  Transformation code fomr pytorch3d https://pytorch3d.readthedocs.io/en/latest/_modules/pytorch3d/transforms/rotation_conversions.html#matrix_to_quaternion
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    """
+    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
+    using Gram--Schmidt orthogonalization per Section B of [1].
+    Args:
+        d6: 6D rotation representation, of size (*, 6)
+    Returns:
+        batch of rotation matrices of size (*, 3, 3)
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
+def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
+    by dropping the last row. Note that 6D representation is not unique.
+    Args:
+        matrix: batch of rotation matrices of size (*, 3, 3)
+    Returns:
+        6D rotation representation, of size (*, 6)
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    batch_dim = matrix.size()[:-2]
+    return matrix[..., :2, :].clone().reshape(batch_dim + (6,))
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+    return quat_candidates[
+        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :  # pyre-ignore[16]
+    ].reshape(batch_dim + (4,))
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+numpy==1.25.2
+Pillow==10.0.1
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+urllib3==1.26.16

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+h5py==3.9.0
+imageio==2.31.3
+open3d==0.17.0
+opencv-python==4.8.0.76
+pandas==2.1.0
+pycocotools==2.0.7
+scikit-image==0.21.0
+scikit-learn==1.3.0
+scipy==1.11.2
+timm==0.9.7
+detectron2 @ git+https://github.com/facebookresearch/detectron2.git@fc9c33b1f6e5d4c37bbb46dde19af41afc1ddb2a