Spaces:

RoyYang0714
/

3D-MOOD

Running on Zero

File size: 15,822 Bytes

9b33fca

"""CC-3DT graph."""

from __future__ import annotations

import torch
import torch.nn.functional as F
from torch import Tensor

from vis4d.op.box.box2d import bbox_iou
from vis4d.op.geometry.rotation import (
    euler_angles_to_matrix,
    matrix_to_quaternion,
    rotate_orientation,
    rotate_velocities,
)
from vis4d.op.geometry.transform import transform_points
from vis4d.op.track.assignment import TrackIDCounter, greedy_assign
from vis4d.op.track.matching import calc_bisoftmax_affinity

from .common import Track3DOut


def get_track_3d_out(
    boxes_3d: Tensor, class_ids: Tensor, scores_3d: Tensor, track_ids: Tensor
) -> Track3DOut:
    """Get track 3D output.

    Args:
        boxes_3d (Tensor): (N, 12): x,y,z,h,w,l,rx,ry,rz,vx,vy,vz
        class_ids (Tensor): (N,)
        scores_3d (Tensor): (N,)
        track_ids (Tensor): (N,)

    Returns:
        Track3DOut: output
    """
    center = boxes_3d[:, :3]
    # HWL -> WLH
    dims = boxes_3d[:, [4, 5, 3]]
    orientation = matrix_to_quaternion(
        euler_angles_to_matrix(boxes_3d[:, 6:9])
    )

    return Track3DOut(
        boxes_3d=[torch.cat([center, dims, orientation], dim=1)],
        velocities=[boxes_3d[:, 9:12]],
        class_ids=[class_ids],
        scores_3d=[scores_3d],
        track_ids=[track_ids],
    )


class CC3DTrackAssociation:
    """Data association relying on quasi-dense instance similarity and 3D clue.

    This class assigns detection candidates to a given memory of existing
    tracks and backdrops.
    Backdrops are low-score detections kept in case they have high
    similarity with a high-score detection in succeeding frames.
    """

    def __init__(
        self,
        init_score_thr: float = 0.8,
        obj_score_thr: float = 0.5,
        match_score_thr: float = 0.5,
        nms_backdrop_iou_thr: float = 0.3,
        nms_class_iou_thr: float = 0.7,
        nms_conf_thr: float = 0.5,
        with_cats: bool = True,
        with_velocities: bool = False,
        bbox_affinity_weight: float = 0.5,
    ) -> None:
        """Creates an instance of the class.

        Args:
            init_score_thr (float): Confidence threshold for initializing a new
                track.
            obj_score_thr (float): Confidence treshold s.t. a detection is
                considered in the track / det matching process.
            match_score_thr (float): Similarity score threshold for matching a
                detection to an existing track.
            nms_backdrop_iou_thr (float): Maximum IoU of a backdrop with
                another detection.
            nms_class_iou_thr (float): Maximum IoU of a high score detection
                with another of a different class.
            nms_conf_thr (float): Confidence threshold for NMS.
            with_cats (bool): If to consider category information for
                tracking (i.e. all detections within a track must have
                consistent category labels).
            with_velocities (bool): If to use predicted velocities for
                matching.
            bbox_affinity_weight (float): Weight of bbox affinity in the
                overall affinity score.
        """
        super().__init__()
        self.init_score_thr = init_score_thr
        self.obj_score_thr = obj_score_thr
        self.match_score_thr = match_score_thr
        self.nms_backdrop_iou_thr = nms_backdrop_iou_thr
        self.nms_class_iou_thr = nms_class_iou_thr
        self.nms_conf_thr = nms_conf_thr
        self.with_cats = with_cats
        self.with_velocities = with_velocities
        self.bbox_affinity_weight = bbox_affinity_weight
        self.feat_affinity_weight = 1 - bbox_affinity_weight

    def _filter_detections(
        self,
        detections: Tensor,
        camera_ids: Tensor,
        scores: Tensor,
        detections_3d: Tensor,
        scores_3d: Tensor,
        class_ids: Tensor,
        embeddings: Tensor,
        velocities: Tensor | None = None,
    ) -> tuple[
        Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor | None, Tensor
    ]:
        """Remove overlapping objects across classes via nms.

        Args:
            detections (Tensor): [N, 4] Tensor of boxes.
            camera_ids (Tensor): [N,] Tensor of camera ids.
            scores (Tensor): [N,] Tensor of confidence scores.
            detections_3d (Tensor): [N, 7] Tensor of 3D boxes.
            scores_3d (Tensor): [N,] Tensor of 3D confidence scores.
            class_ids (Tensor): [N,] Tensor of class ids.
            embeddings (Tensor): [N, C] tensor of appearance embeddings.
            velocities (Tensor | None): [N, 3] Tensor of velocities.

        Returns:
            tuple[Tensor]: filtered detections, scores, class_ids,
                embeddings, and filtered indices.
        """
        scores, inds = scores.sort(descending=True)
        (
            detections,
            camera_ids,
            embeddings,
            class_ids,
            detections_3d,
            scores_3d,
        ) = (
            detections[inds],
            camera_ids[inds],
            embeddings[inds],
            class_ids[inds],
            detections_3d[inds],
            scores_3d[inds],
        )

        if velocities is not None:
            velocities = velocities[inds]

        valids = embeddings.new_ones((len(detections),), dtype=torch.bool)

        ious = bbox_iou(detections, detections)
        valid_ious = torch.eq(
            camera_ids.unsqueeze(1), camera_ids.unsqueeze(0)
        ).int()
        ious *= valid_ious

        for i in range(1, len(detections)):
            if scores[i] < self.obj_score_thr:
                thr = self.nms_backdrop_iou_thr
            else:
                thr = self.nms_class_iou_thr

            if (ious[i, :i] > thr).any():
                valids[i] = False

        detections = detections[valids]
        scores = scores[valids]
        detections_3d = detections_3d[valids]
        scores_3d = scores_3d[valids]
        class_ids = class_ids[valids]
        embeddings = embeddings[valids]

        if velocities is not None:
            velocities = velocities[valids]

        return (
            detections,
            scores,
            detections_3d,
            scores_3d,
            class_ids,
            embeddings,
            velocities,
            inds[valids],
        )

    def depth_ordering(
        self,
        obsv_boxes_3d: Tensor,
        obsv_velocities: Tensor | None,
        memory_boxes_3d_predict: Tensor,
        memory_boxes_3d: Tensor,
        memory_velocities: Tensor,
    ) -> Tensor:
        """Depth ordering matching."""
        # Centroid
        centroid_weight_list = []
        for memory_box_3d_predict in memory_boxes_3d_predict:
            centroid_weight_list.append(
                F.pairwise_distance(  # pylint: disable=not-callable
                    obsv_boxes_3d[:, :3],
                    memory_box_3d_predict[:3],
                    keepdim=True,
                )
            )
        centroid_weight = torch.cat(centroid_weight_list, dim=1)
        centroid_weight = torch.exp(-torch.div(centroid_weight, 10.0))

        # Moving distance should be aligned
        motion_weight_list = []
        moving_dist = (
            obsv_boxes_3d[:, :3, None]
            - memory_boxes_3d[:, :3, None].transpose(2, 0)
        ).transpose(1, 2)
        for v in moving_dist:
            motion_weight_list.append(
                F.pairwise_distance(  # pylint: disable=not-callable
                    v, memory_velocities[:, :3]
                ).unsqueeze(0)
            )
        motion_weight = torch.cat(motion_weight_list, dim=0)
        motion_weight = torch.exp(-torch.div(motion_weight, 5.0))

        # Velocity scores
        if self.with_velocities:
            assert (
                obsv_velocities is not None
            ), "Please provide velocities if with_velocities=True!"

            velsim_weight_list = []
            obsvvv_velocities = obsv_velocities.unsqueeze(1).expand_as(
                moving_dist
            )
            for v in obsvvv_velocities:
                velsim_weight_list.append(
                    F.pairwise_distance(  # pylint: disable=not-callable
                        v, memory_velocities[:, -3:]
                    ).unsqueeze(0)
                )
            velsim_weight = torch.cat(velsim_weight_list, dim=0)
            cos_sim = torch.exp(-velsim_weight / 5.0)
        else:
            # Moving direction should be aligned
            # Set to 0.5 when two vector not within +-90 degree
            cos_sim_list = []
            obsv_direct = (
                obsv_boxes_3d[:, :2, None]
                - memory_boxes_3d[:, :2, None].transpose(2, 0)
            ).transpose(1, 2)
            for d in obsv_direct:
                cos_sim_list.append(
                    F.cosine_similarity(  # pylint: disable=not-callable
                        d, memory_velocities[:, :2]
                    ).unsqueeze(0)
                )
            cos_sim = torch.cat(cos_sim_list, dim=0)
            cos_sim = torch.add(cos_sim, 1.0)
            cos_sim = torch.div(cos_sim, 2.0)

        scores_depth = (
            cos_sim * centroid_weight + (1.0 - cos_sim) * motion_weight
        )

        return scores_depth

    def __call__(
        self,
        detections: Tensor,
        camera_ids: Tensor,
        detection_scores: Tensor,
        detections_3d: Tensor,
        detection_scores_3d: Tensor,
        detection_class_ids: Tensor,
        detection_embeddings: Tensor,
        obs_velocities: Tensor | None = None,
        memory_boxes_3d: Tensor | None = None,
        memory_track_ids: Tensor | None = None,
        memory_class_ids: Tensor | None = None,
        memory_embeddings: Tensor | None = None,
        memory_boxes_3d_predict: Tensor | None = None,
        memory_velocities: Tensor | None = None,
        with_depth_confidence: bool = True,
    ) -> tuple[Tensor, Tensor]:
        """Process inputs, match detections with existing tracks.

        Args:
            detections (Tensor): [N, 4] detected boxes.
            camera_ids (Tensor): [N,] camera ids.
            detection_scores (Tensor): [N,] confidence scores.
            detections_3d (Tensor): [N, 7] detected boxes in 3D.
            detection_scores_3d (Tensor): [N,] confidence scores in 3D.
            detection_class_ids (Tensor): [N,] class indices.
            detection_embeddings (Tensor): [N, C] appearance embeddings.
            obs_velocities (Tensor | None): [N, 3] velocities of detections.
            memory_boxes_3d (Tensor): [M, 7] boxes in memory.
            memory_track_ids (Tensor): [M,] track ids in memory.
            memory_class_ids (Tensor): [M,] class indices in memory.
            memory_embeddings (Tensor): [M, C] appearance embeddings in
                memory.
            memory_boxes_3d_predict (Tensor): [M, 7] predicted boxes in
                memory.
            memory_velocities (Tensor): [M, 7] velocities in memory.

        Returns:
            tuple[Tensor, Tensor]: track ids of active tracks and selected
                detection indices corresponding to tracks.
        """
        (
            detections,
            detection_scores,
            detections_3d,
            detection_scores_3d,
            detection_class_ids,
            detection_embeddings,
            obs_velocities,
            permute_inds,
        ) = self._filter_detections(
            detections,
            camera_ids,
            detection_scores,
            detections_3d,
            detection_scores_3d,
            detection_class_ids,
            detection_embeddings,
            obs_velocities,
        )

        if with_depth_confidence:
            depth_confidence = detection_scores_3d
        else:
            depth_confidence = detection_scores_3d.new_ones(
                len(detection_scores_3d)
            )

        # match if buffer is not empty
        if len(detections) > 0 and memory_boxes_3d is not None:
            assert (
                memory_track_ids is not None
                and memory_class_ids is not None
                and memory_embeddings is not None
                and memory_boxes_3d_predict is not None
                and memory_velocities is not None
            )

            # Box 3D
            bbox3d_weight_list = []
            for memory_box_3d_predict in memory_boxes_3d_predict:
                bbox3d_weight_list.append(
                    F.pairwise_distance(  # pylint: disable=not-callable
                        detections_3d,
                        memory_box_3d_predict,
                        keepdim=True,
                    )
                )
            bbox3d_weight = torch.cat(bbox3d_weight_list, dim=1)
            scores_iou = torch.exp(-torch.div(bbox3d_weight, 10.0))

            # Depth Ordering
            scores_depth = self.depth_ordering(
                detections_3d,
                obs_velocities,
                memory_boxes_3d_predict,
                memory_boxes_3d,
                memory_velocities,
            )

            # match using bisoftmax metric
            similarity_scores = calc_bisoftmax_affinity(
                detection_embeddings,
                memory_embeddings,
                detection_class_ids,
                memory_class_ids,
            )

            if self.with_cats:
                assert (
                    detection_class_ids is not None
                    and memory_class_ids is not None
                ), "Please provide class ids if with_categories=True!"
                cat_same = detection_class_ids.view(
                    -1, 1
                ) == memory_class_ids.view(1, -1)
                scores_cats = cat_same.float()

            affinity_scores = (
                self.bbox_affinity_weight * scores_iou * scores_depth
                + self.feat_affinity_weight * similarity_scores
            )
            affinity_scores /= (
                self.bbox_affinity_weight + self.feat_affinity_weight
            )
            affinity_scores = torch.mul(
                affinity_scores, torch.greater(scores_iou, 0.0).float()
            )
            affinity_scores = torch.mul(
                affinity_scores, torch.greater(scores_depth, 0.0).float()
            )
            if self.with_cats:
                affinity_scores = torch.mul(affinity_scores, scores_cats)

            ids = greedy_assign(
                detection_scores * depth_confidence,
                memory_track_ids,
                affinity_scores,
                self.match_score_thr,
                self.obj_score_thr,
                self.nms_conf_thr,
            )
        else:
            ids = torch.full(
                (len(detections),),
                -1,
                dtype=torch.long,
                device=detections.device,
            )
        new_inds = (ids == -1) & (detection_scores > self.init_score_thr)
        ids[new_inds] = TrackIDCounter.get_ids(
            new_inds.sum(), device=ids.device  # type: ignore
        )
        return ids, permute_inds


def cam_to_global(
    boxes_3d_list: list[Tensor], extrinsics: Tensor
) -> list[Tensor]:
    """Convert camera coordinates to global coordinates."""
    for i, boxes_3d in enumerate(boxes_3d_list):
        if len(boxes_3d) != 0:
            boxes_3d_list[i][:, :3] = transform_points(
                boxes_3d_list[i][:, :3], extrinsics[i]
            )
            boxes_3d_list[i][:, 6:9] = rotate_orientation(
                boxes_3d_list[i][:, 6:9], extrinsics[i]
            )
            boxes_3d_list[i][:, 9:12] = rotate_velocities(
                boxes_3d_list[i][:, 9:12], extrinsics[i]
            )
    return boxes_3d_list