Spaces:

RoyYang0714
/

3D-MOOD

Running on Zero

App Files Files Community

3D-MOOD / vis4d /op /track3d /cc_3dt.py

RoyYang0714

feat: Try to build everything locally.

9b33fca 3 months ago

raw

history blame contribute delete

15.8 kB

	"""CC-3DT graph."""

	from __future__ import annotations

	import torch
	import torch.nn.functional as F
	from torch import Tensor

	from vis4d.op.box.box2d import bbox_iou
	from vis4d.op.geometry.rotation import (
	euler_angles_to_matrix,
	matrix_to_quaternion,
	rotate_orientation,
	rotate_velocities,
	)
	from vis4d.op.geometry.transform import transform_points
	from vis4d.op.track.assignment import TrackIDCounter, greedy_assign
	from vis4d.op.track.matching import calc_bisoftmax_affinity

	from .common import Track3DOut


	def get_track_3d_out(
	boxes_3d: Tensor, class_ids: Tensor, scores_3d: Tensor, track_ids: Tensor
	) -> Track3DOut:
	"""Get track 3D output.

	Args:
	boxes_3d (Tensor): (N, 12): x,y,z,h,w,l,rx,ry,rz,vx,vy,vz
	class_ids (Tensor): (N,)
	scores_3d (Tensor): (N,)
	track_ids (Tensor): (N,)

	Returns:
	Track3DOut: output
	"""
	center = boxes_3d[:, :3]
	# HWL -> WLH
	dims = boxes_3d[:, [4, 5, 3]]
	orientation = matrix_to_quaternion(
	euler_angles_to_matrix(boxes_3d[:, 6:9])
	)

	return Track3DOut(
	boxes_3d=[torch.cat([center, dims, orientation], dim=1)],
	velocities=[boxes_3d[:, 9:12]],
	class_ids=[class_ids],
	scores_3d=[scores_3d],
	track_ids=[track_ids],
	)


	class CC3DTrackAssociation:
	"""Data association relying on quasi-dense instance similarity and 3D clue.

	This class assigns detection candidates to a given memory of existing
	tracks and backdrops.
	Backdrops are low-score detections kept in case they have high
	similarity with a high-score detection in succeeding frames.
	"""

	def __init__(
	self,
	init_score_thr: float = 0.8,
	obj_score_thr: float = 0.5,
	match_score_thr: float = 0.5,
	nms_backdrop_iou_thr: float = 0.3,
	nms_class_iou_thr: float = 0.7,
	nms_conf_thr: float = 0.5,
	with_cats: bool = True,
	with_velocities: bool = False,
	bbox_affinity_weight: float = 0.5,
	) -> None:
	"""Creates an instance of the class.

	Args:
	init_score_thr (float): Confidence threshold for initializing a new
	track.
	obj_score_thr (float): Confidence treshold s.t. a detection is
	considered in the track / det matching process.
	match_score_thr (float): Similarity score threshold for matching a
	detection to an existing track.
	nms_backdrop_iou_thr (float): Maximum IoU of a backdrop with
	another detection.
	nms_class_iou_thr (float): Maximum IoU of a high score detection
	with another of a different class.
	nms_conf_thr (float): Confidence threshold for NMS.
	with_cats (bool): If to consider category information for
	tracking (i.e. all detections within a track must have
	consistent category labels).
	with_velocities (bool): If to use predicted velocities for
	matching.
	bbox_affinity_weight (float): Weight of bbox affinity in the
	overall affinity score.
	"""
	super().__init__()
	self.init_score_thr = init_score_thr
	self.obj_score_thr = obj_score_thr
	self.match_score_thr = match_score_thr
	self.nms_backdrop_iou_thr = nms_backdrop_iou_thr
	self.nms_class_iou_thr = nms_class_iou_thr
	self.nms_conf_thr = nms_conf_thr
	self.with_cats = with_cats
	self.with_velocities = with_velocities
	self.bbox_affinity_weight = bbox_affinity_weight
	self.feat_affinity_weight = 1 - bbox_affinity_weight

	def _filter_detections(
	self,
	detections: Tensor,
	camera_ids: Tensor,
	scores: Tensor,
	detections_3d: Tensor,
	scores_3d: Tensor,
	class_ids: Tensor,
	embeddings: Tensor,
	velocities: Tensor \| None = None,
	) -> tuple[
	Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor \| None, Tensor
	]:
	"""Remove overlapping objects across classes via nms.

	Args:
	detections (Tensor): [N, 4] Tensor of boxes.
	camera_ids (Tensor): [N,] Tensor of camera ids.
	scores (Tensor): [N,] Tensor of confidence scores.
	detections_3d (Tensor): [N, 7] Tensor of 3D boxes.
	scores_3d (Tensor): [N,] Tensor of 3D confidence scores.
	class_ids (Tensor): [N,] Tensor of class ids.
	embeddings (Tensor): [N, C] tensor of appearance embeddings.
	velocities (Tensor \| None): [N, 3] Tensor of velocities.

	Returns:
	tuple[Tensor]: filtered detections, scores, class_ids,
	embeddings, and filtered indices.
	"""
	scores, inds = scores.sort(descending=True)
	(
	detections,
	camera_ids,
	embeddings,
	class_ids,
	detections_3d,
	scores_3d,
	) = (
	detections[inds],
	camera_ids[inds],
	embeddings[inds],
	class_ids[inds],
	detections_3d[inds],
	scores_3d[inds],
	)

	if velocities is not None:
	velocities = velocities[inds]

	valids = embeddings.new_ones((len(detections),), dtype=torch.bool)

	ious = bbox_iou(detections, detections)
	valid_ious = torch.eq(
	camera_ids.unsqueeze(1), camera_ids.unsqueeze(0)
	).int()
	ious *= valid_ious

	for i in range(1, len(detections)):
	if scores[i] < self.obj_score_thr:
	thr = self.nms_backdrop_iou_thr
	else:
	thr = self.nms_class_iou_thr

	if (ious[i, :i] > thr).any():
	valids[i] = False

	detections = detections[valids]
	scores = scores[valids]
	detections_3d = detections_3d[valids]
	scores_3d = scores_3d[valids]
	class_ids = class_ids[valids]
	embeddings = embeddings[valids]

	if velocities is not None:
	velocities = velocities[valids]

	return (
	detections,
	scores,
	detections_3d,
	scores_3d,
	class_ids,
	embeddings,
	velocities,
	inds[valids],
	)

	def depth_ordering(
	self,
	obsv_boxes_3d: Tensor,
	obsv_velocities: Tensor \| None,
	memory_boxes_3d_predict: Tensor,
	memory_boxes_3d: Tensor,
	memory_velocities: Tensor,
	) -> Tensor:
	"""Depth ordering matching."""
	# Centroid
	centroid_weight_list = []
	for memory_box_3d_predict in memory_boxes_3d_predict:
	centroid_weight_list.append(
	F.pairwise_distance( # pylint: disable=not-callable
	obsv_boxes_3d[:, :3],
	memory_box_3d_predict[:3],
	keepdim=True,
	)
	)
	centroid_weight = torch.cat(centroid_weight_list, dim=1)
	centroid_weight = torch.exp(-torch.div(centroid_weight, 10.0))

	# Moving distance should be aligned
	motion_weight_list = []
	moving_dist = (
	obsv_boxes_3d[:, :3, None]
	- memory_boxes_3d[:, :3, None].transpose(2, 0)
	).transpose(1, 2)
	for v in moving_dist:
	motion_weight_list.append(
	F.pairwise_distance( # pylint: disable=not-callable
	v, memory_velocities[:, :3]
	).unsqueeze(0)
	)
	motion_weight = torch.cat(motion_weight_list, dim=0)
	motion_weight = torch.exp(-torch.div(motion_weight, 5.0))

	# Velocity scores
	if self.with_velocities:
	assert (
	obsv_velocities is not None
	), "Please provide velocities if with_velocities=True!"

	velsim_weight_list = []
	obsvvv_velocities = obsv_velocities.unsqueeze(1).expand_as(
	moving_dist
	)
	for v in obsvvv_velocities:
	velsim_weight_list.append(
	F.pairwise_distance( # pylint: disable=not-callable
	v, memory_velocities[:, -3:]
	).unsqueeze(0)
	)
	velsim_weight = torch.cat(velsim_weight_list, dim=0)
	cos_sim = torch.exp(-velsim_weight / 5.0)
	else:
	# Moving direction should be aligned
	# Set to 0.5 when two vector not within +-90 degree
	cos_sim_list = []
	obsv_direct = (
	obsv_boxes_3d[:, :2, None]
	- memory_boxes_3d[:, :2, None].transpose(2, 0)
	).transpose(1, 2)
	for d in obsv_direct:
	cos_sim_list.append(
	F.cosine_similarity( # pylint: disable=not-callable
	d, memory_velocities[:, :2]
	).unsqueeze(0)
	)
	cos_sim = torch.cat(cos_sim_list, dim=0)
	cos_sim = torch.add(cos_sim, 1.0)
	cos_sim = torch.div(cos_sim, 2.0)

	scores_depth = (
	cos_sim * centroid_weight + (1.0 - cos_sim) * motion_weight
	)

	return scores_depth

	def __call__(
	self,
	detections: Tensor,
	camera_ids: Tensor,
	detection_scores: Tensor,
	detections_3d: Tensor,
	detection_scores_3d: Tensor,
	detection_class_ids: Tensor,
	detection_embeddings: Tensor,
	obs_velocities: Tensor \| None = None,
	memory_boxes_3d: Tensor \| None = None,
	memory_track_ids: Tensor \| None = None,
	memory_class_ids: Tensor \| None = None,
	memory_embeddings: Tensor \| None = None,
	memory_boxes_3d_predict: Tensor \| None = None,
	memory_velocities: Tensor \| None = None,
	with_depth_confidence: bool = True,
	) -> tuple[Tensor, Tensor]:
	"""Process inputs, match detections with existing tracks.

	Args:
	detections (Tensor): [N, 4] detected boxes.
	camera_ids (Tensor): [N,] camera ids.
	detection_scores (Tensor): [N,] confidence scores.
	detections_3d (Tensor): [N, 7] detected boxes in 3D.
	detection_scores_3d (Tensor): [N,] confidence scores in 3D.
	detection_class_ids (Tensor): [N,] class indices.
	detection_embeddings (Tensor): [N, C] appearance embeddings.
	obs_velocities (Tensor \| None): [N, 3] velocities of detections.
	memory_boxes_3d (Tensor): [M, 7] boxes in memory.
	memory_track_ids (Tensor): [M,] track ids in memory.
	memory_class_ids (Tensor): [M,] class indices in memory.
	memory_embeddings (Tensor): [M, C] appearance embeddings in
	memory.
	memory_boxes_3d_predict (Tensor): [M, 7] predicted boxes in
	memory.
	memory_velocities (Tensor): [M, 7] velocities in memory.

	Returns:
	tuple[Tensor, Tensor]: track ids of active tracks and selected
	detection indices corresponding to tracks.
	"""
	(
	detections,
	detection_scores,
	detections_3d,
	detection_scores_3d,
	detection_class_ids,
	detection_embeddings,
	obs_velocities,
	permute_inds,
	) = self._filter_detections(
	detections,
	camera_ids,
	detection_scores,
	detections_3d,
	detection_scores_3d,
	detection_class_ids,
	detection_embeddings,
	obs_velocities,
	)

	if with_depth_confidence:
	depth_confidence = detection_scores_3d
	else:
	depth_confidence = detection_scores_3d.new_ones(
	len(detection_scores_3d)
	)

	# match if buffer is not empty
	if len(detections) > 0 and memory_boxes_3d is not None:
	assert (
	memory_track_ids is not None
	and memory_class_ids is not None
	and memory_embeddings is not None
	and memory_boxes_3d_predict is not None
	and memory_velocities is not None
	)

	# Box 3D
	bbox3d_weight_list = []
	for memory_box_3d_predict in memory_boxes_3d_predict:
	bbox3d_weight_list.append(
	F.pairwise_distance( # pylint: disable=not-callable
	detections_3d,
	memory_box_3d_predict,
	keepdim=True,
	)
	)
	bbox3d_weight = torch.cat(bbox3d_weight_list, dim=1)
	scores_iou = torch.exp(-torch.div(bbox3d_weight, 10.0))

	# Depth Ordering
	scores_depth = self.depth_ordering(
	detections_3d,
	obs_velocities,
	memory_boxes_3d_predict,
	memory_boxes_3d,
	memory_velocities,
	)

	# match using bisoftmax metric
	similarity_scores = calc_bisoftmax_affinity(
	detection_embeddings,
	memory_embeddings,
	detection_class_ids,
	memory_class_ids,
	)

	if self.with_cats:
	assert (
	detection_class_ids is not None
	and memory_class_ids is not None
	), "Please provide class ids if with_categories=True!"
	cat_same = detection_class_ids.view(
	-1, 1
	) == memory_class_ids.view(1, -1)
	scores_cats = cat_same.float()

	affinity_scores = (
	self.bbox_affinity_weight * scores_iou * scores_depth
	+ self.feat_affinity_weight * similarity_scores
	)
	affinity_scores /= (
	self.bbox_affinity_weight + self.feat_affinity_weight
	)
	affinity_scores = torch.mul(
	affinity_scores, torch.greater(scores_iou, 0.0).float()
	)
	affinity_scores = torch.mul(
	affinity_scores, torch.greater(scores_depth, 0.0).float()
	)
	if self.with_cats:
	affinity_scores = torch.mul(affinity_scores, scores_cats)

	ids = greedy_assign(
	detection_scores * depth_confidence,
	memory_track_ids,
	affinity_scores,
	self.match_score_thr,
	self.obj_score_thr,
	self.nms_conf_thr,
	)
	else:
	ids = torch.full(
	(len(detections),),
	-1,
	dtype=torch.long,
	device=detections.device,
	)
	new_inds = (ids == -1) & (detection_scores > self.init_score_thr)
	ids[new_inds] = TrackIDCounter.get_ids(
	new_inds.sum(), device=ids.device # type: ignore
	)
	return ids, permute_inds


	def cam_to_global(
	boxes_3d_list: list[Tensor], extrinsics: Tensor
	) -> list[Tensor]:
	"""Convert camera coordinates to global coordinates."""
	for i, boxes_3d in enumerate(boxes_3d_list):
	if len(boxes_3d) != 0:
	boxes_3d_list[i][:, :3] = transform_points(
	boxes_3d_list[i][:, :3], extrinsics[i]
	)
	boxes_3d_list[i][:, 6:9] = rotate_orientation(
	boxes_3d_list[i][:, 6:9], extrinsics[i]
	)
	boxes_3d_list[i][:, 9:12] = rotate_velocities(
	boxes_3d_list[i][:, 9:12], extrinsics[i]
	)
	return boxes_3d_list