Spaces:
Sleeping
Sleeping
| # Copyright (c) Meta Platforms, Inc. and affiliates | |
| import math | |
| import numpy as np | |
| import pandas as pd | |
| from typing import Tuple, List | |
| from pytorch3d.renderer.lighting import PointLights | |
| from pytorch3d.renderer.mesh.renderer import MeshRenderer | |
| from pytorch3d.renderer.mesh.shader import SoftPhongShader | |
| from pytorch3d.transforms.math import acos_linear_extrapolation | |
| import torch | |
| from pytorch3d.structures import Meshes | |
| from detectron2.structures import BoxMode | |
| from pytorch3d.renderer import TexturesVertex | |
| from pytorch3d.structures.meshes import ( | |
| Meshes, | |
| ) | |
| from pytorch3d.renderer import ( | |
| PerspectiveCameras, | |
| RasterizationSettings, | |
| MeshRasterizer | |
| ) | |
| from pytorch3d.renderer import ( | |
| PerspectiveCameras, | |
| SoftSilhouetteShader, | |
| RasterizationSettings, | |
| MeshRasterizer | |
| ) | |
| from detectron2.data import ( | |
| MetadataCatalog, | |
| ) | |
| from pytorch3d.transforms import axis_angle_to_matrix | |
| from pytorch3d.renderer import MeshRenderer as MR | |
| UNIT_CUBE = np.array([ | |
| [-0.5, -0.5, -0.5], | |
| [ 0.5, -0.5, -0.5], | |
| [ 0.5, 0.5, -0.5], | |
| [-0.5, 0.5, -0.5], | |
| [-0.5, -0.5, 0.5], | |
| [ 0.5, -0.5, 0.5], | |
| [ 0.5, 0.5, 0.5], | |
| [-0.5, 0.5, 0.5] | |
| ]) | |
| def upto_2Pi(val): | |
| out = val | |
| # constrain between [0, 2pi) | |
| while out >= 2*math.pi: out -= math.pi * 2 | |
| while out < 0: out += math.pi * 2 | |
| return out | |
| def upto_Pi(val): | |
| out = val | |
| # constrain between [0, pi) | |
| while out >= math.pi: out -= math.pi | |
| while out < 0: out += math.pi | |
| return out | |
| # Calculates rotation matrix to euler angles | |
| # The result is the same as MATLAB except the order | |
| # of the euler angles ( x and z are swapped ). | |
| # adopted from https://www.learnopencv.com/rotation-matrix-to-euler-angles/ | |
| def mat2euler(R): | |
| sy = math.sqrt(R[0, 0] * R[0, 0] + R[1, 0] * R[1, 0]) | |
| #singular = sy < 1e-6 | |
| x = math.atan2(R[2, 1], R[2, 2]) | |
| y = math.atan2(-R[2, 0], sy) | |
| z = math.atan2(R[1, 0], R[0, 0]) | |
| return np.array([x, y, z]) | |
| # Calculates Rotation Matrix given euler angles. | |
| # adopted from https://www.learnopencv.com/rotation-matrix-to-euler-angles/ | |
| def euler2mat(euler): | |
| R_x = np.array([[1, 0, 0], | |
| [0, math.cos(euler[0]), -math.sin(euler[0])], | |
| [0, math.sin(euler[0]), math.cos(euler[0])] | |
| ]) | |
| R_y = np.array([[math.cos(euler[1]), 0, math.sin(euler[1])], | |
| [0, 1, 0], | |
| [-math.sin(euler[1]), 0, math.cos(euler[1])] | |
| ]) | |
| R_z = np.array([[math.cos(euler[2]), -math.sin(euler[2]), 0], | |
| [math.sin(euler[2]), math.cos(euler[2]), 0], | |
| [0, 0, 1] | |
| ]) | |
| R = np.dot(R_z, np.dot(R_y, R_x)) | |
| return R | |
| def euler2mat_torch(euler): | |
| R_x = torch.stack([ | |
| torch.tensor([[1, 0, 0], | |
| [0, torch.cos(angle), -torch.sin(angle)], | |
| [0, torch.sin(angle), torch.cos(angle)]]) | |
| for angle in euler[:, 0] | |
| ]) | |
| R_y = torch.stack([ | |
| torch.tensor([[torch.cos(angle), 0, torch.sin(angle)], | |
| [0, 1, 0], | |
| [-torch.sin(angle), 0, torch.cos(angle)]]) | |
| for angle in euler[:, 1] | |
| ]) | |
| R_z = torch.stack([ | |
| torch.tensor([[torch.cos(angle), -torch.sin(angle), 0], | |
| [torch.sin(angle), torch.cos(angle), 0], | |
| [0, 0, 1]]) | |
| for angle in euler[:, 2] | |
| ]) | |
| R = torch.matmul(R_z, torch.matmul(R_y, R_x)) | |
| # (n x 3 x 3 out tensor) | |
| return R | |
| def to_float_tensor(input): | |
| data_type = type(input) | |
| if data_type != torch.Tensor: | |
| input = torch.tensor(input) | |
| return input.float() | |
| def get_cuboid_verts_faces(box3d=None, R=None): | |
| """ | |
| Computes vertices and faces from a 3D cuboid representation. | |
| Args: | |
| bbox3d (flexible): [[X Y Z W H L]] | |
| R (flexible): [np.array(3x3)] | |
| Returns: | |
| verts: the 3D vertices of the cuboid in camera space | |
| faces: the vertex indices per face | |
| """ | |
| if box3d is None: | |
| box3d = [0, 0, 0, 1, 1, 1] | |
| # make sure types are correct | |
| box3d = to_float_tensor(box3d) | |
| if R is not None: | |
| R = to_float_tensor(R) | |
| squeeze = len(box3d.shape) == 1 | |
| if squeeze: | |
| box3d = box3d.unsqueeze(0) | |
| if R is not None: | |
| R = R.unsqueeze(0) | |
| n = len(box3d) | |
| x3d = box3d[:, 0].unsqueeze(1) | |
| y3d = box3d[:, 1].unsqueeze(1) | |
| z3d = box3d[:, 2].unsqueeze(1) | |
| w3d = box3d[:, 3].unsqueeze(1) | |
| h3d = box3d[:, 4].unsqueeze(1) | |
| l3d = box3d[:, 5].unsqueeze(1) | |
| ''' | |
| v4_____________________v5 | |
| /| /| | |
| / | / | | |
| / | / | | |
| /___|_________________/ | | |
| v0| | |v1 | | |
| | | | | | |
| | | | | | |
| | | | | | |
| | |_________________|___| | |
| | / v7 | /v6 | |
| | / | / | |
| | / | / | |
| |/_____________________|/ | |
| v3 v2 | |
| ''' | |
| verts = to_float_tensor(torch.zeros([n, 3, 8], device=box3d.device)) | |
| # setup X | |
| verts[:, 0, [0, 3, 4, 7]] = -l3d / 2 | |
| verts[:, 0, [1, 2, 5, 6]] = l3d / 2 | |
| # setup Y | |
| verts[:, 1, [0, 1, 4, 5]] = -h3d / 2 | |
| verts[:, 1, [2, 3, 6, 7]] = h3d / 2 | |
| # setup Z | |
| verts[:, 2, [0, 1, 2, 3]] = -w3d / 2 | |
| verts[:, 2, [4, 5, 6, 7]] = w3d / 2 | |
| if R is not None: | |
| # rotate | |
| verts = R @ verts | |
| # translate | |
| verts[:, 0, :] += x3d | |
| verts[:, 1, :] += y3d | |
| verts[:, 2, :] += z3d | |
| verts = verts.transpose(1, 2) | |
| faces = torch.tensor([ | |
| [0, 1, 2], # front TR | |
| [2, 3, 0], # front BL | |
| [1, 5, 6], # right TR | |
| [6, 2, 1], # right BL | |
| [4, 0, 3], # left TR | |
| [3, 7, 4], # left BL | |
| [5, 4, 7], # back TR | |
| [7, 6, 5], # back BL | |
| [4, 5, 1], # top TR | |
| [1, 0, 4], # top BL | |
| [3, 2, 6], # bottom TR | |
| [6, 7, 3], # bottom BL | |
| ]).float().unsqueeze(0).repeat([n, 1, 1]) | |
| if squeeze: | |
| verts = verts.squeeze() | |
| faces = faces.squeeze() | |
| return verts, faces.to(verts.device) | |
| def get_cuboid_verts(K, box3d, R=None, view_R=None, view_T=None): | |
| # make sure types are correct | |
| K = to_float_tensor(K) | |
| box3d = to_float_tensor(box3d) | |
| if R is not None: | |
| R = to_float_tensor(R) | |
| squeeze = len(box3d.shape) == 1 | |
| if squeeze: | |
| box3d = box3d.unsqueeze(0) | |
| if R is not None: | |
| R = R.unsqueeze(0) | |
| n = len(box3d) | |
| if len(K.shape) == 2: | |
| K = K.unsqueeze(0).repeat([n, 1, 1]) | |
| corners_3d, _ = get_cuboid_verts_faces(box3d, R) | |
| if view_T is not None: | |
| corners_3d -= view_T.view(1, 1, 3) | |
| if view_R is not None: | |
| corners_3d = (view_R @ corners_3d[0].T).T.unsqueeze(0) | |
| if view_T is not None: | |
| corners_3d[:, :, -1] += view_T.view(1, 1, 3)[:, :, -1]*1.25 | |
| # project to 2D | |
| corners_2d = K @ corners_3d.transpose(1, 2) | |
| corners_2d[:, :2, :] = corners_2d[:, :2, :] / corners_2d[:, 2, :].unsqueeze(1) | |
| corners_2d = corners_2d.transpose(1, 2) | |
| if squeeze: | |
| corners_3d = corners_3d.squeeze() | |
| corners_2d = corners_2d.squeeze() | |
| return corners_2d, corners_3d | |
| def approx_eval_resolution(h, w, scale_min=0, scale_max=1e10): | |
| """ | |
| Approximates the resolution an image with h x w resolution would | |
| run through a model at which constrains the scale to a min and max. | |
| Args: | |
| h (int): input resolution height | |
| w (int): input resolution width | |
| scale_min (int): minimum scale allowed to resize too | |
| scale_max (int): maximum scale allowed to resize too | |
| Returns: | |
| h (int): output resolution height | |
| w (int): output resolution width | |
| sf (float): scaling factor that was applied | |
| which can convert from original --> network resolution. | |
| """ | |
| orig_h = h | |
| # first resize to min | |
| sf = scale_min / min(h, w) | |
| h *= sf | |
| w *= sf | |
| # next resize to max | |
| sf = min(scale_max / max(h, w), 1.0) | |
| h *= sf | |
| w *= sf | |
| return h, w, h/orig_h | |
| def compute_priors(cfg, datasets, max_cluster_rounds=1000, min_points_for_std=5, n_bins=None): | |
| """ | |
| Computes priors via simple averaging or a custom K-Means clustering. | |
| """ | |
| annIds = datasets.getAnnIds() | |
| anns = datasets.loadAnns(annIds) | |
| data_raw = [] | |
| category_names = MetadataCatalog.get('omni3d_model').thing_classes | |
| virtual_depth = cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH | |
| virtual_focal = cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL | |
| test_scale_min = cfg.INPUT.MIN_SIZE_TEST | |
| test_scale_max = cfg.INPUT.MAX_SIZE_TEST | |
| ''' | |
| Accumulate the annotations while discarding the 2D center information | |
| (hence, keeping only the 2D and 3D scale information, and properties.) | |
| ''' | |
| for ann_idx, ann in enumerate(anns): | |
| category_name = ann['category_name'].lower() | |
| ignore = ann['ignore'] | |
| dataset_id = ann['dataset_id'] | |
| image_id = ann['image_id'] | |
| fy = datasets.imgs[image_id]['K'][1][1] | |
| im_h = datasets.imgs[image_id]['height'] | |
| im_w = datasets.imgs[image_id]['width'] | |
| f = 2 * fy / im_h | |
| if cfg.DATASETS.MODAL_2D_BOXES and 'bbox2D_tight' in ann and ann['bbox2D_tight'][0] != -1: | |
| x, y, w, h = BoxMode.convert(ann['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) | |
| elif cfg.DATASETS.TRUNC_2D_BOXES and 'bbox2D_trunc' in ann and not np.all([val==-1 for val in ann['bbox2D_trunc']]): | |
| x, y, w, h = BoxMode.convert(ann['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) | |
| elif 'bbox2D_proj' in ann: | |
| x, y, w, h = BoxMode.convert(ann['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) | |
| else: | |
| continue | |
| x3d, y3d, z3d = ann['center_cam'] | |
| w3d, h3d, l3d = ann['dimensions'] | |
| test_h, test_w, sf = approx_eval_resolution(im_h, im_w, test_scale_min, test_scale_max) | |
| # scale everything to test resolution | |
| h *= sf | |
| w *= sf | |
| if virtual_depth: | |
| virtual_to_real = compute_virtual_scale_from_focal_spaces(fy, im_h, virtual_focal, test_h) | |
| real_to_virtual = 1/virtual_to_real | |
| z3d *= real_to_virtual | |
| scale = np.sqrt(h**2 + w**2) | |
| if (not ignore) and category_name in category_names: | |
| data_raw.append([category_name, w, h, x3d, y3d, z3d, w3d, h3d, l3d, w3d*h3d*l3d, dataset_id, image_id, fy, f, scale]) | |
| # TODO pandas is fairly inefficient to rely on for large scale. | |
| df_raw = pd.DataFrame(data_raw, columns=[ | |
| 'name', | |
| 'w', 'h', 'x3d', 'y3d', 'z3d', | |
| 'w3d', 'h3d', 'l3d', 'volume', | |
| 'dataset', 'image', | |
| 'fy', 'f', 'scale' | |
| ]) | |
| # ^ the elements ending in w/h/l3d are the actual sizes, while the x/y/z3d are the camera perspective sizes. | |
| priors_bins = [] | |
| priors_dims_per_cat = [] | |
| priors_z3d_per_cat = [] | |
| priors_y3d_per_cat = [] | |
| # compute priors for z and y globally | |
| priors_z3d = [df_raw.z3d.mean(), df_raw.z3d.std()] | |
| priors_y3d = [df_raw.y3d.mean(), df_raw.y3d.std()] | |
| if n_bins is None: | |
| n_bins = cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS | |
| # Each prior is pre-computed per category | |
| for cat in category_names: | |
| df_cat = df_raw[df_raw.name == cat] | |
| ''' | |
| First compute static variable statistics | |
| ''' | |
| scales = torch.FloatTensor(np.array(df_cat.scale)) | |
| n = len(scales) | |
| if n > 0: | |
| priors_dims_per_cat.append([[df_cat.w3d.mean(), df_cat.h3d.mean(), df_cat.l3d.mean()], [df_cat.w3d.std(), df_cat.h3d.std(), df_cat.l3d.std()]]) | |
| priors_z3d_per_cat.append([df_cat.z3d.mean(), df_cat.z3d.std()]) | |
| priors_y3d_per_cat.append([df_cat.y3d.mean(), df_cat.y3d.std()]) | |
| else: | |
| # dummy data. | |
| priors_dims_per_cat.append([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) | |
| priors_z3d_per_cat.append([50, 50]) | |
| priors_y3d_per_cat.append([1, 10]) | |
| ''' | |
| Next compute Z cluster statistics based on y and area | |
| ''' | |
| def compute_cluster_scale_mean(scales, assignments, n_bins, match_quality): | |
| cluster_scales = [] | |
| for bin in range(n_bins): | |
| in_cluster = assignments==bin | |
| if in_cluster.sum() < min_points_for_std: | |
| in_cluster[match_quality[:, bin].topk(min_points_for_std)[1]] = True | |
| scale = scales[in_cluster].mean() | |
| cluster_scales.append(scale.item()) | |
| return torch.FloatTensor(cluster_scales) | |
| if n_bins > 1: | |
| if n < min_points_for_std: | |
| print('Warning {} category has only {} valid samples...'.format(cat, n)) | |
| # dummy data since category doesn't have available samples. | |
| max_scale = cfg.MODEL.ANCHOR_GENERATOR.SIZES[-1][-1] | |
| min_scale = cfg.MODEL.ANCHOR_GENERATOR.SIZES[0][0] | |
| base = (max_scale / min_scale) ** (1 / (n_bins - 1)) | |
| cluster_scales = np.array([min_scale * (base ** i) for i in range(0, n_bins)]) | |
| # default values are unused anyways in training. but range linearly | |
| # from 100 to 1 and ascend with 2D scale. | |
| bin_priors_z = [[b, 15] for b in np.arange(100, 1, -(100-1)/n_bins)] | |
| priors_bins.append((cat, cluster_scales.tolist(), bin_priors_z)) | |
| assert len(bin_priors_z) == n_bins, 'Broken default bin scaling.' | |
| else: | |
| max_scale = scales.max() | |
| min_scale = scales.min() | |
| base = (max_scale / min_scale) ** (1 / (n_bins - 1)) | |
| cluster_scales = torch.FloatTensor([min_scale * (base ** i) for i in range(0, n_bins)]) | |
| best_score = -np.inf | |
| for round in range(max_cluster_rounds): | |
| # quality scores for gts and clusters (n x n_bins) | |
| match_quality = -(cluster_scales.unsqueeze(0) - scales.unsqueeze(1)).abs() | |
| # assign to best clusters | |
| scores, assignments_round = match_quality.max(1) | |
| round_score = scores.mean().item() | |
| if np.round(round_score, 5) > best_score: | |
| best_score = round_score | |
| assignments = assignments_round | |
| # make new clusters | |
| cluster_scales = compute_cluster_scale_mean(scales, assignments, n_bins, match_quality) | |
| else: | |
| break | |
| bin_priors_z = [] | |
| for bin in range(n_bins): | |
| in_cluster = assignments == bin | |
| # not enough in the cluster to compute reliable stats? | |
| # fill it with the topk others | |
| if in_cluster.sum() < min_points_for_std: | |
| in_cluster[match_quality[:, bin].topk(min_points_for_std)[1]] = True | |
| # move to numpy for indexing pandas | |
| in_cluster = in_cluster.numpy() | |
| z3d_mean = df_cat.z3d[in_cluster].mean() | |
| z3d_std = df_cat.z3d[in_cluster].std() | |
| bin_priors_z.append([z3d_mean, z3d_std]) | |
| priors_bins.append((cat, cluster_scales.numpy().tolist(), bin_priors_z)) | |
| priors = { | |
| 'priors_dims_per_cat': priors_dims_per_cat, | |
| 'priors_z3d_per_cat': priors_z3d_per_cat, | |
| 'priors_y3d_per_cat': priors_y3d_per_cat, | |
| 'priors_bins': priors_bins, | |
| 'priors_y3d': priors_y3d, | |
| 'priors_z3d': priors_z3d, | |
| } | |
| return priors | |
| def compute_priors_custom(cfg, datasets, max_cluster_rounds=1000, min_points_for_std=5): | |
| """ | |
| simplification of the standard compute_priors function | |
| Computes priors via simple averaging | |
| """ | |
| annIds = datasets.getAnnIds() | |
| anns = datasets.loadAnns(annIds) | |
| data_raw = [] | |
| category_names = MetadataCatalog.get('omni3d_model').thing_classes | |
| virtual_depth = cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH | |
| virtual_focal = cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL | |
| test_scale_min = cfg.INPUT.MIN_SIZE_TEST | |
| test_scale_max = cfg.INPUT.MAX_SIZE_TEST | |
| ''' | |
| Accumulate the annotations while discarding the 2D center information | |
| (hence, keeping only the 2D and 3D scale information, and properties.) | |
| ''' | |
| for ann_idx, ann in enumerate(anns): | |
| category_name = ann['category_name'].lower() | |
| ignore = ann['ignore'] | |
| dataset_id = ann['dataset_id'] | |
| image_id = ann['image_id'] | |
| fy = datasets.imgs[image_id]['K'][1][1] | |
| im_h = datasets.imgs[image_id]['height'] | |
| im_w = datasets.imgs[image_id]['width'] | |
| f = 2 * fy / im_h | |
| if cfg.DATASETS.MODAL_2D_BOXES and 'bbox2D_tight' in ann and ann['bbox2D_tight'][0] != -1: | |
| x, y, w, h = BoxMode.convert(ann['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) | |
| elif cfg.DATASETS.TRUNC_2D_BOXES and 'bbox2D_trunc' in ann and not np.all([val==-1 for val in ann['bbox2D_trunc']]): | |
| x, y, w, h = BoxMode.convert(ann['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) | |
| elif 'bbox2D_proj' in ann: | |
| x, y, w, h = BoxMode.convert(ann['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) | |
| else: | |
| continue | |
| x3d, y3d, z3d = ann['center_cam'] | |
| w3d, h3d, l3d = ann['dimensions'] | |
| test_h, test_w, sf = approx_eval_resolution(im_h, im_w, test_scale_min, test_scale_max) | |
| # scale everything to test resolution | |
| h *= sf | |
| w *= sf | |
| if virtual_depth: | |
| virtual_to_real = compute_virtual_scale_from_focal_spaces(fy, im_h, virtual_focal, test_h) | |
| real_to_virtual = 1/virtual_to_real | |
| z3d *= real_to_virtual | |
| scale = np.sqrt(h**2 + w**2) | |
| if (not ignore) and category_name in category_names: | |
| data_raw.append([category_name, w, h, x3d, y3d, z3d, w3d, h3d, l3d, w3d*h3d*l3d, dataset_id, image_id, fy, f, scale]) | |
| # TODO pandas is fairly inefficient to rely on for large scale. | |
| df_raw = pd.DataFrame(data_raw, columns=[ | |
| 'name', | |
| 'w', 'h', 'x3d', 'y3d', 'z3d', | |
| 'w3d', 'h3d', 'l3d', 'volume', | |
| 'dataset', 'image', | |
| 'fy', 'f', 'scale' | |
| ]) | |
| # ^ the elements ending in w/h/l3d are the actual sizes, while the x/y/z3d are the camera perspective sizes. | |
| priors_bins = [] | |
| priors_dims_per_cat = [] | |
| priors_z3d_per_cat = [] | |
| priors_y3d_per_cat = [] | |
| # compute priors for z and y globally | |
| priors_z3d = [df_raw.z3d.mean(), df_raw.z3d.std()] | |
| priors_y3d = [df_raw.y3d.mean(), df_raw.y3d.std()] | |
| # Each prior is pre-computed per category | |
| for cat in category_names: | |
| df_cat = df_raw[df_raw.name == cat] | |
| ''' | |
| First compute static variable statistics | |
| ''' | |
| scales = torch.FloatTensor(np.array(df_cat.scale)) | |
| n = len(scales) | |
| if None: | |
| priors_dims_per_cat.append([[df_cat.w3d.mean(), df_cat.h3d.mean(), df_cat.l3d.mean()], [df_cat.w3d.std(), df_cat.h3d.std(), df_cat.l3d.std()]]) | |
| priors_z3d_per_cat.append([df_cat.z3d.mean(), df_cat.z3d.std()]) | |
| priors_y3d_per_cat.append([df_cat.y3d.mean(), df_cat.y3d.std()]) | |
| else: | |
| # dummy data. | |
| priors_dims_per_cat.append([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) | |
| priors_z3d_per_cat.append([0, 0]) | |
| priors_y3d_per_cat.append([0, 0]) | |
| priors = { | |
| 'priors_dims_per_cat': priors_dims_per_cat, | |
| 'priors_z3d_per_cat': priors_z3d_per_cat, | |
| 'priors_y3d_per_cat': priors_y3d_per_cat, | |
| 'priors_bins': priors_bins, | |
| 'priors_y3d': priors_y3d, | |
| 'priors_z3d': priors_z3d, | |
| } | |
| return priors | |
| def convert_3d_box_to_2d(K, box3d, R=None, clipw=0, cliph=0, XYWH=True, min_z=0.20): | |
| """ | |
| Converts a 3D box to a 2D box via projection. | |
| Args: | |
| K (np.array): intrinsics matrix 3x3 | |
| bbox3d (flexible): [[X Y Z W H L]] | |
| R (flexible): [np.array(3x3)] | |
| clipw (int): clip invalid X to the image bounds. Image width is usually used here. | |
| cliph (int): clip invalid Y to the image bounds. Image height is usually used here. | |
| XYWH (bool): returns in XYWH if true, otherwise XYXY format. | |
| min_z: the threshold for how close a vertex is allowed to be before being | |
| considered as invalid for projection purposes. | |
| Returns: | |
| box2d (flexible): the 2D box results. | |
| behind_camera (bool): whether the projection has any points behind the camera plane. | |
| fully_behind (bool): all points are behind the camera plane. | |
| """ | |
| # bounds used for vertices behind image plane | |
| topL_bound = torch.tensor([[0, 0, 0]]).float() | |
| topR_bound = torch.tensor([[clipw-1, 0, 0]]).float() | |
| botL_bound = torch.tensor([[0, cliph-1, 0]]).float() | |
| botR_bound = torch.tensor([[clipw-1, cliph-1, 0]]).float() | |
| # make sure types are correct | |
| K = to_float_tensor(K) | |
| box3d = to_float_tensor(box3d) | |
| if R is not None: | |
| R = to_float_tensor(R) | |
| squeeze = len(box3d.shape) == 1 | |
| if squeeze: | |
| box3d = box3d.unsqueeze(0) | |
| if R is not None: | |
| R = R.unsqueeze(0) | |
| n = len(box3d) | |
| verts2d, verts3d = get_cuboid_verts(K, box3d, R) | |
| # any boxes behind camera plane? | |
| verts_behind = verts2d[:, :, 2] <= min_z | |
| behind_camera = verts_behind.any(1) | |
| verts_signs = torch.sign(verts3d) | |
| # check for any boxes projected behind image plane corners | |
| topL = verts_behind & (verts_signs[:, :, 0] < 0) & (verts_signs[:, :, 1] < 0) | |
| topR = verts_behind & (verts_signs[:, :, 0] > 0) & (verts_signs[:, :, 1] < 0) | |
| botL = verts_behind & (verts_signs[:, :, 0] < 0) & (verts_signs[:, :, 1] > 0) | |
| botR = verts_behind & (verts_signs[:, :, 0] > 0) & (verts_signs[:, :, 1] > 0) | |
| # clip values to be in bounds for invalid points | |
| verts2d[topL] = topL_bound | |
| verts2d[topR] = topR_bound | |
| verts2d[botL] = botL_bound | |
| verts2d[botR] = botR_bound | |
| x, xi = verts2d[:, :, 0].min(1) | |
| y, yi = verts2d[:, :, 1].min(1) | |
| x2, x2i = verts2d[:, :, 0].max(1) | |
| y2, y2i = verts2d[:, :, 1].max(1) | |
| fully_behind = verts_behind.all(1) | |
| width = x2 - x | |
| height = y2 - y | |
| if XYWH: | |
| box2d = torch.cat((x.unsqueeze(1), y.unsqueeze(1), width.unsqueeze(1), height.unsqueeze(1)), dim=1) | |
| else: | |
| box2d = torch.cat((x.unsqueeze(1), y.unsqueeze(1), x2.unsqueeze(1), y2.unsqueeze(1)), dim=1) | |
| if squeeze: | |
| box2d = box2d.squeeze() | |
| behind_camera = behind_camera.squeeze() | |
| fully_behind = fully_behind.squeeze() | |
| return box2d, behind_camera, fully_behind | |
| # | |
| def compute_virtual_scale_from_focal_spaces(f, H, f0, H0): | |
| """ | |
| Computes the scaling factor of depth from f0, H0 to f, H | |
| Args: | |
| f (float): the desired [virtual] focal length (px) | |
| H (float): the desired [virtual] height (px) | |
| f0 (float): the initial [real] focal length (px) | |
| H0 (float): the initial [real] height (px) | |
| Returns: | |
| the scaling factor float to convert form (f0, H0) --> (f, H) | |
| """ | |
| return (H0 * f) / (f0 * H) | |
| def R_to_allocentric(K, R, u=None, v=None): | |
| """ | |
| Convert a rotation matrix or series of rotation matrices to allocentric | |
| representation given a 2D location (u, v) in pixels. | |
| When u or v are not available, we fall back on the principal point of K. | |
| """ | |
| if type(K) == torch.Tensor: | |
| fx = K[:, 0, 0] | |
| fy = K[:, 1, 1] | |
| sx = K[:, 0, 2] | |
| sy = K[:, 1, 2] | |
| n = len(K) | |
| oray = torch.stack(((u - sx)/fx, (v - sy)/fy, torch.ones_like(u))).T | |
| oray = oray / torch.linalg.norm(oray, dim=1).unsqueeze(1) | |
| angle = torch.acos(oray[:, -1]) | |
| axis = torch.zeros_like(oray) | |
| axis[:, 0] = axis[:, 0] - oray[:, 1] | |
| axis[:, 1] = axis[:, 1] + oray[:, 0] | |
| norms = torch.linalg.norm(axis, dim=1) | |
| valid_angle = angle > 0 | |
| M = axis_angle_to_matrix(angle.unsqueeze(1)*axis/norms.unsqueeze(1)) | |
| R_view = R.clone() | |
| R_view[valid_angle] = torch.bmm(M[valid_angle].transpose(2, 1), R[valid_angle]) | |
| else: | |
| fx = K[0][0] | |
| fy = K[1][1] | |
| sx = K[0][2] | |
| sy = K[1][2] | |
| if u is None: | |
| u = sx | |
| if v is None: | |
| v = sy | |
| oray = np.array([(u - sx)/fx, (v - sy)/fy, 1]) | |
| oray = oray / np.linalg.norm(oray) | |
| cray = np.array([0, 0, 1]) | |
| angle = math.acos(cray.dot(oray)) | |
| if angle != 0: | |
| axis = np.cross(cray, oray) | |
| axis_torch = torch.from_numpy(angle*axis/np.linalg.norm(axis)).float() | |
| R_view = np.dot(axis_angle_to_matrix(axis_torch).numpy().T, R) | |
| else: | |
| R_view = R | |
| return R_view | |
| def R_from_allocentric(K, R_view, u=None, v=None): | |
| """ | |
| Convert a rotation matrix or series of rotation matrices to egocentric | |
| representation given a 2D location (u, v) in pixels. | |
| When u or v are not available, we fall back on the principal point of K. | |
| """ | |
| if type(K) == torch.Tensor: | |
| fx = K[:, 0, 0] | |
| fy = K[:, 1, 1] | |
| sx = K[:, 0, 2] | |
| sy = K[:, 1, 2] | |
| n = len(K) | |
| oray = torch.stack(((u - sx)/fx, (v - sy)/fy, torch.ones_like(u))).T | |
| oray = oray / torch.linalg.norm(oray, dim=1).unsqueeze(1) | |
| angle = torch.acos(oray[:, -1]) | |
| axis = torch.zeros_like(oray) | |
| axis[:, 0] = axis[:, 0] - oray[:, 1] | |
| axis[:, 1] = axis[:, 1] + oray[:, 0] | |
| norms = torch.linalg.norm(axis, dim=1) | |
| valid_angle = angle > 0 | |
| M = axis_angle_to_matrix(angle.unsqueeze(1)*axis/norms.unsqueeze(1)) | |
| R = R_view.clone() | |
| R[valid_angle] = torch.bmm(M[valid_angle], R_view[valid_angle]) | |
| else: | |
| fx = K[0][0] | |
| fy = K[1][1] | |
| sx = K[0][2] | |
| sy = K[1][2] | |
| if u is None: | |
| u = sx | |
| if v is None: | |
| v = sy | |
| oray = np.array([(u - sx)/fx, (v - sy)/fy, 1]) | |
| oray = oray / np.linalg.norm(oray) | |
| cray = np.array([0, 0, 1]) | |
| angle = math.acos(cray.dot(oray)) | |
| if angle != 0: | |
| #axis = np.cross(cray, oray) | |
| axis = np.array([-oray[1], oray[0], 0]) | |
| axis_torch = torch.from_numpy(angle*axis/np.linalg.norm(axis)).float() | |
| R = np.dot(axis_angle_to_matrix(axis_torch).numpy(), R_view) | |
| else: | |
| R = R_view | |
| return R | |
| def render_depth_map(K, box3d, pose, width, height, device=None): | |
| cameras = get_camera(K, width, height) | |
| renderer = get_basic_renderer(cameras, width, height) | |
| mesh = mesh_cuboid(box3d, pose) | |
| if device is not None: | |
| cameras = cameras.to(device) | |
| renderer = renderer.to(device) | |
| mesh = mesh.to(device) | |
| im_rendered, fragment = renderer(mesh) | |
| silhouettes = im_rendered[:, :, :, -1] > 0 | |
| zbuf = fragment.zbuf[:, :, :, 0] | |
| zbuf[zbuf==-1] = math.inf | |
| depth_map, depth_map_inds = zbuf.min(dim=0) | |
| return silhouettes, depth_map, depth_map_inds | |
| def estimate_visibility(K, box3d, pose, width, height, device=None): | |
| silhouettes, depth_map, depth_map_inds = render_depth_map(K, box3d, pose, width, height, device=device) | |
| n = silhouettes.shape[0] | |
| visibilies = [] | |
| for annidx in range(n): | |
| area = silhouettes[annidx].sum() | |
| visible = (depth_map_inds[silhouettes[annidx]] == annidx).sum() | |
| visibilies.append((visible / area).item()) | |
| return visibilies | |
| def estimate_truncation(K, box3d, R, imW, imH): | |
| box2d, out_of_bounds, fully_behind = convert_3d_box_to_2d(K, box3d, R, imW, imH) | |
| if fully_behind: | |
| return 1.0 | |
| box2d = box2d.detach().cpu().numpy().tolist() | |
| box2d_XYXY = BoxMode.convert(box2d, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) | |
| image_box = np.array([0, 0, imW-1, imH-1]) | |
| truncation = 1 - iou(np.array(box2d_XYXY)[np.newaxis], image_box[np.newaxis], ign_area_b=True) | |
| return truncation.item() | |
| def mesh_cuboid(box3d=None, R=None, color=None): | |
| verts, faces = get_cuboid_verts_faces(box3d, R) | |
| if verts.ndim == 2: | |
| verts = to_float_tensor(verts).unsqueeze(0) | |
| faces = to_float_tensor(faces).unsqueeze(0) | |
| ninstances = len(verts) | |
| if (isinstance(color, Tuple) or isinstance(color, List)) and len(color) == 3: | |
| color = torch.tensor(color).view(1, 1, 3).expand(ninstances, 8, 3).float() | |
| # pass in a tensor of colors per box | |
| elif color.ndim == 2: | |
| color = to_float_tensor(color).unsqueeze(1).expand(ninstances, 8, 3).float() | |
| device = verts.device | |
| mesh = Meshes(verts=verts, faces=faces, textures=None if color is None else TexturesVertex(verts_features=color).to(device)) | |
| return mesh | |
| def get_camera(K, width, height, switch_hands=True, R=None, T=None): | |
| K = to_float_tensor(K) | |
| if switch_hands: | |
| K = K @ torch.tensor([ | |
| [-1, 0, 0], | |
| [0, -1, 0], | |
| [0, 0, 1] | |
| ]).float() | |
| fx = K[0, 0] | |
| fy = K[1, 1] | |
| px = K[0, 2] | |
| py = K[1, 2] | |
| if R is None: | |
| camera = PerspectiveCameras( | |
| focal_length=((fx, fy),), principal_point=((px, py),), | |
| image_size=((height, width),), in_ndc=False | |
| ) | |
| else: | |
| camera = PerspectiveCameras( | |
| focal_length=((fx, fy),), principal_point=((px, py),), | |
| image_size=((height, width),), in_ndc=False, R=R, T=T | |
| ) | |
| return camera | |
| def get_basic_renderer(cameras, width, height, use_color=False): | |
| raster_settings = RasterizationSettings( | |
| image_size=(height, width), | |
| blur_radius=0 if use_color else np.log(1. / 1e-4 - 1.) * 1e-4, | |
| faces_per_pixel=1, | |
| perspective_correct=False, | |
| ) | |
| if use_color: | |
| # SoftPhongShader, HardPhongShader, HardFlatShader, SoftGouraudShader | |
| lights = PointLights(location=[[0.0, 0.0, 0.0]]) | |
| shader = SoftPhongShader(cameras=cameras, lights=lights) | |
| else: | |
| shader = SoftSilhouetteShader() | |
| renderer = MeshRenderer( | |
| rasterizer=MeshRasterizer( | |
| cameras=cameras, | |
| raster_settings=raster_settings, | |
| ), | |
| shader=shader | |
| ) | |
| return renderer | |
| class MeshRenderer(MR): | |
| def __init__(self, rasterizer, shader): | |
| super().__init__(rasterizer, shader) | |
| def forward(self, meshes_world, **kwargs) -> torch.Tensor: | |
| fragments = self.rasterizer(meshes_world, **kwargs) | |
| images = self.shader(fragments, meshes_world, **kwargs) | |
| return images, fragments | |
| def iou(box_a, box_b, mode='cross', ign_area_b=False): | |
| """ | |
| Computes the amount of Intersection over Union (IoU) between two different sets of boxes. | |
| Args: | |
| box_a (array or tensor): Mx4 boxes, defined by [x1, y1, x2, y2] | |
| box_a (array or tensor): Nx4 boxes, defined by [x1, y1, x2, y2] | |
| mode (str): either 'cross' or 'list', where cross will check all combinations of box_a and | |
| box_b hence MxN array, and list expects the same size list M == N, hence returns Mx1 array. | |
| ign_area_b (bool): if true then we ignore area of b. e.g., checking % box a is inside b | |
| """ | |
| data_type = type(box_a) | |
| # this mode computes the IoU in the sense of cross. | |
| # i.e., box_a = M x 4, box_b = N x 4 then the output is M x N | |
| if mode == 'cross': | |
| inter = intersect(box_a, box_b, mode=mode) | |
| area_a = ((box_a[:, 2] - box_a[:, 0]) * | |
| (box_a[:, 3] - box_a[:, 1])) | |
| area_b = ((box_b[:, 2] - box_b[:, 0]) * | |
| (box_b[:, 3] - box_b[:, 1])) | |
| # torch.Tensor | |
| if data_type == torch.Tensor: | |
| union = area_a.unsqueeze(0) | |
| if not ign_area_b: | |
| union = union + area_b.unsqueeze(1) - inter | |
| return (inter / union).permute(1, 0) | |
| # np.ndarray | |
| elif data_type == np.ndarray: | |
| union = np.expand_dims(area_a, 0) | |
| if not ign_area_b: | |
| union = union + np.expand_dims(area_b, 1) - inter | |
| return (inter / union).T | |
| # unknown type | |
| else: | |
| raise ValueError('unknown data type {}'.format(data_type)) | |
| # this mode compares every box in box_a with target in box_b | |
| # i.e., box_a = M x 4 and box_b = M x 4 then output is M x 1 | |
| elif mode == 'list': | |
| inter = intersect(box_a, box_b, mode=mode) | |
| area_a = (box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1]) | |
| area_b = (box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1]) | |
| union = area_a + area_b - inter | |
| return inter / union | |
| else: | |
| raise ValueError('unknown mode {}'.format(mode)) | |
| def intersect(box_a, box_b, mode='cross'): | |
| """ | |
| Computes the amount of intersect between two different sets of boxes. | |
| Args: | |
| box_a (nparray): Mx4 boxes, defined by [x1, y1, x2, y2] | |
| box_a (nparray): Nx4 boxes, defined by [x1, y1, x2, y2] | |
| mode (str): either 'cross' or 'list', where cross will check all combinations of box_a and | |
| box_b hence MxN array, and list expects the same size list M == N, hence returns Mx1 array. | |
| data_type (type): either torch.Tensor or np.ndarray, we automatically determine otherwise | |
| """ | |
| # determine type | |
| data_type = type(box_a) | |
| # this mode computes the intersect in the sense of cross. | |
| # i.e., box_a = M x 4, box_b = N x 4 then the output is M x N | |
| if mode == 'cross': | |
| # np.ndarray | |
| if data_type == np.ndarray: | |
| max_xy = np.minimum(box_a[:, 2:4], np.expand_dims(box_b[:, 2:4], axis=1)) | |
| min_xy = np.maximum(box_a[:, 0:2], np.expand_dims(box_b[:, 0:2], axis=1)) | |
| inter = np.clip((max_xy - min_xy), a_min=0, a_max=None) | |
| elif data_type == torch.Tensor: | |
| max_xy = torch.min(box_a[:, 2:4], box_b[:, 2:4].unsqueeze(1)) | |
| min_xy = torch.max(box_a[:, 0:2], box_b[:, 0:2].unsqueeze(1)) | |
| inter = torch.clamp((max_xy - min_xy), 0) | |
| # unknown type | |
| else: | |
| raise ValueError('type {} is not implemented'.format(data_type)) | |
| return inter[:, :, 0] * inter[:, :, 1] | |
| # this mode computes the intersect in the sense of list_a vs. list_b. | |
| # i.e., box_a = M x 4, box_b = M x 4 then the output is Mx1 | |
| elif mode == 'list': | |
| # torch.Tesnor | |
| if data_type == torch.Tensor: | |
| max_xy = torch.min(box_a[:, 2:], box_b[:, 2:]) | |
| min_xy = torch.max(box_a[:, :2], box_b[:, :2]) | |
| inter = torch.clamp((max_xy - min_xy), 0) | |
| # np.ndarray | |
| elif data_type == np.ndarray: | |
| max_xy = np.min(box_a[:, 2:], box_b[:, 2:]) | |
| min_xy = np.max(box_a[:, :2], box_b[:, :2]) | |
| inter = np.clip((max_xy - min_xy), a_min=0, a_max=None) | |
| # unknown type | |
| else: | |
| raise ValueError('unknown data type {}'.format(data_type)) | |
| return inter[:, 0] * inter[:, 1] | |
| else: | |
| raise ValueError('unknown mode {}'.format(mode)) | |
| def scaled_sigmoid(vals, min=0.0, max=1.0): | |
| """ | |
| Simple helper function for a scaled sigmoid. | |
| The output is bounded by (min, max) | |
| Args: | |
| vals (Tensor): input logits to scale | |
| min (Tensor or float): the minimum value to scale to. | |
| max (Tensor or float): the maximum value to scale to. | |
| """ | |
| return min + (max-min)*torch.sigmoid(vals) | |
| def so3_relative_angle_batched( | |
| R: torch.Tensor, | |
| cos_angle: bool = False, | |
| cos_bound: float = 1e-4, | |
| eps: float = 1e-4, | |
| ) -> torch.Tensor: | |
| """ | |
| Calculates the relative angle (in radians) between pairs of | |
| rotation matrices `R1` and `R2` with `angle = acos(0.5 * (Trace(R1 R2^T)-1))` | |
| .. note:: | |
| This corresponds to a geodesic distance on the 3D manifold of rotation | |
| matrices. | |
| Args: | |
| R1: Batch of rotation matrices of shape `(minibatch, 3, 3)`. | |
| R2: Batch of rotation matrices of shape `(minibatch, 3, 3)`. | |
| cos_angle: If==True return cosine of the relative angle rather than | |
| the angle itself. This can avoid the unstable calculation of `acos`. | |
| cos_bound: Clamps the cosine of the relative rotation angle to | |
| [-1 + cos_bound, 1 - cos_bound] to avoid non-finite outputs/gradients | |
| of the `acos` call. Note that the non-finite outputs/gradients | |
| are returned when the angle is requested (i.e. `cos_angle==False`) | |
| and the rotation angle is close to 0 or π. | |
| eps: Tolerance for the valid trace check of the relative rotation matrix | |
| in `so3_rotation_angle`. | |
| Returns: | |
| Corresponding rotation angles of shape `(minibatch,)`. | |
| If `cos_angle==True`, returns the cosine of the angles. | |
| Raises: | |
| ValueError if `R1` or `R2` is of incorrect shape. | |
| ValueError if `R1` or `R2` has an unexpected trace. | |
| """ | |
| N = R.shape[0] | |
| n_pairs = N * (N - 1) // 2 | |
| Rleft = torch.zeros((n_pairs, 3, 3)) | |
| Rright = torch.zeros((n_pairs, 3, 3)) | |
| global_idx = 0 | |
| for i in range(1, N): | |
| for j in range(i): | |
| p1 = R[i] | |
| p2 = R[j] | |
| Rleft[global_idx] = p1 | |
| Rright[global_idx] = p2 | |
| global_idx += 1 | |
| # gather up the pairs | |
| R12 = torch.matmul(Rleft, Rright.permute(0, 2, 1)) | |
| return so3_rotation_angle(R12, cos_angle=cos_angle, cos_bound=cos_bound, eps=eps) | |
| def so3_rotation_angle( | |
| R: torch.Tensor, | |
| eps: float = 1e-4, | |
| cos_angle: bool = False, | |
| cos_bound: float = 1e-4, | |
| ) -> torch.Tensor: | |
| """ | |
| Calculates angles (in radians) of a batch of rotation matrices `R` with | |
| `angle = acos(0.5 * (Trace(R)-1))`. The trace of the | |
| input matrices is checked to be in the valid range `[-1-eps,3+eps]`. | |
| The `eps` argument is a small constant that allows for small errors | |
| caused by limited machine precision. | |
| Args: | |
| R: Batch of rotation matrices of shape `(minibatch, 3, 3)`. | |
| eps: Tolerance for the valid trace check. | |
| cos_angle: If==True return cosine of the rotation angles rather than | |
| the angle itself. This can avoid the unstable | |
| calculation of `acos`. | |
| cos_bound: Clamps the cosine of the rotation angle to | |
| [-1 + cos_bound, 1 - cos_bound] to avoid non-finite outputs/gradients | |
| of the `acos` call. Note that the non-finite outputs/gradients | |
| are returned when the angle is requested (i.e. `cos_angle==False`) | |
| and the rotation angle is close to 0 or π. | |
| Returns: | |
| Corresponding rotation angles of shape `(minibatch,)`. | |
| If `cos_angle==True`, returns the cosine of the angles. | |
| Raises: | |
| ValueError if `R` is of incorrect shape. | |
| ValueError if `R` has an unexpected trace. | |
| """ | |
| N, dim1, dim2 = R.shape | |
| if dim1 != 3 or dim2 != 3: | |
| raise ValueError("Input has to be a batch of 3x3 Tensors.") | |
| rot_trace = R[:, 0, 0] + R[:, 1, 1] + R[:, 2, 2] | |
| if ((rot_trace < -1.0 - eps) + (rot_trace > 3.0 + eps)).any(): | |
| raise ValueError("A matrix has trace outside valid range [-1-eps,3+eps].") | |
| # phi ... rotation angle | |
| phi_cos = (rot_trace - 1.0) * 0.5 | |
| if cos_angle: | |
| return phi_cos | |
| else: | |
| if cos_bound > 0.0: | |
| bound = 1.0 - cos_bound | |
| return acos_linear_extrapolation(phi_cos, (-bound, bound)) | |
| else: | |
| return torch.acos(phi_cos) |