Spaces:
Running
on
Zero
Running
on
Zero
| # Copyright (c) OpenMMLab. All rights reserved. | |
| from copy import deepcopy | |
| from typing import Optional, Tuple | |
| import numpy as np | |
| from mmpose.registry import KEYPOINT_CODECS | |
| from .base import BaseKeypointCodec | |
| from .utils import camera_to_image_coord | |
| class MotionBERTLabel(BaseKeypointCodec): | |
| r"""Generate keypoint and label coordinates for `MotionBERT`_ by Zhu et al | |
| (2022). | |
| Note: | |
| - instance number: N | |
| - keypoint number: K | |
| - keypoint dimension: D | |
| - pose-lifitng target dimension: C | |
| Args: | |
| num_keypoints (int): The number of keypoints in the dataset. | |
| root_index (int): Root keypoint index in the pose. Default: 0. | |
| remove_root (bool): If true, remove the root keypoint from the pose. | |
| Default: ``False``. | |
| save_index (bool): If true, store the root position separated from the | |
| original pose, only takes effect if ``remove_root`` is ``True``. | |
| Default: ``False``. | |
| concat_vis (bool): If true, concat the visibility item of keypoints. | |
| Default: ``False``. | |
| rootrel (bool): If true, the root keypoint will be set to the | |
| coordinate origin. Default: ``False``. | |
| mode (str): Indicating whether the current mode is 'train' or 'test'. | |
| Default: ``'test'``. | |
| """ | |
| auxiliary_encode_keys = { | |
| 'lifting_target', 'lifting_target_visible', 'camera_param', 'factor' | |
| } | |
| instance_mapping_table = dict( | |
| lifting_target='lifting_target', | |
| lifting_target_visible='lifting_target_visible', | |
| ) | |
| label_mapping_table = dict( | |
| trajectory_weights='trajectory_weights', | |
| lifting_target_label='lifting_target_label', | |
| lifting_target_weight='lifting_target_weight') | |
| def __init__(self, | |
| num_keypoints: int, | |
| root_index: int = 0, | |
| remove_root: bool = False, | |
| save_index: bool = False, | |
| concat_vis: bool = False, | |
| rootrel: bool = False, | |
| mode: str = 'test'): | |
| super().__init__() | |
| self.num_keypoints = num_keypoints | |
| self.root_index = root_index | |
| self.remove_root = remove_root | |
| self.save_index = save_index | |
| self.concat_vis = concat_vis | |
| self.rootrel = rootrel | |
| assert mode.lower() in {'train', 'test' | |
| }, (f'Unsupported mode {mode}, ' | |
| 'mode should be one of ("train", "test").') | |
| self.mode = mode.lower() | |
| def encode(self, | |
| keypoints: np.ndarray, | |
| keypoints_visible: Optional[np.ndarray] = None, | |
| lifting_target: Optional[np.ndarray] = None, | |
| lifting_target_visible: Optional[np.ndarray] = None, | |
| camera_param: Optional[dict] = None, | |
| factor: Optional[np.ndarray] = None) -> dict: | |
| """Encoding keypoints from input image space to normalized space. | |
| Args: | |
| keypoints (np.ndarray): Keypoint coordinates in shape (B, T, K, D). | |
| keypoints_visible (np.ndarray, optional): Keypoint visibilities in | |
| shape (B, T, K). | |
| lifting_target (np.ndarray, optional): 3d target coordinate in | |
| shape (T, K, C). | |
| lifting_target_visible (np.ndarray, optional): Target coordinate in | |
| shape (T, K, ). | |
| camera_param (dict, optional): The camera parameter dictionary. | |
| factor (np.ndarray, optional): The factor mapping camera and image | |
| coordinate in shape (T, ). | |
| Returns: | |
| encoded (dict): Contains the following items: | |
| - keypoint_labels (np.ndarray): The processed keypoints in | |
| shape like (N, K, D). | |
| - keypoint_labels_visible (np.ndarray): The processed | |
| keypoints' weights in shape (N, K, ) or (N, K-1, ). | |
| - lifting_target_label: The processed target coordinate in | |
| shape (K, C) or (K-1, C). | |
| - lifting_target_weight (np.ndarray): The target weights in | |
| shape (K, ) or (K-1, ). | |
| - factor (np.ndarray): The factor mapping camera and image | |
| coordinate in shape (T, 1). | |
| """ | |
| if keypoints_visible is None: | |
| keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) | |
| # set initial value for `lifting_target_weight` | |
| if lifting_target_visible is None: | |
| lifting_target_visible = np.ones( | |
| lifting_target.shape[:-1], dtype=np.float32) | |
| lifting_target_weight = lifting_target_visible | |
| else: | |
| valid = lifting_target_visible > 0.5 | |
| lifting_target_weight = np.where(valid, 1., 0.).astype(np.float32) | |
| if camera_param is None: | |
| camera_param = dict() | |
| encoded = dict() | |
| assert lifting_target is not None | |
| lifting_target_label = lifting_target.copy() | |
| keypoint_labels = keypoints.copy() | |
| assert keypoint_labels.ndim in { | |
| 2, 3 | |
| }, (f'Keypoint labels should have 2 or 3 dimensions, ' | |
| f'but got {keypoint_labels.ndim}.') | |
| if keypoint_labels.ndim == 2: | |
| keypoint_labels = keypoint_labels[None, ...] | |
| # Normalize the 2D keypoint coordinate with image width and height | |
| _camera_param = deepcopy(camera_param) | |
| assert 'w' in _camera_param and 'h' in _camera_param, ( | |
| 'Camera parameters should contain "w" and "h".') | |
| w, h = _camera_param['w'], _camera_param['h'] | |
| keypoint_labels[ | |
| ..., :2] = keypoint_labels[..., :2] / w * 2 - [1, h / w] | |
| # convert target to image coordinate | |
| T = keypoint_labels.shape[0] | |
| factor_ = np.array([4] * T, dtype=np.float32).reshape(T, ) | |
| if 'f' in _camera_param and 'c' in _camera_param: | |
| lifting_target_label, factor_ = camera_to_image_coord( | |
| self.root_index, lifting_target_label, _camera_param) | |
| if self.mode == 'train': | |
| w, h = w / 1000, h / 1000 | |
| lifting_target_label[ | |
| ..., :2] = lifting_target_label[..., :2] / w * 2 - [1, h / w] | |
| lifting_target_label[..., 2] = lifting_target_label[..., 2] / w * 2 | |
| lifting_target_label[..., :, :] = lifting_target_label[ | |
| ..., :, :] - lifting_target_label[..., | |
| self.root_index:self.root_index + | |
| 1, :] | |
| if factor is None or factor[0] == 0: | |
| factor = factor_ | |
| if factor.ndim == 1: | |
| factor = factor[:, None] | |
| if self.mode == 'test': | |
| lifting_target_label *= factor[..., None] | |
| if self.concat_vis: | |
| keypoints_visible_ = keypoints_visible | |
| if keypoints_visible.ndim == 2: | |
| keypoints_visible_ = keypoints_visible[..., None] | |
| keypoint_labels = np.concatenate( | |
| (keypoint_labels, keypoints_visible_), axis=2) | |
| encoded['keypoint_labels'] = keypoint_labels | |
| encoded['keypoint_labels_visible'] = keypoints_visible | |
| encoded['lifting_target_label'] = lifting_target_label | |
| encoded['lifting_target_weight'] = lifting_target_weight | |
| encoded['lifting_target'] = lifting_target_label | |
| encoded['lifting_target_visible'] = lifting_target_visible | |
| encoded['factor'] = factor | |
| return encoded | |
| def decode( | |
| self, | |
| encoded: np.ndarray, | |
| w: Optional[np.ndarray] = None, | |
| h: Optional[np.ndarray] = None, | |
| factor: Optional[np.ndarray] = None, | |
| ) -> Tuple[np.ndarray, np.ndarray]: | |
| """Decode keypoint coordinates from normalized space to input image | |
| space. | |
| Args: | |
| encoded (np.ndarray): Coordinates in shape (N, K, C). | |
| w (np.ndarray, optional): The image widths in shape (N, ). | |
| Default: ``None``. | |
| h (np.ndarray, optional): The image heights in shape (N, ). | |
| Default: ``None``. | |
| factor (np.ndarray, optional): The factor for projection in shape | |
| (N, ). Default: ``None``. | |
| Returns: | |
| keypoints (np.ndarray): Decoded coordinates in shape (N, K, C). | |
| scores (np.ndarray): The keypoint scores in shape (N, K). | |
| """ | |
| keypoints = encoded.copy() | |
| scores = np.ones(keypoints.shape[:-1], dtype=np.float32) | |
| if self.rootrel: | |
| keypoints[..., 0, :] = 0 | |
| if w is not None and w.size > 0: | |
| assert w.shape == h.shape, (f'w and h should have the same shape, ' | |
| f'but got {w.shape} and {h.shape}.') | |
| assert w.shape[0] == keypoints.shape[0], ( | |
| f'w and h should have the same batch size, ' | |
| f'but got {w.shape[0]} and {keypoints.shape[0]}.') | |
| assert w.ndim in {1, | |
| 2}, (f'w and h should have 1 or 2 dimensions, ' | |
| f'but got {w.ndim}.') | |
| if w.ndim == 1: | |
| w = w[:, None] | |
| h = h[:, None] | |
| trans = np.append( | |
| np.ones((w.shape[0], 1)), h / w, axis=1)[:, None, :] | |
| keypoints[..., :2] = (keypoints[..., :2] + trans) * w[:, None] / 2 | |
| keypoints[..., 2:] = keypoints[..., 2:] * w[:, None] / 2 | |
| if factor is not None and factor.size > 0: | |
| assert factor.shape[0] == keypoints.shape[0], ( | |
| f'factor should have the same batch size, ' | |
| f'but got {factor.shape[0]} and {keypoints.shape[0]}.') | |
| keypoints *= factor[..., None] | |
| keypoints[..., :, :] = keypoints[..., :, :] - keypoints[ | |
| ..., self.root_index:self.root_index + 1, :] | |
| keypoints /= 1000. | |
| return keypoints, scores | |