Spaces:
Running
on
Zero
Running
on
Zero
| # Copyright (c) OpenMMLab. All rights reserved. | |
| from copy import deepcopy | |
| from typing import List, Optional, Tuple, Union | |
| import numpy as np | |
| from mmpose.registry import KEYPOINT_CODECS | |
| from .base import BaseKeypointCodec | |
| class VideoPoseLifting(BaseKeypointCodec): | |
| r"""Generate keypoint coordinates for pose lifter. | |
| Note: | |
| - instance number: N | |
| - keypoint number: K | |
| - keypoint dimension: D | |
| - pose-lifitng target dimension: C | |
| Args: | |
| num_keypoints (int): The number of keypoints in the dataset. | |
| zero_center: Whether to zero-center the target around root. Default: | |
| ``True``. | |
| root_index (Union[int, List]): Root keypoint index in the pose. | |
| Default: 0. | |
| remove_root (bool): If true, remove the root keypoint from the pose. | |
| Default: ``False``. | |
| save_index (bool): If true, store the root position separated from the | |
| original pose, only takes effect if ``remove_root`` is ``True``. | |
| Default: ``False``. | |
| reshape_keypoints (bool): If true, reshape the keypoints into shape | |
| (-1, N). Default: ``True``. | |
| concat_vis (bool): If true, concat the visibility item of keypoints. | |
| Default: ``False``. | |
| normalize_camera (bool): Whether to normalize camera intrinsics. | |
| Default: ``False``. | |
| """ | |
| auxiliary_encode_keys = { | |
| 'lifting_target', 'lifting_target_visible', 'camera_param' | |
| } | |
| instance_mapping_table = dict( | |
| lifting_target='lifting_target', | |
| lifting_target_visible='lifting_target_visible', | |
| ) | |
| label_mapping_table = dict( | |
| trajectory_weights='trajectory_weights', | |
| lifting_target_label='lifting_target_label', | |
| lifting_target_weight='lifting_target_weight') | |
| def __init__(self, | |
| num_keypoints: int, | |
| zero_center: bool = True, | |
| root_index: Union[int, List] = 0, | |
| remove_root: bool = False, | |
| save_index: bool = False, | |
| reshape_keypoints: bool = True, | |
| concat_vis: bool = False, | |
| normalize_camera: bool = False): | |
| super().__init__() | |
| self.num_keypoints = num_keypoints | |
| self.zero_center = zero_center | |
| if isinstance(root_index, int): | |
| root_index = [root_index] | |
| self.root_index = root_index | |
| self.remove_root = remove_root | |
| self.save_index = save_index | |
| self.reshape_keypoints = reshape_keypoints | |
| self.concat_vis = concat_vis | |
| self.normalize_camera = normalize_camera | |
| def encode(self, | |
| keypoints: np.ndarray, | |
| keypoints_visible: Optional[np.ndarray] = None, | |
| lifting_target: Optional[np.ndarray] = None, | |
| lifting_target_visible: Optional[np.ndarray] = None, | |
| camera_param: Optional[dict] = None) -> dict: | |
| """Encoding keypoints from input image space to normalized space. | |
| Args: | |
| keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D). | |
| keypoints_visible (np.ndarray, optional): Keypoint visibilities in | |
| shape (N, K). | |
| lifting_target (np.ndarray, optional): 3d target coordinate in | |
| shape (T, K, C). | |
| lifting_target_visible (np.ndarray, optional): Target coordinate in | |
| shape (T, K, ). | |
| camera_param (dict, optional): The camera parameter dictionary. | |
| Returns: | |
| encoded (dict): Contains the following items: | |
| - keypoint_labels (np.ndarray): The processed keypoints in | |
| shape like (N, K, D) or (K * D, N). | |
| - keypoint_labels_visible (np.ndarray): The processed | |
| keypoints' weights in shape (N, K, ) or (N-1, K, ). | |
| - lifting_target_label: The processed target coordinate in | |
| shape (K, C) or (K-1, C). | |
| - lifting_target_weight (np.ndarray): The target weights in | |
| shape (K, ) or (K-1, ). | |
| - trajectory_weights (np.ndarray): The trajectory weights in | |
| shape (K, ). | |
| In addition, there are some optional items it may contain: | |
| - target_root (np.ndarray): The root coordinate of target in | |
| shape (C, ). Exists if ``zero_center`` is ``True``. | |
| - target_root_removed (bool): Indicate whether the root of | |
| pose-lifitng target is removed. Exists if | |
| ``remove_root`` is ``True``. | |
| - target_root_index (int): An integer indicating the index of | |
| root. Exists if ``remove_root`` and ``save_index`` | |
| are ``True``. | |
| - camera_param (dict): The updated camera parameter dictionary. | |
| Exists if ``normalize_camera`` is ``True``. | |
| """ | |
| if keypoints_visible is None: | |
| keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) | |
| if lifting_target is None: | |
| lifting_target = [keypoints[0]] | |
| # set initial value for `lifting_target_weight` | |
| # and `trajectory_weights` | |
| if lifting_target_visible is None: | |
| lifting_target_visible = np.ones( | |
| lifting_target.shape[:-1], dtype=np.float32) | |
| lifting_target_weight = lifting_target_visible | |
| trajectory_weights = (1 / lifting_target[:, 2]) | |
| else: | |
| valid = lifting_target_visible > 0.5 | |
| lifting_target_weight = np.where(valid, 1., 0.).astype(np.float32) | |
| trajectory_weights = lifting_target_weight | |
| if camera_param is None: | |
| camera_param = dict() | |
| encoded = dict() | |
| lifting_target_label = lifting_target.copy() | |
| # Zero-center the target pose around a given root keypoint | |
| if self.zero_center: | |
| assert (lifting_target.ndim >= 2 and | |
| lifting_target.shape[-2] > max(self.root_index)), \ | |
| f'Got invalid joint shape {lifting_target.shape}' | |
| root = np.mean(lifting_target[..., self.root_index, :], axis=-2) | |
| lifting_target_label -= root[..., np.newaxis, :] | |
| encoded['target_root'] = root | |
| if self.remove_root and len(self.root_index) == 1: | |
| root_index = self.root_index[0] | |
| lifting_target_label = np.delete( | |
| lifting_target_label, root_index, axis=-2) | |
| lifting_target_visible = np.delete( | |
| lifting_target_visible, root_index, axis=-2) | |
| assert lifting_target_weight.ndim in { | |
| 2, 3 | |
| }, (f'Got invalid lifting target weights shape ' | |
| f'{lifting_target_weight.shape}') | |
| axis_to_remove = -2 if lifting_target_weight.ndim == 3 else -1 | |
| lifting_target_weight = np.delete( | |
| lifting_target_weight, root_index, axis=axis_to_remove) | |
| # Add a flag to avoid latter transforms that rely on the root | |
| # joint or the original joint index | |
| encoded['target_root_removed'] = True | |
| # Save the root index for restoring the global pose | |
| if self.save_index: | |
| encoded['target_root_index'] = root_index | |
| # Normalize the 2D keypoint coordinate with image width and height | |
| _camera_param = deepcopy(camera_param) | |
| assert 'w' in _camera_param and 'h' in _camera_param, ( | |
| 'Camera parameter `w` and `h` should be provided.') | |
| center = np.array([0.5 * _camera_param['w'], 0.5 * _camera_param['h']], | |
| dtype=np.float32) | |
| scale = np.array(0.5 * _camera_param['w'], dtype=np.float32) | |
| keypoint_labels = (keypoints - center) / scale | |
| assert keypoint_labels.ndim in { | |
| 2, 3 | |
| }, (f'Got invalid keypoint labels shape {keypoint_labels.shape}') | |
| if keypoint_labels.ndim == 2: | |
| keypoint_labels = keypoint_labels[None, ...] | |
| if self.normalize_camera: | |
| assert 'f' in _camera_param and 'c' in _camera_param, ( | |
| 'Camera parameter `f` and `c` should be provided.') | |
| _camera_param['f'] = _camera_param['f'] / scale | |
| _camera_param['c'] = (_camera_param['c'] - center[:, None]) / scale | |
| encoded['camera_param'] = _camera_param | |
| if self.concat_vis: | |
| keypoints_visible_ = keypoints_visible | |
| if keypoints_visible.ndim == 2: | |
| keypoints_visible_ = keypoints_visible[..., None] | |
| keypoint_labels = np.concatenate( | |
| (keypoint_labels, keypoints_visible_), axis=2) | |
| if self.reshape_keypoints: | |
| N = keypoint_labels.shape[0] | |
| keypoint_labels = keypoint_labels.transpose(1, 2, 0).reshape(-1, N) | |
| encoded['keypoint_labels'] = keypoint_labels | |
| encoded['keypoints_visible'] = keypoints_visible | |
| encoded['lifting_target_label'] = lifting_target_label | |
| encoded['lifting_target_weight'] = lifting_target_weight | |
| encoded['trajectory_weights'] = trajectory_weights | |
| return encoded | |
| def decode(self, | |
| encoded: np.ndarray, | |
| target_root: Optional[np.ndarray] = None | |
| ) -> Tuple[np.ndarray, np.ndarray]: | |
| """Decode keypoint coordinates from normalized space to input image | |
| space. | |
| Args: | |
| encoded (np.ndarray): Coordinates in shape (N, K, C). | |
| target_root (np.ndarray, optional): The pose-lifitng target root | |
| coordinate. Default: ``None``. | |
| Returns: | |
| keypoints (np.ndarray): Decoded coordinates in shape (N, K, C). | |
| scores (np.ndarray): The keypoint scores in shape (N, K). | |
| """ | |
| keypoints = encoded.copy() | |
| if target_root is not None and target_root.size > 0: | |
| keypoints = keypoints + target_root | |
| if self.remove_root and len(self.root_index) == 1: | |
| keypoints = np.insert( | |
| keypoints, self.root_index, target_root, axis=1) | |
| scores = np.ones(keypoints.shape[:-1], dtype=np.float32) | |
| return keypoints, scores | |