Spaces:
Runtime error
Runtime error
| # coding=utf-8 | |
| # Copyright 2021 The Deeplab2 Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """This file contains code to get a sample from a dataset.""" | |
| import functools | |
| import numpy as np | |
| import tensorflow as tf | |
| from deeplab2 import common | |
| from deeplab2.data import dataset_utils | |
| from deeplab2.data.preprocessing import input_preprocessing as preprocessing | |
| def _compute_gaussian_from_std(sigma): | |
| """Computes the Gaussian and its size from a given standard deviation.""" | |
| size = int(6 * sigma + 3) | |
| x = np.arange(size, dtype=np.float) | |
| y = x[:, np.newaxis] | |
| x0, y0 = 3 * sigma + 1, 3 * sigma + 1 | |
| return np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)), size | |
| class PanopticSampleGenerator: | |
| """This class generates samples from images and labels.""" | |
| def __init__(self, | |
| dataset_info, | |
| is_training, | |
| crop_size, | |
| min_resize_value=None, | |
| max_resize_value=None, | |
| resize_factor=None, | |
| min_scale_factor=1., | |
| max_scale_factor=1., | |
| scale_factor_step_size=0, | |
| autoaugment_policy_name=None, | |
| only_semantic_annotations=False, | |
| thing_id_mask_annotations=False, | |
| max_thing_id=128, | |
| sigma=8, | |
| focus_small_instances=None): | |
| """Initializes the panoptic segmentation generator. | |
| Args: | |
| dataset_info: A dictionary with the following keys. | |
| - `name`: String, dataset name. | |
| - `ignore_label`: Integer, ignore label. | |
| - `class_has_instances_list`: A list of integers indicating which | |
| class has instance annotations. | |
| - `panoptic_label_divisor`: Integer, panoptic label divisor. | |
| - `num_classes`: Integer, number of classes. | |
| - `is_video_dataset`: Boolean, is video dataset or not. | |
| is_training: Boolean, is training mode or not. | |
| crop_size: Image crop size [height, width]. | |
| min_resize_value: A 2-tuple of (height, width), desired minimum value | |
| after resize. If a single element is given, then height and width share | |
| the same value. None, empty or having 0 indicates no minimum value will | |
| be used. | |
| max_resize_value: A 2-tuple of (height, width), maximum allowed value | |
| after resize. If a single element is given, then height and width | |
| share the same value. None, empty or having 0 indicates no maximum | |
| value will be used. | |
| resize_factor: Resized dimensions are multiple of factor plus one. | |
| min_scale_factor: Minimum scale factor for random scale augmentation. | |
| max_scale_factor: Maximum scale factor for random scale augmentation. | |
| scale_factor_step_size: The step size from min scale factor to max scale | |
| factor. The input is randomly scaled based on the value of | |
| (min_scale_factor, max_scale_factor, scale_factor_step_size). | |
| autoaugment_policy_name: String, autoaugment policy name. See | |
| autoaugment_policy.py for available policies. | |
| only_semantic_annotations: An optional flag indicating whether the model | |
| needs only semantic annotations (default: False). | |
| thing_id_mask_annotations: An optional flag indicating whether the model | |
| needs thing_id_mask annotations. When `thing_id_mask_annotations` is | |
| True, we will additionally return mask annotation for each `thing` | |
| instance, encoded with a unique thing_id. This ground-truth annotation | |
| could be used to learn a better segmentation mask for each instance. | |
| `thing_id` indicates the number of unique thing-ID to each instance in | |
| an image, starting the counting from 0 (default: False). | |
| max_thing_id: The maximum number of possible thing instances per image. It | |
| is used together when thing_id_mask_annotations = True, representing the | |
| maximum thing ID encoded in the thing_id_mask. (default: 128). | |
| sigma: The standard deviation of the Gaussian used to encode the center | |
| keypoint (default: 8). | |
| focus_small_instances: An optional dict that defines how to deal with | |
| small instances (default: None): | |
| -`threshold`: An integer defining the threshold pixel number for an | |
| instance to be considered small. | |
| -`weight`: A number that defines the loss weight for small instances. | |
| """ | |
| self._dataset_info = dataset_info | |
| self._ignore_label = self._dataset_info['ignore_label'] | |
| self._only_semantic_annotations = only_semantic_annotations | |
| self._sigma = sigma | |
| self._instance_area_threshold = 0 | |
| self._small_instance_weight = 1.0 | |
| self._thing_id_mask_annotations = thing_id_mask_annotations | |
| self._max_thing_id = max_thing_id | |
| self._is_training = is_training | |
| self._preprocessing_fn = functools.partial( | |
| preprocessing.preprocess_image_and_label, | |
| crop_height=crop_size[0], | |
| crop_width=crop_size[1], | |
| min_resize_value=min_resize_value, | |
| max_resize_value=max_resize_value, | |
| resize_factor=resize_factor, | |
| min_scale_factor=min_scale_factor, | |
| max_scale_factor=max_scale_factor, | |
| scale_factor_step_size=scale_factor_step_size, | |
| autoaugment_policy_name=autoaugment_policy_name, | |
| ignore_label=self._ignore_label * | |
| self._dataset_info['panoptic_label_divisor'], | |
| is_training=self._is_training) | |
| if focus_small_instances is not None: | |
| self._instance_area_threshold = focus_small_instances['threshold'] | |
| self._small_instance_weight = focus_small_instances['weight'] | |
| self._gaussian, self._gaussian_size = _compute_gaussian_from_std( | |
| self._sigma) | |
| self._gaussian = tf.cast(tf.reshape(self._gaussian, [-1]), tf.float32) | |
| def __call__(self, sample_dict): | |
| """Gets a sample. | |
| Args: | |
| sample_dict: A dictionary with the following keys and values: | |
| - `image`: A tensor of shape [image_height, image_width, 3]. | |
| - `image_name`: String, image name. | |
| - `label`: A tensor of shape [label_height, label_width, 1] or None. | |
| - `height`: An integer specifying the height of the image. | |
| - `width`: An integer specifying the width of the image. | |
| - `sequence`: An optional string specifying the sequence name. | |
| - `prev_image`: An optional tensor of the same shape as `image`. | |
| - `prev_label`: An optional tensor of the same shape as `label`. | |
| - `next_image`: An optional next-frame tensor of the shape of `image`. | |
| - `next_label`: An optional next-frame tensor of the shape of `label`. | |
| Returns: | |
| sample: A dictionary storing required data for panoptic segmentation. | |
| """ | |
| return self.call(**sample_dict) | |
| def call(self, | |
| image, | |
| image_name, | |
| label, | |
| height, | |
| width, | |
| sequence='', | |
| prev_image=None, | |
| prev_label=None, | |
| next_image=None, | |
| next_label=None): | |
| """Gets a sample. | |
| Args: | |
| image: A tensor of shape [image_height, image_width, 3]. | |
| image_name: String, image name. | |
| label: A tensor of shape [label_height, label_width, 1] or None. | |
| height: An integer specifying the height of the image. | |
| width: An integer specifying the width of the image. | |
| sequence: An optional string specifying the sequence name. | |
| prev_image: An optional tensor of shape [image_height, image_width, 3]. | |
| prev_label: An optional tensor of shape [label_height, label_width, 1]. | |
| next_image: An optional tensor of shape [image_height, image_width, 3]. | |
| next_label: An optional tensor of shape [label_height, label_width, 1]. | |
| Returns: | |
| sample: A dictionary storing required data for panoptic segmentation. | |
| Raises: | |
| ValueError: An error occurs when the label shape is invalid. | |
| NotImplementedError: An error occurs when thing_id_mask_annotations comes | |
| together with prev_image or prev_label, not currently implemented. | |
| """ | |
| if label is not None: | |
| label.get_shape().assert_is_compatible_with( | |
| tf.TensorShape([None, None, 1])) | |
| original_label = tf.cast(label, dtype=tf.int32, name='original_label') | |
| if next_label is not None: | |
| original_next_label = tf.cast( | |
| next_label, dtype=tf.int32, name='original_next_label') | |
| # Reusing the preprocessing function for both next and prev samples. | |
| if next_image is not None: | |
| resized_image, image, label, next_image, next_label = ( | |
| self._preprocessing_fn( | |
| image, label, prev_image=next_image, prev_label=next_label)) | |
| else: | |
| resized_image, image, label, prev_image, prev_label = ( | |
| self._preprocessing_fn( | |
| image, label, prev_image=prev_image, prev_label=prev_label)) | |
| sample = { | |
| common.IMAGE: image | |
| } | |
| if prev_image is not None: | |
| sample[common.IMAGE] = tf.concat([image, prev_image], axis=2) | |
| if next_image is not None: | |
| sample[common.NEXT_IMAGE] = next_image | |
| sample[common.IMAGE] = tf.concat([image, next_image], axis=2) | |
| if label is not None: | |
| # Panoptic label for crowd regions will be ignore_label. | |
| semantic_label, panoptic_label, thing_mask, crowd_region = ( | |
| dataset_utils.get_semantic_and_panoptic_label( | |
| self._dataset_info, label, self._ignore_label)) | |
| sample[common.GT_SEMANTIC_KEY] = tf.squeeze(semantic_label, axis=2) | |
| semantic_weights = tf.ones_like(semantic_label, dtype=tf.float32) | |
| sample[common.SEMANTIC_LOSS_WEIGHT_KEY] = tf.squeeze( | |
| semantic_weights, axis=2) | |
| sample[common.GT_IS_CROWD] = tf.squeeze(crowd_region, axis=2) | |
| if not self._only_semantic_annotations: | |
| # The sample will have the original label including crowd regions. | |
| sample[common.GT_PANOPTIC_KEY] = tf.squeeze(label, axis=2) | |
| # Compute center loss for all non-crowd and non-ignore pixels. | |
| non_crowd_and_non_ignore_regions = tf.logical_and( | |
| tf.logical_not(crowd_region), | |
| tf.not_equal(semantic_label, self._ignore_label)) | |
| sample[common.CENTER_LOSS_WEIGHT_KEY] = tf.squeeze(tf.cast( | |
| non_crowd_and_non_ignore_regions, tf.float32), axis=2) | |
| # Compute regression loss only for thing pixels that are not crowd. | |
| non_crowd_things = tf.logical_and( | |
| tf.logical_not(crowd_region), thing_mask) | |
| sample[common.REGRESSION_LOSS_WEIGHT_KEY] = tf.squeeze(tf.cast( | |
| non_crowd_things, tf.float32), axis=2) | |
| prev_panoptic_label = None | |
| next_panoptic_label = None | |
| if prev_label is not None: | |
| _, prev_panoptic_label, _, _ = ( | |
| dataset_utils.get_semantic_and_panoptic_label( | |
| self._dataset_info, prev_label, self._ignore_label)) | |
| if next_label is not None: | |
| _, next_panoptic_label, _, _ = ( | |
| dataset_utils.get_semantic_and_panoptic_label( | |
| self._dataset_info, next_label, self._ignore_label)) | |
| (sample[common.GT_INSTANCE_CENTER_KEY], | |
| sample[common.GT_INSTANCE_REGRESSION_KEY], | |
| sample[common.SEMANTIC_LOSS_WEIGHT_KEY], | |
| prev_center_map, | |
| frame_center_offsets, | |
| next_offset) = self._generate_gt_center_and_offset( | |
| panoptic_label, semantic_weights, prev_panoptic_label, | |
| next_panoptic_label) | |
| sample[common.GT_INSTANCE_REGRESSION_KEY] = tf.cast( | |
| sample[common.GT_INSTANCE_REGRESSION_KEY], tf.float32) | |
| if next_label is not None: | |
| sample[common.GT_NEXT_INSTANCE_REGRESSION_KEY] = tf.cast( | |
| next_offset, tf.float32) | |
| sample[common.NEXT_REGRESSION_LOSS_WEIGHT_KEY] = tf.cast( | |
| tf.greater(tf.reduce_sum(tf.abs(next_offset), axis=2), 0), | |
| tf.float32) | |
| # Only squeeze center map and semantic loss weights, as regression map | |
| # has two channels (x and y offsets). | |
| sample[common.GT_INSTANCE_CENTER_KEY] = tf.squeeze( | |
| sample[common.GT_INSTANCE_CENTER_KEY], axis=2) | |
| sample[common.SEMANTIC_LOSS_WEIGHT_KEY] = tf.squeeze( | |
| sample[common.SEMANTIC_LOSS_WEIGHT_KEY], axis=2) | |
| if prev_label is not None: | |
| sample[common.GT_FRAME_OFFSET_KEY] = frame_center_offsets | |
| sample[common.GT_FRAME_OFFSET_KEY] = tf.cast( | |
| sample[common.GT_FRAME_OFFSET_KEY], tf.float32) | |
| frame_offsets_present = tf.logical_or( | |
| tf.not_equal(frame_center_offsets[..., 0], 0), | |
| tf.not_equal(frame_center_offsets[..., 1], 0)) | |
| sample[common.FRAME_REGRESSION_LOSS_WEIGHT_KEY] = tf.cast( | |
| frame_offsets_present, tf.float32) | |
| if self._is_training: | |
| sample[common.IMAGE] = tf.concat( | |
| [sample[common.IMAGE], prev_center_map], axis=2) | |
| if self._thing_id_mask_annotations: | |
| if any([prev_image is not None, | |
| prev_label is not None, | |
| next_image is not None, | |
| next_label is not None]): | |
| raise NotImplementedError( | |
| 'Current implementation of Max-DeepLab does not support ' | |
| + 'prev_image, prev_label, next_image, or next_label.') | |
| thing_id_mask, thing_id_class = ( | |
| self._generate_thing_id_mask_and_class( | |
| panoptic_label, non_crowd_things)) | |
| sample[common.GT_THING_ID_MASK_KEY] = tf.squeeze( | |
| thing_id_mask, axis=2) | |
| sample[common.GT_THING_ID_CLASS_KEY] = thing_id_class | |
| if not self._is_training: | |
| # Resized image is only used during visualization. | |
| sample[common.RESIZED_IMAGE] = resized_image | |
| sample[common.IMAGE_NAME] = image_name | |
| sample[common.GT_SIZE_RAW] = tf.stack([height, width], axis=0) | |
| if self._dataset_info['is_video_dataset']: | |
| sample[common.SEQUENCE_ID] = sequence | |
| # Keep original labels for evaluation. | |
| if label is not None: | |
| orig_semantic_label, _, _, orig_crowd_region = ( | |
| dataset_utils.get_semantic_and_panoptic_label( | |
| self._dataset_info, original_label, self._ignore_label)) | |
| sample[common.GT_SEMANTIC_RAW] = tf.squeeze(orig_semantic_label, axis=2) | |
| if not self._only_semantic_annotations: | |
| sample[common.GT_PANOPTIC_RAW] = tf.squeeze(original_label, axis=2) | |
| sample[common.GT_IS_CROWD_RAW] = tf.squeeze(orig_crowd_region) | |
| if next_label is not None: | |
| sample[common.GT_NEXT_PANOPTIC_RAW] = tf.squeeze( | |
| original_next_label, axis=2) | |
| return sample | |
| def _generate_thing_id_mask_and_class(self, | |
| panoptic_label, | |
| non_crowd_things): | |
| """Generates the ground-truth thing-ID masks and their class labels. | |
| It computes thing-ID mask and class with unique ID for each thing instance. | |
| `thing_id` indicates the number of unique thing-ID to each instance in an | |
| image, starting the counting from 0. Each pixel in thing_id_mask is labeled | |
| with the corresponding thing-ID. | |
| Args: | |
| panoptic_label: A tf.Tensor of shape [height, width, 1]. | |
| non_crowd_things: A tf.Tensor of shape [height, width, 1], indicating | |
| non-crowd and thing-class regions. | |
| Returns: | |
| thing_id_mask: A tf.Tensor of shape [height, width, 1]. It assigns each | |
| non-crowd thing instance a unique mask-ID label, starting from 0. | |
| Unassigned pixels are set to -1. | |
| thing_id_class: A tf.Tensor of shape [max_thing_id]. It contains semantic | |
| ID of each instance assigned to thing_id_mask. The remaining | |
| (max_thing_id - num_things) elements are set to -1. | |
| Raises: | |
| ValueError: An error occurs when the thing-ID mask contains stuff or crowd | |
| region. | |
| ValueError: An error occurs when thing_count is greater or equal to | |
| self._max_thing_id. | |
| """ | |
| unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1])) | |
| thing_id_mask = -tf.ones_like(panoptic_label) | |
| thing_id_class = -tf.ones(self._max_thing_id) | |
| thing_count = 0 | |
| for panoptic_id in unique_ids: | |
| semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor'] | |
| # Filter out IDs that are not thing instances (i.e., IDs for ignore_label, | |
| # stuff classes or crowd). Stuff classes and crowd regions both have IDs | |
| # of the form panoptic_id = semantic_id * label_divisor (i.e., instance id | |
| # = 0) | |
| if (semantic_id == self._dataset_info['ignore_label'] or | |
| panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0): | |
| continue | |
| assert_stuff_crowd = tf.debugging.Assert( | |
| tf.reduce_all(non_crowd_things[panoptic_label == panoptic_id]), | |
| ['thing-ID mask here must not contain stuff or crowd region.']) | |
| with tf.control_dependencies([assert_stuff_crowd]): | |
| panoptic_id = tf.identity(panoptic_id) | |
| thing_id_mask = tf.where(panoptic_label == panoptic_id, | |
| thing_count, thing_id_mask) | |
| assert_thing_count = tf.debugging.Assert( | |
| thing_count < self._max_thing_id, | |
| ['thing_count must be smaller than self._max_thing_id.']) | |
| with tf.control_dependencies([assert_thing_count]): | |
| thing_count = tf.identity(thing_count) | |
| thing_id_class = tf.tensor_scatter_nd_update( | |
| thing_id_class, [[thing_count]], [semantic_id]) | |
| thing_count += 1 | |
| return thing_id_mask, thing_id_class | |
| def _generate_prev_centers_with_noise(self, | |
| panoptic_label, | |
| offset_noise_factor=0.05, | |
| false_positive_rate=0.2, | |
| false_positive_noise_factor=0.05): | |
| """Generates noisy center predictions for the previous frame. | |
| Args: | |
| panoptic_label: A tf.Tensor of shape [height, width, 1]. | |
| offset_noise_factor: An optional float defining the maximum fraction of | |
| the object size that is used to displace the previous center. | |
| false_positive_rate: An optional float indicating at which probability | |
| false positives should be added. | |
| false_positive_noise_factor: An optional float defining the maximum | |
| fraction of the object size that is used to displace the false positive | |
| center. | |
| Returns: | |
| A tuple of (center, ids_to_center) with both being tf.Tensor of shape | |
| [height, width, 1] and shape [N, 2] where N is the number of unique IDs. | |
| """ | |
| height = tf.shape(panoptic_label)[0] | |
| width = tf.shape(panoptic_label)[1] | |
| # Pad center to make boundary handling easier. | |
| center_pad_begin = int(round(3 * self._sigma + 1)) | |
| center_pad_end = int(round(3 * self._sigma + 2)) | |
| center_pad = center_pad_begin + center_pad_end | |
| center = tf.zeros((height + center_pad, width + center_pad)) | |
| unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1])) | |
| ids_to_center_x = tf.zeros_like(unique_ids, dtype=tf.int32) | |
| ids_to_center_y = tf.zeros_like(unique_ids, dtype=tf.int32) | |
| for panoptic_id in unique_ids: | |
| semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor'] | |
| # Filter out IDs that should be ignored, are stuff classes or crowd. | |
| # Stuff classes and crowd regions both have IDs of the form panoptic_id = | |
| # semantic_id * label_divisor | |
| if (semantic_id == self._dataset_info['ignore_label'] or | |
| panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0): | |
| continue | |
| # Convert [[y0, x0, 0], ...] to [[y0, ...], [x0, ...], [0, ...]]. | |
| mask_index = tf.cast( | |
| tf.transpose(tf.where(panoptic_label == panoptic_id)), tf.float32) | |
| centers = tf.reduce_mean(mask_index, axis=1) | |
| bbox_size = ( | |
| tf.reduce_max(mask_index, axis=1) - tf.reduce_min(mask_index, axis=1)) | |
| # Add noise. | |
| center_y = ( | |
| centers[0] + tf.random.normal([], dtype=tf.float32) * | |
| offset_noise_factor * bbox_size[0]) | |
| center_x = ( | |
| centers[1] + tf.random.normal([], dtype=tf.float32) * | |
| offset_noise_factor * bbox_size[1]) | |
| center_x = tf.minimum( | |
| tf.maximum(tf.cast(tf.round(center_x), tf.int32), 0), width - 1) | |
| center_y = tf.minimum( | |
| tf.maximum(tf.cast(tf.round(center_y), tf.int32), 0), height - 1) | |
| id_index = tf.where(tf.equal(panoptic_id, unique_ids)) | |
| ids_to_center_x = tf.tensor_scatter_nd_update( | |
| ids_to_center_x, id_index, tf.expand_dims(center_x, axis=0)) | |
| ids_to_center_y = tf.tensor_scatter_nd_update( | |
| ids_to_center_y, id_index, tf.expand_dims(center_y, axis=0)) | |
| def add_center_gaussian(center_x_coord, center_y_coord, center): | |
| # Due to the padding with center_pad_begin in center, the computed | |
| # center becomes the upper left corner in the center tensor. | |
| upper_left = center_x_coord, center_y_coord | |
| bottom_right = (upper_left[0] + self._gaussian_size, | |
| upper_left[1] + self._gaussian_size) | |
| indices_x, indices_y = tf.meshgrid( | |
| tf.range(upper_left[0], bottom_right[0]), | |
| tf.range(upper_left[1], bottom_right[1])) | |
| indices = tf.transpose( | |
| tf.stack([tf.reshape(indices_y, [-1]), | |
| tf.reshape(indices_x, [-1])])) | |
| return tf.tensor_scatter_nd_max( | |
| center, indices, self._gaussian, name='center_scatter') | |
| center = add_center_gaussian(center_x, center_y, center) | |
| # Generate false positives. | |
| center_y = ( | |
| tf.cast(center_y, dtype=tf.float32) + | |
| tf.random.normal([], dtype=tf.float32) * false_positive_noise_factor * | |
| bbox_size[0]) | |
| center_x = ( | |
| tf.cast(center_x, dtype=tf.float32) + | |
| tf.random.normal([], dtype=tf.float32) * false_positive_noise_factor * | |
| bbox_size[1]) | |
| center_x = tf.minimum( | |
| tf.maximum(tf.cast(tf.round(center_x), tf.int32), 0), width - 1) | |
| center_y = tf.minimum( | |
| tf.maximum(tf.cast(tf.round(center_y), tf.int32), 0), height - 1) | |
| # Draw a sample to decide whether to add a false positive or not. | |
| center = center + tf.cast( | |
| tf.random.uniform([], dtype=tf.float32) < false_positive_rate, | |
| tf.float32) * ( | |
| add_center_gaussian(center_x, center_y, center) - center) | |
| center = center[center_pad_begin:(center_pad_begin + height), | |
| center_pad_begin:(center_pad_begin + width)] | |
| center = tf.expand_dims(center, -1) | |
| return center, unique_ids, ids_to_center_x, ids_to_center_y | |
| def _generate_gt_center_and_offset(self, | |
| panoptic_label, | |
| semantic_weights, | |
| prev_panoptic_label=None, | |
| next_panoptic_label=None): | |
| """Generates the ground-truth center and offset from the panoptic labels. | |
| Additionally, the per-pixel weights for the semantic branch are increased | |
| for small instances. In case, prev_panoptic_label is passed, it also | |
| computes the previous center heatmap with random noise and the offsets | |
| between center maps. | |
| Args: | |
| panoptic_label: A tf.Tensor of shape [height, width, 1]. | |
| semantic_weights: A tf.Tensor of shape [height, width, 1]. | |
| prev_panoptic_label: An optional tf.Tensor of shape [height, width, 1]. | |
| next_panoptic_label: An optional tf.Tensor of shape [height, width, 1]. | |
| Returns: | |
| A tuple (center, offsets, weights, prev_center, frame_offset*, | |
| next_offset) with each being a tf.Tensor of shape [height, width, 1 (2*)]. | |
| If prev_panoptic_label is None, prev_center and frame_offset are None. | |
| If next_panoptic_label is None, next_offset is None. | |
| """ | |
| height = tf.shape(panoptic_label)[0] | |
| width = tf.shape(panoptic_label)[1] | |
| # Pad center to make boundary handling easier. | |
| center_pad_begin = int(round(3 * self._sigma + 1)) | |
| center_pad_end = int(round(3 * self._sigma + 2)) | |
| center_pad = center_pad_begin + center_pad_end | |
| center = tf.zeros((height + center_pad, width + center_pad)) | |
| offset_x = tf.zeros((height, width, 1), dtype=tf.int32) | |
| offset_y = tf.zeros((height, width, 1), dtype=tf.int32) | |
| unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1])) | |
| prev_center = None | |
| frame_offsets = None | |
| # Due to loop handling in tensorflow, these variables had to be defined for | |
| # all cases. | |
| frame_offset_x = tf.zeros((height, width, 1), dtype=tf.int32) | |
| frame_offset_y = tf.zeros((height, width, 1), dtype=tf.int32) | |
| # Next-frame instance offsets. | |
| next_offset = None | |
| next_offset_y = tf.zeros((height, width, 1), dtype=tf.int32) | |
| next_offset_x = tf.zeros((height, width, 1), dtype=tf.int32) | |
| if prev_panoptic_label is not None: | |
| (prev_center, prev_unique_ids, prev_centers_x, prev_centers_y | |
| ) = self._generate_prev_centers_with_noise(prev_panoptic_label) | |
| for panoptic_id in unique_ids: | |
| semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor'] | |
| # Filter out IDs that should be ignored, are stuff classes or crowd. | |
| # Stuff classes and crowd regions both have IDs of the form panopti_id = | |
| # semantic_id * label_divisor | |
| if (semantic_id == self._dataset_info['ignore_label'] or | |
| panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0): | |
| continue | |
| # Convert [[y0, x0, 0], ...] to [[y0, ...], [x0, ...], [0, ...]]. | |
| mask_index = tf.transpose(tf.where(panoptic_label == panoptic_id)) | |
| mask_y_index = mask_index[0] | |
| mask_x_index = mask_index[1] | |
| next_mask_index = None | |
| next_mask_y_index = None | |
| next_mask_x_index = None | |
| if next_panoptic_label is not None: | |
| next_mask_index = tf.transpose( | |
| tf.where(next_panoptic_label == panoptic_id)) | |
| next_mask_y_index = next_mask_index[0] | |
| next_mask_x_index = next_mask_index[1] | |
| instance_area = tf.shape(mask_x_index) | |
| if instance_area < self._instance_area_threshold: | |
| semantic_weights = tf.where(panoptic_label == panoptic_id, | |
| self._small_instance_weight, | |
| semantic_weights) | |
| centers = tf.reduce_mean(tf.cast(mask_index, tf.float32), axis=1) | |
| center_x = tf.cast(tf.round(centers[1]), tf.int32) | |
| center_y = tf.cast(tf.round(centers[0]), tf.int32) | |
| # Due to the padding with center_pad_begin in center, the computed center | |
| # becomes the upper left corner in the center tensor. | |
| upper_left = center_x, center_y | |
| bottom_right = (upper_left[0] + self._gaussian_size, | |
| upper_left[1] + self._gaussian_size) | |
| indices_x, indices_y = tf.meshgrid( | |
| tf.range(upper_left[0], bottom_right[0]), | |
| tf.range(upper_left[1], bottom_right[1])) | |
| indices = tf.transpose( | |
| tf.stack([tf.reshape(indices_y, [-1]), | |
| tf.reshape(indices_x, [-1])])) | |
| center = tf.tensor_scatter_nd_max( | |
| center, indices, self._gaussian, name='center_scatter') | |
| offset_y = tf.tensor_scatter_nd_update( | |
| offset_y, | |
| tf.transpose(mask_index), | |
| center_y - tf.cast(mask_y_index, tf.int32), | |
| name='offset_y_scatter') | |
| offset_x = tf.tensor_scatter_nd_update( | |
| offset_x, | |
| tf.transpose(mask_index), | |
| center_x - tf.cast(mask_x_index, tf.int32), | |
| name='offset_x_scatter') | |
| if prev_panoptic_label is not None: | |
| mask = tf.equal(prev_unique_ids, panoptic_id) | |
| if tf.math.count_nonzero(mask) > 0: | |
| prev_center_x = prev_centers_x[mask] | |
| prev_center_y = prev_centers_y[mask] | |
| frame_offset_y = tf.tensor_scatter_nd_update( | |
| frame_offset_y, | |
| tf.transpose(mask_index), | |
| prev_center_y - tf.cast(mask_y_index, tf.int32), | |
| name='frame_offset_y_scatter') | |
| frame_offset_x = tf.tensor_scatter_nd_update( | |
| frame_offset_x, | |
| tf.transpose(mask_index), | |
| prev_center_x - tf.cast(mask_x_index, tf.int32), | |
| name='frame_offset_x_scatter') | |
| if next_panoptic_label is not None: | |
| next_offset_y = tf.tensor_scatter_nd_update( | |
| next_offset_y, | |
| tf.transpose(next_mask_index), | |
| center_y - tf.cast(next_mask_y_index, tf.int32), | |
| name='next_offset_y_scatter') | |
| next_offset_x = tf.tensor_scatter_nd_update( | |
| next_offset_x, | |
| tf.transpose(next_mask_index), | |
| center_x - tf.cast(next_mask_x_index, tf.int32), | |
| name='next_offset_x_scatter') | |
| offset = tf.concat([offset_y, offset_x], axis=2) | |
| center = center[center_pad_begin:(center_pad_begin + height), | |
| center_pad_begin:(center_pad_begin + width)] | |
| center = tf.expand_dims(center, -1) | |
| if prev_panoptic_label is not None: | |
| frame_offsets = tf.concat([frame_offset_y, frame_offset_x], axis=2) | |
| if next_panoptic_label is not None: | |
| next_offset = tf.concat([next_offset_y, next_offset_x], axis=2) | |
| return (center, offset, semantic_weights, prev_center, frame_offsets, | |
| next_offset) | |