DeepLearningExamples/TensorFlow2/Segmentation/MaskRCNN/mask_rcnn/ops/training_ops.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Training specific ops, including sampling, building targets, etc."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

from mask_rcnn.utils import box_utils
from mask_rcnn.ops import spatial_transform_ops

from mask_rcnn.object_detection import balanced_positive_negative_sampler

_EPSILON = 1e-8


def _add_class_assignments(iou, gt_boxes, gt_labels):
    """Computes object category assignment for each box.

  Args:
    iou: a tensor for the iou matrix with a shape of
      [batch_size, K, MAX_NUM_INSTANCES]. K is the number of post-nms RoIs
      (i.e., rpn_post_nms_topn).
    gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4].
      This tensor might have paddings with negative values. The coordinates
      of gt_boxes are in the pixel coordinates of the scaled image scale.
    gt_labels: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
      tensor might have paddings with a value of -1.
  Returns:
    max_boxes: a tensor with a shape of [batch_size, K, 4], representing
      the ground truth coordinates of each roi.
    max_classes: a int32 tensor with a shape of [batch_size, K], representing
      the ground truth class of each roi.
    max_overlap: a tensor with a shape of [batch_size, K], representing
      the maximum overlap of each roi.
    argmax_iou: a tensor with a shape of [batch_size, K], representing the iou
      argmax.
  """
    with tf.name_scope('add_class_assignments'):
        batch_size, _, _ = iou.get_shape().as_list()

        argmax_iou = tf.argmax(input=iou, axis=2, output_type=tf.int32)

        indices = tf.reshape(
            argmax_iou + tf.expand_dims(tf.range(batch_size) * tf.shape(input=gt_labels)[1], 1),
            shape=[-1]
        )

        max_classes = tf.reshape(tf.gather(tf.reshape(gt_labels, [-1, 1]), indices), [batch_size, -1])

        max_overlap = tf.reduce_max(input_tensor=iou, axis=2)

        bg_mask = tf.equal(max_overlap, tf.zeros_like(max_overlap))

        max_classes = tf.where(bg_mask, tf.zeros_like(max_classes), max_classes)

        max_boxes = tf.reshape(
            tf.gather(tf.reshape(gt_boxes, [-1, 4]), indices),
            [batch_size, -1, 4]
        )

        max_boxes = tf.where(
            tf.tile(tf.expand_dims(bg_mask, axis=2), [1, 1, 4]),
            tf.zeros_like(max_boxes),
            max_boxes
        )

    return max_boxes, max_classes, max_overlap, argmax_iou


def encode_box_targets(boxes, gt_boxes, gt_labels, bbox_reg_weights):
    """Encodes predicted boxes with respect to ground truth boxes."""
    with tf.name_scope('encode_box_targets'):
        box_targets = box_utils.encode_boxes(boxes=gt_boxes, anchors=boxes, weights=bbox_reg_weights)
        # If a target is background, the encoded box target should be zeros.
        mask = tf.tile(tf.expand_dims(tf.equal(gt_labels, tf.zeros_like(gt_labels)), axis=2), [1, 1, 4])
        box_targets = tf.where(mask, tf.zeros_like(box_targets), box_targets)

    return box_targets


def proposal_label_op(boxes, gt_boxes, gt_labels,
                      batch_size_per_im=512, fg_fraction=0.25, fg_thresh=0.5,
                      bg_thresh_hi=0.5, bg_thresh_lo=0.):
    """Assigns the proposals with ground truth labels and performs subsmpling.

    Given proposal `boxes`, `gt_boxes`, and `gt_labels`, the function uses the
    following algorithm to generate the final `batch_size_per_im` RoIs.
    1. Calculates the IoU between each proposal box and each gt_boxes.
    2. Assigns each proposal box with a ground truth class and box label by
     choosing the largest overlap.
    3. Samples `batch_size_per_im` boxes from all proposal boxes, and returns
     box_targets, class_targets, and RoIs.
    The reference implementations of #1 and #2 are here:
    https://github.com/facebookresearch/Detectron/blob/master/detectron/datasets/json_dataset.py
    The reference implementation of #3 is here:
    https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py

    Args:
    boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
      proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
      last dimension is the pixel coordinates of scaled images in
      [ymin, xmin, ymax, xmax] form.
    gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
      tensor might have paddings with a value of -1. The coordinates of gt_boxes
      are in the pixel coordinates of the scaled image.
    gt_labels: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
      tensor might have paddings with a value of -1.
    batch_size_per_im: a integer represents RoI minibatch size per image.
    fg_fraction: a float represents the target fraction of RoI minibatch that
      is labeled foreground (i.e., class > 0).
    fg_thresh: a float represents the overlap threshold for an RoI to be
      considered foreground (if >= fg_thresh).
    bg_thresh_hi: a float represents the overlap threshold for an RoI to be
      considered background (class = 0 if overlap in [LO, HI)).
    bg_thresh_lo: a float represents the overlap threshold for an RoI to be
      considered background (class = 0 if overlap in [LO, HI)).
    Returns:
    box_targets: a tensor with a shape of [batch_size, K, 4]. The tensor
      contains the ground truth pixel coordinates of the scaled images for each
      roi. K is the number of sample RoIs (e.g., batch_size_per_im).
    class_targets: a integer tensor with a shape of [batch_size, K]. The tensor
      contains the ground truth class for each roi.
    rois: a tensor with a shape of [batch_size, K, 4], representing the
      coordinates of the selected RoI.
    proposal_to_label_map: a tensor with a shape of [batch_size, K]. This tensor
      keeps the mapping between proposal to labels. proposal_to_label_map[i]
      means the index of the ground truth instance for the i-th proposal.
    """
    with tf.name_scope('proposal_label'):
        batch_size = boxes.shape[0]

        # The reference implementation intentionally includes ground truth boxes in
        # the proposals.
        # see https://github.com/facebookresearch/Detectron/blob/master/detectron/datasets/json_dataset.py#L359
        boxes = tf.concat([boxes, gt_boxes], axis=1)
        iou = box_utils.bbox_overlap(boxes, gt_boxes)

        (pre_sample_box_targets, pre_sample_class_targets, max_overlap,
         proposal_to_label_map) = _add_class_assignments(iou, gt_boxes, gt_labels)

        # Generates a random sample of RoIs comprising foreground and background
        # examples.
        # reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py#L132
        positives = tf.greater(max_overlap,
                               fg_thresh * tf.ones_like(max_overlap))
        negatives = tf.logical_and(
            tf.greater_equal(max_overlap, bg_thresh_lo * tf.ones_like(max_overlap)),
            tf.less(max_overlap, bg_thresh_hi * tf.ones_like(max_overlap))
        )

        pre_sample_class_targets = tf.where(
            negatives,
            tf.zeros_like(pre_sample_class_targets),
            pre_sample_class_targets
        )

        proposal_to_label_map = tf.where(
            negatives,
            tf.zeros_like(proposal_to_label_map),
            proposal_to_label_map
        )

        # Handles ground truth paddings.
        ignore_mask = tf.less(tf.reduce_min(input_tensor=iou, axis=2), tf.zeros_like(max_overlap))

        # indicator includes both positive and negative labels.
        # labels includes only positives labels.
        # positives = indicator & labels.
        # negatives = indicator & !labels.
        # ignore = !indicator.
        labels = positives
        pos_or_neg = tf.logical_or(positives, negatives)
        indicator = tf.logical_and(pos_or_neg, tf.logical_not(ignore_mask))

        all_samples = []
        sampler = balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
            positive_fraction=fg_fraction,
            is_static=True
        )

        # Batch-unroll the sub-sampling process.
        for i in range(batch_size):
            samples = sampler.subsample(indicator[i], batch_size_per_im, labels[i])
            all_samples.append(samples)

        all_samples = tf.stack([all_samples], axis=0)[0]
        # A workaround to get the indices from the boolean tensors.
        _, samples_indices = tf.nn.top_k(tf.cast(all_samples, dtype=tf.int32), k=batch_size_per_im, sorted=True)

        # Contructs indices for gather.
        samples_indices = tf.reshape(
            samples_indices + tf.expand_dims(tf.range(batch_size) * tf.shape(input=boxes)[1], 1),
            [-1]
        )

        rois = tf.reshape(
            tf.gather(tf.reshape(boxes, [-1, 4]), samples_indices),
            [batch_size, -1, 4]
        )

        class_targets = tf.reshape(
            tf.gather(tf.reshape(pre_sample_class_targets, [-1, 1]), samples_indices),
            [batch_size, -1]
        )

        sample_box_targets = tf.reshape(
            tf.gather(tf.reshape(pre_sample_box_targets, [-1, 4]), samples_indices),
            [batch_size, -1, 4]
        )

        sample_proposal_to_label_map = tf.reshape(
            tf.gather(tf.reshape(proposal_to_label_map, [-1, 1]), samples_indices),
            [batch_size, -1]
        )

    return sample_box_targets, class_targets, rois, sample_proposal_to_label_map


def select_fg_for_masks(class_targets, box_targets, boxes, proposal_to_label_map, max_num_fg=128):
    """Selects the fore ground objects for mask branch during training.

    Args:
    class_targets: a tensor of shape [batch_size, num_boxes]  representing the
      class label for each box.
    box_targets: a tensor with a shape of [batch_size, num_boxes, 4]. The tensor
      contains the ground truth pixel coordinates of the scaled images for each
      roi.
    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row
      represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
    proposal_to_label_map: a tensor with a shape of [batch_size, num_boxes].
      This tensor keeps the mapping between proposal to labels.
      proposal_to_label_map[i] means the index of the ground truth instance for
      the i-th proposal.
    max_num_fg: a integer represents the number of masks per image.
    Returns:
    class_targets, boxes, proposal_to_label_map, box_targets that have
    foreground objects.
    """

    # Masks are for positive (fg) objects only.
    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py
    batch_size = boxes.shape[0]
    _, fg_indices = tf.nn.top_k(tf.cast(tf.greater(class_targets, 0), dtype=tf.float32), k=max_num_fg)

    # Contructs indices for gather.
    indices = tf.reshape(fg_indices + tf.expand_dims(tf.range(batch_size) * tf.shape(input=class_targets)[1], 1), [-1])

    fg_class_targets = tf.reshape(
        tf.gather(tf.reshape(class_targets, [-1, 1]), indices),
        [batch_size, -1]
    )

    fg_box_targets = tf.reshape(
        tf.gather(tf.reshape(box_targets, [-1, 4]), indices),
        [batch_size, -1, 4]
    )

    fg_box_rois = tf.reshape(
        tf.gather(tf.reshape(boxes, [-1, 4]), indices), [batch_size, -1, 4]
    )

    fg_proposal_to_label_map = tf.reshape(
        tf.gather(tf.reshape(proposal_to_label_map, [-1, 1]), indices),
        [batch_size, -1]
    )

    return (fg_class_targets, fg_box_targets, fg_box_rois,
            fg_proposal_to_label_map)


def get_mask_targets(fg_boxes, fg_proposal_to_label_map, fg_box_targets, mask_gt_labels, output_size=28):
    """Crop and resize on multilevel feature pyramid.

    Args:
    fg_boxes: A 3-D tensor of shape [batch_size, num_masks, 4]. Each row
      represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
    fg_proposal_to_label_map: A tensor of shape [batch_size, num_masks].
    fg_box_targets: a float tensor representing the box label for each box
      with a shape of [batch_size, num_masks, 4].
    mask_gt_labels: A tensor with a shape of [batch_size, M, H+4, W+4]. M is
      NUM_MAX_INSTANCES (i.e., 100 in this implementation) in each image, while
      H and W are ground truth mask size. The `+4` comes from padding of two
      zeros in both directions of height and width dimension.
    output_size: A scalar to indicate the output crop size.

    Returns:
    A 4-D tensor representing feature crop of shape
    [batch_size, num_boxes, output_size, output_size].
    """

    _, _, max_feature_height, max_feature_width = mask_gt_labels.get_shape().as_list()

    # proposal_to_label_map might have a -1 paddings.
    levels = tf.maximum(fg_proposal_to_label_map, 0)

    # Projects box location and sizes to corresponding cropped ground truth
    # mask coordinates.
    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(value=fg_boxes, num_or_size_splits=4, axis=2)
    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(value=fg_box_targets, num_or_size_splits=4, axis=2)

    valid_feature_width = max_feature_width - 4
    valid_feature_height = max_feature_height - 4

    y_transform = (bb_y_min - gt_y_min) * valid_feature_height / (gt_y_max - gt_y_min + _EPSILON) + 2
    x_transform = (bb_x_min - gt_x_min) * valid_feature_width / (gt_x_max - gt_x_min + _EPSILON) + 2
    h_transform = (bb_y_max - bb_y_min) * valid_feature_height / (gt_y_max - gt_y_min + _EPSILON)
    w_transform = (bb_x_max - bb_x_min) * valid_feature_width / (gt_x_max - gt_x_min + _EPSILON)

    boundaries = tf.concat(
        [
            tf.cast(tf.ones_like(y_transform) * (max_feature_height - 1), dtype=tf.float32),
            tf.cast(tf.ones_like(x_transform) * (max_feature_width - 1), dtype=tf.float32)
        ],
        axis=-1
    )

    features_per_box = spatial_transform_ops.selective_crop_and_resize(
        tf.expand_dims(mask_gt_labels, -1),
        tf.concat([y_transform, x_transform, h_transform, w_transform], -1),
        tf.expand_dims(levels, -1),
        boundaries,
        output_size
    )

    features_per_box = tf.squeeze(features_per_box, axis=-1)

    # Masks are binary outputs.
    features_per_box = tf.where(
        tf.greater_equal(features_per_box, 0.5),
        tf.ones_like(features_per_box),
        tf.zeros_like(features_per_box)
    )

    # mask_targets depend on box RoIs, which have gradients. This stop_gradient
    # prevents the flow of gradient to box RoIs.
    features_per_box = tf.stop_gradient(features_per_box)

    return features_per_box