This commit is contained in:
dl-desktop
2020-02-06 16:47:03 -03:00
parent 6328265287
commit b586f22bf0
318 changed files with 25111 additions and 664 deletions

Binary file not shown.

View File

@@ -0,0 +1,116 @@
'''
Utilities to match ground truth boxes to anchor boxes.
Copyright (C) 2018 Pierluigi Ferrari
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
from __future__ import division
import numpy as np
def match_bipartite_greedy(weight_matrix):
'''
Returns a bipartite matching according to the given weight matrix.
The algorithm works as follows:
Let the first axis of `weight_matrix` represent ground truth boxes
and the second axis anchor boxes.
The ground truth box that has the greatest similarity with any
anchor box will be matched first, then out of the remaining ground
truth boxes, the ground truth box that has the greatest similarity
with any of the remaining anchor boxes will be matched second, and
so on. That is, the ground truth boxes will be matched in descending
order by maximum similarity with any of the respectively remaining
anchor boxes.
The runtime complexity is O(m^2 * n), where `m` is the number of
ground truth boxes and `n` is the number of anchor boxes.
Arguments:
weight_matrix (array): A 2D Numpy array that represents the weight matrix
for the matching process. If `(m,n)` is the shape of the weight matrix,
it must be `m <= n`. The weights can be integers or floating point
numbers. The matching process will maximize, i.e. larger weights are
preferred over smaller weights.
Returns:
A 1D Numpy array of length `weight_matrix.shape[0]` that represents
the matched index along the second axis of `weight_matrix` for each index
along the first axis.
'''
weight_matrix = np.copy(weight_matrix) # We'll modify this array.
num_ground_truth_boxes = weight_matrix.shape[0]
all_gt_indices = list(range(num_ground_truth_boxes)) # Only relevant for fancy-indexing below.
# This 1D array will contain for each ground truth box the index of
# the matched anchor box.
matches = np.zeros(num_ground_truth_boxes, dtype=np.int)
# In each iteration of the loop below, exactly one ground truth box
# will be matched to one anchor box.
for _ in range(num_ground_truth_boxes):
# Find the maximal anchor-ground truth pair in two steps: First, reduce
# over the anchor boxes and then reduce over the ground truth boxes.
anchor_indices = np.argmax(weight_matrix, axis=1) # Reduce along the anchor box axis.
overlaps = weight_matrix[all_gt_indices, anchor_indices]
ground_truth_index = np.argmax(overlaps) # Reduce along the ground truth box axis.
anchor_index = anchor_indices[ground_truth_index]
matches[ground_truth_index] = anchor_index # Set the match.
# Set the row of the matched ground truth box and the column of the matched
# anchor box to all zeros. This ensures that those boxes will not be matched again,
# because they will never be the best matches for any other boxes.
weight_matrix[ground_truth_index] = 0
weight_matrix[:,anchor_index] = 0
return matches
def match_multi(weight_matrix, threshold):
'''
Matches all elements along the second axis of `weight_matrix` to their best
matches along the first axis subject to the constraint that the weight of a match
must be greater than or equal to `threshold` in order to produce a match.
If the weight matrix contains elements that should be ignored, the row or column
representing the respective elemet should be set to a value below `threshold`.
Arguments:
weight_matrix (array): A 2D Numpy array that represents the weight matrix
for the matching process. If `(m,n)` is the shape of the weight matrix,
it must be `m <= n`. The weights can be integers or floating point
numbers. The matching process will maximize, i.e. larger weights are
preferred over smaller weights.
threshold (float): A float that represents the threshold (i.e. lower bound)
that must be met by a pair of elements to produce a match.
Returns:
Two 1D Numpy arrays of equal length that represent the matched indices. The first
array contains the indices along the first axis of `weight_matrix`, the second array
contains the indices along the second axis.
'''
num_anchor_boxes = weight_matrix.shape[1]
all_anchor_indices = list(range(num_anchor_boxes)) # Only relevant for fancy-indexing below.
# Find the best ground truth match for every anchor box.
ground_truth_indices = np.argmax(weight_matrix, axis=0) # Array of shape (weight_matrix.shape[1],)
overlaps = weight_matrix[ground_truth_indices, all_anchor_indices] # Array of shape (weight_matrix.shape[1],)
# Filter out the matches with a weight below the threshold.
anchor_indices_thresh_met = np.nonzero(overlaps >= threshold)[0]
gt_indices_thresh_met = ground_truth_indices[anchor_indices_thresh_met]
return gt_indices_thresh_met, anchor_indices_thresh_met

View File

@@ -0,0 +1,617 @@
'''
An encoder that converts ground truth annotations to SSD-compatible training targets.
Copyright (C) 2018 Pierluigi Ferrari
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
from __future__ import division
import numpy as np
from bounding_box_utils.bounding_box_utils import iou, convert_coordinates
from ssd_encoder_decoder.matching_utils import match_bipartite_greedy, match_multi
class SSDInputEncoder:
'''
Transforms ground truth labels for object detection in images
(2D bounding box coordinates and class labels) to the format required for
training an SSD model.
In the process of encoding the ground truth labels, a template of anchor boxes
is being built, which are subsequently matched to the ground truth boxes
via an intersection-over-union threshold criterion.
'''
def __init__(self,
img_height,
img_width,
n_classes,
predictor_sizes,
min_scale=0.1,
max_scale=0.9,
scales=None,
aspect_ratios_global=[0.5, 1.0, 2.0],
aspect_ratios_per_layer=None,
two_boxes_for_ar1=True,
steps=None,
offsets=None,
clip_boxes=False,
variances=[0.1, 0.1, 0.2, 0.2],
matching_type='multi',
pos_iou_threshold=0.5,
neg_iou_limit=0.3,
border_pixels='half',
coords='centroids',
normalize_coords=True,
background_id=0):
'''
Arguments:
img_height (int): The height of the input images.
img_width (int): The width of the input images.
n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
predictor_sizes (list): A list of int-tuples of the format `(height, width)`
containing the output heights and widths of the convolutional predictor layers.
min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
of the shorter side of the input images. Note that you should set the scaling factors
such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
to detect. Must be >0.
max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
of the shorter side of the input images. All scaling factors between the smallest and the
largest will be linearly interpolated. Note that the second to last of the linearly interpolated
scaling factors will actually be the scaling factor for the last predictor layer, while the last
scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
if `two_boxes_for_ar1` is `True`. Note that you should set the scaling factors
such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
to detect. Must be greater than or equal to `min_scale`.
scales (list, optional): A list of floats >0 containing scaling factors per convolutional predictor layer.
This list must be one element longer than the number of predictor layers. The first `k` elements are the
scaling factors for the `k` predictor layers, while the last element is used for the second box
for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
last scaling factor must be passed either way, even if it is not being used. If a list is passed,
this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
Note that you should set the scaling factors such that the resulting anchor box sizes correspond to
the sizes of the objects you are trying to detect.
aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
generated. This list is valid for all prediction layers. Note that you should set the aspect ratios such
that the resulting anchor box shapes roughly correspond to the shapes of the objects you are trying to detect.
aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
If a list is passed, it overrides `aspect_ratios_global`. Note that you should set the aspect ratios such
that the resulting anchor box shapes very roughly correspond to the shapes of the objects you are trying to detect.
two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratios lists that contain 1. Will be ignored otherwise.
If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
using the scaling factor for the respective layer, the second one will be generated using
geometric mean of said scaling factor and next bigger scaling factor.
steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
If no steps are provided, then they will be computed such that the anchor box center points will form an
equidistant grid within the image dimensions.
offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
either floats or tuples of two floats. These numbers represent for each predictor layer how many
pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
of the step size specified in the `steps` argument. If the list contains floats, then that value will
be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
`(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
clip_boxes (bool, optional): If `True`, limits the anchor box coordinates to stay within image boundaries.
variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
its respective variance value.
matching_type (str, optional): Can be either 'multi' or 'bipartite'. In 'bipartite' mode, each ground truth box will
be matched only to the one anchor box with the highest IoU overlap. In 'multi' mode, in addition to the aforementioned
bipartite matching, all anchor boxes with an IoU overlap greater than or equal to the `pos_iou_threshold` will be
matched to a given ground truth box.
pos_iou_threshold (float, optional): The intersection-over-union similarity threshold that must be
met in order to match a given ground truth box to a given anchor box.
neg_iou_limit (float, optional): The maximum allowed intersection-over-union similarity of an
anchor box with any ground truth box to be labeled a negative (i.e. background) box. If an
anchor box is neither a positive, nor a negative box, it will be ignored during training.
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxex, but not the other.
coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
normalize_coords (bool, optional): If `True`, the encoder uses relative instead of absolute coordinates.
This means instead of using absolute tartget coordinates, the encoder will scale all coordinates to be within [0,1].
This way learning becomes independent of the input image size.
background_id (int, optional): Determines which class ID is for the background class.
'''
predictor_sizes = np.array(predictor_sizes)
if predictor_sizes.ndim == 1:
predictor_sizes = np.expand_dims(predictor_sizes, axis=0)
##################################################################################
# Handle exceptions.
##################################################################################
if (min_scale is None or max_scale is None) and scales is None:
raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
if scales:
if (len(scales) != predictor_sizes.shape[0] + 1): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
raise ValueError("It must be either scales is None or len(scales) == len(predictor_sizes)+1, but len(scales) == {} and len(predictor_sizes)+1 == {}".format(len(scales), len(predictor_sizes)+1))
scales = np.array(scales)
if np.any(scales <= 0):
raise ValueError("All values in `scales` must be greater than 0, but the passed list of scales is {}".format(scales))
else: # If no list of scales was passed, we need to make sure that `min_scale` and `max_scale` are valid values.
if not 0 < min_scale <= max_scale:
raise ValueError("It must be 0 < min_scale <= max_scale, but it is min_scale = {} and max_scale = {}".format(min_scale, max_scale))
if not (aspect_ratios_per_layer is None):
if (len(aspect_ratios_per_layer) != predictor_sizes.shape[0]): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == len(predictor_sizes), but len(aspect_ratios_per_layer) == {} and len(predictor_sizes) == {}".format(len(aspect_ratios_per_layer), len(predictor_sizes)))
for aspect_ratios in aspect_ratios_per_layer:
if np.any(np.array(aspect_ratios) <= 0):
raise ValueError("All aspect ratios must be greater than zero.")
else:
if (aspect_ratios_global is None):
raise ValueError("At least one of `aspect_ratios_global` and `aspect_ratios_per_layer` must not be `None`.")
if np.any(np.array(aspect_ratios_global) <= 0):
raise ValueError("All aspect ratios must be greater than zero.")
if len(variances) != 4:
raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
variances = np.array(variances)
if np.any(variances <= 0):
raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
if not (coords == 'minmax' or coords == 'centroids' or coords == 'corners'):
raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
if (not (steps is None)) and (len(steps) != predictor_sizes.shape[0]):
raise ValueError("You must provide at least one step value per predictor layer.")
if (not (offsets is None)) and (len(offsets) != predictor_sizes.shape[0]):
raise ValueError("You must provide at least one offset value per predictor layer.")
##################################################################################
# Set or compute members.
##################################################################################
self.img_height = img_height
self.img_width = img_width
self.n_classes = n_classes + 1 # + 1 for the background class
self.predictor_sizes = predictor_sizes
self.min_scale = min_scale
self.max_scale = max_scale
# If `scales` is None, compute the scaling factors by linearly interpolating between
# `min_scale` and `max_scale`. If an explicit list of `scales` is given, however,
# then it takes precedent over `min_scale` and `max_scale`.
if (scales is None):
self.scales = np.linspace(self.min_scale, self.max_scale, len(self.predictor_sizes)+1)
else:
# If a list of scales is given explicitly, we'll use that instead of computing it from `min_scale` and `max_scale`.
self.scales = scales
# If `aspect_ratios_per_layer` is None, then we use the same list of aspect ratios
# `aspect_ratios_global` for all predictor layers. If `aspect_ratios_per_layer` is given,
# however, then it takes precedent over `aspect_ratios_global`.
if (aspect_ratios_per_layer is None):
self.aspect_ratios = [aspect_ratios_global] * predictor_sizes.shape[0]
else:
# If aspect ratios are given per layer, we'll use those.
self.aspect_ratios = aspect_ratios_per_layer
self.two_boxes_for_ar1 = two_boxes_for_ar1
if not (steps is None):
self.steps = steps
else:
self.steps = [None] * predictor_sizes.shape[0]
if not (offsets is None):
self.offsets = offsets
else:
self.offsets = [None] * predictor_sizes.shape[0]
self.clip_boxes = clip_boxes
self.variances = variances
self.matching_type = matching_type
self.pos_iou_threshold = pos_iou_threshold
self.neg_iou_limit = neg_iou_limit
self.border_pixels = border_pixels
self.coords = coords
self.normalize_coords = normalize_coords
self.background_id = background_id
# Compute the number of boxes per spatial location for each predictor layer.
# For example, if a predictor layer has three different aspect ratios, [1.0, 0.5, 2.0], and is
# supposed to predict two boxes of slightly different size for aspect ratio 1.0, then that predictor
# layer predicts a total of four boxes at every spatial location across the feature map.
if not (aspect_ratios_per_layer is None):
self.n_boxes = []
for aspect_ratios in aspect_ratios_per_layer:
if (1 in aspect_ratios) & two_boxes_for_ar1:
self.n_boxes.append(len(aspect_ratios) + 1)
else:
self.n_boxes.append(len(aspect_ratios))
else:
if (1 in aspect_ratios_global) & two_boxes_for_ar1:
self.n_boxes = len(aspect_ratios_global) + 1
else:
self.n_boxes = len(aspect_ratios_global)
##################################################################################
# Compute the anchor boxes for each predictor layer.
##################################################################################
# Compute the anchor boxes for each predictor layer. We only have to do this once
# since the anchor boxes depend only on the model configuration, not on the input data.
# For each predictor layer (i.e. for each scaling factor) the tensors for that layer's
# anchor boxes will have the shape `(feature_map_height, feature_map_width, n_boxes, 4)`.
self.boxes_list = [] # This will store the anchor boxes for each predicotr layer.
# The following lists just store diagnostic information. Sometimes it's handy to have the
# boxes' center points, heights, widths, etc. in a list.
self.wh_list_diag = [] # Box widths and heights for each predictor layer
self.steps_diag = [] # Horizontal and vertical distances between any two boxes for each predictor layer
self.offsets_diag = [] # Offsets for each predictor layer
self.centers_diag = [] # Anchor box center points as `(cy, cx)` for each predictor layer
# Iterate over all predictor layers and compute the anchor boxes for each one.
for i in range(len(self.predictor_sizes)):
boxes, center, wh, step, offset = self.generate_anchor_boxes_for_layer(feature_map_size=self.predictor_sizes[i],
aspect_ratios=self.aspect_ratios[i],
this_scale=self.scales[i],
next_scale=self.scales[i+1],
this_steps=self.steps[i],
this_offsets=self.offsets[i],
diagnostics=True)
self.boxes_list.append(boxes)
self.wh_list_diag.append(wh)
self.steps_diag.append(step)
self.offsets_diag.append(offset)
self.centers_diag.append(center)
def __call__(self, ground_truth_labels, diagnostics=False):
'''
Converts ground truth bounding box data into a suitable format to train an SSD model.
Arguments:
ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
to the respective image, and the data for each ground truth bounding box has the format
`(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
boxes.
Returns:
`y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
the last axis are the box coordinates, the next four elements after that are just dummy elements, and
the last four elements are the variances.
'''
# Mapping to define which indices represent which coordinates in the ground truth.
class_id = 0
xmin = 1
ymin = 2
xmax = 3
ymax = 4
batch_size = len(ground_truth_labels)
##################################################################################
# Generate the template for y_encoded.
##################################################################################
y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False)
##################################################################################
# Match ground truth boxes to anchor boxes.
##################################################################################
# Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
# a ground truth match and for which the maximal IoU overlap with any ground truth box is less
# than or equal to `neg_iou_limit` will be a negative (background) box.
y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default.
n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item
class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors
for i in range(batch_size): # For each batch item...
if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match.
labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item
# Check for degenerate ground truth bounding boxes before attempting any computations.
if np.any(labels[:,[xmax]] - labels[:,[xmin]] <= 0) or np.any(labels[:,[ymax]] - labels[:,[ymin]] <= 0):
raise DegenerateBoxError("SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, labels) +
"i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " +
"bounding boxes will lead to NaN errors during the training.")
# Maybe normalize the box coordinates.
if self.normalize_coords:
labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height
labels[:,[xmin,xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width
# Maybe convert the box coordinate format.
if self.coords == 'centroids':
labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels)
elif self.coords == 'minmax':
labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax')
classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)] # The one-hot class IDs for the ground truth boxes of this batch item
labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin,ymin,xmax,ymax]]], axis=-1) # The one-hot version of the labels for this batch item
# Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
# This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
similarities = iou(labels[:,[xmin,ymin,xmax,ymax]], y_encoded[i,:,-12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels)
# First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
# This ensures that each ground truth box will have at least one good match.
# For each ground truth box, get the anchor box to match with it.
bipartite_matches = match_bipartite_greedy(weight_matrix=similarities)
# Write the ground truth data to the matched anchor boxes.
y_encoded[i, bipartite_matches, :-8] = labels_one_hot
# Set the columns of the matched anchor boxes to zero to indicate that they were matched.
similarities[:, bipartite_matches] = 0
# Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
# ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
# such ground truth box.
if self.matching_type == 'multi':
# Get all matches that satisfy the IoU threshold.
matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold)
# Write the ground truth data to the matched anchor boxes.
y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]
# Set the columns of the matched anchor boxes to zero to indicate that they were matched.
similarities[:, matches[1]] = 0
# Third: Now after the matching is done, all negative (background) anchor boxes that have
# an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
# i.e. they will no longer be background boxes. These anchors are "too close" to a
# ground truth box to be valid background boxes.
max_background_similarities = np.amax(similarities, axis=0)
neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0]
y_encoded[i, neutral_boxes, self.background_id] = 0
##################################################################################
# Convert box coordinates to anchor box offsets.
##################################################################################
if self.coords == 'centroids':
y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] # w(gt) / w(anchor), h(gt) / h(anchor)
y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
elif self.coords == 'corners':
y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
y_encoded[:,:,[-12,-10]] /= np.expand_dims(y_encoded[:,:,-6] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
y_encoded[:,:,[-11,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-7], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
elif self.coords == 'minmax':
y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
y_encoded[:,:,[-12,-11]] /= np.expand_dims(y_encoded[:,:,-7] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
y_encoded[:,:,[-10,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-6], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
if diagnostics:
# Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
y_matched_anchors = np.copy(y_encoded)
y_matched_anchors[:,:,-12:-8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero.
return y_encoded, y_matched_anchors
else:
return y_encoded
def generate_anchor_boxes_for_layer(self,
feature_map_size,
aspect_ratios,
this_scale,
next_scale,
this_steps=None,
this_offsets=None,
diagnostics=False):
'''
Computes an array of the spatial positions and sizes of the anchor boxes for one predictor layer
of size `feature_map_size == [feature_map_height, feature_map_width]`.
Arguments:
feature_map_size (tuple): A list or tuple `[feature_map_height, feature_map_width]` with the spatial
dimensions of the feature map for which to generate the anchor boxes.
aspect_ratios (list): A list of floats, the aspect ratios for which anchor boxes are to be generated.
All list elements must be unique.
this_scale (float): A float in [0, 1], the scaling factor for the size of the generate anchor boxes
as a fraction of the shorter side of the input image.
next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
`self.two_boxes_for_ar1 == True`.
diagnostics (bool, optional): If true, the following additional outputs will be returned:
1) A list of the center point `x` and `y` coordinates for each spatial location.
2) A list containing `(width, height)` for each box aspect ratio.
3) A tuple containing `(step_height, step_width)`
4) A tuple containing `(offset_height, offset_width)`
This information can be useful to understand in just a few numbers what the generated grid of
anchor boxes actually looks like, i.e. how large the different boxes are and how dense
their spatial distribution is, in order to determine whether the box grid covers the input images
appropriately and whether the box sizes are appropriate to fit the sizes of the objects
to be detected.
Returns:
A 4D Numpy tensor of shape `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)` where the
last dimension contains `(xmin, xmax, ymin, ymax)` for each anchor box in each cell of the feature map.
'''
# Compute box width and height for each aspect ratio.
# The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
size = min(self.img_height, self.img_width)
# Compute the box widths and and heights for all aspect ratios
wh_list = []
for ar in aspect_ratios:
if (ar == 1):
# Compute the regular anchor box for aspect ratio 1.
box_height = box_width = this_scale * size
wh_list.append((box_width, box_height))
if self.two_boxes_for_ar1:
# Compute one slightly larger version using the geometric mean of this scale value and the next.
box_height = box_width = np.sqrt(this_scale * next_scale) * size
wh_list.append((box_width, box_height))
else:
box_width = this_scale * size * np.sqrt(ar)
box_height = this_scale * size / np.sqrt(ar)
wh_list.append((box_width, box_height))
wh_list = np.array(wh_list)
n_boxes = len(wh_list)
# Compute the grid of box center points. They are identical for all aspect ratios.
# Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
if (this_steps is None):
step_height = self.img_height / feature_map_size[0]
step_width = self.img_width / feature_map_size[1]
else:
if isinstance(this_steps, (list, tuple)) and (len(this_steps) == 2):
step_height = this_steps[0]
step_width = this_steps[1]
elif isinstance(this_steps, (int, float)):
step_height = this_steps
step_width = this_steps
# Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
if (this_offsets is None):
offset_height = 0.5
offset_width = 0.5
else:
if isinstance(this_offsets, (list, tuple)) and (len(this_offsets) == 2):
offset_height = this_offsets[0]
offset_width = this_offsets[1]
elif isinstance(this_offsets, (int, float)):
offset_height = this_offsets
offset_width = this_offsets
# Now that we have the offsets and step sizes, compute the grid of anchor box center points.
cy = np.linspace(offset_height * step_height, (offset_height + feature_map_size[0] - 1) * step_height, feature_map_size[0])
cx = np.linspace(offset_width * step_width, (offset_width + feature_map_size[1] - 1) * step_width, feature_map_size[1])
cx_grid, cy_grid = np.meshgrid(cx, cy)
cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
# Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
# where the last dimension will contain `(cx, cy, w, h)`
boxes_tensor = np.zeros((feature_map_size[0], feature_map_size[1], n_boxes, 4))
boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx
boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy
boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
# Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')
# If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
if self.clip_boxes:
x_coords = boxes_tensor[:,:,:,[0, 2]]
x_coords[x_coords >= self.img_width] = self.img_width - 1
x_coords[x_coords < 0] = 0
boxes_tensor[:,:,:,[0, 2]] = x_coords
y_coords = boxes_tensor[:,:,:,[1, 3]]
y_coords[y_coords >= self.img_height] = self.img_height - 1
y_coords[y_coords < 0] = 0
boxes_tensor[:,:,:,[1, 3]] = y_coords
# `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
if self.normalize_coords:
boxes_tensor[:, :, :, [0, 2]] /= self.img_width
boxes_tensor[:, :, :, [1, 3]] /= self.img_height
# TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
if self.coords == 'centroids':
# Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
elif self.coords == 'minmax':
# Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')
if diagnostics:
return boxes_tensor, (cy, cx), wh_list, (step_height, step_width), (offset_height, offset_width)
else:
return boxes_tensor
def generate_encoding_template(self, batch_size, diagnostics=False):
'''
Produces an encoding template for the ground truth label tensor for a given batch.
Note that all tensor creation, reshaping and concatenation operations performed in this function
and the sub-functions it calls are identical to those performed inside the SSD model. This, of course,
must be the case in order to preserve the spatial meaning of each box prediction, but it's useful to make
yourself aware of this fact and why it is necessary.
In other words, the boxes in `y_encoded` must have a specific order in order correspond to the right spatial
positions and scales of the boxes predicted by the model. The sequence of operations here ensures that `y_encoded`
has this specific form.
Arguments:
batch_size (int): The batch size.
diagnostics (bool, optional): See the documnentation for `generate_anchor_boxes()`. The diagnostic output
here is similar, just for all predictor conv layers.
Returns:
A Numpy array of shape `(batch_size, #boxes, #classes + 12)`, the template into which to encode
the ground truth labels for training. The last axis has length `#classes + 12` because the model
output contains not only the 4 predicted box coordinate offsets, but also the 4 coordinates for
the anchor boxes and the 4 variance values.
'''
# Tile the anchor boxes for each predictor layer across all batch items.
boxes_batch = []
for boxes in self.boxes_list:
# Prepend one dimension to `self.boxes_list` to account for the batch size and tile it along.
# The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 4)`
boxes = np.expand_dims(boxes, axis=0)
boxes = np.tile(boxes, (batch_size, 1, 1, 1, 1))
# Now reshape the 5D tensor above into a 3D tensor of shape
# `(batch, feature_map_height * feature_map_width * n_boxes, 4)`. The resulting
# order of the tensor content will be identical to the order obtained from the reshaping operation
# in our Keras model (we're using the Tensorflow backend, and tf.reshape() and np.reshape()
# use the same default index order, which is C-like index ordering)
boxes = np.reshape(boxes, (batch_size, -1, 4))
boxes_batch.append(boxes)
# Concatenate the anchor tensors from the individual layers to one.
boxes_tensor = np.concatenate(boxes_batch, axis=1)
# 3: Create a template tensor to hold the one-hot class encodings of shape `(batch, #boxes, #classes)`
# It will contain all zeros for now, the classes will be set in the matching process that follows
classes_tensor = np.zeros((batch_size, boxes_tensor.shape[1], self.n_classes))
# 4: Create a tensor to contain the variances. This tensor has the same shape as `boxes_tensor` and simply
# contains the same 4 variance values for every position in the last axis.
variances_tensor = np.zeros_like(boxes_tensor)
variances_tensor += self.variances # Long live broadcasting
# 4: Concatenate the classes, boxes and variances tensors to get our final template for y_encoded. We also need
# another tensor of the shape of `boxes_tensor` as a space filler so that `y_encoding_template` has the same
# shape as the SSD model output tensor. The content of this tensor is irrelevant, we'll just use
# `boxes_tensor` a second time.
y_encoding_template = np.concatenate((classes_tensor, boxes_tensor, boxes_tensor, variances_tensor), axis=2)
if diagnostics:
return y_encoding_template, self.centers_diag, self.wh_list_diag, self.steps_diag, self.offsets_diag
else:
return y_encoding_template
class DegenerateBoxError(Exception):
'''
An exception class to be raised if degenerate boxes are being detected.
'''
pass

View File

@@ -0,0 +1,530 @@
'''
Includes:
* Functions to decode and filter raw SSD model output. These are only needed if the
SSD model does not have a `DecodeDetections` layer.
* Functions to perform greedy non-maximum suppression
Copyright (C) 2018 Pierluigi Ferrari
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
from __future__ import division
import numpy as np
from bounding_box_utils.bounding_box_utils import iou, convert_coordinates
def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'):
'''
Perform greedy non-maximum suppression on the input boxes.
Greedy NMS works by selecting the box with the highest score and
removing all boxes around it that are too close to it measured by IoU-similarity.
Out of the boxes that are left over, once again the one with the highest
score is selected and so on, until no boxes with too much overlap are left.
Arguments:
y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this
is a list of length `n` where each list element is a 2D Numpy array.
For a batch item with `k` predicted boxes this 2D Numpy array has
shape `(k, 6)`, where each row contains the coordinates of the respective
box in the format `[class_id, score, xmin, xmax, ymin, ymax]`.
Technically, the number of columns doesn't have to be 6, it can be
arbitrary as long as the first four elements of each row are
`xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element
is the score assigned to the prediction. Note that this function is
agnostic to the scale of the score or what it represents.
iou_threshold (float, optional): All boxes with a Jaccard similarity of
greater than `iou_threshold` with a locally maximal box will be removed
from the set of predictions, where 'maximal' refers to the box score.
coords (str, optional): The coordinate format of `y_pred_decoded`.
Can be one of the formats supported by `iou()`.
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxex, but not the other.
Returns:
The predictions after removing non-maxima. The format is the same as the input format.
'''
y_pred_decoded_nms = []
for batch_item in y_pred_decoded: # For the labels of each batch item...
boxes_left = np.copy(batch_item)
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
y_pred_decoded_nms.append(np.array(maxima))
return y_pred_decoded_nms
def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
'''
The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
function for per-class NMS in `decode_detections()`.
'''
boxes_left = np.copy(predictions)
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
maximum_index = np.argmax(boxes_left[:,0]) # ...get the index of the next box with the highest confidence...
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
similarities = iou(boxes_left[:,1:], maximum_box[1:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
return np.array(maxima)
def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
'''
The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
function in `decode_detections_fast()`.
'''
boxes_left = np.copy(predictions)
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
return np.array(maxima)
def decode_detections(y_pred,
confidence_thresh=0.01,
iou_threshold=0.45,
top_k=200,
input_coords='centroids',
normalize_coords=True,
img_height=None,
img_width=None,
border_pixels='half'):
'''
Convert model prediction output back to a format that contains only the positive box predictions
(i.e. the same format that `SSDInputEncoder` takes as input).
After the decoding, two stages of prediction filtering are performed for each class individually:
First confidence thresholding, then greedy non-maximum suppression. The filtering results for all
classes are concatenated and the `top_k` overall highest confidence results constitute the final
predictions for a given batch item. This procedure follows the original Caffe implementation.
For a slightly different and more efficient alternative to decode raw model output that performs
non-maximum suppresion globally instead of per class, see `decode_detections_fast()` below.
Arguments:
y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
boxes predicted by the model per image and the last axis contains
`[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
positive class in order to be considered for the non-maximum suppression stage for the respective class.
A lower value will result in a larger part of the selection process being done by the non-maximum suppression
stage, while a larger value will result in a larger part of the selection process happening in the confidence
thresholding stage.
iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
to the box score.
top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
non-maximum suppression stage.
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
coordinates. Requires `img_height` and `img_width` if set to `True`.
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxex, but not the other.
Returns:
A python list of length `batch_size` where each list element represents the predicted boxes
for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`.
'''
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
# 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
if input_coords == 'centroids':
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
elif input_coords == 'minmax':
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
elif input_coords == 'corners':
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
else:
raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
# 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
if normalize_coords:
y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
# 3: Apply confidence thresholding and non-maximum suppression per class
n_classes = y_pred_decoded_raw.shape[-1] - 4 # The number of classes is the length of the last axis minus the four box coordinates
y_pred_decoded = [] # Store the final predictions in this list
for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
pred = [] # Store the final predictions for this batch item here
for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
single_class = batch_item[:,[class_id, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
threshold_met = single_class[single_class[:,0] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
if threshold_met.shape[0] > 0: # If any boxes made the threshold...
maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
maxima_output[:,0] = class_id # Write the class ID to the first column...
maxima_output[:,1:] = maxima # ...and write the maxima to the other columns...
pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
# Once we're through with all classes, keep only the `top_k` maxima with the highest scores
if pred: # If there are any predictions left after confidence-thresholding...
pred = np.concatenate(pred, axis=0)
if top_k != 'all' and pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
else:
pred = np.array(pred) # Even if empty, `pred` must become a Numpy array.
y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
return y_pred_decoded
def decode_detections_fast(y_pred,
confidence_thresh=0.5,
iou_threshold=0.45,
top_k='all',
input_coords='centroids',
normalize_coords=True,
img_height=None,
img_width=None,
border_pixels='half'):
'''
Convert model prediction output back to a format that contains only the positive box predictions
(i.e. the same format that `enconde_y()` takes as input).
Optionally performs confidence thresholding and greedy non-maximum suppression after the decoding stage.
Note that the decoding procedure used here is not the same as the procedure used in the original Caffe implementation.
For each box, the procedure used here assigns the box's highest confidence as its predicted class. Then it removes
all boxes for which the highest confidence is the background class. This results in less work for the subsequent
non-maximum suppression, because the vast majority of the predictions will be filtered out just by the fact that
their highest confidence is for the background class. It is much more efficient than the procedure of the original
implementation, but the results may also differ.
Arguments:
y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
boxes predicted by the model per image and the last axis contains
`[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in any positive
class required for a given box to be considered a positive prediction. A lower value will result
in better recall, while a higher value will result in better precision. Do not use this parameter with the
goal to combat the inevitably many duplicates that an SSD will produce, the subsequent non-maximum suppression
stage will take care of those.
iou_threshold (float, optional): `None` or a float in [0,1]. If `None`, no non-maximum suppression will be
performed. If not `None`, greedy NMS will be performed after the confidence thresholding stage, meaning
all boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
from the set of predictions, where 'maximal' refers to the box score.
top_k (int, optional): 'all' or an integer with number of highest scoring predictions to be kept for each batch item
after the non-maximum suppression stage. If 'all', all predictions left after the NMS stage will be kept.
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
coordinates. Requires `img_height` and `img_width` if set to `True`.
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxex, but not the other.
Returns:
A python list of length `batch_size` where each list element represents the predicted boxes
for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
a non-background class for the respective image in the format `[class_id, confidence, xmin, xmax, ymin, ymax]`.
'''
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
# 1: Convert the classes from one-hot encoding to their class ID
y_pred_converted = np.copy(y_pred[:,:,-14:-8]) # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
y_pred_converted[:,:,0] = np.argmax(y_pred[:,:,:-12], axis=-1) # The indices of the highest confidence values in the one-hot class vectors are the class ID
y_pred_converted[:,:,1] = np.amax(y_pred[:,:,:-12], axis=-1) # Store the confidence values themselves, too
# 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
if input_coords == 'centroids':
y_pred_converted[:,:,[4,5]] = np.exp(y_pred_converted[:,:,[4,5]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
y_pred_converted[:,:,[4,5]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
y_pred_converted[:,:,[2,3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
y_pred_converted[:,:,[2,3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2corners')
elif input_coords == 'minmax':
y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_converted[:,:,[2,3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_converted[:,:,[4,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='minmax2corners')
elif input_coords == 'corners':
y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_converted[:,:,[2,4]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_converted[:,:,[3,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
else:
raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
# 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
if normalize_coords:
y_pred_converted[:,:,[2,4]] *= img_width # Convert xmin, xmax back to absolute coordinates
y_pred_converted[:,:,[3,5]] *= img_height # Convert ymin, ymax back to absolute coordinates
# 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
y_pred_decoded = []
for batch_item in y_pred_converted: # For each image in the batch...
boxes = batch_item[np.nonzero(batch_item[:,0])] # ...get all boxes that don't belong to the background class,...
boxes = boxes[boxes[:,1] >= confidence_thresh] # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
if iou_threshold: # ...if an IoU threshold is set...
boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on the remaining boxes.
if top_k != 'all' and boxes.shape[0] > top_k: # If we have more than `top_k` results left at this point...
top_k_indices = np.argpartition(boxes[:,1], kth=boxes.shape[0]-top_k, axis=0)[boxes.shape[0]-top_k:] # ...get the indices of the `top_k` highest-scoring boxes...
boxes = boxes[top_k_indices] # ...and keep only those boxes...
y_pred_decoded.append(boxes) # ...and now that we're done, append the array of final predictions for this batch item to the output list
return y_pred_decoded
################################################################################################
# Debugging tools, not relevant for normal use
################################################################################################
# The functions below are for debugging, so you won't normally need them. That is,
# unless you need to debug your model, of course.
def decode_detections_debug(y_pred,
confidence_thresh=0.01,
iou_threshold=0.45,
top_k=200,
input_coords='centroids',
normalize_coords=True,
img_height=None,
img_width=None,
variance_encoded_in_target=False,
border_pixels='half'):
'''
This decoder performs the same processing as `decode_detections()`, but the output format for each left-over
predicted box is `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
That is, in addition to the usual data, each predicted box has the internal index of that box within
the model (`box_id`) prepended to it. This allows you to know exactly which part of the model made a given
box prediction; in particular, it allows you to know which predictor layer made a given prediction.
This can be useful for debugging.
Arguments:
y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
boxes predicted by the model per image and the last axis contains
`[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
positive class in order to be considered for the non-maximum suppression stage for the respective class.
A lower value will result in a larger part of the selection process being done by the non-maximum suppression
stage, while a larger value will result in a larger part of the selection process happening in the confidence
thresholding stage.
iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
to the box score.
top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
non-maximum suppression stage.
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
coordinates. Requires `img_height` and `img_width` if set to `True`.
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxex, but not the other.
Returns:
A python list of length `batch_size` where each list element represents the predicted boxes
for one image and contains a Numpy array of shape `(boxes, 7)` where each row is a box prediction for
a non-background class for the respective image in the format `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
'''
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
# 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
if input_coords == 'centroids':
if variance_encoded_in_target:
# Decode the predicted box center x and y coordinates.
y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] + y_pred[:,:,[-8,-7]]
# Decode the predicted box width and heigt.
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
else:
# Decode the predicted box center x and y coordinates.
y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] * y_pred[:,:,[-4,-3]] + y_pred[:,:,[-8,-7]]
# Decode the predicted box width and heigt.
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
elif input_coords == 'minmax':
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
elif input_coords == 'corners':
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
else:
raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
# 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
if normalize_coords:
y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
# 3: For each batch item, prepend each box's internal index to its coordinates.
y_pred_decoded_raw2 = np.zeros((y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1], y_pred_decoded_raw.shape[2] + 1)) # Expand the last axis by one.
y_pred_decoded_raw2[:,:,1:] = y_pred_decoded_raw
y_pred_decoded_raw2[:,:,0] = np.arange(y_pred_decoded_raw.shape[1]) # Put the box indices as the first element for each box via broadcasting.
y_pred_decoded_raw = y_pred_decoded_raw2
# 4: Apply confidence thresholding and non-maximum suppression per class
n_classes = y_pred_decoded_raw.shape[-1] - 5 # The number of classes is the length of the last axis minus the four box coordinates and minus the index
y_pred_decoded = [] # Store the final predictions in this list
for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
pred = [] # Store the final predictions for this batch item here
for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
single_class = batch_item[:,[0, class_id + 1, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
threshold_met = single_class[single_class[:,1] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
if threshold_met.shape[0] > 0: # If any boxes made the threshold...
maxima = _greedy_nms_debug(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
maxima_output[:,0] = maxima[:,0] # Write the box index to the first column...
maxima_output[:,1] = class_id # ...and write the class ID to the second column...
maxima_output[:,2:] = maxima[:,1:] # ...and write the rest of the maxima data to the other columns...
pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
# Once we're through with all classes, keep only the `top_k` maxima with the highest scores
pred = np.concatenate(pred, axis=0)
if pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
top_k_indices = np.argpartition(pred[:,2], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
return y_pred_decoded
def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
'''
The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all
left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output
box and is thus useful for debugging.
'''
boxes_left = np.copy(predictions)
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
return np.array(maxima)
def get_num_boxes_per_pred_layer(predictor_sizes, aspect_ratios, two_boxes_for_ar1):
'''
Returns a list of the number of boxes that each predictor layer predicts.
`aspect_ratios` must be a nested list, containing a list of aspect ratios
for each predictor layer.
'''
num_boxes_per_pred_layer = []
for i in range(len(predictor_sizes)):
if two_boxes_for_ar1:
num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * (len(aspect_ratios[i]) + 1))
else:
num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * len(aspect_ratios[i]))
return num_boxes_per_pred_layer
def get_pred_layers(y_pred_decoded, num_boxes_per_pred_layer):
'''
For a given prediction tensor decoded with `decode_detections_debug()`, returns a list
with the indices of the predictor layers that made each predictions.
That is, this function lets you know which predictor layer is responsible
for a given prediction.
Arguments:
y_pred_decoded (array): The decoded model output tensor. Must have been
decoded with `decode_detections_debug()` so that it contains the internal box index
for each predicted box.
num_boxes_per_pred_layer (list): A list that contains the total number
of boxes that each predictor layer predicts.
'''
pred_layers_all = []
cum_boxes_per_pred_layer = np.cumsum(num_boxes_per_pred_layer)
for batch_item in y_pred_decoded:
pred_layers = []
for prediction in batch_item:
if (prediction[0] < 0) or (prediction[0] >= cum_boxes_per_pred_layer[-1]):
raise ValueError("Box index is out of bounds of the possible indices as given by the values in `num_boxes_per_pred_layer`.")
for i in range(len(cum_boxes_per_pred_layer)):
if prediction[0] < cum_boxes_per_pred_layer[i]:
pred_layers.append(i)
break
pred_layers_all.append(pred_layers)
return pred_layers_all