Summary
This commit is contained in:
0
ssd_keras-master/ssd_encoder_decoder/__init__.py
Normal file
0
ssd_keras-master/ssd_encoder_decoder/__init__.py
Normal file
BIN
ssd_keras-master/ssd_encoder_decoder/__init__.pyc
Normal file
BIN
ssd_keras-master/ssd_encoder_decoder/__init__.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
116
ssd_keras-master/ssd_encoder_decoder/matching_utils.py
Normal file
116
ssd_keras-master/ssd_encoder_decoder/matching_utils.py
Normal file
@@ -0,0 +1,116 @@
|
||||
'''
|
||||
Utilities to match ground truth boxes to anchor boxes.
|
||||
|
||||
Copyright (C) 2018 Pierluigi Ferrari
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
from __future__ import division
|
||||
import numpy as np
|
||||
|
||||
def match_bipartite_greedy(weight_matrix):
|
||||
'''
|
||||
Returns a bipartite matching according to the given weight matrix.
|
||||
|
||||
The algorithm works as follows:
|
||||
|
||||
Let the first axis of `weight_matrix` represent ground truth boxes
|
||||
and the second axis anchor boxes.
|
||||
The ground truth box that has the greatest similarity with any
|
||||
anchor box will be matched first, then out of the remaining ground
|
||||
truth boxes, the ground truth box that has the greatest similarity
|
||||
with any of the remaining anchor boxes will be matched second, and
|
||||
so on. That is, the ground truth boxes will be matched in descending
|
||||
order by maximum similarity with any of the respectively remaining
|
||||
anchor boxes.
|
||||
The runtime complexity is O(m^2 * n), where `m` is the number of
|
||||
ground truth boxes and `n` is the number of anchor boxes.
|
||||
|
||||
Arguments:
|
||||
weight_matrix (array): A 2D Numpy array that represents the weight matrix
|
||||
for the matching process. If `(m,n)` is the shape of the weight matrix,
|
||||
it must be `m <= n`. The weights can be integers or floating point
|
||||
numbers. The matching process will maximize, i.e. larger weights are
|
||||
preferred over smaller weights.
|
||||
|
||||
Returns:
|
||||
A 1D Numpy array of length `weight_matrix.shape[0]` that represents
|
||||
the matched index along the second axis of `weight_matrix` for each index
|
||||
along the first axis.
|
||||
'''
|
||||
|
||||
weight_matrix = np.copy(weight_matrix) # We'll modify this array.
|
||||
num_ground_truth_boxes = weight_matrix.shape[0]
|
||||
all_gt_indices = list(range(num_ground_truth_boxes)) # Only relevant for fancy-indexing below.
|
||||
|
||||
# This 1D array will contain for each ground truth box the index of
|
||||
# the matched anchor box.
|
||||
matches = np.zeros(num_ground_truth_boxes, dtype=np.int)
|
||||
|
||||
# In each iteration of the loop below, exactly one ground truth box
|
||||
# will be matched to one anchor box.
|
||||
for _ in range(num_ground_truth_boxes):
|
||||
|
||||
# Find the maximal anchor-ground truth pair in two steps: First, reduce
|
||||
# over the anchor boxes and then reduce over the ground truth boxes.
|
||||
anchor_indices = np.argmax(weight_matrix, axis=1) # Reduce along the anchor box axis.
|
||||
overlaps = weight_matrix[all_gt_indices, anchor_indices]
|
||||
ground_truth_index = np.argmax(overlaps) # Reduce along the ground truth box axis.
|
||||
anchor_index = anchor_indices[ground_truth_index]
|
||||
matches[ground_truth_index] = anchor_index # Set the match.
|
||||
|
||||
# Set the row of the matched ground truth box and the column of the matched
|
||||
# anchor box to all zeros. This ensures that those boxes will not be matched again,
|
||||
# because they will never be the best matches for any other boxes.
|
||||
weight_matrix[ground_truth_index] = 0
|
||||
weight_matrix[:,anchor_index] = 0
|
||||
|
||||
return matches
|
||||
|
||||
def match_multi(weight_matrix, threshold):
|
||||
'''
|
||||
Matches all elements along the second axis of `weight_matrix` to their best
|
||||
matches along the first axis subject to the constraint that the weight of a match
|
||||
must be greater than or equal to `threshold` in order to produce a match.
|
||||
|
||||
If the weight matrix contains elements that should be ignored, the row or column
|
||||
representing the respective elemet should be set to a value below `threshold`.
|
||||
|
||||
Arguments:
|
||||
weight_matrix (array): A 2D Numpy array that represents the weight matrix
|
||||
for the matching process. If `(m,n)` is the shape of the weight matrix,
|
||||
it must be `m <= n`. The weights can be integers or floating point
|
||||
numbers. The matching process will maximize, i.e. larger weights are
|
||||
preferred over smaller weights.
|
||||
threshold (float): A float that represents the threshold (i.e. lower bound)
|
||||
that must be met by a pair of elements to produce a match.
|
||||
|
||||
Returns:
|
||||
Two 1D Numpy arrays of equal length that represent the matched indices. The first
|
||||
array contains the indices along the first axis of `weight_matrix`, the second array
|
||||
contains the indices along the second axis.
|
||||
'''
|
||||
|
||||
num_anchor_boxes = weight_matrix.shape[1]
|
||||
all_anchor_indices = list(range(num_anchor_boxes)) # Only relevant for fancy-indexing below.
|
||||
|
||||
# Find the best ground truth match for every anchor box.
|
||||
ground_truth_indices = np.argmax(weight_matrix, axis=0) # Array of shape (weight_matrix.shape[1],)
|
||||
overlaps = weight_matrix[ground_truth_indices, all_anchor_indices] # Array of shape (weight_matrix.shape[1],)
|
||||
|
||||
# Filter out the matches with a weight below the threshold.
|
||||
anchor_indices_thresh_met = np.nonzero(overlaps >= threshold)[0]
|
||||
gt_indices_thresh_met = ground_truth_indices[anchor_indices_thresh_met]
|
||||
|
||||
return gt_indices_thresh_met, anchor_indices_thresh_met
|
||||
BIN
ssd_keras-master/ssd_encoder_decoder/matching_utils.pyc
Normal file
BIN
ssd_keras-master/ssd_encoder_decoder/matching_utils.pyc
Normal file
Binary file not shown.
617
ssd_keras-master/ssd_encoder_decoder/ssd_input_encoder.py
Normal file
617
ssd_keras-master/ssd_encoder_decoder/ssd_input_encoder.py
Normal file
@@ -0,0 +1,617 @@
|
||||
'''
|
||||
An encoder that converts ground truth annotations to SSD-compatible training targets.
|
||||
|
||||
Copyright (C) 2018 Pierluigi Ferrari
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
from __future__ import division
|
||||
import numpy as np
|
||||
|
||||
from bounding_box_utils.bounding_box_utils import iou, convert_coordinates
|
||||
from ssd_encoder_decoder.matching_utils import match_bipartite_greedy, match_multi
|
||||
|
||||
class SSDInputEncoder:
|
||||
'''
|
||||
Transforms ground truth labels for object detection in images
|
||||
(2D bounding box coordinates and class labels) to the format required for
|
||||
training an SSD model.
|
||||
|
||||
In the process of encoding the ground truth labels, a template of anchor boxes
|
||||
is being built, which are subsequently matched to the ground truth boxes
|
||||
via an intersection-over-union threshold criterion.
|
||||
'''
|
||||
|
||||
def __init__(self,
|
||||
img_height,
|
||||
img_width,
|
||||
n_classes,
|
||||
predictor_sizes,
|
||||
min_scale=0.1,
|
||||
max_scale=0.9,
|
||||
scales=None,
|
||||
aspect_ratios_global=[0.5, 1.0, 2.0],
|
||||
aspect_ratios_per_layer=None,
|
||||
two_boxes_for_ar1=True,
|
||||
steps=None,
|
||||
offsets=None,
|
||||
clip_boxes=False,
|
||||
variances=[0.1, 0.1, 0.2, 0.2],
|
||||
matching_type='multi',
|
||||
pos_iou_threshold=0.5,
|
||||
neg_iou_limit=0.3,
|
||||
border_pixels='half',
|
||||
coords='centroids',
|
||||
normalize_coords=True,
|
||||
background_id=0):
|
||||
'''
|
||||
Arguments:
|
||||
img_height (int): The height of the input images.
|
||||
img_width (int): The width of the input images.
|
||||
n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
|
||||
predictor_sizes (list): A list of int-tuples of the format `(height, width)`
|
||||
containing the output heights and widths of the convolutional predictor layers.
|
||||
min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
|
||||
of the shorter side of the input images. Note that you should set the scaling factors
|
||||
such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
|
||||
to detect. Must be >0.
|
||||
max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
|
||||
of the shorter side of the input images. All scaling factors between the smallest and the
|
||||
largest will be linearly interpolated. Note that the second to last of the linearly interpolated
|
||||
scaling factors will actually be the scaling factor for the last predictor layer, while the last
|
||||
scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
|
||||
if `two_boxes_for_ar1` is `True`. Note that you should set the scaling factors
|
||||
such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
|
||||
to detect. Must be greater than or equal to `min_scale`.
|
||||
scales (list, optional): A list of floats >0 containing scaling factors per convolutional predictor layer.
|
||||
This list must be one element longer than the number of predictor layers. The first `k` elements are the
|
||||
scaling factors for the `k` predictor layers, while the last element is used for the second box
|
||||
for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
|
||||
last scaling factor must be passed either way, even if it is not being used. If a list is passed,
|
||||
this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
|
||||
Note that you should set the scaling factors such that the resulting anchor box sizes correspond to
|
||||
the sizes of the objects you are trying to detect.
|
||||
aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
|
||||
generated. This list is valid for all prediction layers. Note that you should set the aspect ratios such
|
||||
that the resulting anchor box shapes roughly correspond to the shapes of the objects you are trying to detect.
|
||||
aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
|
||||
If a list is passed, it overrides `aspect_ratios_global`. Note that you should set the aspect ratios such
|
||||
that the resulting anchor box shapes very roughly correspond to the shapes of the objects you are trying to detect.
|
||||
two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratios lists that contain 1. Will be ignored otherwise.
|
||||
If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
|
||||
using the scaling factor for the respective layer, the second one will be generated using
|
||||
geometric mean of said scaling factor and next bigger scaling factor.
|
||||
steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
|
||||
either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
|
||||
pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
|
||||
the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
|
||||
If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
|
||||
If no steps are provided, then they will be computed such that the anchor box center points will form an
|
||||
equidistant grid within the image dimensions.
|
||||
offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
|
||||
either floats or tuples of two floats. These numbers represent for each predictor layer how many
|
||||
pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
|
||||
as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
|
||||
of the step size specified in the `steps` argument. If the list contains floats, then that value will
|
||||
be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
|
||||
`(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
|
||||
clip_boxes (bool, optional): If `True`, limits the anchor box coordinates to stay within image boundaries.
|
||||
variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
|
||||
its respective variance value.
|
||||
matching_type (str, optional): Can be either 'multi' or 'bipartite'. In 'bipartite' mode, each ground truth box will
|
||||
be matched only to the one anchor box with the highest IoU overlap. In 'multi' mode, in addition to the aforementioned
|
||||
bipartite matching, all anchor boxes with an IoU overlap greater than or equal to the `pos_iou_threshold` will be
|
||||
matched to a given ground truth box.
|
||||
pos_iou_threshold (float, optional): The intersection-over-union similarity threshold that must be
|
||||
met in order to match a given ground truth box to a given anchor box.
|
||||
neg_iou_limit (float, optional): The maximum allowed intersection-over-union similarity of an
|
||||
anchor box with any ground truth box to be labeled a negative (i.e. background) box. If an
|
||||
anchor box is neither a positive, nor a negative box, it will be ignored during training.
|
||||
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
|
||||
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
|
||||
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
|
||||
If 'half', then one of each of the two horizontal and vertical borders belong
|
||||
to the boxex, but not the other.
|
||||
coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
|
||||
of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
|
||||
and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
|
||||
normalize_coords (bool, optional): If `True`, the encoder uses relative instead of absolute coordinates.
|
||||
This means instead of using absolute tartget coordinates, the encoder will scale all coordinates to be within [0,1].
|
||||
This way learning becomes independent of the input image size.
|
||||
background_id (int, optional): Determines which class ID is for the background class.
|
||||
'''
|
||||
predictor_sizes = np.array(predictor_sizes)
|
||||
if predictor_sizes.ndim == 1:
|
||||
predictor_sizes = np.expand_dims(predictor_sizes, axis=0)
|
||||
|
||||
##################################################################################
|
||||
# Handle exceptions.
|
||||
##################################################################################
|
||||
|
||||
if (min_scale is None or max_scale is None) and scales is None:
|
||||
raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
|
||||
|
||||
if scales:
|
||||
if (len(scales) != predictor_sizes.shape[0] + 1): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
|
||||
raise ValueError("It must be either scales is None or len(scales) == len(predictor_sizes)+1, but len(scales) == {} and len(predictor_sizes)+1 == {}".format(len(scales), len(predictor_sizes)+1))
|
||||
scales = np.array(scales)
|
||||
if np.any(scales <= 0):
|
||||
raise ValueError("All values in `scales` must be greater than 0, but the passed list of scales is {}".format(scales))
|
||||
else: # If no list of scales was passed, we need to make sure that `min_scale` and `max_scale` are valid values.
|
||||
if not 0 < min_scale <= max_scale:
|
||||
raise ValueError("It must be 0 < min_scale <= max_scale, but it is min_scale = {} and max_scale = {}".format(min_scale, max_scale))
|
||||
|
||||
if not (aspect_ratios_per_layer is None):
|
||||
if (len(aspect_ratios_per_layer) != predictor_sizes.shape[0]): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
|
||||
raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == len(predictor_sizes), but len(aspect_ratios_per_layer) == {} and len(predictor_sizes) == {}".format(len(aspect_ratios_per_layer), len(predictor_sizes)))
|
||||
for aspect_ratios in aspect_ratios_per_layer:
|
||||
if np.any(np.array(aspect_ratios) <= 0):
|
||||
raise ValueError("All aspect ratios must be greater than zero.")
|
||||
else:
|
||||
if (aspect_ratios_global is None):
|
||||
raise ValueError("At least one of `aspect_ratios_global` and `aspect_ratios_per_layer` must not be `None`.")
|
||||
if np.any(np.array(aspect_ratios_global) <= 0):
|
||||
raise ValueError("All aspect ratios must be greater than zero.")
|
||||
|
||||
if len(variances) != 4:
|
||||
raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
|
||||
variances = np.array(variances)
|
||||
if np.any(variances <= 0):
|
||||
raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
|
||||
|
||||
if not (coords == 'minmax' or coords == 'centroids' or coords == 'corners'):
|
||||
raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
|
||||
|
||||
if (not (steps is None)) and (len(steps) != predictor_sizes.shape[0]):
|
||||
raise ValueError("You must provide at least one step value per predictor layer.")
|
||||
|
||||
if (not (offsets is None)) and (len(offsets) != predictor_sizes.shape[0]):
|
||||
raise ValueError("You must provide at least one offset value per predictor layer.")
|
||||
|
||||
##################################################################################
|
||||
# Set or compute members.
|
||||
##################################################################################
|
||||
|
||||
self.img_height = img_height
|
||||
self.img_width = img_width
|
||||
self.n_classes = n_classes + 1 # + 1 for the background class
|
||||
self.predictor_sizes = predictor_sizes
|
||||
self.min_scale = min_scale
|
||||
self.max_scale = max_scale
|
||||
# If `scales` is None, compute the scaling factors by linearly interpolating between
|
||||
# `min_scale` and `max_scale`. If an explicit list of `scales` is given, however,
|
||||
# then it takes precedent over `min_scale` and `max_scale`.
|
||||
if (scales is None):
|
||||
self.scales = np.linspace(self.min_scale, self.max_scale, len(self.predictor_sizes)+1)
|
||||
else:
|
||||
# If a list of scales is given explicitly, we'll use that instead of computing it from `min_scale` and `max_scale`.
|
||||
self.scales = scales
|
||||
# If `aspect_ratios_per_layer` is None, then we use the same list of aspect ratios
|
||||
# `aspect_ratios_global` for all predictor layers. If `aspect_ratios_per_layer` is given,
|
||||
# however, then it takes precedent over `aspect_ratios_global`.
|
||||
if (aspect_ratios_per_layer is None):
|
||||
self.aspect_ratios = [aspect_ratios_global] * predictor_sizes.shape[0]
|
||||
else:
|
||||
# If aspect ratios are given per layer, we'll use those.
|
||||
self.aspect_ratios = aspect_ratios_per_layer
|
||||
self.two_boxes_for_ar1 = two_boxes_for_ar1
|
||||
if not (steps is None):
|
||||
self.steps = steps
|
||||
else:
|
||||
self.steps = [None] * predictor_sizes.shape[0]
|
||||
if not (offsets is None):
|
||||
self.offsets = offsets
|
||||
else:
|
||||
self.offsets = [None] * predictor_sizes.shape[0]
|
||||
self.clip_boxes = clip_boxes
|
||||
self.variances = variances
|
||||
self.matching_type = matching_type
|
||||
self.pos_iou_threshold = pos_iou_threshold
|
||||
self.neg_iou_limit = neg_iou_limit
|
||||
self.border_pixels = border_pixels
|
||||
self.coords = coords
|
||||
self.normalize_coords = normalize_coords
|
||||
self.background_id = background_id
|
||||
|
||||
# Compute the number of boxes per spatial location for each predictor layer.
|
||||
# For example, if a predictor layer has three different aspect ratios, [1.0, 0.5, 2.0], and is
|
||||
# supposed to predict two boxes of slightly different size for aspect ratio 1.0, then that predictor
|
||||
# layer predicts a total of four boxes at every spatial location across the feature map.
|
||||
if not (aspect_ratios_per_layer is None):
|
||||
self.n_boxes = []
|
||||
for aspect_ratios in aspect_ratios_per_layer:
|
||||
if (1 in aspect_ratios) & two_boxes_for_ar1:
|
||||
self.n_boxes.append(len(aspect_ratios) + 1)
|
||||
else:
|
||||
self.n_boxes.append(len(aspect_ratios))
|
||||
else:
|
||||
if (1 in aspect_ratios_global) & two_boxes_for_ar1:
|
||||
self.n_boxes = len(aspect_ratios_global) + 1
|
||||
else:
|
||||
self.n_boxes = len(aspect_ratios_global)
|
||||
|
||||
##################################################################################
|
||||
# Compute the anchor boxes for each predictor layer.
|
||||
##################################################################################
|
||||
|
||||
# Compute the anchor boxes for each predictor layer. We only have to do this once
|
||||
# since the anchor boxes depend only on the model configuration, not on the input data.
|
||||
# For each predictor layer (i.e. for each scaling factor) the tensors for that layer's
|
||||
# anchor boxes will have the shape `(feature_map_height, feature_map_width, n_boxes, 4)`.
|
||||
|
||||
self.boxes_list = [] # This will store the anchor boxes for each predicotr layer.
|
||||
|
||||
# The following lists just store diagnostic information. Sometimes it's handy to have the
|
||||
# boxes' center points, heights, widths, etc. in a list.
|
||||
self.wh_list_diag = [] # Box widths and heights for each predictor layer
|
||||
self.steps_diag = [] # Horizontal and vertical distances between any two boxes for each predictor layer
|
||||
self.offsets_diag = [] # Offsets for each predictor layer
|
||||
self.centers_diag = [] # Anchor box center points as `(cy, cx)` for each predictor layer
|
||||
|
||||
# Iterate over all predictor layers and compute the anchor boxes for each one.
|
||||
for i in range(len(self.predictor_sizes)):
|
||||
boxes, center, wh, step, offset = self.generate_anchor_boxes_for_layer(feature_map_size=self.predictor_sizes[i],
|
||||
aspect_ratios=self.aspect_ratios[i],
|
||||
this_scale=self.scales[i],
|
||||
next_scale=self.scales[i+1],
|
||||
this_steps=self.steps[i],
|
||||
this_offsets=self.offsets[i],
|
||||
diagnostics=True)
|
||||
self.boxes_list.append(boxes)
|
||||
self.wh_list_diag.append(wh)
|
||||
self.steps_diag.append(step)
|
||||
self.offsets_diag.append(offset)
|
||||
self.centers_diag.append(center)
|
||||
|
||||
def __call__(self, ground_truth_labels, diagnostics=False):
|
||||
'''
|
||||
Converts ground truth bounding box data into a suitable format to train an SSD model.
|
||||
|
||||
Arguments:
|
||||
ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
|
||||
for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
|
||||
to the respective image, and the data for each ground truth bounding box has the format
|
||||
`(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
|
||||
an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
|
||||
diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
|
||||
but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
|
||||
This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
|
||||
boxes.
|
||||
|
||||
Returns:
|
||||
`y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
|
||||
ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
|
||||
model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
|
||||
the last axis are the box coordinates, the next four elements after that are just dummy elements, and
|
||||
the last four elements are the variances.
|
||||
'''
|
||||
|
||||
# Mapping to define which indices represent which coordinates in the ground truth.
|
||||
class_id = 0
|
||||
xmin = 1
|
||||
ymin = 2
|
||||
xmax = 3
|
||||
ymax = 4
|
||||
|
||||
batch_size = len(ground_truth_labels)
|
||||
|
||||
##################################################################################
|
||||
# Generate the template for y_encoded.
|
||||
##################################################################################
|
||||
|
||||
y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False)
|
||||
|
||||
##################################################################################
|
||||
# Match ground truth boxes to anchor boxes.
|
||||
##################################################################################
|
||||
|
||||
# Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
|
||||
# a ground truth match and for which the maximal IoU overlap with any ground truth box is less
|
||||
# than or equal to `neg_iou_limit` will be a negative (background) box.
|
||||
|
||||
y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default.
|
||||
n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item
|
||||
class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors
|
||||
|
||||
for i in range(batch_size): # For each batch item...
|
||||
|
||||
if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match.
|
||||
labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item
|
||||
|
||||
# Check for degenerate ground truth bounding boxes before attempting any computations.
|
||||
if np.any(labels[:,[xmax]] - labels[:,[xmin]] <= 0) or np.any(labels[:,[ymax]] - labels[:,[ymin]] <= 0):
|
||||
raise DegenerateBoxError("SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, labels) +
|
||||
"i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " +
|
||||
"bounding boxes will lead to NaN errors during the training.")
|
||||
|
||||
# Maybe normalize the box coordinates.
|
||||
if self.normalize_coords:
|
||||
labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height
|
||||
labels[:,[xmin,xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width
|
||||
|
||||
# Maybe convert the box coordinate format.
|
||||
if self.coords == 'centroids':
|
||||
labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels)
|
||||
elif self.coords == 'minmax':
|
||||
labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax')
|
||||
|
||||
classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)] # The one-hot class IDs for the ground truth boxes of this batch item
|
||||
labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin,ymin,xmax,ymax]]], axis=-1) # The one-hot version of the labels for this batch item
|
||||
|
||||
# Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
|
||||
# This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
|
||||
similarities = iou(labels[:,[xmin,ymin,xmax,ymax]], y_encoded[i,:,-12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels)
|
||||
|
||||
# First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
|
||||
# This ensures that each ground truth box will have at least one good match.
|
||||
|
||||
# For each ground truth box, get the anchor box to match with it.
|
||||
bipartite_matches = match_bipartite_greedy(weight_matrix=similarities)
|
||||
|
||||
# Write the ground truth data to the matched anchor boxes.
|
||||
y_encoded[i, bipartite_matches, :-8] = labels_one_hot
|
||||
|
||||
# Set the columns of the matched anchor boxes to zero to indicate that they were matched.
|
||||
similarities[:, bipartite_matches] = 0
|
||||
|
||||
# Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
|
||||
# ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
|
||||
# such ground truth box.
|
||||
|
||||
if self.matching_type == 'multi':
|
||||
|
||||
# Get all matches that satisfy the IoU threshold.
|
||||
matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold)
|
||||
|
||||
# Write the ground truth data to the matched anchor boxes.
|
||||
y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]
|
||||
|
||||
# Set the columns of the matched anchor boxes to zero to indicate that they were matched.
|
||||
similarities[:, matches[1]] = 0
|
||||
|
||||
# Third: Now after the matching is done, all negative (background) anchor boxes that have
|
||||
# an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
|
||||
# i.e. they will no longer be background boxes. These anchors are "too close" to a
|
||||
# ground truth box to be valid background boxes.
|
||||
|
||||
max_background_similarities = np.amax(similarities, axis=0)
|
||||
neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0]
|
||||
y_encoded[i, neutral_boxes, self.background_id] = 0
|
||||
|
||||
##################################################################################
|
||||
# Convert box coordinates to anchor box offsets.
|
||||
##################################################################################
|
||||
|
||||
if self.coords == 'centroids':
|
||||
y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
|
||||
y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
|
||||
y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] # w(gt) / w(anchor), h(gt) / h(anchor)
|
||||
y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
|
||||
elif self.coords == 'corners':
|
||||
y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
|
||||
y_encoded[:,:,[-12,-10]] /= np.expand_dims(y_encoded[:,:,-6] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
|
||||
y_encoded[:,:,[-11,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-7], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
|
||||
y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
|
||||
elif self.coords == 'minmax':
|
||||
y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
|
||||
y_encoded[:,:,[-12,-11]] /= np.expand_dims(y_encoded[:,:,-7] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
|
||||
y_encoded[:,:,[-10,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-6], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
|
||||
y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
|
||||
|
||||
if diagnostics:
|
||||
# Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
|
||||
y_matched_anchors = np.copy(y_encoded)
|
||||
y_matched_anchors[:,:,-12:-8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero.
|
||||
return y_encoded, y_matched_anchors
|
||||
else:
|
||||
return y_encoded
|
||||
|
||||
def generate_anchor_boxes_for_layer(self,
|
||||
feature_map_size,
|
||||
aspect_ratios,
|
||||
this_scale,
|
||||
next_scale,
|
||||
this_steps=None,
|
||||
this_offsets=None,
|
||||
diagnostics=False):
|
||||
'''
|
||||
Computes an array of the spatial positions and sizes of the anchor boxes for one predictor layer
|
||||
of size `feature_map_size == [feature_map_height, feature_map_width]`.
|
||||
|
||||
Arguments:
|
||||
feature_map_size (tuple): A list or tuple `[feature_map_height, feature_map_width]` with the spatial
|
||||
dimensions of the feature map for which to generate the anchor boxes.
|
||||
aspect_ratios (list): A list of floats, the aspect ratios for which anchor boxes are to be generated.
|
||||
All list elements must be unique.
|
||||
this_scale (float): A float in [0, 1], the scaling factor for the size of the generate anchor boxes
|
||||
as a fraction of the shorter side of the input image.
|
||||
next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
|
||||
`self.two_boxes_for_ar1 == True`.
|
||||
diagnostics (bool, optional): If true, the following additional outputs will be returned:
|
||||
1) A list of the center point `x` and `y` coordinates for each spatial location.
|
||||
2) A list containing `(width, height)` for each box aspect ratio.
|
||||
3) A tuple containing `(step_height, step_width)`
|
||||
4) A tuple containing `(offset_height, offset_width)`
|
||||
This information can be useful to understand in just a few numbers what the generated grid of
|
||||
anchor boxes actually looks like, i.e. how large the different boxes are and how dense
|
||||
their spatial distribution is, in order to determine whether the box grid covers the input images
|
||||
appropriately and whether the box sizes are appropriate to fit the sizes of the objects
|
||||
to be detected.
|
||||
|
||||
Returns:
|
||||
A 4D Numpy tensor of shape `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)` where the
|
||||
last dimension contains `(xmin, xmax, ymin, ymax)` for each anchor box in each cell of the feature map.
|
||||
'''
|
||||
# Compute box width and height for each aspect ratio.
|
||||
|
||||
# The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
|
||||
size = min(self.img_height, self.img_width)
|
||||
# Compute the box widths and and heights for all aspect ratios
|
||||
wh_list = []
|
||||
for ar in aspect_ratios:
|
||||
if (ar == 1):
|
||||
# Compute the regular anchor box for aspect ratio 1.
|
||||
box_height = box_width = this_scale * size
|
||||
wh_list.append((box_width, box_height))
|
||||
if self.two_boxes_for_ar1:
|
||||
# Compute one slightly larger version using the geometric mean of this scale value and the next.
|
||||
box_height = box_width = np.sqrt(this_scale * next_scale) * size
|
||||
wh_list.append((box_width, box_height))
|
||||
else:
|
||||
box_width = this_scale * size * np.sqrt(ar)
|
||||
box_height = this_scale * size / np.sqrt(ar)
|
||||
wh_list.append((box_width, box_height))
|
||||
wh_list = np.array(wh_list)
|
||||
n_boxes = len(wh_list)
|
||||
|
||||
# Compute the grid of box center points. They are identical for all aspect ratios.
|
||||
|
||||
# Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
|
||||
if (this_steps is None):
|
||||
step_height = self.img_height / feature_map_size[0]
|
||||
step_width = self.img_width / feature_map_size[1]
|
||||
else:
|
||||
if isinstance(this_steps, (list, tuple)) and (len(this_steps) == 2):
|
||||
step_height = this_steps[0]
|
||||
step_width = this_steps[1]
|
||||
elif isinstance(this_steps, (int, float)):
|
||||
step_height = this_steps
|
||||
step_width = this_steps
|
||||
# Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
|
||||
if (this_offsets is None):
|
||||
offset_height = 0.5
|
||||
offset_width = 0.5
|
||||
else:
|
||||
if isinstance(this_offsets, (list, tuple)) and (len(this_offsets) == 2):
|
||||
offset_height = this_offsets[0]
|
||||
offset_width = this_offsets[1]
|
||||
elif isinstance(this_offsets, (int, float)):
|
||||
offset_height = this_offsets
|
||||
offset_width = this_offsets
|
||||
# Now that we have the offsets and step sizes, compute the grid of anchor box center points.
|
||||
cy = np.linspace(offset_height * step_height, (offset_height + feature_map_size[0] - 1) * step_height, feature_map_size[0])
|
||||
cx = np.linspace(offset_width * step_width, (offset_width + feature_map_size[1] - 1) * step_width, feature_map_size[1])
|
||||
cx_grid, cy_grid = np.meshgrid(cx, cy)
|
||||
cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
|
||||
cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
|
||||
|
||||
# Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
|
||||
# where the last dimension will contain `(cx, cy, w, h)`
|
||||
boxes_tensor = np.zeros((feature_map_size[0], feature_map_size[1], n_boxes, 4))
|
||||
|
||||
boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx
|
||||
boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy
|
||||
boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
|
||||
boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
|
||||
|
||||
# Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
|
||||
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')
|
||||
|
||||
# If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
|
||||
if self.clip_boxes:
|
||||
x_coords = boxes_tensor[:,:,:,[0, 2]]
|
||||
x_coords[x_coords >= self.img_width] = self.img_width - 1
|
||||
x_coords[x_coords < 0] = 0
|
||||
boxes_tensor[:,:,:,[0, 2]] = x_coords
|
||||
y_coords = boxes_tensor[:,:,:,[1, 3]]
|
||||
y_coords[y_coords >= self.img_height] = self.img_height - 1
|
||||
y_coords[y_coords < 0] = 0
|
||||
boxes_tensor[:,:,:,[1, 3]] = y_coords
|
||||
|
||||
# `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
|
||||
if self.normalize_coords:
|
||||
boxes_tensor[:, :, :, [0, 2]] /= self.img_width
|
||||
boxes_tensor[:, :, :, [1, 3]] /= self.img_height
|
||||
|
||||
# TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
|
||||
if self.coords == 'centroids':
|
||||
# Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
|
||||
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
|
||||
elif self.coords == 'minmax':
|
||||
# Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
|
||||
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')
|
||||
|
||||
if diagnostics:
|
||||
return boxes_tensor, (cy, cx), wh_list, (step_height, step_width), (offset_height, offset_width)
|
||||
else:
|
||||
return boxes_tensor
|
||||
|
||||
def generate_encoding_template(self, batch_size, diagnostics=False):
|
||||
'''
|
||||
Produces an encoding template for the ground truth label tensor for a given batch.
|
||||
|
||||
Note that all tensor creation, reshaping and concatenation operations performed in this function
|
||||
and the sub-functions it calls are identical to those performed inside the SSD model. This, of course,
|
||||
must be the case in order to preserve the spatial meaning of each box prediction, but it's useful to make
|
||||
yourself aware of this fact and why it is necessary.
|
||||
|
||||
In other words, the boxes in `y_encoded` must have a specific order in order correspond to the right spatial
|
||||
positions and scales of the boxes predicted by the model. The sequence of operations here ensures that `y_encoded`
|
||||
has this specific form.
|
||||
|
||||
Arguments:
|
||||
batch_size (int): The batch size.
|
||||
diagnostics (bool, optional): See the documnentation for `generate_anchor_boxes()`. The diagnostic output
|
||||
here is similar, just for all predictor conv layers.
|
||||
|
||||
Returns:
|
||||
A Numpy array of shape `(batch_size, #boxes, #classes + 12)`, the template into which to encode
|
||||
the ground truth labels for training. The last axis has length `#classes + 12` because the model
|
||||
output contains not only the 4 predicted box coordinate offsets, but also the 4 coordinates for
|
||||
the anchor boxes and the 4 variance values.
|
||||
'''
|
||||
# Tile the anchor boxes for each predictor layer across all batch items.
|
||||
boxes_batch = []
|
||||
for boxes in self.boxes_list:
|
||||
# Prepend one dimension to `self.boxes_list` to account for the batch size and tile it along.
|
||||
# The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 4)`
|
||||
boxes = np.expand_dims(boxes, axis=0)
|
||||
boxes = np.tile(boxes, (batch_size, 1, 1, 1, 1))
|
||||
|
||||
# Now reshape the 5D tensor above into a 3D tensor of shape
|
||||
# `(batch, feature_map_height * feature_map_width * n_boxes, 4)`. The resulting
|
||||
# order of the tensor content will be identical to the order obtained from the reshaping operation
|
||||
# in our Keras model (we're using the Tensorflow backend, and tf.reshape() and np.reshape()
|
||||
# use the same default index order, which is C-like index ordering)
|
||||
boxes = np.reshape(boxes, (batch_size, -1, 4))
|
||||
boxes_batch.append(boxes)
|
||||
|
||||
# Concatenate the anchor tensors from the individual layers to one.
|
||||
boxes_tensor = np.concatenate(boxes_batch, axis=1)
|
||||
|
||||
# 3: Create a template tensor to hold the one-hot class encodings of shape `(batch, #boxes, #classes)`
|
||||
# It will contain all zeros for now, the classes will be set in the matching process that follows
|
||||
classes_tensor = np.zeros((batch_size, boxes_tensor.shape[1], self.n_classes))
|
||||
|
||||
# 4: Create a tensor to contain the variances. This tensor has the same shape as `boxes_tensor` and simply
|
||||
# contains the same 4 variance values for every position in the last axis.
|
||||
variances_tensor = np.zeros_like(boxes_tensor)
|
||||
variances_tensor += self.variances # Long live broadcasting
|
||||
|
||||
# 4: Concatenate the classes, boxes and variances tensors to get our final template for y_encoded. We also need
|
||||
# another tensor of the shape of `boxes_tensor` as a space filler so that `y_encoding_template` has the same
|
||||
# shape as the SSD model output tensor. The content of this tensor is irrelevant, we'll just use
|
||||
# `boxes_tensor` a second time.
|
||||
y_encoding_template = np.concatenate((classes_tensor, boxes_tensor, boxes_tensor, variances_tensor), axis=2)
|
||||
|
||||
if diagnostics:
|
||||
return y_encoding_template, self.centers_diag, self.wh_list_diag, self.steps_diag, self.offsets_diag
|
||||
else:
|
||||
return y_encoding_template
|
||||
|
||||
class DegenerateBoxError(Exception):
|
||||
'''
|
||||
An exception class to be raised if degenerate boxes are being detected.
|
||||
'''
|
||||
pass
|
||||
BIN
ssd_keras-master/ssd_encoder_decoder/ssd_input_encoder.pyc
Normal file
BIN
ssd_keras-master/ssd_encoder_decoder/ssd_input_encoder.pyc
Normal file
Binary file not shown.
530
ssd_keras-master/ssd_encoder_decoder/ssd_output_decoder.py
Normal file
530
ssd_keras-master/ssd_encoder_decoder/ssd_output_decoder.py
Normal file
@@ -0,0 +1,530 @@
|
||||
'''
|
||||
Includes:
|
||||
* Functions to decode and filter raw SSD model output. These are only needed if the
|
||||
SSD model does not have a `DecodeDetections` layer.
|
||||
* Functions to perform greedy non-maximum suppression
|
||||
|
||||
Copyright (C) 2018 Pierluigi Ferrari
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
from __future__ import division
|
||||
import numpy as np
|
||||
|
||||
from bounding_box_utils.bounding_box_utils import iou, convert_coordinates
|
||||
|
||||
def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'):
|
||||
'''
|
||||
Perform greedy non-maximum suppression on the input boxes.
|
||||
|
||||
Greedy NMS works by selecting the box with the highest score and
|
||||
removing all boxes around it that are too close to it measured by IoU-similarity.
|
||||
Out of the boxes that are left over, once again the one with the highest
|
||||
score is selected and so on, until no boxes with too much overlap are left.
|
||||
|
||||
Arguments:
|
||||
y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this
|
||||
is a list of length `n` where each list element is a 2D Numpy array.
|
||||
For a batch item with `k` predicted boxes this 2D Numpy array has
|
||||
shape `(k, 6)`, where each row contains the coordinates of the respective
|
||||
box in the format `[class_id, score, xmin, xmax, ymin, ymax]`.
|
||||
Technically, the number of columns doesn't have to be 6, it can be
|
||||
arbitrary as long as the first four elements of each row are
|
||||
`xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element
|
||||
is the score assigned to the prediction. Note that this function is
|
||||
agnostic to the scale of the score or what it represents.
|
||||
iou_threshold (float, optional): All boxes with a Jaccard similarity of
|
||||
greater than `iou_threshold` with a locally maximal box will be removed
|
||||
from the set of predictions, where 'maximal' refers to the box score.
|
||||
coords (str, optional): The coordinate format of `y_pred_decoded`.
|
||||
Can be one of the formats supported by `iou()`.
|
||||
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
|
||||
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
|
||||
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
|
||||
If 'half', then one of each of the two horizontal and vertical borders belong
|
||||
to the boxex, but not the other.
|
||||
|
||||
Returns:
|
||||
The predictions after removing non-maxima. The format is the same as the input format.
|
||||
'''
|
||||
y_pred_decoded_nms = []
|
||||
for batch_item in y_pred_decoded: # For the labels of each batch item...
|
||||
boxes_left = np.copy(batch_item)
|
||||
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
|
||||
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
|
||||
maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
|
||||
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
|
||||
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
|
||||
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
|
||||
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
|
||||
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
|
||||
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
|
||||
y_pred_decoded_nms.append(np.array(maxima))
|
||||
|
||||
return y_pred_decoded_nms
|
||||
|
||||
def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
|
||||
'''
|
||||
The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
|
||||
function for per-class NMS in `decode_detections()`.
|
||||
'''
|
||||
boxes_left = np.copy(predictions)
|
||||
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
|
||||
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
|
||||
maximum_index = np.argmax(boxes_left[:,0]) # ...get the index of the next box with the highest confidence...
|
||||
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
|
||||
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
|
||||
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
|
||||
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
|
||||
similarities = iou(boxes_left[:,1:], maximum_box[1:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
|
||||
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
|
||||
return np.array(maxima)
|
||||
|
||||
def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
|
||||
'''
|
||||
The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
|
||||
function in `decode_detections_fast()`.
|
||||
'''
|
||||
boxes_left = np.copy(predictions)
|
||||
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
|
||||
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
|
||||
maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
|
||||
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
|
||||
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
|
||||
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
|
||||
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
|
||||
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
|
||||
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
|
||||
return np.array(maxima)
|
||||
|
||||
def decode_detections(y_pred,
|
||||
confidence_thresh=0.01,
|
||||
iou_threshold=0.45,
|
||||
top_k=200,
|
||||
input_coords='centroids',
|
||||
normalize_coords=True,
|
||||
img_height=None,
|
||||
img_width=None,
|
||||
border_pixels='half'):
|
||||
'''
|
||||
Convert model prediction output back to a format that contains only the positive box predictions
|
||||
(i.e. the same format that `SSDInputEncoder` takes as input).
|
||||
|
||||
After the decoding, two stages of prediction filtering are performed for each class individually:
|
||||
First confidence thresholding, then greedy non-maximum suppression. The filtering results for all
|
||||
classes are concatenated and the `top_k` overall highest confidence results constitute the final
|
||||
predictions for a given batch item. This procedure follows the original Caffe implementation.
|
||||
For a slightly different and more efficient alternative to decode raw model output that performs
|
||||
non-maximum suppresion globally instead of per class, see `decode_detections_fast()` below.
|
||||
|
||||
Arguments:
|
||||
y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
|
||||
of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
|
||||
boxes predicted by the model per image and the last axis contains
|
||||
`[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
|
||||
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
|
||||
positive class in order to be considered for the non-maximum suppression stage for the respective class.
|
||||
A lower value will result in a larger part of the selection process being done by the non-maximum suppression
|
||||
stage, while a larger value will result in a larger part of the selection process happening in the confidence
|
||||
thresholding stage.
|
||||
iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
|
||||
with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
|
||||
to the box score.
|
||||
top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
|
||||
non-maximum suppression stage.
|
||||
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
|
||||
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
|
||||
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
|
||||
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
|
||||
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
|
||||
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
|
||||
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
|
||||
coordinates. Requires `img_height` and `img_width` if set to `True`.
|
||||
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
|
||||
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
|
||||
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
|
||||
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
|
||||
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
|
||||
If 'half', then one of each of the two horizontal and vertical borders belong
|
||||
to the boxex, but not the other.
|
||||
|
||||
Returns:
|
||||
A python list of length `batch_size` where each list element represents the predicted boxes
|
||||
for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
|
||||
a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`.
|
||||
'''
|
||||
if normalize_coords and ((img_height is None) or (img_width is None)):
|
||||
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
|
||||
|
||||
# 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
|
||||
|
||||
y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
|
||||
|
||||
if input_coords == 'centroids':
|
||||
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
|
||||
y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
|
||||
y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
|
||||
y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
|
||||
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
|
||||
elif input_coords == 'minmax':
|
||||
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
|
||||
y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
|
||||
y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
|
||||
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
|
||||
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
|
||||
elif input_coords == 'corners':
|
||||
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
|
||||
y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
|
||||
y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
|
||||
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
|
||||
else:
|
||||
raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
|
||||
|
||||
# 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
|
||||
|
||||
if normalize_coords:
|
||||
y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
|
||||
y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
|
||||
|
||||
# 3: Apply confidence thresholding and non-maximum suppression per class
|
||||
|
||||
n_classes = y_pred_decoded_raw.shape[-1] - 4 # The number of classes is the length of the last axis minus the four box coordinates
|
||||
|
||||
y_pred_decoded = [] # Store the final predictions in this list
|
||||
for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
|
||||
pred = [] # Store the final predictions for this batch item here
|
||||
for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
|
||||
single_class = batch_item[:,[class_id, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
|
||||
threshold_met = single_class[single_class[:,0] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
|
||||
if threshold_met.shape[0] > 0: # If any boxes made the threshold...
|
||||
maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
|
||||
maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
|
||||
maxima_output[:,0] = class_id # Write the class ID to the first column...
|
||||
maxima_output[:,1:] = maxima # ...and write the maxima to the other columns...
|
||||
pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
|
||||
# Once we're through with all classes, keep only the `top_k` maxima with the highest scores
|
||||
if pred: # If there are any predictions left after confidence-thresholding...
|
||||
pred = np.concatenate(pred, axis=0)
|
||||
if top_k != 'all' and pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
|
||||
top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
|
||||
pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
|
||||
else:
|
||||
pred = np.array(pred) # Even if empty, `pred` must become a Numpy array.
|
||||
y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
|
||||
|
||||
return y_pred_decoded
|
||||
|
||||
def decode_detections_fast(y_pred,
|
||||
confidence_thresh=0.5,
|
||||
iou_threshold=0.45,
|
||||
top_k='all',
|
||||
input_coords='centroids',
|
||||
normalize_coords=True,
|
||||
img_height=None,
|
||||
img_width=None,
|
||||
border_pixels='half'):
|
||||
'''
|
||||
Convert model prediction output back to a format that contains only the positive box predictions
|
||||
(i.e. the same format that `enconde_y()` takes as input).
|
||||
|
||||
Optionally performs confidence thresholding and greedy non-maximum suppression after the decoding stage.
|
||||
|
||||
Note that the decoding procedure used here is not the same as the procedure used in the original Caffe implementation.
|
||||
For each box, the procedure used here assigns the box's highest confidence as its predicted class. Then it removes
|
||||
all boxes for which the highest confidence is the background class. This results in less work for the subsequent
|
||||
non-maximum suppression, because the vast majority of the predictions will be filtered out just by the fact that
|
||||
their highest confidence is for the background class. It is much more efficient than the procedure of the original
|
||||
implementation, but the results may also differ.
|
||||
|
||||
Arguments:
|
||||
y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
|
||||
of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
|
||||
boxes predicted by the model per image and the last axis contains
|
||||
`[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
|
||||
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in any positive
|
||||
class required for a given box to be considered a positive prediction. A lower value will result
|
||||
in better recall, while a higher value will result in better precision. Do not use this parameter with the
|
||||
goal to combat the inevitably many duplicates that an SSD will produce, the subsequent non-maximum suppression
|
||||
stage will take care of those.
|
||||
iou_threshold (float, optional): `None` or a float in [0,1]. If `None`, no non-maximum suppression will be
|
||||
performed. If not `None`, greedy NMS will be performed after the confidence thresholding stage, meaning
|
||||
all boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
|
||||
from the set of predictions, where 'maximal' refers to the box score.
|
||||
top_k (int, optional): 'all' or an integer with number of highest scoring predictions to be kept for each batch item
|
||||
after the non-maximum suppression stage. If 'all', all predictions left after the NMS stage will be kept.
|
||||
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
|
||||
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
|
||||
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
|
||||
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
|
||||
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
|
||||
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
|
||||
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
|
||||
coordinates. Requires `img_height` and `img_width` if set to `True`.
|
||||
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
|
||||
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
|
||||
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
|
||||
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
|
||||
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
|
||||
If 'half', then one of each of the two horizontal and vertical borders belong
|
||||
to the boxex, but not the other.
|
||||
|
||||
Returns:
|
||||
A python list of length `batch_size` where each list element represents the predicted boxes
|
||||
for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
|
||||
a non-background class for the respective image in the format `[class_id, confidence, xmin, xmax, ymin, ymax]`.
|
||||
'''
|
||||
if normalize_coords and ((img_height is None) or (img_width is None)):
|
||||
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
|
||||
|
||||
# 1: Convert the classes from one-hot encoding to their class ID
|
||||
y_pred_converted = np.copy(y_pred[:,:,-14:-8]) # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
|
||||
y_pred_converted[:,:,0] = np.argmax(y_pred[:,:,:-12], axis=-1) # The indices of the highest confidence values in the one-hot class vectors are the class ID
|
||||
y_pred_converted[:,:,1] = np.amax(y_pred[:,:,:-12], axis=-1) # Store the confidence values themselves, too
|
||||
|
||||
# 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
|
||||
if input_coords == 'centroids':
|
||||
y_pred_converted[:,:,[4,5]] = np.exp(y_pred_converted[:,:,[4,5]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
|
||||
y_pred_converted[:,:,[4,5]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
|
||||
y_pred_converted[:,:,[2,3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
|
||||
y_pred_converted[:,:,[2,3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
|
||||
y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2corners')
|
||||
elif input_coords == 'minmax':
|
||||
y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
|
||||
y_pred_converted[:,:,[2,3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
|
||||
y_pred_converted[:,:,[4,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
|
||||
y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
|
||||
y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='minmax2corners')
|
||||
elif input_coords == 'corners':
|
||||
y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
|
||||
y_pred_converted[:,:,[2,4]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
|
||||
y_pred_converted[:,:,[3,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
|
||||
y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
|
||||
else:
|
||||
raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
|
||||
|
||||
# 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
|
||||
if normalize_coords:
|
||||
y_pred_converted[:,:,[2,4]] *= img_width # Convert xmin, xmax back to absolute coordinates
|
||||
y_pred_converted[:,:,[3,5]] *= img_height # Convert ymin, ymax back to absolute coordinates
|
||||
|
||||
# 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
|
||||
y_pred_decoded = []
|
||||
for batch_item in y_pred_converted: # For each image in the batch...
|
||||
boxes = batch_item[np.nonzero(batch_item[:,0])] # ...get all boxes that don't belong to the background class,...
|
||||
boxes = boxes[boxes[:,1] >= confidence_thresh] # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
|
||||
if iou_threshold: # ...if an IoU threshold is set...
|
||||
boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on the remaining boxes.
|
||||
if top_k != 'all' and boxes.shape[0] > top_k: # If we have more than `top_k` results left at this point...
|
||||
top_k_indices = np.argpartition(boxes[:,1], kth=boxes.shape[0]-top_k, axis=0)[boxes.shape[0]-top_k:] # ...get the indices of the `top_k` highest-scoring boxes...
|
||||
boxes = boxes[top_k_indices] # ...and keep only those boxes...
|
||||
y_pred_decoded.append(boxes) # ...and now that we're done, append the array of final predictions for this batch item to the output list
|
||||
|
||||
return y_pred_decoded
|
||||
|
||||
################################################################################################
|
||||
# Debugging tools, not relevant for normal use
|
||||
################################################################################################
|
||||
|
||||
# The functions below are for debugging, so you won't normally need them. That is,
|
||||
# unless you need to debug your model, of course.
|
||||
|
||||
def decode_detections_debug(y_pred,
|
||||
confidence_thresh=0.01,
|
||||
iou_threshold=0.45,
|
||||
top_k=200,
|
||||
input_coords='centroids',
|
||||
normalize_coords=True,
|
||||
img_height=None,
|
||||
img_width=None,
|
||||
variance_encoded_in_target=False,
|
||||
border_pixels='half'):
|
||||
'''
|
||||
This decoder performs the same processing as `decode_detections()`, but the output format for each left-over
|
||||
predicted box is `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
|
||||
|
||||
That is, in addition to the usual data, each predicted box has the internal index of that box within
|
||||
the model (`box_id`) prepended to it. This allows you to know exactly which part of the model made a given
|
||||
box prediction; in particular, it allows you to know which predictor layer made a given prediction.
|
||||
This can be useful for debugging.
|
||||
|
||||
Arguments:
|
||||
y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
|
||||
of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
|
||||
boxes predicted by the model per image and the last axis contains
|
||||
`[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
|
||||
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
|
||||
positive class in order to be considered for the non-maximum suppression stage for the respective class.
|
||||
A lower value will result in a larger part of the selection process being done by the non-maximum suppression
|
||||
stage, while a larger value will result in a larger part of the selection process happening in the confidence
|
||||
thresholding stage.
|
||||
iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
|
||||
with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
|
||||
to the box score.
|
||||
top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
|
||||
non-maximum suppression stage.
|
||||
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
|
||||
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
|
||||
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
|
||||
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
|
||||
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
|
||||
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
|
||||
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
|
||||
coordinates. Requires `img_height` and `img_width` if set to `True`.
|
||||
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
|
||||
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
|
||||
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
|
||||
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
|
||||
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
|
||||
If 'half', then one of each of the two horizontal and vertical borders belong
|
||||
to the boxex, but not the other.
|
||||
|
||||
Returns:
|
||||
A python list of length `batch_size` where each list element represents the predicted boxes
|
||||
for one image and contains a Numpy array of shape `(boxes, 7)` where each row is a box prediction for
|
||||
a non-background class for the respective image in the format `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
|
||||
'''
|
||||
if normalize_coords and ((img_height is None) or (img_width is None)):
|
||||
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
|
||||
|
||||
# 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
|
||||
|
||||
y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
|
||||
|
||||
if input_coords == 'centroids':
|
||||
if variance_encoded_in_target:
|
||||
# Decode the predicted box center x and y coordinates.
|
||||
y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] + y_pred[:,:,[-8,-7]]
|
||||
# Decode the predicted box width and heigt.
|
||||
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
|
||||
else:
|
||||
# Decode the predicted box center x and y coordinates.
|
||||
y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] * y_pred[:,:,[-4,-3]] + y_pred[:,:,[-8,-7]]
|
||||
# Decode the predicted box width and heigt.
|
||||
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
|
||||
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
|
||||
elif input_coords == 'minmax':
|
||||
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
|
||||
y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
|
||||
y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
|
||||
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
|
||||
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
|
||||
elif input_coords == 'corners':
|
||||
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
|
||||
y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
|
||||
y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
|
||||
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
|
||||
else:
|
||||
raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
|
||||
|
||||
# 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
|
||||
|
||||
if normalize_coords:
|
||||
y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
|
||||
y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
|
||||
|
||||
# 3: For each batch item, prepend each box's internal index to its coordinates.
|
||||
|
||||
y_pred_decoded_raw2 = np.zeros((y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1], y_pred_decoded_raw.shape[2] + 1)) # Expand the last axis by one.
|
||||
y_pred_decoded_raw2[:,:,1:] = y_pred_decoded_raw
|
||||
y_pred_decoded_raw2[:,:,0] = np.arange(y_pred_decoded_raw.shape[1]) # Put the box indices as the first element for each box via broadcasting.
|
||||
y_pred_decoded_raw = y_pred_decoded_raw2
|
||||
|
||||
# 4: Apply confidence thresholding and non-maximum suppression per class
|
||||
|
||||
n_classes = y_pred_decoded_raw.shape[-1] - 5 # The number of classes is the length of the last axis minus the four box coordinates and minus the index
|
||||
|
||||
y_pred_decoded = [] # Store the final predictions in this list
|
||||
for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
|
||||
pred = [] # Store the final predictions for this batch item here
|
||||
for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
|
||||
single_class = batch_item[:,[0, class_id + 1, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
|
||||
threshold_met = single_class[single_class[:,1] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
|
||||
if threshold_met.shape[0] > 0: # If any boxes made the threshold...
|
||||
maxima = _greedy_nms_debug(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
|
||||
maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
|
||||
maxima_output[:,0] = maxima[:,0] # Write the box index to the first column...
|
||||
maxima_output[:,1] = class_id # ...and write the class ID to the second column...
|
||||
maxima_output[:,2:] = maxima[:,1:] # ...and write the rest of the maxima data to the other columns...
|
||||
pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
|
||||
# Once we're through with all classes, keep only the `top_k` maxima with the highest scores
|
||||
pred = np.concatenate(pred, axis=0)
|
||||
if pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
|
||||
top_k_indices = np.argpartition(pred[:,2], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
|
||||
pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
|
||||
y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
|
||||
|
||||
return y_pred_decoded
|
||||
|
||||
def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
|
||||
'''
|
||||
The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
|
||||
function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all
|
||||
left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output
|
||||
box and is thus useful for debugging.
|
||||
'''
|
||||
boxes_left = np.copy(predictions)
|
||||
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
|
||||
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
|
||||
maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
|
||||
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
|
||||
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
|
||||
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
|
||||
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
|
||||
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
|
||||
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
|
||||
return np.array(maxima)
|
||||
|
||||
def get_num_boxes_per_pred_layer(predictor_sizes, aspect_ratios, two_boxes_for_ar1):
|
||||
'''
|
||||
Returns a list of the number of boxes that each predictor layer predicts.
|
||||
|
||||
`aspect_ratios` must be a nested list, containing a list of aspect ratios
|
||||
for each predictor layer.
|
||||
'''
|
||||
num_boxes_per_pred_layer = []
|
||||
for i in range(len(predictor_sizes)):
|
||||
if two_boxes_for_ar1:
|
||||
num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * (len(aspect_ratios[i]) + 1))
|
||||
else:
|
||||
num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * len(aspect_ratios[i]))
|
||||
return num_boxes_per_pred_layer
|
||||
|
||||
def get_pred_layers(y_pred_decoded, num_boxes_per_pred_layer):
|
||||
'''
|
||||
For a given prediction tensor decoded with `decode_detections_debug()`, returns a list
|
||||
with the indices of the predictor layers that made each predictions.
|
||||
|
||||
That is, this function lets you know which predictor layer is responsible
|
||||
for a given prediction.
|
||||
|
||||
Arguments:
|
||||
y_pred_decoded (array): The decoded model output tensor. Must have been
|
||||
decoded with `decode_detections_debug()` so that it contains the internal box index
|
||||
for each predicted box.
|
||||
num_boxes_per_pred_layer (list): A list that contains the total number
|
||||
of boxes that each predictor layer predicts.
|
||||
'''
|
||||
pred_layers_all = []
|
||||
cum_boxes_per_pred_layer = np.cumsum(num_boxes_per_pred_layer)
|
||||
for batch_item in y_pred_decoded:
|
||||
pred_layers = []
|
||||
for prediction in batch_item:
|
||||
if (prediction[0] < 0) or (prediction[0] >= cum_boxes_per_pred_layer[-1]):
|
||||
raise ValueError("Box index is out of bounds of the possible indices as given by the values in `num_boxes_per_pred_layer`.")
|
||||
for i in range(len(cum_boxes_per_pred_layer)):
|
||||
if prediction[0] < cum_boxes_per_pred_layer[i]:
|
||||
pred_layers.append(i)
|
||||
break
|
||||
pred_layers_all.append(pred_layers)
|
||||
return pred_layers_all
|
||||
BIN
ssd_keras-master/ssd_encoder_decoder/ssd_output_decoder.pyc
Normal file
BIN
ssd_keras-master/ssd_encoder_decoder/ssd_output_decoder.pyc
Normal file
Binary file not shown.
Reference in New Issue
Block a user