This commit is contained in:
dl-desktop
2020-02-06 16:47:03 -03:00
parent 6328265287
commit b586f22bf0
318 changed files with 25111 additions and 664 deletions

Binary file not shown.

View File

@@ -0,0 +1,278 @@
'''
A custom Keras layer to generate anchor boxes.
Copyright (C) 2018 Pierluigi Ferrari
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
from __future__ import division
import numpy as np
import keras.backend as K
from keras.engine.topology import InputSpec
from keras.engine.topology import Layer
from bounding_box_utils.bounding_box_utils import convert_coordinates
class AnchorBoxes(Layer):
'''
A Keras layer to create an output tensor containing anchor box coordinates
and variances based on the input tensor and the passed arguments.
A set of 2D anchor boxes of different aspect ratios is created for each spatial unit of
the input tensor. The number of anchor boxes created per unit depends on the arguments
`aspect_ratios` and `two_boxes_for_ar1`, in the default case it is 4. The boxes
are parameterized by the coordinate tuple `(xmin, xmax, ymin, ymax)`.
The logic implemented by this layer is identical to the logic in the module
`ssd_box_encode_decode_utils.py`.
The purpose of having this layer in the network is to make the model self-sufficient
at inference time. Since the model is predicting offsets to the anchor boxes
(rather than predicting absolute box coordinates directly), one needs to know the anchor
box coordinates in order to construct the final prediction boxes from the predicted offsets.
If the model's output tensor did not contain the anchor box coordinates, the necessary
information to convert the predicted offsets back to absolute coordinates would be missing
in the model output. The reason why it is necessary to predict offsets to the anchor boxes
rather than to predict absolute box coordinates directly is explained in `README.md`.
Input shape:
4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
or `(batch, height, width, channels)` if `dim_ordering = 'tf'`.
Output shape:
5D tensor of shape `(batch, height, width, n_boxes, 8)`. The last axis contains
the four anchor box coordinates and the four variance values for each box.
'''
def __init__(self,
img_height,
img_width,
this_scale,
next_scale,
aspect_ratios=[0.5, 1.0, 2.0],
two_boxes_for_ar1=True,
this_steps=None,
this_offsets=None,
clip_boxes=False,
variances=[0.1, 0.1, 0.2, 0.2],
coords='centroids',
normalize_coords=False,
**kwargs):
'''
All arguments need to be set to the same values as in the box encoding process, otherwise the behavior is undefined.
Some of these arguments are explained in more detail in the documentation of the `SSDBoxEncoder` class.
Arguments:
img_height (int): The height of the input images.
img_width (int): The width of the input images.
this_scale (float): A float in [0, 1], the scaling factor for the size of the generated anchor boxes
as a fraction of the shorter side of the input image.
next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
`self.two_boxes_for_ar1 == True`.
aspect_ratios (list, optional): The list of aspect ratios for which default boxes are to be
generated for this layer.
two_boxes_for_ar1 (bool, optional): Only relevant if `aspect_ratios` contains 1.
If `True`, two default boxes will be generated for aspect ratio 1. The first will be generated
using the scaling factor for the respective layer, the second one will be generated using
geometric mean of said scaling factor and next bigger scaling factor.
clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
its respective variance value.
coords (str, optional): The box coordinate format to be used internally in the model (i.e. this is not the input format
of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height),
'corners' for the format `(xmin, ymin, xmax, ymax)`, or 'minmax' for the format `(xmin, xmax, ymin, ymax)`.
normalize_coords (bool, optional): Set to `True` if the model uses relative instead of absolute coordinates,
i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
'''
if K.backend() != 'tensorflow':
raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
if (this_scale < 0) or (next_scale < 0) or (this_scale > 1):
raise ValueError("`this_scale` must be in [0, 1] and `next_scale` must be >0, but `this_scale` == {}, `next_scale` == {}".format(this_scale, next_scale))
if len(variances) != 4:
raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
variances = np.array(variances)
if np.any(variances <= 0):
raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
self.img_height = img_height
self.img_width = img_width
self.this_scale = this_scale
self.next_scale = next_scale
self.aspect_ratios = aspect_ratios
self.two_boxes_for_ar1 = two_boxes_for_ar1
self.this_steps = this_steps
self.this_offsets = this_offsets
self.clip_boxes = clip_boxes
self.variances = variances
self.coords = coords
self.normalize_coords = normalize_coords
# Compute the number of boxes per cell
if (1 in aspect_ratios) and two_boxes_for_ar1:
self.n_boxes = len(aspect_ratios) + 1
else:
self.n_boxes = len(aspect_ratios)
super(AnchorBoxes, self).__init__(**kwargs)
def build(self, input_shape):
self.input_spec = [InputSpec(shape=input_shape)]
super(AnchorBoxes, self).build(input_shape)
def call(self, x, mask=None):
'''
Return an anchor box tensor based on the shape of the input tensor.
The logic implemented here is identical to the logic in the module `ssd_box_encode_decode_utils.py`.
Note that this tensor does not participate in any graph computations at runtime. It is being created
as a constant once during graph creation and is just being output along with the rest of the model output
during runtime. Because of this, all logic is implemented as Numpy array operations and it is sufficient
to convert the resulting Numpy array into a Keras tensor at the very end before outputting it.
Arguments:
x (tensor): 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. The input for this
layer must be the output of the localization predictor layer.
'''
# Compute box width and height for each aspect ratio
# The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
size = min(self.img_height, self.img_width)
# Compute the box widths and and heights for all aspect ratios
wh_list = []
for ar in self.aspect_ratios:
if (ar == 1):
# Compute the regular anchor box for aspect ratio 1.
box_height = box_width = self.this_scale * size
wh_list.append((box_width, box_height))
if self.two_boxes_for_ar1:
# Compute one slightly larger version using the geometric mean of this scale value and the next.
box_height = box_width = np.sqrt(self.this_scale * self.next_scale) * size
wh_list.append((box_width, box_height))
else:
box_height = self.this_scale * size / np.sqrt(ar)
box_width = self.this_scale * size * np.sqrt(ar)
wh_list.append((box_width, box_height))
wh_list = np.array(wh_list)
# We need the shape of the input tensor
if K.image_dim_ordering() == 'tf':
batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape
else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future
batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape
# Compute the grid of box center points. They are identical for all aspect ratios.
# Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
if (self.this_steps is None):
step_height = self.img_height / feature_map_height
step_width = self.img_width / feature_map_width
else:
if isinstance(self.this_steps, (list, tuple)) and (len(self.this_steps) == 2):
step_height = self.this_steps[0]
step_width = self.this_steps[1]
elif isinstance(self.this_steps, (int, float)):
step_height = self.this_steps
step_width = self.this_steps
# Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
if (self.this_offsets is None):
offset_height = 0.5
offset_width = 0.5
else:
if isinstance(self.this_offsets, (list, tuple)) and (len(self.this_offsets) == 2):
offset_height = self.this_offsets[0]
offset_width = self.this_offsets[1]
elif isinstance(self.this_offsets, (int, float)):
offset_height = self.this_offsets
offset_width = self.this_offsets
# Now that we have the offsets and step sizes, compute the grid of anchor box center points.
cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height)
cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width)
cx_grid, cy_grid = np.meshgrid(cx, cy)
cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
# Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
# where the last dimension will contain `(cx, cy, w, h)`
boxes_tensor = np.zeros((feature_map_height, feature_map_width, self.n_boxes, 4))
boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx
boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy
boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
# Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)`
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')
# If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
if self.clip_boxes:
x_coords = boxes_tensor[:,:,:,[0, 2]]
x_coords[x_coords >= self.img_width] = self.img_width - 1
x_coords[x_coords < 0] = 0
boxes_tensor[:,:,:,[0, 2]] = x_coords
y_coords = boxes_tensor[:,:,:,[1, 3]]
y_coords[y_coords >= self.img_height] = self.img_height - 1
y_coords[y_coords < 0] = 0
boxes_tensor[:,:,:,[1, 3]] = y_coords
# If `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
if self.normalize_coords:
boxes_tensor[:, :, :, [0, 2]] /= self.img_width
boxes_tensor[:, :, :, [1, 3]] /= self.img_height
# TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
if self.coords == 'centroids':
# Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
elif self.coords == 'minmax':
# Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')
# Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape
# as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis.
variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)`
variances_tensor += self.variances # Long live broadcasting
# Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)`
boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1)
# Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
# The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1))
return boxes_tensor
def compute_output_shape(self, input_shape):
if K.image_dim_ordering() == 'tf':
batch_size, feature_map_height, feature_map_width, feature_map_channels = input_shape
else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future
batch_size, feature_map_channels, feature_map_height, feature_map_width = input_shape
return (batch_size, feature_map_height, feature_map_width, self.n_boxes, 8)
def get_config(self):
config = {
'img_height': self.img_height,
'img_width': self.img_width,
'this_scale': self.this_scale,
'next_scale': self.next_scale,
'aspect_ratios': list(self.aspect_ratios),
'two_boxes_for_ar1': self.two_boxes_for_ar1,
'clip_boxes': self.clip_boxes,
'variances': list(self.variances),
'coords': self.coords,
'normalize_coords': self.normalize_coords
}
base_config = super(AnchorBoxes, self).get_config()
return dict(list(base_config.items()) + list(config.items()))

View File

@@ -0,0 +1,283 @@
'''
A custom Keras layer to decode the raw SSD prediction output. Corresponds to the
`DetectionOutput` layer type in the original Caffe implementation of SSD.
Copyright (C) 2018 Pierluigi Ferrari
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
from __future__ import division
import numpy as np
import tensorflow as tf
import keras.backend as K
from keras.engine.topology import InputSpec
from keras.engine.topology import Layer
class DecodeDetections(Layer):
'''
A Keras layer to decode the raw SSD prediction output.
Input shape:
3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`.
Output shape:
3D tensor of shape `(batch_size, top_k, 6)`.
'''
def __init__(self,
confidence_thresh=0.01,
iou_threshold=0.45,
top_k=200,
nms_max_output_size=400,
coords='centroids',
normalize_coords=True,
img_height=None,
img_width=None,
**kwargs):
'''
All default argument values follow the Caffe implementation.
Arguments:
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
positive class in order to be considered for the non-maximum suppression stage for the respective class.
A lower value will result in a larger part of the selection process being done by the non-maximum suppression
stage, while a larger value will result in a larger part of the selection process happening in the confidence
thresholding stage.
iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
to the box score.
top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
non-maximum suppression stage.
nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum
suppression.
coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids'
i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are
currently not supported.
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
coordinates. Requires `img_height` and `img_width` if set to `True`.
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
'''
if K.backend() != 'tensorflow':
raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
if coords != 'centroids':
raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.")
# We need these members for the config.
self.confidence_thresh = confidence_thresh
self.iou_threshold = iou_threshold
self.top_k = top_k
self.normalize_coords = normalize_coords
self.img_height = img_height
self.img_width = img_width
self.coords = coords
self.nms_max_output_size = nms_max_output_size
# We need these members for TensorFlow.
self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh')
self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold')
self.tf_top_k = tf.constant(self.top_k, name='top_k')
self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords')
self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height')
self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width')
self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size')
super(DecodeDetections, self).__init__(**kwargs)
def build(self, input_shape):
self.input_spec = [InputSpec(shape=input_shape)]
super(DecodeDetections, self).build(input_shape)
def call(self, y_pred, mask=None):
'''
Returns:
3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded
to always yield `top_k` predictions per batch item. The last axis contains
the coordinates for each predicted box in the format
`[class_id, confidence, xmin, ymin, xmax, ymax]`.
'''
#####################################################################################
# 1. Convert the box coordinates from predicted anchor box offsets to predicted
# absolute coordinates
#####################################################################################
# Convert anchor box offsets to image offsets.
cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor
cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor
w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor
h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor
# Convert 'centroids' to 'corners'.
xmin = cx - 0.5 * w
ymin = cy - 0.5 * h
xmax = cx + 0.5 * w
ymax = cy + 0.5 * h
# If the model predicts box coordinates relative to the image dimensions and they are supposed
# to be converted back to absolute coordinates, do that.
def normalized_coords():
xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1)
ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1)
xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1)
ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1)
return xmin1, ymin1, xmax1, ymax1
def non_normalized_coords():
return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1)
xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords)
# Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor.
y_pred = tf.concat(values=[y_pred[...,:-12], xmin, ymin, xmax, ymax], axis=-1)
#####################################################################################
# 2. Perform confidence thresholding, per-class non-maximum suppression, and
# top-k filtering.
#####################################################################################
batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
n_boxes = tf.shape(y_pred)[1]
n_classes = y_pred.shape[2] - 4
class_indices = tf.range(1, n_classes)
# Create a function that filters the predictions for the given batch item. Specifically, it performs:
# - confidence thresholding
# - non-maximum suppression (NMS)
# - top-k filtering
def filter_predictions(batch_item):
# Create a function that filters the predictions for one single class.
def filter_single_class(index):
# From a tensor of shape (n_boxes, n_classes + 4 coordinates) extract
# a tensor of shape (n_boxes, 1 + 4 coordinates) that contains the
# confidnece values for just one class, determined by `index`.
confidences = tf.expand_dims(batch_item[..., index], axis=-1)
class_id = tf.fill(dims=tf.shape(confidences), value=tf.to_float(index))
box_coordinates = batch_item[...,-4:]
single_class = tf.concat([class_id, confidences, box_coordinates], axis=-1)
# Apply confidence thresholding with respect to the class defined by `index`.
threshold_met = single_class[:,1] > self.tf_confidence_thresh
single_class = tf.boolean_mask(tensor=single_class,
mask=threshold_met)
# If any boxes made the threshold, perform NMS.
def perform_nms():
scores = single_class[...,1]
# `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`.
xmin = tf.expand_dims(single_class[...,-4], axis=-1)
ymin = tf.expand_dims(single_class[...,-3], axis=-1)
xmax = tf.expand_dims(single_class[...,-2], axis=-1)
ymax = tf.expand_dims(single_class[...,-1], axis=-1)
boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1)
maxima_indices = tf.image.non_max_suppression(boxes=boxes,
scores=scores,
max_output_size=self.tf_nms_max_output_size,
iou_threshold=self.iou_threshold,
name='non_maximum_suppresion')
maxima = tf.gather(params=single_class,
indices=maxima_indices,
axis=0)
return maxima
def no_confident_predictions():
return tf.constant(value=0.0, shape=(1,6))
single_class_nms = tf.cond(tf.equal(tf.size(single_class), 0), no_confident_predictions, perform_nms)
# Make sure `single_class` is exactly `self.nms_max_output_size` elements long.
padded_single_class = tf.pad(tensor=single_class_nms,
paddings=[[0, self.tf_nms_max_output_size - tf.shape(single_class_nms)[0]], [0, 0]],
mode='CONSTANT',
constant_values=0.0)
return padded_single_class
# Iterate `filter_single_class()` over all class indices.
filtered_single_classes = tf.map_fn(fn=lambda i: filter_single_class(i),
elems=tf.range(1,n_classes),
dtype=tf.float32,
parallel_iterations=128,
back_prop=False,
swap_memory=False,
infer_shape=True,
name='loop_over_classes')
# Concatenate the filtered results for all individual classes to one tensor.
filtered_predictions = tf.reshape(tensor=filtered_single_classes, shape=(-1,6))
# Perform top-k filtering for this batch item or pad it in case there are
# fewer than `self.top_k` boxes left at this point. Either way, produce a
# tensor of length `self.top_k`. By the time we return the final results tensor
# for the whole batch, all batch items must have the same number of predicted
# boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k`
# predictions are left after the filtering process above, we pad the missing
# predictions with zeros as dummy entries.
def top_k():
return tf.gather(params=filtered_predictions,
indices=tf.nn.top_k(filtered_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
axis=0)
def pad_and_top_k():
padded_predictions = tf.pad(tensor=filtered_predictions,
paddings=[[0, self.tf_top_k - tf.shape(filtered_predictions)[0]], [0, 0]],
mode='CONSTANT',
constant_values=0.0)
return tf.gather(params=padded_predictions,
indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
axis=0)
top_k_boxes = tf.cond(tf.greater_equal(tf.shape(filtered_predictions)[0], self.tf_top_k), top_k, pad_and_top_k)
return top_k_boxes
# Iterate `filter_predictions()` over all batch items.
output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x),
elems=y_pred,
dtype=None,
parallel_iterations=128,
back_prop=False,
swap_memory=False,
infer_shape=True,
name='loop_over_batch')
return output_tensor
def compute_output_shape(self, input_shape):
batch_size, n_boxes, last_axis = input_shape
return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates)
def get_config(self):
config = {
'confidence_thresh': self.confidence_thresh,
'iou_threshold': self.iou_threshold,
'top_k': self.top_k,
'nms_max_output_size': self.nms_max_output_size,
'coords': self.coords,
'normalize_coords': self.normalize_coords,
'img_height': self.img_height,
'img_width': self.img_width,
}
base_config = super(DecodeDetections, self).get_config()
return dict(list(base_config.items()) + list(config.items()))

View File

@@ -0,0 +1,266 @@
'''
A custom Keras layer to decode the raw SSD prediction output. This is a modified
and more efficient version of the `DetectionOutput` layer type in the original Caffe
implementation of SSD. For a faithful replication of the original layer, please
refer to the `DecodeDetections` layer.
Copyright (C) 2018 Pierluigi Ferrari
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
from __future__ import division
import numpy as np
import tensorflow as tf
import keras.backend as K
from keras.engine.topology import InputSpec
from keras.engine.topology import Layer
class DecodeDetectionsFast(Layer):
'''
A Keras layer to decode the raw SSD prediction output.
Input shape:
3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`.
Output shape:
3D tensor of shape `(batch_size, top_k, 6)`.
'''
def __init__(self,
confidence_thresh=0.01,
iou_threshold=0.45,
top_k=200,
nms_max_output_size=400,
coords='centroids',
normalize_coords=True,
img_height=None,
img_width=None,
**kwargs):
'''
All default argument values follow the Caffe implementation.
Arguments:
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
positive class in order to be considered for the non-maximum suppression stage for the respective class.
A lower value will result in a larger part of the selection process being done by the non-maximum suppression
stage, while a larger value will result in a larger part of the selection process happening in the confidence
thresholding stage.
iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
to the box score.
top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
non-maximum suppression stage.
nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum
suppression.
coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids'
i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are
currently not supported.
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
coordinates. Requires `img_height` and `img_width` if set to `True`.
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
'''
if K.backend() != 'tensorflow':
raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
if coords != 'centroids':
raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.")
# We need these members for the config.
self.confidence_thresh = confidence_thresh
self.iou_threshold = iou_threshold
self.top_k = top_k
self.normalize_coords = normalize_coords
self.img_height = img_height
self.img_width = img_width
self.coords = coords
self.nms_max_output_size = nms_max_output_size
# We need these members for TensorFlow.
self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh')
self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold')
self.tf_top_k = tf.constant(self.top_k, name='top_k')
self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords')
self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height')
self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width')
self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size')
super(DecodeDetectionsFast, self).__init__(**kwargs)
def build(self, input_shape):
self.input_spec = [InputSpec(shape=input_shape)]
super(DecodeDetectionsFast, self).build(input_shape)
def call(self, y_pred, mask=None):
'''
Returns:
3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded
to always yield `top_k` predictions per batch item. The last axis contains
the coordinates for each predicted box in the format
`[class_id, confidence, xmin, ymin, xmax, ymax]`.
'''
#####################################################################################
# 1. Convert the box coordinates from predicted anchor box offsets to predicted
# absolute coordinates
#####################################################################################
# Extract the predicted class IDs as the indices of the highest confidence values.
class_ids = tf.expand_dims(tf.to_float(tf.argmax(y_pred[...,:-12], axis=-1)), axis=-1)
# Extract the confidences of the maximal classes.
confidences = tf.reduce_max(y_pred[...,:-12], axis=-1, keep_dims=True)
# Convert anchor box offsets to image offsets.
cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor
cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor
w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor
h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor
# Convert 'centroids' to 'corners'.
xmin = cx - 0.5 * w
ymin = cy - 0.5 * h
xmax = cx + 0.5 * w
ymax = cy + 0.5 * h
# If the model predicts box coordinates relative to the image dimensions and they are supposed
# to be converted back to absolute coordinates, do that.
def normalized_coords():
xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1)
ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1)
xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1)
ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1)
return xmin1, ymin1, xmax1, ymax1
def non_normalized_coords():
return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1)
xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords)
# Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor.
y_pred = tf.concat(values=[class_ids, confidences, xmin, ymin, xmax, ymax], axis=-1)
#####################################################################################
# 2. Perform confidence thresholding, non-maximum suppression, and top-k filtering.
#####################################################################################
batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
n_boxes = tf.shape(y_pred)[1]
n_classes = y_pred.shape[2] - 4
class_indices = tf.range(1, n_classes)
# Create a function that filters the predictions for the given batch item. Specifically, it performs:
# - confidence thresholding
# - non-maximum suppression (NMS)
# - top-k filtering
def filter_predictions(batch_item):
# Keep only the non-background boxes.
positive_boxes = tf.not_equal(batch_item[...,0], 0.0)
predictions = tf.boolean_mask(tensor=batch_item,
mask=positive_boxes)
def perform_confidence_thresholding():
# Apply confidence thresholding.
threshold_met = predictions[:,1] > self.tf_confidence_thresh
return tf.boolean_mask(tensor=predictions,
mask=threshold_met)
def no_positive_boxes():
return tf.constant(value=0.0, shape=(1,6))
# If there are any positive predictions, perform confidence thresholding.
predictions_conf_thresh = tf.cond(tf.equal(tf.size(predictions), 0), no_positive_boxes, perform_confidence_thresholding)
def perform_nms():
scores = predictions_conf_thresh[...,1]
# `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`.
xmin = tf.expand_dims(predictions_conf_thresh[...,-4], axis=-1)
ymin = tf.expand_dims(predictions_conf_thresh[...,-3], axis=-1)
xmax = tf.expand_dims(predictions_conf_thresh[...,-2], axis=-1)
ymax = tf.expand_dims(predictions_conf_thresh[...,-1], axis=-1)
boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1)
maxima_indices = tf.image.non_max_suppression(boxes=boxes,
scores=scores,
max_output_size=self.tf_nms_max_output_size,
iou_threshold=self.iou_threshold,
name='non_maximum_suppresion')
maxima = tf.gather(params=predictions_conf_thresh,
indices=maxima_indices,
axis=0)
return maxima
def no_confident_predictions():
return tf.constant(value=0.0, shape=(1,6))
# If any boxes made the threshold, perform NMS.
predictions_nms = tf.cond(tf.equal(tf.size(predictions_conf_thresh), 0), no_confident_predictions, perform_nms)
# Perform top-k filtering for this batch item or pad it in case there are
# fewer than `self.top_k` boxes left at this point. Either way, produce a
# tensor of length `self.top_k`. By the time we return the final results tensor
# for the whole batch, all batch items must have the same number of predicted
# boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k`
# predictions are left after the filtering process above, we pad the missing
# predictions with zeros as dummy entries.
def top_k():
return tf.gather(params=predictions_nms,
indices=tf.nn.top_k(predictions_nms[:, 1], k=self.tf_top_k, sorted=True).indices,
axis=0)
def pad_and_top_k():
padded_predictions = tf.pad(tensor=predictions_nms,
paddings=[[0, self.tf_top_k - tf.shape(predictions_nms)[0]], [0, 0]],
mode='CONSTANT',
constant_values=0.0)
return tf.gather(params=padded_predictions,
indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
axis=0)
top_k_boxes = tf.cond(tf.greater_equal(tf.shape(predictions_nms)[0], self.tf_top_k), top_k, pad_and_top_k)
return top_k_boxes
# Iterate `filter_predictions()` over all batch items.
output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x),
elems=y_pred,
dtype=None,
parallel_iterations=128,
back_prop=False,
swap_memory=False,
infer_shape=True,
name='loop_over_batch')
return output_tensor
def compute_output_shape(self, input_shape):
batch_size, n_boxes, last_axis = input_shape
return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates)
def get_config(self):
config = {
'confidence_thresh': self.confidence_thresh,
'iou_threshold': self.iou_threshold,
'top_k': self.top_k,
'nms_max_output_size': self.nms_max_output_size,
'coords': self.coords,
'normalize_coords': self.normalize_coords,
'img_height': self.img_height,
'img_width': self.img_width,
}
base_config = super(DecodeDetectionsFast, self).get_config()
return dict(list(base_config.items()) + list(config.items()))

View File

@@ -0,0 +1,70 @@
'''
A custom Keras layer to perform L2-normalization.
Copyright (C) 2018 Pierluigi Ferrari
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
from __future__ import division
import numpy as np
import keras.backend as K
from keras.engine.topology import InputSpec
from keras.engine.topology import Layer
class L2Normalization(Layer):
'''
Performs L2 normalization on the input tensor with a learnable scaling parameter
as described in the paper "Parsenet: Looking Wider to See Better" (see references)
and as used in the original SSD model.
Arguments:
gamma_init (int): The initial scaling parameter. Defaults to 20 following the
SSD paper.
Input shape:
4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
or `(batch, height, width, channels)` if `dim_ordering = 'tf'`.
Returns:
The scaled tensor. Same shape as the input tensor.
References:
http://cs.unc.edu/~wliu/papers/parsenet.pdf
'''
def __init__(self, gamma_init=20, **kwargs):
if K.image_dim_ordering() == 'tf':
self.axis = 3
else:
self.axis = 1
self.gamma_init = gamma_init
super(L2Normalization, self).__init__(**kwargs)
def build(self, input_shape):
self.input_spec = [InputSpec(shape=input_shape)]
gamma = self.gamma_init * np.ones((input_shape[self.axis],))
self.gamma = K.variable(gamma, name='{}_gamma'.format(self.name))
self.trainable_weights = [self.gamma]
super(L2Normalization, self).build(input_shape)
def call(self, x, mask=None):
output = K.l2_normalize(x, self.axis)
return output * self.gamma
def get_config(self):
config = {
'gamma_init': self.gamma_init
}
base_config = super(L2Normalization, self).get_config()
return dict(list(base_config.items()) + list(config.items()))