Summary

2020-02-06 16:47:03 -03:00
parent 6328265287
commit b586f22bf0
318 changed files with 25111 additions and 664 deletions
--- a/ssd_keras-master/ssd_encoder_decoder/init.py
+++ b/ssd_keras-master/ssd_encoder_decoder/init.py
--- a/ssd_keras-master/ssd_encoder_decoder/init.pyc
+++ b/ssd_keras-master/ssd_encoder_decoder/init.pyc
--- a/ssd_keras-master/ssd_encoder_decoder/pycache/init.cpython-36.pyc
+++ b/ssd_keras-master/ssd_encoder_decoder/pycache/init.cpython-36.pyc
--- a/ssd_keras-master/ssd_encoder_decoder/pycache/matching_utils.cpython-36.pyc
+++ b/ssd_keras-master/ssd_encoder_decoder/pycache/matching_utils.cpython-36.pyc
--- a/ssd_keras-master/ssd_encoder_decoder/pycache/ssd_input_encoder.cpython-36.pyc
+++ b/ssd_keras-master/ssd_encoder_decoder/pycache/ssd_input_encoder.cpython-36.pyc
--- a/ssd_keras-master/ssd_encoder_decoder/pycache/ssd_output_decoder.cpython-36.pyc
+++ b/ssd_keras-master/ssd_encoder_decoder/pycache/ssd_output_decoder.cpython-36.pyc
--- a/ssd_keras-master/ssd_encoder_decoder/matching_utils.py
+++ b/ssd_keras-master/ssd_encoder_decoder/matching_utils.py
@@ -0,0 +1,116 @@
+'''
+Utilities to match ground truth boxes to anchor boxes.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+def match_bipartite_greedy(weight_matrix):
+    '''
+    Returns a bipartite matching according to the given weight matrix.
+
+    The algorithm works as follows:
+
+    Let the first axis of `weight_matrix` represent ground truth boxes
+    and the second axis anchor boxes.
+    The ground truth box that has the greatest similarity with any
+    anchor box will be matched first, then out of the remaining ground
+    truth boxes, the ground truth box that has the greatest similarity
+    with any of the remaining anchor boxes will be matched second, and
+    so on. That is, the ground truth boxes will be matched in descending
+    order by maximum similarity with any of the respectively remaining
+    anchor boxes.
+    The runtime complexity is O(m^2 * n), where `m` is the number of
+    ground truth boxes and `n` is the number of anchor boxes.
+
+    Arguments:
+        weight_matrix (array): A 2D Numpy array that represents the weight matrix
+            for the matching process. If `(m,n)` is the shape of the weight matrix,
+            it must be `m <= n`. The weights can be integers or floating point
+            numbers. The matching process will maximize, i.e. larger weights are
+            preferred over smaller weights.
+
+    Returns:
+        A 1D Numpy array of length `weight_matrix.shape[0]` that represents
+        the matched index along the second axis of `weight_matrix` for each index
+        along the first axis.
+    '''
+
+    weight_matrix = np.copy(weight_matrix) # We'll modify this array.
+    num_ground_truth_boxes = weight_matrix.shape[0]
+    all_gt_indices = list(range(num_ground_truth_boxes)) # Only relevant for fancy-indexing below.
+
+    # This 1D array will contain for each ground truth box the index of
+    # the matched anchor box.
+    matches = np.zeros(num_ground_truth_boxes, dtype=np.int)
+
+    # In each iteration of the loop below, exactly one ground truth box
+    # will be matched to one anchor box.
+    for _ in range(num_ground_truth_boxes):
+
+        # Find the maximal anchor-ground truth pair in two steps: First, reduce
+        # over the anchor boxes and then reduce over the ground truth boxes.
+        anchor_indices = np.argmax(weight_matrix, axis=1) # Reduce along the anchor box axis.
+        overlaps = weight_matrix[all_gt_indices, anchor_indices]
+        ground_truth_index = np.argmax(overlaps) # Reduce along the ground truth box axis.
+        anchor_index = anchor_indices[ground_truth_index]
+        matches[ground_truth_index] = anchor_index # Set the match.
+
+        # Set the row of the matched ground truth box and the column of the matched
+        # anchor box to all zeros. This ensures that those boxes will not be matched again,
+        # because they will never be the best matches for any other boxes.
+        weight_matrix[ground_truth_index] = 0
+        weight_matrix[:,anchor_index] = 0
+
+    return matches
+
+def match_multi(weight_matrix, threshold):
+    '''
+    Matches all elements along the second axis of `weight_matrix` to their best
+    matches along the first axis subject to the constraint that the weight of a match
+    must be greater than or equal to `threshold` in order to produce a match.
+
+    If the weight matrix contains elements that should be ignored, the row or column
+    representing the respective elemet should be set to a value below `threshold`.
+
+    Arguments:
+        weight_matrix (array): A 2D Numpy array that represents the weight matrix
+            for the matching process. If `(m,n)` is the shape of the weight matrix,
+            it must be `m <= n`. The weights can be integers or floating point
+            numbers. The matching process will maximize, i.e. larger weights are
+            preferred over smaller weights.
+        threshold (float): A float that represents the threshold (i.e. lower bound)
+            that must be met by a pair of elements to produce a match.
+
+    Returns:
+        Two 1D Numpy arrays of equal length that represent the matched indices. The first
+        array contains the indices along the first axis of `weight_matrix`, the second array
+        contains the indices along the second axis.
+    '''
+
+    num_anchor_boxes = weight_matrix.shape[1]
+    all_anchor_indices = list(range(num_anchor_boxes)) # Only relevant for fancy-indexing below.
+
+    # Find the best ground truth match for every anchor box.
+    ground_truth_indices = np.argmax(weight_matrix, axis=0) # Array of shape (weight_matrix.shape[1],)
+    overlaps = weight_matrix[ground_truth_indices, all_anchor_indices] # Array of shape (weight_matrix.shape[1],)
+
+    # Filter out the matches with a weight below the threshold.
+    anchor_indices_thresh_met = np.nonzero(overlaps >= threshold)[0]
+    gt_indices_thresh_met = ground_truth_indices[anchor_indices_thresh_met]
+
+    return gt_indices_thresh_met, anchor_indices_thresh_met
--- a/ssd_keras-master/ssd_encoder_decoder/matching_utils.pyc
+++ b/ssd_keras-master/ssd_encoder_decoder/matching_utils.pyc
--- a/ssd_keras-master/ssd_encoder_decoder/ssd_input_encoder.py
+++ b/ssd_keras-master/ssd_encoder_decoder/ssd_input_encoder.py
@@ -0,0 +1,617 @@
+'''
+An encoder that converts ground truth annotations to SSD-compatible training targets.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from bounding_box_utils.bounding_box_utils import iou, convert_coordinates
+from ssd_encoder_decoder.matching_utils import match_bipartite_greedy, match_multi
+
+class SSDInputEncoder:
+    '''
+    Transforms ground truth labels for object detection in images
+    (2D bounding box coordinates and class labels) to the format required for
+    training an SSD model.
+
+    In the process of encoding the ground truth labels, a template of anchor boxes
+    is being built, which are subsequently matched to the ground truth boxes
+    via an intersection-over-union threshold criterion.
+    '''
+
+    def __init__(self,
+                 img_height,
+                 img_width,
+                 n_classes,
+                 predictor_sizes,
+                 min_scale=0.1,
+                 max_scale=0.9,
+                 scales=None,
+                 aspect_ratios_global=[0.5, 1.0, 2.0],
+                 aspect_ratios_per_layer=None,
+                 two_boxes_for_ar1=True,
+                 steps=None,
+                 offsets=None,
+                 clip_boxes=False,
+                 variances=[0.1, 0.1, 0.2, 0.2],
+                 matching_type='multi',
+                 pos_iou_threshold=0.5,
+                 neg_iou_limit=0.3,
+                 border_pixels='half',
+                 coords='centroids',
+                 normalize_coords=True,
+                 background_id=0):
+        '''
+        Arguments:
+            img_height (int): The height of the input images.
+            img_width (int): The width of the input images.
+            n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+            predictor_sizes (list): A list of int-tuples of the format `(height, width)`
+                containing the output heights and widths of the convolutional predictor layers.
+            min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
+                of the shorter side of the input images. Note that you should set the scaling factors
+                such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
+                to detect. Must be >0.
+            max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
+                of the shorter side of the input images. All scaling factors between the smallest and the
+                largest will be linearly interpolated. Note that the second to last of the linearly interpolated
+                scaling factors will actually be the scaling factor for the last predictor layer, while the last
+                scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
+                if `two_boxes_for_ar1` is `True`. Note that you should set the scaling factors
+                such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
+                to detect. Must be greater than or equal to `min_scale`.
+            scales (list, optional): A list of floats >0 containing scaling factors per convolutional predictor layer.
+                This list must be one element longer than the number of predictor layers. The first `k` elements are the
+                scaling factors for the `k` predictor layers, while the last element is used for the second box
+                for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
+                last scaling factor must be passed either way, even if it is not being used. If a list is passed,
+                this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
+                Note that you should set the scaling factors such that the resulting anchor box sizes correspond to
+                the sizes of the objects you are trying to detect.
+            aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
+                generated. This list is valid for all prediction layers. Note that you should set the aspect ratios such
+                that the resulting anchor box shapes roughly correspond to the shapes of the objects you are trying to detect.
+            aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
+                If a list is passed, it overrides `aspect_ratios_global`. Note that you should set the aspect ratios such
+                that the resulting anchor box shapes very roughly correspond to the shapes of the objects you are trying to detect.
+            two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratios lists that contain 1. Will be ignored otherwise.
+                If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
+                using the scaling factor for the respective layer, the second one will be generated using
+                geometric mean of said scaling factor and next bigger scaling factor.
+            steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+                either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
+                pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
+                the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
+                If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
+                If no steps are provided, then they will be computed such that the anchor box center points will form an
+                equidistant grid within the image dimensions.
+            offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+                either floats or tuples of two floats. These numbers represent for each predictor layer how many
+                pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
+                as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
+                of the step size specified in the `steps` argument. If the list contains floats, then that value will
+                be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
+                `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
+            clip_boxes (bool, optional): If `True`, limits the anchor box coordinates to stay within image boundaries.
+            variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+                its respective variance value.
+            matching_type (str, optional): Can be either 'multi' or 'bipartite'. In 'bipartite' mode, each ground truth box will
+                be matched only to the one anchor box with the highest IoU overlap. In 'multi' mode, in addition to the aforementioned
+                bipartite matching, all anchor boxes with an IoU overlap greater than or equal to the `pos_iou_threshold` will be
+                matched to a given ground truth box.
+            pos_iou_threshold (float, optional): The intersection-over-union similarity threshold that must be
+                met in order to match a given ground truth box to a given anchor box.
+            neg_iou_limit (float, optional): The maximum allowed intersection-over-union similarity of an
+                anchor box with any ground truth box to be labeled a negative (i.e. background) box. If an
+                anchor box is neither a positive, nor a negative box, it will be ignored during training.
+            border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+                Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+                to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+                If 'half', then one of each of the two horizontal and vertical borders belong
+                to the boxex, but not the other.
+            coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
+                of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
+                and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+            normalize_coords (bool, optional): If `True`, the encoder uses relative instead of absolute coordinates.
+                This means instead of using absolute tartget coordinates, the encoder will scale all coordinates to be within [0,1].
+                This way learning becomes independent of the input image size.
+            background_id (int, optional): Determines which class ID is for the background class.
+        '''
+        predictor_sizes = np.array(predictor_sizes)
+        if predictor_sizes.ndim == 1:
+            predictor_sizes = np.expand_dims(predictor_sizes, axis=0)
+
+        ##################################################################################
+        # Handle exceptions.
+        ##################################################################################
+
+        if (min_scale is None or max_scale is None) and scales is None:
+            raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
+
+        if scales:
+            if (len(scales) != predictor_sizes.shape[0] + 1): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
+                raise ValueError("It must be either scales is None or len(scales) == len(predictor_sizes)+1, but len(scales) == {} and len(predictor_sizes)+1 == {}".format(len(scales), len(predictor_sizes)+1))
+            scales = np.array(scales)
+            if np.any(scales <= 0):
+                raise ValueError("All values in `scales` must be greater than 0, but the passed list of scales is {}".format(scales))
+        else: # If no list of scales was passed, we need to make sure that `min_scale` and `max_scale` are valid values.
+            if not 0 < min_scale <= max_scale:
+                raise ValueError("It must be 0 < min_scale <= max_scale, but it is min_scale = {} and max_scale = {}".format(min_scale, max_scale))
+
+        if not (aspect_ratios_per_layer is None):
+            if (len(aspect_ratios_per_layer) != predictor_sizes.shape[0]): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
+                raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == len(predictor_sizes), but len(aspect_ratios_per_layer) == {} and len(predictor_sizes) == {}".format(len(aspect_ratios_per_layer), len(predictor_sizes)))
+            for aspect_ratios in aspect_ratios_per_layer:
+                if np.any(np.array(aspect_ratios) <= 0):
+                    raise ValueError("All aspect ratios must be greater than zero.")
+        else:
+            if (aspect_ratios_global is None):
+                raise ValueError("At least one of `aspect_ratios_global` and `aspect_ratios_per_layer` must not be `None`.")
+            if np.any(np.array(aspect_ratios_global) <= 0):
+                raise ValueError("All aspect ratios must be greater than zero.")
+
+        if len(variances) != 4:
+            raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+        variances = np.array(variances)
+        if np.any(variances <= 0):
+            raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+        if not (coords == 'minmax' or coords == 'centroids' or coords == 'corners'):
+            raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+        if (not (steps is None)) and (len(steps) != predictor_sizes.shape[0]):
+            raise ValueError("You must provide at least one step value per predictor layer.")
+
+        if (not (offsets is None)) and (len(offsets) != predictor_sizes.shape[0]):
+            raise ValueError("You must provide at least one offset value per predictor layer.")
+
+        ##################################################################################
+        # Set or compute members.
+        ##################################################################################
+
+        self.img_height = img_height
+        self.img_width = img_width
+        self.n_classes = n_classes + 1 # + 1 for the background class
+        self.predictor_sizes = predictor_sizes
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        # If `scales` is None, compute the scaling factors by linearly interpolating between
+        # `min_scale` and `max_scale`. If an explicit list of `scales` is given, however,
+        # then it takes precedent over `min_scale` and `max_scale`.
+        if (scales is None):
+            self.scales = np.linspace(self.min_scale, self.max_scale, len(self.predictor_sizes)+1)
+        else:
+            # If a list of scales is given explicitly, we'll use that instead of computing it from `min_scale` and `max_scale`.
+            self.scales = scales
+        # If `aspect_ratios_per_layer` is None, then we use the same list of aspect ratios
+        # `aspect_ratios_global` for all predictor layers. If `aspect_ratios_per_layer` is given,
+        # however, then it takes precedent over `aspect_ratios_global`.
+        if (aspect_ratios_per_layer is None):
+            self.aspect_ratios = [aspect_ratios_global] * predictor_sizes.shape[0]
+        else:
+            # If aspect ratios are given per layer, we'll use those.
+            self.aspect_ratios = aspect_ratios_per_layer
+        self.two_boxes_for_ar1 = two_boxes_for_ar1
+        if not (steps is None):
+            self.steps = steps
+        else:
+            self.steps = [None] * predictor_sizes.shape[0]
+        if not (offsets is None):
+            self.offsets = offsets
+        else:
+            self.offsets = [None] * predictor_sizes.shape[0]
+        self.clip_boxes = clip_boxes
+        self.variances = variances
+        self.matching_type = matching_type
+        self.pos_iou_threshold = pos_iou_threshold
+        self.neg_iou_limit = neg_iou_limit
+        self.border_pixels = border_pixels
+        self.coords = coords
+        self.normalize_coords = normalize_coords
+        self.background_id = background_id
+
+        # Compute the number of boxes per spatial location for each predictor layer.
+        # For example, if a predictor layer has three different aspect ratios, [1.0, 0.5, 2.0], and is
+        # supposed to predict two boxes of slightly different size for aspect ratio 1.0, then that predictor
+        # layer predicts a total of four boxes at every spatial location across the feature map.
+        if not (aspect_ratios_per_layer is None):
+            self.n_boxes = []
+            for aspect_ratios in aspect_ratios_per_layer:
+                if (1 in aspect_ratios) & two_boxes_for_ar1:
+                    self.n_boxes.append(len(aspect_ratios) + 1)
+                else:
+                    self.n_boxes.append(len(aspect_ratios))
+        else:
+            if (1 in aspect_ratios_global) & two_boxes_for_ar1:
+                self.n_boxes = len(aspect_ratios_global) + 1
+            else:
+                self.n_boxes = len(aspect_ratios_global)
+
+        ##################################################################################
+        # Compute the anchor boxes for each predictor layer.
+        ##################################################################################
+
+        # Compute the anchor boxes for each predictor layer. We only have to do this once
+        # since the anchor boxes depend only on the model configuration, not on the input data.
+        # For each predictor layer (i.e. for each scaling factor) the tensors for that layer's
+        # anchor boxes will have the shape `(feature_map_height, feature_map_width, n_boxes, 4)`.
+
+        self.boxes_list = [] # This will store the anchor boxes for each predicotr layer.
+
+        # The following lists just store diagnostic information. Sometimes it's handy to have the
+        # boxes' center points, heights, widths, etc. in a list.
+        self.wh_list_diag = [] # Box widths and heights for each predictor layer
+        self.steps_diag = [] # Horizontal and vertical distances between any two boxes for each predictor layer
+        self.offsets_diag = [] # Offsets for each predictor layer
+        self.centers_diag = [] # Anchor box center points as `(cy, cx)` for each predictor layer
+
+        # Iterate over all predictor layers and compute the anchor boxes for each one.
+        for i in range(len(self.predictor_sizes)):
+            boxes, center, wh, step, offset = self.generate_anchor_boxes_for_layer(feature_map_size=self.predictor_sizes[i],
+                                                                                   aspect_ratios=self.aspect_ratios[i],
+                                                                                   this_scale=self.scales[i],
+                                                                                   next_scale=self.scales[i+1],
+                                                                                   this_steps=self.steps[i],
+                                                                                   this_offsets=self.offsets[i],
+                                                                                   diagnostics=True)
+            self.boxes_list.append(boxes)
+            self.wh_list_diag.append(wh)
+            self.steps_diag.append(step)
+            self.offsets_diag.append(offset)
+            self.centers_diag.append(center)
+
+    def __call__(self, ground_truth_labels, diagnostics=False):
+        '''
+        Converts ground truth bounding box data into a suitable format to train an SSD model.
+
+        Arguments:
+            ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
+                for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
+                to the respective image, and the data for each ground truth bounding box has the format
+                `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
+                an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
+            diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
+                but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
+                This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
+                boxes.
+
+        Returns:
+            `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
+            ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
+            model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
+            the last axis are the box coordinates, the next four elements after that are just dummy elements, and
+            the last four elements are the variances.
+        '''
+
+        # Mapping to define which indices represent which coordinates in the ground truth.
+        class_id = 0
+        xmin = 1
+        ymin = 2
+        xmax = 3
+        ymax = 4
+
+        batch_size = len(ground_truth_labels)
+
+        ##################################################################################
+        # Generate the template for y_encoded.
+        ##################################################################################
+
+        y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False)
+
+        ##################################################################################
+        # Match ground truth boxes to anchor boxes.
+        ##################################################################################
+
+        # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
+        # a ground truth match and for which the maximal IoU overlap with any ground truth box is less
+        # than or equal to `neg_iou_limit` will be a negative (background) box.
+
+        y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default.
+        n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item
+        class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors
+
+        for i in range(batch_size): # For each batch item...
+
+            if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match.
+            labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item
+
+            # Check for degenerate ground truth bounding boxes before attempting any computations.
+            if np.any(labels[:,[xmax]] - labels[:,[xmin]] <= 0) or np.any(labels[:,[ymax]] - labels[:,[ymin]] <= 0):
+                raise DegenerateBoxError("SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, labels) +
+                                         "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " +
+                                         "bounding boxes will lead to NaN errors during the training.")
+
+            # Maybe normalize the box coordinates.
+            if self.normalize_coords:
+                labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height
+                labels[:,[xmin,xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width
+
+            # Maybe convert the box coordinate format.
+            if self.coords == 'centroids':
+                labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels)
+            elif self.coords == 'minmax':
+                labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax')
+
+            classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)] # The one-hot class IDs for the ground truth boxes of this batch item
+            labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin,ymin,xmax,ymax]]], axis=-1) # The one-hot version of the labels for this batch item
+
+            # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
+            # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
+            similarities = iou(labels[:,[xmin,ymin,xmax,ymax]], y_encoded[i,:,-12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels)
+
+            # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
+            #        This ensures that each ground truth box will have at least one good match.
+
+            # For each ground truth box, get the anchor box to match with it.
+            bipartite_matches = match_bipartite_greedy(weight_matrix=similarities)
+
+            # Write the ground truth data to the matched anchor boxes.
+            y_encoded[i, bipartite_matches, :-8] = labels_one_hot
+
+            # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
+            similarities[:, bipartite_matches] = 0
+
+            # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
+            #         ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
+            #         such ground truth box.
+
+            if self.matching_type == 'multi':
+
+                # Get all matches that satisfy the IoU threshold.
+                matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold)
+
+                # Write the ground truth data to the matched anchor boxes.
+                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]
+
+                # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
+                similarities[:, matches[1]] = 0
+
+            # Third: Now after the matching is done, all negative (background) anchor boxes that have
+            #        an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
+            #        i.e. they will no longer be background boxes. These anchors are "too close" to a
+            #        ground truth box to be valid background boxes.
+
+            max_background_similarities = np.amax(similarities, axis=0)
+            neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0]
+            y_encoded[i, neutral_boxes, self.background_id] = 0
+
+        ##################################################################################
+        # Convert box coordinates to anchor box offsets.
+        ##################################################################################
+
+        if self.coords == 'centroids':
+            y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
+            y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
+            y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] # w(gt) / w(anchor), h(gt) / h(anchor)
+            y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
+        elif self.coords == 'corners':
+            y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
+            y_encoded[:,:,[-12,-10]] /= np.expand_dims(y_encoded[:,:,-6] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
+            y_encoded[:,:,[-11,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-7], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
+            y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
+        elif self.coords == 'minmax':
+            y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
+            y_encoded[:,:,[-12,-11]] /= np.expand_dims(y_encoded[:,:,-7] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
+            y_encoded[:,:,[-10,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-6], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
+            y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
+
+        if diagnostics:
+            # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
+            y_matched_anchors = np.copy(y_encoded)
+            y_matched_anchors[:,:,-12:-8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero.
+            return y_encoded, y_matched_anchors
+        else:
+            return y_encoded
+
+    def generate_anchor_boxes_for_layer(self,
+                                        feature_map_size,
+                                        aspect_ratios,
+                                        this_scale,
+                                        next_scale,
+                                        this_steps=None,
+                                        this_offsets=None,
+                                        diagnostics=False):
+        '''
+        Computes an array of the spatial positions and sizes of the anchor boxes for one predictor layer
+        of size `feature_map_size == [feature_map_height, feature_map_width]`.
+
+        Arguments:
+            feature_map_size (tuple): A list or tuple `[feature_map_height, feature_map_width]` with the spatial
+                dimensions of the feature map for which to generate the anchor boxes.
+            aspect_ratios (list): A list of floats, the aspect ratios for which anchor boxes are to be generated.
+                All list elements must be unique.
+            this_scale (float): A float in [0, 1], the scaling factor for the size of the generate anchor boxes
+                as a fraction of the shorter side of the input image.
+            next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
+                `self.two_boxes_for_ar1 == True`.
+            diagnostics (bool, optional): If true, the following additional outputs will be returned:
+                1) A list of the center point `x` and `y` coordinates for each spatial location.
+                2) A list containing `(width, height)` for each box aspect ratio.
+                3) A tuple containing `(step_height, step_width)`
+                4) A tuple containing `(offset_height, offset_width)`
+                This information can be useful to understand in just a few numbers what the generated grid of
+                anchor boxes actually looks like, i.e. how large the different boxes are and how dense
+                their spatial distribution is, in order to determine whether the box grid covers the input images
+                appropriately and whether the box sizes are appropriate to fit the sizes of the objects
+                to be detected.
+
+        Returns:
+            A 4D Numpy tensor of shape `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)` where the
+            last dimension contains `(xmin, xmax, ymin, ymax)` for each anchor box in each cell of the feature map.
+        '''
+        # Compute box width and height for each aspect ratio.
+
+        # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
+        size = min(self.img_height, self.img_width)
+        # Compute the box widths and and heights for all aspect ratios
+        wh_list = []
+        for ar in aspect_ratios:
+            if (ar == 1):
+                # Compute the regular anchor box for aspect ratio 1.
+                box_height = box_width = this_scale * size
+                wh_list.append((box_width, box_height))
+                if self.two_boxes_for_ar1:
+                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
+                    box_height = box_width = np.sqrt(this_scale * next_scale) * size
+                    wh_list.append((box_width, box_height))
+            else:
+                box_width = this_scale * size * np.sqrt(ar)
+                box_height = this_scale * size / np.sqrt(ar)
+                wh_list.append((box_width, box_height))
+        wh_list = np.array(wh_list)
+        n_boxes = len(wh_list)
+
+        # Compute the grid of box center points. They are identical for all aspect ratios.
+
+        # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
+        if (this_steps is None):
+            step_height = self.img_height / feature_map_size[0]
+            step_width = self.img_width / feature_map_size[1]
+        else:
+            if isinstance(this_steps, (list, tuple)) and (len(this_steps) == 2):
+                step_height = this_steps[0]
+                step_width = this_steps[1]
+            elif isinstance(this_steps, (int, float)):
+                step_height = this_steps
+                step_width = this_steps
+        # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
+        if (this_offsets is None):
+            offset_height = 0.5
+            offset_width = 0.5
+        else:
+            if isinstance(this_offsets, (list, tuple)) and (len(this_offsets) == 2):
+                offset_height = this_offsets[0]
+                offset_width = this_offsets[1]
+            elif isinstance(this_offsets, (int, float)):
+                offset_height = this_offsets
+                offset_width = this_offsets
+        # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
+        cy = np.linspace(offset_height * step_height, (offset_height + feature_map_size[0] - 1) * step_height, feature_map_size[0])
+        cx = np.linspace(offset_width * step_width, (offset_width + feature_map_size[1] - 1) * step_width, feature_map_size[1])
+        cx_grid, cy_grid = np.meshgrid(cx, cy)
+        cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
+        cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
+
+        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
+        # where the last dimension will contain `(cx, cy, w, h)`
+        boxes_tensor = np.zeros((feature_map_size[0], feature_map_size[1], n_boxes, 4))
+
+        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx
+        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy
+        boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
+        boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
+
+        # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
+        boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')
+
+        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
+        if self.clip_boxes:
+            x_coords = boxes_tensor[:,:,:,[0, 2]]
+            x_coords[x_coords >= self.img_width] = self.img_width - 1
+            x_coords[x_coords < 0] = 0
+            boxes_tensor[:,:,:,[0, 2]] = x_coords
+            y_coords = boxes_tensor[:,:,:,[1, 3]]
+            y_coords[y_coords >= self.img_height] = self.img_height - 1
+            y_coords[y_coords < 0] = 0
+            boxes_tensor[:,:,:,[1, 3]] = y_coords
+
+        # `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
+        if self.normalize_coords:
+            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
+            boxes_tensor[:, :, :, [1, 3]] /= self.img_height
+
+        # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
+        if self.coords == 'centroids':
+            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
+            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
+        elif self.coords == 'minmax':
+            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
+            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')
+
+        if diagnostics:
+            return boxes_tensor, (cy, cx), wh_list, (step_height, step_width), (offset_height, offset_width)
+        else:
+            return boxes_tensor
+
+    def generate_encoding_template(self, batch_size, diagnostics=False):
+        '''
+        Produces an encoding template for the ground truth label tensor for a given batch.
+
+        Note that all tensor creation, reshaping and concatenation operations performed in this function
+        and the sub-functions it calls are identical to those performed inside the SSD model. This, of course,
+        must be the case in order to preserve the spatial meaning of each box prediction, but it's useful to make
+        yourself aware of this fact and why it is necessary.
+
+        In other words, the boxes in `y_encoded` must have a specific order in order correspond to the right spatial
+        positions and scales of the boxes predicted by the model. The sequence of operations here ensures that `y_encoded`
+        has this specific form.
+
+        Arguments:
+            batch_size (int): The batch size.
+            diagnostics (bool, optional): See the documnentation for `generate_anchor_boxes()`. The diagnostic output
+                here is similar, just for all predictor conv layers.
+
+        Returns:
+            A Numpy array of shape `(batch_size, #boxes, #classes + 12)`, the template into which to encode
+            the ground truth labels for training. The last axis has length `#classes + 12` because the model
+            output contains not only the 4 predicted box coordinate offsets, but also the 4 coordinates for
+            the anchor boxes and the 4 variance values.
+        '''
+        # Tile the anchor boxes for each predictor layer across all batch items.
+        boxes_batch = []
+        for boxes in self.boxes_list:
+            # Prepend one dimension to `self.boxes_list` to account for the batch size and tile it along.
+            # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 4)`
+            boxes = np.expand_dims(boxes, axis=0)
+            boxes = np.tile(boxes, (batch_size, 1, 1, 1, 1))
+
+            # Now reshape the 5D tensor above into a 3D tensor of shape
+            # `(batch, feature_map_height * feature_map_width * n_boxes, 4)`. The resulting
+            # order of the tensor content will be identical to the order obtained from the reshaping operation
+            # in our Keras model (we're using the Tensorflow backend, and tf.reshape() and np.reshape()
+            # use the same default index order, which is C-like index ordering)
+            boxes = np.reshape(boxes, (batch_size, -1, 4))
+            boxes_batch.append(boxes)
+
+        # Concatenate the anchor tensors from the individual layers to one.
+        boxes_tensor = np.concatenate(boxes_batch, axis=1)
+
+        # 3: Create a template tensor to hold the one-hot class encodings of shape `(batch, #boxes, #classes)`
+        #    It will contain all zeros for now, the classes will be set in the matching process that follows
+        classes_tensor = np.zeros((batch_size, boxes_tensor.shape[1], self.n_classes))
+
+        # 4: Create a tensor to contain the variances. This tensor has the same shape as `boxes_tensor` and simply
+        #    contains the same 4 variance values for every position in the last axis.
+        variances_tensor = np.zeros_like(boxes_tensor)
+        variances_tensor += self.variances # Long live broadcasting
+
+        # 4: Concatenate the classes, boxes and variances tensors to get our final template for y_encoded. We also need
+        #    another tensor of the shape of `boxes_tensor` as a space filler so that `y_encoding_template` has the same
+        #    shape as the SSD model output tensor. The content of this tensor is irrelevant, we'll just use
+        #    `boxes_tensor` a second time.
+        y_encoding_template = np.concatenate((classes_tensor, boxes_tensor, boxes_tensor, variances_tensor), axis=2)
+
+        if diagnostics:
+            return y_encoding_template, self.centers_diag, self.wh_list_diag, self.steps_diag, self.offsets_diag
+        else:
+            return y_encoding_template
+
+class DegenerateBoxError(Exception):
+    '''
+    An exception class to be raised if degenerate boxes are being detected.
+    '''
+    pass
--- a/ssd_keras-master/ssd_encoder_decoder/ssd_input_encoder.pyc
+++ b/ssd_keras-master/ssd_encoder_decoder/ssd_input_encoder.pyc
--- a/ssd_keras-master/ssd_encoder_decoder/ssd_output_decoder.py
+++ b/ssd_keras-master/ssd_encoder_decoder/ssd_output_decoder.py
@@ -0,0 +1,530 @@
+'''
+Includes:
+* Functions to decode and filter raw SSD model output. These are only needed if the
+  SSD model does not have a `DecodeDetections` layer.
+* Functions to perform greedy non-maximum suppression
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from bounding_box_utils.bounding_box_utils import iou, convert_coordinates
+
+def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'):
+    '''
+    Perform greedy non-maximum suppression on the input boxes.
+
+    Greedy NMS works by selecting the box with the highest score and
+    removing all boxes around it that are too close to it measured by IoU-similarity.
+    Out of the boxes that are left over, once again the one with the highest
+    score is selected and so on, until no boxes with too much overlap are left.
+
+    Arguments:
+        y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this
+            is a list of length `n` where each list element is a 2D Numpy array.
+            For a batch item with `k` predicted boxes this 2D Numpy array has
+            shape `(k, 6)`, where each row contains the coordinates of the respective
+            box in the format `[class_id, score, xmin, xmax, ymin, ymax]`.
+            Technically, the number of columns doesn't have to be 6, it can be
+            arbitrary as long as the first four elements of each row are
+            `xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element
+            is the score assigned to the prediction. Note that this function is
+            agnostic to the scale of the score or what it represents.
+        iou_threshold (float, optional): All boxes with a Jaccard similarity of
+            greater than `iou_threshold` with a locally maximal box will be removed
+            from the set of predictions, where 'maximal' refers to the box score.
+        coords (str, optional): The coordinate format of `y_pred_decoded`.
+            Can be one of the formats supported by `iou()`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        The predictions after removing non-maxima. The format is the same as the input format.
+    '''
+    y_pred_decoded_nms = []
+    for batch_item in y_pred_decoded: # For the labels of each batch item...
+        boxes_left = np.copy(batch_item)
+        maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+        while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+            maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
+            maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+            maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+            boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+            if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+            similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+            boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+        y_pred_decoded_nms.append(np.array(maxima))
+
+    return y_pred_decoded_nms
+
+def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
+    '''
+    The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
+    function for per-class NMS in `decode_detections()`.
+    '''
+    boxes_left = np.copy(predictions)
+    maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+    while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+        maximum_index = np.argmax(boxes_left[:,0]) # ...get the index of the next box with the highest confidence...
+        maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+        maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+        boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+        if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+        similarities = iou(boxes_left[:,1:], maximum_box[1:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+        boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+    return np.array(maxima)
+
+def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
+    '''
+    The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
+    function in `decode_detections_fast()`.
+    '''
+    boxes_left = np.copy(predictions)
+    maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+    while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+        maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
+        maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+        maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+        boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+        if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+        similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+        boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+    return np.array(maxima)
+
+def decode_detections(y_pred,
+                      confidence_thresh=0.01,
+                      iou_threshold=0.45,
+                      top_k=200,
+                      input_coords='centroids',
+                      normalize_coords=True,
+                      img_height=None,
+                      img_width=None,
+                      border_pixels='half'):
+    '''
+    Convert model prediction output back to a format that contains only the positive box predictions
+    (i.e. the same format that `SSDInputEncoder` takes as input).
+
+    After the decoding, two stages of prediction filtering are performed for each class individually:
+    First confidence thresholding, then greedy non-maximum suppression. The filtering results for all
+    classes are concatenated and the `top_k` overall highest confidence results constitute the final
+    predictions for a given batch item. This procedure follows the original Caffe implementation.
+    For a slightly different and more efficient alternative to decode raw model output that performs
+    non-maximum suppresion globally instead of per class, see `decode_detections_fast()` below.
+
+    Arguments:
+        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
+            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
+            boxes predicted by the model per image and the last axis contains
+            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+            positive class in order to be considered for the non-maximum suppression stage for the respective class.
+            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+            stage, while a larger value will result in a larger part of the selection process happening in the confidence
+            thresholding stage.
+        iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+            to the box score.
+        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+            non-maximum suppression stage.
+        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+            coordinates. Requires `img_height` and `img_width` if set to `True`.
+        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        A python list of length `batch_size` where each list element represents the predicted boxes
+        for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
+        a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`.
+    '''
+    if normalize_coords and ((img_height is None) or (img_width is None)):
+        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
+
+    y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
+
+    if input_coords == 'centroids':
+        y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
+        y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
+        y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
+        y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
+        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
+    elif input_coords == 'minmax':
+        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
+    elif input_coords == 'corners':
+        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+    else:
+        raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
+
+    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
+
+    if normalize_coords:
+        y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
+        y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
+
+    # 3: Apply confidence thresholding and non-maximum suppression per class
+
+    n_classes = y_pred_decoded_raw.shape[-1] - 4 # The number of classes is the length of the last axis minus the four box coordinates
+
+    y_pred_decoded = [] # Store the final predictions in this list
+    for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
+        pred = [] # Store the final predictions for this batch item here
+        for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
+            single_class = batch_item[:,[class_id, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
+            threshold_met = single_class[single_class[:,0] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
+            if threshold_met.shape[0] > 0: # If any boxes made the threshold...
+                maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
+                maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
+                maxima_output[:,0] = class_id # Write the class ID to the first column...
+                maxima_output[:,1:] = maxima # ...and write the maxima to the other columns...
+                pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
+        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
+        if pred: # If there are any predictions left after confidence-thresholding...
+            pred = np.concatenate(pred, axis=0)
+            if top_k != 'all' and pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
+                top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
+                pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
+        else:
+            pred = np.array(pred) # Even if empty, `pred` must become a Numpy array.
+        y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
+
+    return y_pred_decoded
+
+def decode_detections_fast(y_pred,
+                           confidence_thresh=0.5,
+                           iou_threshold=0.45,
+                           top_k='all',
+                           input_coords='centroids',
+                           normalize_coords=True,
+                           img_height=None,
+                           img_width=None,
+                           border_pixels='half'):
+    '''
+    Convert model prediction output back to a format that contains only the positive box predictions
+    (i.e. the same format that `enconde_y()` takes as input).
+
+    Optionally performs confidence thresholding and greedy non-maximum suppression after the decoding stage.
+
+    Note that the decoding procedure used here is not the same as the procedure used in the original Caffe implementation.
+    For each box, the procedure used here assigns the box's highest confidence as its predicted class. Then it removes
+    all boxes for which the highest confidence is the background class. This results in less work for the subsequent
+    non-maximum suppression, because the vast majority of the predictions will be filtered out just by the fact that
+    their highest confidence is for the background class. It is much more efficient than the procedure of the original
+    implementation, but the results may also differ.
+
+    Arguments:
+        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
+            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
+            boxes predicted by the model per image and the last axis contains
+            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in any positive
+            class required for a given box to be considered a positive prediction. A lower value will result
+            in better recall, while a higher value will result in better precision. Do not use this parameter with the
+            goal to combat the inevitably many duplicates that an SSD will produce, the subsequent non-maximum suppression
+            stage will take care of those.
+        iou_threshold (float, optional): `None` or a float in [0,1]. If `None`, no non-maximum suppression will be
+            performed. If not `None`, greedy NMS will be performed after the confidence thresholding stage, meaning
+            all boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
+            from the set of predictions, where 'maximal' refers to the box score.
+        top_k (int, optional): 'all' or an integer with number of highest scoring predictions to be kept for each batch item
+            after the non-maximum suppression stage. If 'all', all predictions left after the NMS stage will be kept.
+        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+            coordinates. Requires `img_height` and `img_width` if set to `True`.
+        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        A python list of length `batch_size` where each list element represents the predicted boxes
+        for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
+        a non-background class for the respective image in the format `[class_id, confidence, xmin, xmax, ymin, ymax]`.
+    '''
+    if normalize_coords and ((img_height is None) or (img_width is None)):
+        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+    # 1: Convert the classes from one-hot encoding to their class ID
+    y_pred_converted = np.copy(y_pred[:,:,-14:-8]) # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
+    y_pred_converted[:,:,0] = np.argmax(y_pred[:,:,:-12], axis=-1) # The indices of the highest confidence values in the one-hot class vectors are the class ID
+    y_pred_converted[:,:,1] = np.amax(y_pred[:,:,:-12], axis=-1) # Store the confidence values themselves, too
+
+    # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
+    if input_coords == 'centroids':
+        y_pred_converted[:,:,[4,5]] = np.exp(y_pred_converted[:,:,[4,5]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
+        y_pred_converted[:,:,[4,5]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
+        y_pred_converted[:,:,[2,3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
+        y_pred_converted[:,:,[2,3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
+        y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2corners')
+    elif input_coords == 'minmax':
+        y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_converted[:,:,[2,3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_converted[:,:,[4,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+        y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='minmax2corners')
+    elif input_coords == 'corners':
+        y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_converted[:,:,[2,4]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_converted[:,:,[3,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+    else:
+        raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+    # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
+    if normalize_coords:
+        y_pred_converted[:,:,[2,4]] *= img_width # Convert xmin, xmax back to absolute coordinates
+        y_pred_converted[:,:,[3,5]] *= img_height # Convert ymin, ymax back to absolute coordinates
+
+    # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
+    y_pred_decoded = []
+    for batch_item in y_pred_converted: # For each image in the batch...
+        boxes = batch_item[np.nonzero(batch_item[:,0])] # ...get all boxes that don't belong to the background class,...
+        boxes = boxes[boxes[:,1] >= confidence_thresh] # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
+        if iou_threshold: # ...if an IoU threshold is set...
+            boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on the remaining boxes.
+        if top_k != 'all' and boxes.shape[0] > top_k: # If we have more than `top_k` results left at this point...
+            top_k_indices = np.argpartition(boxes[:,1], kth=boxes.shape[0]-top_k, axis=0)[boxes.shape[0]-top_k:] # ...get the indices of the `top_k` highest-scoring boxes...
+            boxes = boxes[top_k_indices] # ...and keep only those boxes...
+        y_pred_decoded.append(boxes) # ...and now that we're done, append the array of final predictions for this batch item to the output list
+
+    return y_pred_decoded
+
+################################################################################################
+# Debugging tools, not relevant for normal use
+################################################################################################
+
+# The functions below are for debugging, so you won't normally need them. That is,
+# unless you need to debug your model, of course.
+
+def decode_detections_debug(y_pred,
+                            confidence_thresh=0.01,
+                            iou_threshold=0.45,
+                            top_k=200,
+                            input_coords='centroids',
+                            normalize_coords=True,
+                            img_height=None,
+                            img_width=None,
+                            variance_encoded_in_target=False,
+                            border_pixels='half'):
+    '''
+    This decoder performs the same processing as `decode_detections()`, but the output format for each left-over
+    predicted box is `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
+
+    That is, in addition to the usual data, each predicted box has the internal index of that box within
+    the model (`box_id`) prepended to it. This allows you to know exactly which part of the model made a given
+    box prediction; in particular, it allows you to know which predictor layer made a given prediction.
+    This can be useful for debugging.
+
+    Arguments:
+        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
+            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
+            boxes predicted by the model per image and the last axis contains
+            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+            positive class in order to be considered for the non-maximum suppression stage for the respective class.
+            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+            stage, while a larger value will result in a larger part of the selection process happening in the confidence
+            thresholding stage.
+        iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+            to the box score.
+        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+            non-maximum suppression stage.
+        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+            coordinates. Requires `img_height` and `img_width` if set to `True`.
+        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        A python list of length `batch_size` where each list element represents the predicted boxes
+        for one image and contains a Numpy array of shape `(boxes, 7)` where each row is a box prediction for
+        a non-background class for the respective image in the format `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
+    '''
+    if normalize_coords and ((img_height is None) or (img_width is None)):
+        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
+
+    y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
+
+    if input_coords == 'centroids':
+        if variance_encoded_in_target:
+            # Decode the predicted box center x and y coordinates.
+            y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] + y_pred[:,:,[-8,-7]]
+            # Decode the predicted box width and heigt.
+            y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
+        else:
+            # Decode the predicted box center x and y coordinates.
+            y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] * y_pred[:,:,[-4,-3]] + y_pred[:,:,[-8,-7]]
+            # Decode the predicted box width and heigt.
+            y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
+        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
+    elif input_coords == 'minmax':
+        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
+    elif input_coords == 'corners':
+        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+    else:
+        raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
+
+    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
+
+    if normalize_coords:
+        y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
+        y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
+
+    # 3: For each batch item, prepend each box's internal index to its coordinates.
+
+    y_pred_decoded_raw2 = np.zeros((y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1], y_pred_decoded_raw.shape[2] + 1)) # Expand the last axis by one.
+    y_pred_decoded_raw2[:,:,1:] = y_pred_decoded_raw
+    y_pred_decoded_raw2[:,:,0] = np.arange(y_pred_decoded_raw.shape[1]) # Put the box indices as the first element for each box via broadcasting.
+    y_pred_decoded_raw = y_pred_decoded_raw2
+
+    # 4: Apply confidence thresholding and non-maximum suppression per class
+
+    n_classes = y_pred_decoded_raw.shape[-1] - 5 # The number of classes is the length of the last axis minus the four box coordinates and minus the index
+
+    y_pred_decoded = [] # Store the final predictions in this list
+    for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
+        pred = [] # Store the final predictions for this batch item here
+        for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
+            single_class = batch_item[:,[0, class_id + 1, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
+            threshold_met = single_class[single_class[:,1] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
+            if threshold_met.shape[0] > 0: # If any boxes made the threshold...
+                maxima = _greedy_nms_debug(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
+                maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
+                maxima_output[:,0] = maxima[:,0] # Write the box index to the first column...
+                maxima_output[:,1] = class_id # ...and write the class ID to the second column...
+                maxima_output[:,2:] = maxima[:,1:] # ...and write the rest of the maxima data to the other columns...
+                pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
+        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
+        pred = np.concatenate(pred, axis=0)
+        if pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
+            top_k_indices = np.argpartition(pred[:,2], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
+            pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
+        y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
+
+    return y_pred_decoded
+
+def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
+    '''
+    The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
+    function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all
+    left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output
+    box and is thus useful for debugging.
+    '''
+    boxes_left = np.copy(predictions)
+    maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+    while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+        maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
+        maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+        maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+        boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+        if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+        similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+        boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+    return np.array(maxima)
+
+def get_num_boxes_per_pred_layer(predictor_sizes, aspect_ratios, two_boxes_for_ar1):
+    '''
+    Returns a list of the number of boxes that each predictor layer predicts.
+
+    `aspect_ratios` must be a nested list, containing a list of aspect ratios
+    for each predictor layer.
+    '''
+    num_boxes_per_pred_layer = []
+    for i in range(len(predictor_sizes)):
+        if two_boxes_for_ar1:
+            num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * (len(aspect_ratios[i]) + 1))
+        else:
+            num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * len(aspect_ratios[i]))
+    return num_boxes_per_pred_layer
+
+def get_pred_layers(y_pred_decoded, num_boxes_per_pred_layer):
+    '''
+    For a given prediction tensor decoded with `decode_detections_debug()`, returns a list
+    with the indices of the predictor layers that made each predictions.
+
+    That is, this function lets you know which predictor layer is responsible
+    for a given prediction.
+
+    Arguments:
+        y_pred_decoded (array): The decoded model output tensor. Must have been
+            decoded with `decode_detections_debug()` so that it contains the internal box index
+            for each predicted box.
+        num_boxes_per_pred_layer (list): A list that contains the total number
+            of boxes that each predictor layer predicts.
+    '''
+    pred_layers_all = []
+    cum_boxes_per_pred_layer = np.cumsum(num_boxes_per_pred_layer)
+    for batch_item in y_pred_decoded:
+        pred_layers = []
+        for prediction in batch_item:
+            if (prediction[0] < 0) or (prediction[0] >= cum_boxes_per_pred_layer[-1]):
+                raise ValueError("Box index is out of bounds of the possible indices as given by the values in `num_boxes_per_pred_layer`.")
+            for i in range(len(cum_boxes_per_pred_layer)):
+                if prediction[0] < cum_boxes_per_pred_layer[i]:
+                    pred_layers.append(i)
+                    break
+        pred_layers_all.append(pred_layers)
+    return pred_layers_all
--- a/ssd_keras-master/ssd_encoder_decoder/ssd_output_decoder.pyc
+++ b/ssd_keras-master/ssd_encoder_decoder/ssd_output_decoder.pyc