Summary

2020-02-06 16:47:03 -03:00
parent 6328265287
commit b586f22bf0
318 changed files with 25111 additions and 664 deletions
--- a/ssd_keras-master/eval_utils/init.py
+++ b/ssd_keras-master/eval_utils/init.py
--- a/ssd_keras-master/eval_utils/pycache/init.cpython-36.pyc
+++ b/ssd_keras-master/eval_utils/pycache/init.cpython-36.pyc
--- a/ssd_keras-master/eval_utils/pycache/average_precision_evaluator.cpython-36.pyc
+++ b/ssd_keras-master/eval_utils/pycache/average_precision_evaluator.cpython-36.pyc
--- a/ssd_keras-master/eval_utils/average_precision_evaluator.py
+++ b/ssd_keras-master/eval_utils/average_precision_evaluator.py
@@ -0,0 +1,906 @@
+'''
+An evaluator to compute the Pascal VOC-style mean average precision (both the pre-2010
+and post-2010 algorithm versions) of a given Keras SSD model on a given dataset.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+from math import ceil
+from tqdm import trange
+import sys
+import warnings
+
+from data_generator.object_detection_2d_data_generator import DataGenerator
+from data_generator.object_detection_2d_geometric_ops import Resize
+from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR
+from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
+from ssd_encoder_decoder.ssd_output_decoder import decode_detections
+from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
+
+from bounding_box_utils.bounding_box_utils import iou
+
+class Evaluator:
+    '''
+    Computes the mean average precision of the given Keras SSD model on the given dataset.
+
+    Can compute the Pascal-VOC-style average precision in both the pre-2010 (k-point sampling)
+    and post-2010 (integration) algorithm versions.
+
+    Optionally also returns the average precisions, precisions, and recalls.
+
+    The algorithm is identical to the official Pascal VOC pre-2010 detection evaluation algorithm
+    in its default settings, but can be cusomized in a number of ways.
+    '''
+
+    def __init__(self,
+                 model,
+                 n_classes,
+                 data_generator,
+                 model_mode='inference',
+                 pred_format={'class_id': 0, 'conf': 1, 'xmin': 2, 'ymin': 3, 'xmax': 4, 'ymax': 5},
+                 gt_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            model (Keras model): A Keras SSD model object.
+            n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+            data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset.
+            model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'.
+                This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to
+                the model documentation for the meaning of the individual modes.
+            pred_format (dict, optional): A dictionary that defines which index in the last axis of the model's decoded predictions
+                contains which bounding box coordinate. The dictionary must map the keywords 'class_id', 'conf' (for the confidence),
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis.
+            gt_format (list, optional): A dictionary that defines which index of a ground truth bounding box contains which of the five
+                items class ID, xmin, ymin, xmax, ymax. The expected strings are 'xmin', 'ymin', 'xmax', 'ymax', 'class_id'.
+        '''
+
+        if not isinstance(data_generator, DataGenerator):
+            warnings.warn("`data_generator` is not a `DataGenerator` object, which will cause undefined behavior.")
+
+        self.model = model
+        self.data_generator = data_generator
+        self.n_classes = n_classes
+        self.model_mode = model_mode
+        self.pred_format = pred_format
+        self.gt_format = gt_format
+
+        # The following lists all contain per-class data, i.e. all list have the length `n_classes + 1`,
+        # where one element is for the background class, i.e. that element is just a dummy entry.
+        self.prediction_results = None
+        self.num_gt_per_class = None
+        self.true_positives = None
+        self.false_positives = None
+        self.cumulative_true_positives = None
+        self.cumulative_false_positives = None
+        self.cumulative_precisions = None # "Cumulative" means that the i-th element in each list represents the precision for the first i highest condidence predictions for that class.
+        self.cumulative_recalls = None # "Cumulative" means that the i-th element in each list represents the recall for the first i highest condidence predictions for that class.
+        self.average_precisions = None
+        self.mean_average_precision = None
+
+    def __call__(self,
+                 img_height,
+                 img_width,
+                 batch_size,
+                 data_generator_mode='resize',
+                 round_confidences=False,
+                 matching_iou_threshold=0.5,
+                 border_pixels='include',
+                 sorting_algorithm='quicksort',
+                 average_precision_mode='sample',
+                 num_recall_points=11,
+                 ignore_neutral_boxes=True,
+                 return_precisions=False,
+                 return_recalls=False,
+                 return_average_precisions=False,
+                 verbose=True,
+                 decoding_confidence_thresh=0.01,
+                 decoding_iou_threshold=0.45,
+                 decoding_top_k=200,
+                 decoding_pred_coords='centroids',
+                 decoding_normalize_coords=True):
+        '''
+        Computes the mean average precision of the given Keras SSD model on the given dataset.
+
+        Optionally also returns the averages precisions, precisions, and recalls.
+
+        All the individual steps of the overall evaluation algorithm can also be called separately
+        (check out the other methods of this class), but this runs the overall algorithm all at once.
+
+        Arguments:
+            img_height (int): The input image height for the model.
+            img_width (int): The input image width for the model.
+            batch_size (int): The batch size for the evaluation.
+            data_generator_mode (str, optional): Either of 'resize' and 'pad'. If 'resize', the input images will
+                be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
+                If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
+                and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
+            round_confidences (int, optional): `False` or an integer that is the number of decimals that the prediction
+                confidences will be rounded to. If `False`, the confidences will not be rounded.
+            matching_iou_threshold (float, optional): A prediction will be considered a true positive if it has a Jaccard overlap
+                of at least `matching_iou_threshold` with any ground truth bounding box of the same class.
+            border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+                Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+                to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+                If 'half', then one of each of the two horizontal and vertical borders belong
+                to the boxex, but not the other.
+            sorting_algorithm (str, optional): Which sorting algorithm the matching algorithm should use. This argument accepts
+                any valid sorting algorithm for Numpy's `argsort()` function. You will usually want to choose between 'quicksort'
+                (fastest and most memory efficient, but not stable) and 'mergesort' (slight slower and less memory efficient, but stable).
+                The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm is only guaranteed
+                to behave identically if you choose 'mergesort' as the sorting algorithm, but it will almost always behave identically
+                even if you choose 'quicksort' (but no guarantees).
+            average_precision_mode (str, optional): Can be either 'sample' or 'integrate'. In the case of 'sample', the average precision
+                will be computed according to the Pascal VOC formula that was used up until VOC 2009, where the precision will be sampled
+                for `num_recall_points` recall values. In the case of 'integrate', the average precision will be computed according to the
+                Pascal VOC formula that was used from VOC 2010 onward, where the average precision will be computed by numerically integrating
+                over the whole preciscion-recall curve instead of sampling individual points from it. 'integrate' mode is basically just
+                the limit case of 'sample' mode as the number of sample points increases.
+            num_recall_points (int, optional): The number of points to sample from the precision-recall-curve to compute the average
+                precisions. In other words, this is the number of equidistant recall values for which the resulting precision will be
+                computed. 11 points is the value used in the official Pascal VOC 2007 detection evaluation algorithm.
+            ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth
+                bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these
+                annotations. If `False`, even boxes that are annotated as neutral will be counted into the evaluation. If `True`,
+                neutral boxes will be ignored for the evaluation. An example for evaluation-neutrality are the ground truth boxes
+                annotated as "difficult" in the Pascal VOC datasets, which are usually treated as neutral for the evaluation.
+            return_precisions (bool, optional): If `True`, returns a nested list containing the cumulative precisions for each class.
+            return_recalls (bool, optional): If `True`, returns a nested list containing the cumulative recalls for each class.
+            return_average_precisions (bool, optional): If `True`, returns a list containing the average precision for each class.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            decoding_confidence_thresh (float, optional): Only relevant if the model is in 'training' mode.
+                A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered
+                for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the
+                selection process being done by the non-maximum suppression stage, while a larger value will result in a larger
+                part of the selection process happening in the confidence thresholding stage.
+            decoding_iou_threshold (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1].
+                All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
+                from the set of predictions for a given class, where 'maximal' refers to the box score.
+            decoding_top_k (int, optional): Only relevant if the model is in 'training' mode. The number of highest scoring
+                predictions to be kept for each batch item after the non-maximum suppression stage.
+            decoding_input_coords (str, optional): Only relevant if the model is in 'training' mode. The box coordinate format
+                that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height),
+                'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+            decoding_normalize_coords (bool, optional): Only relevant if the model is in 'training' mode. Set to `True` if the model
+                outputs relative coordinates. Do not set this to `True` if the model already outputs absolute coordinates,
+                as that would result in incorrect coordinates.
+
+        Returns:
+            A float, the mean average precision, plus any optional returns specified in the arguments.
+        '''
+
+        #############################################################################################
+        # Predict on the entire dataset.
+        #############################################################################################
+
+        self.predict_on_dataset(img_height=img_height,
+                                img_width=img_width,
+                                batch_size=batch_size,
+                                data_generator_mode=data_generator_mode,
+                                decoding_confidence_thresh=decoding_confidence_thresh,
+                                decoding_iou_threshold=decoding_iou_threshold,
+                                decoding_top_k=decoding_top_k,
+                                decoding_pred_coords=decoding_pred_coords,
+                                decoding_normalize_coords=decoding_normalize_coords,
+                                decoding_border_pixels=border_pixels,
+                                round_confidences=round_confidences,
+                                verbose=verbose,
+                                ret=False)
+
+        #############################################################################################
+        # Get the total number of ground truth boxes for each class.
+        #############################################################################################
+
+        self.get_num_gt_per_class(ignore_neutral_boxes=ignore_neutral_boxes,
+                                  verbose=False,
+                                  ret=False)
+
+        #############################################################################################
+        # Match predictions to ground truth boxes for all classes.
+        #############################################################################################
+
+        self.match_predictions(ignore_neutral_boxes=ignore_neutral_boxes,
+                               matching_iou_threshold=matching_iou_threshold,
+                               border_pixels=border_pixels,
+                               sorting_algorithm=sorting_algorithm,
+                               verbose=verbose,
+                               ret=False)
+
+        #############################################################################################
+        # Compute the cumulative precision and recall for all classes.
+        #############################################################################################
+
+        self.compute_precision_recall(verbose=verbose, ret=False)
+
+        #############################################################################################
+        # Compute the average precision for this class.
+        #############################################################################################
+
+        self.compute_average_precisions(mode=average_precision_mode,
+                                        num_recall_points=num_recall_points,
+                                        verbose=verbose,
+                                        ret=False)
+
+        #############################################################################################
+        # Compute the mean average precision.
+        #############################################################################################
+
+        mean_average_precision = self.compute_mean_average_precision(ret=True)
+
+        #############################################################################################
+
+        # Compile the returns.
+        if return_precisions or return_recalls or return_average_precisions:
+            ret = [mean_average_precision]
+            if return_average_precisions:
+                ret.append(self.average_precisions)
+            if return_precisions:
+                ret.append(self.cumulative_precisions)
+            if return_recalls:
+                ret.append(self.cumulative_recalls)
+            return ret
+        else:
+            return mean_average_precision
+
+    def predict_on_dataset(self,
+                           img_height,
+                           img_width,
+                           batch_size,
+                           data_generator_mode='resize',
+                           decoding_confidence_thresh=0.01,
+                           decoding_iou_threshold=0.45,
+                           decoding_top_k=200,
+                           decoding_pred_coords='centroids',
+                           decoding_normalize_coords=True,
+                           decoding_border_pixels='include',
+                           round_confidences=False,
+                           verbose=True,
+                           ret=False):
+        '''
+        Runs predictions for the given model over the entire dataset given by `data_generator`.
+
+        Arguments:
+            img_height (int): The input image height for the model.
+            img_width (int): The input image width for the model.
+            batch_size (int): The batch size for the evaluation.
+            data_generator_mode (str, optional): Either of 'resize' and 'pad'. If 'resize', the input images will
+                be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
+                If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
+                and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
+            decoding_confidence_thresh (float, optional): Only relevant if the model is in 'training' mode.
+                A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered
+                for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the
+                selection process being done by the non-maximum suppression stage, while a larger value will result in a larger
+                part of the selection process happening in the confidence thresholding stage.
+            decoding_iou_threshold (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1].
+                All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
+                from the set of predictions for a given class, where 'maximal' refers to the box score.
+            decoding_top_k (int, optional): Only relevant if the model is in 'training' mode. The number of highest scoring
+                predictions to be kept for each batch item after the non-maximum suppression stage.
+            decoding_input_coords (str, optional): Only relevant if the model is in 'training' mode. The box coordinate format
+                that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height),
+                'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+            decoding_normalize_coords (bool, optional): Only relevant if the model is in 'training' mode. Set to `True` if the model
+                outputs relative coordinates. Do not set this to `True` if the model already outputs absolute coordinates,
+                as that would result in incorrect coordinates.
+            round_confidences (int, optional): `False` or an integer that is the number of decimals that the prediction
+                confidences will be rounded to. If `False`, the confidences will not be rounded.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the predictions.
+
+        Returns:
+            None by default. Optionally, a nested list containing the predictions for each class.
+        '''
+
+        class_id_pred = self.pred_format['class_id']
+        conf_pred     = self.pred_format['conf']
+        xmin_pred     = self.pred_format['xmin']
+        ymin_pred     = self.pred_format['ymin']
+        xmax_pred     = self.pred_format['xmax']
+        ymax_pred     = self.pred_format['ymax']
+
+        #############################################################################################
+        # Configure the data generator for the evaluation.
+        #############################################################################################
+
+        convert_to_3_channels = ConvertTo3Channels()
+        resize = Resize(height=img_height,width=img_width, labels_format=self.gt_format)
+        if data_generator_mode == 'resize':
+            transformations = [convert_to_3_channels,
+                               resize]
+        elif data_generator_mode == 'pad':
+            random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, labels_format=self.gt_format)
+            transformations = [convert_to_3_channels,
+                               random_pad,
+                               resize]
+        else:
+            raise ValueError("`data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode))
+
+        # Set the generator parameters.
+        generator = self.data_generator.generate(batch_size=batch_size,
+                                                 shuffle=False,
+                                                 transformations=transformations,
+                                                 label_encoder=None,
+                                                 returns={'processed_images',
+                                                          'image_ids',
+                                                          'evaluation-neutral',
+                                                          'inverse_transform',
+                                                          'original_labels'},
+                                                 keep_images_without_gt=True,
+                                                 degenerate_box_handling='remove')
+
+        # If we don't have any real image IDs, generate pseudo-image IDs.
+        # This is just to make the evaluator compatible both with datasets that do and don't
+        # have image IDs.
+        if self.data_generator.image_ids is None:
+            self.data_generator.image_ids = list(range(self.data_generator.get_dataset_size()))
+
+        #############################################################################################
+        # Predict over all batches of the dataset and store the predictions.
+        #############################################################################################
+
+        # We have to generate a separate results list for each class.
+        results = [list() for _ in range(self.n_classes + 1)]
+
+        # Create a dictionary that maps image IDs to ground truth annotations.
+        # We'll need it below.
+        image_ids_to_labels = {}
+
+        # Compute the number of batches to iterate over the entire dataset.
+        n_images = self.data_generator.get_dataset_size()
+        n_batches = int(ceil(n_images / batch_size))
+        if verbose:
+            print("Number of images in the evaluation dataset: {}".format(n_images))
+            print()
+            tr = trange(n_batches, file=sys.stdout)
+            tr.set_description('Producing predictions batch-wise')
+        else:
+            tr = range(n_batches)
+
+        # Loop over all batches.
+        for j in tr:
+            # Generate batch.
+            batch_X, batch_image_ids, batch_eval_neutral, batch_inverse_transforms, batch_orig_labels = next(generator)
+            # Predict.
+            y_pred = self.model.predict(batch_X)
+            # If the model was created in 'training' mode, the raw predictions need to
+            # be decoded and filtered, otherwise that's already taken care of.
+            if self.model_mode == 'training':
+                # Decode.
+                y_pred = decode_detections(y_pred,
+                                           confidence_thresh=decoding_confidence_thresh,
+                                           iou_threshold=decoding_iou_threshold,
+                                           top_k=decoding_top_k,
+                                           input_coords=decoding_pred_coords,
+                                           normalize_coords=decoding_normalize_coords,
+                                           img_height=img_height,
+                                           img_width=img_width,
+                                           border_pixels=decoding_border_pixels)
+            else:
+                # Filter out the all-zeros dummy elements of `y_pred`.
+                y_pred_filtered = []
+                for i in range(len(y_pred)):
+                    y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0])
+                y_pred = y_pred_filtered
+            # Convert the predicted box coordinates for the original images.
+            y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms)
+
+            # Iterate over all batch items.
+            for k, batch_item in enumerate(y_pred):
+
+                image_id = batch_image_ids[k]
+
+                for box in batch_item:
+                    class_id = int(box[class_id_pred])
+                    # Round the box coordinates to reduce the required memory.
+                    if round_confidences:
+                        confidence = round(box[conf_pred], round_confidences)
+                    else:
+                        confidence = box[conf_pred]
+                    xmin = round(box[xmin_pred], 1)
+                    ymin = round(box[ymin_pred], 1)
+                    xmax = round(box[xmax_pred], 1)
+                    ymax = round(box[ymax_pred], 1)
+                    prediction = (image_id, confidence, xmin, ymin, xmax, ymax)
+                    # Append the predicted box to the results list for its class.
+                    results[class_id].append(prediction)
+
+        self.prediction_results = results
+
+        if ret:
+            return results
+
+    def write_predictions_to_txt(self,
+                                 classes=None,
+                                 out_file_prefix='comp3_det_test_',
+                                 verbose=True):
+        '''
+        Writes the predictions for all classes to separate text files according to the Pascal VOC results format.
+
+        Arguments:
+            classes (list, optional): `None` or a list of strings containing the class names of all classes in the dataset,
+                including some arbitrary name for the background class. This list will be used to name the output text files.
+                The ordering of the names in the list represents the ordering of the classes as they are predicted by the model,
+                i.e. the element with index 3 in this list should correspond to the class with class ID 3 in the model's predictions.
+                If `None`, the output text files will be named by their class IDs.
+            out_file_prefix (str, optional): A prefix for the output text file names. The suffix to each output text file name will
+                be the respective class name followed by the `.txt` file extension. This string is also how you specify the directory
+                in which the results are to be saved.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+
+        Returns:
+            None.
+        '''
+
+        if self.prediction_results is None:
+            raise ValueError("There are no prediction results. You must run `predict_on_dataset()` before calling this method.")
+
+        # We generate a separate results file for each class.
+        for class_id in range(1, self.n_classes + 1):
+
+            if verbose:
+                print("Writing results file for class {}/{}.".format(class_id, self.n_classes))
+
+            if classes is None:
+                class_suffix = '{:04d}'.format(class_id)
+            else:
+                class_suffix = classes[class_id]
+
+            results_file = open('{}{}.txt'.format(out_file_prefix, class_suffix), 'w')
+
+            for prediction in self.prediction_results[class_id]:
+
+                prediction_list = list(prediction)
+                prediction_list[0] = '{:06d}'.format(int(prediction_list[0]))
+                prediction_list[1] = round(prediction_list[1], 4)
+                prediction_txt = ' '.join(map(str, prediction_list)) + '\n'
+                results_file.write(prediction_txt)
+
+            results_file.close()
+
+        if verbose:
+            print("All results files saved.")
+
+    def get_num_gt_per_class(self,
+                             ignore_neutral_boxes=True,
+                             verbose=True,
+                             ret=False):
+        '''
+        Counts the number of ground truth boxes for each class across the dataset.
+
+        Arguments:
+            ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth
+                bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these
+                annotations. If `True`, only non-neutral ground truth boxes will be counted, otherwise all ground truth boxes will
+                be counted.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the list of counts.
+
+        Returns:
+            None by default. Optionally, a list containing a count of the number of ground truth boxes for each class across the
+            entire dataset.
+        '''
+
+        if self.data_generator.labels is None:
+            raise ValueError("Computing the number of ground truth boxes per class not possible, no ground truth given.")
+
+        num_gt_per_class = np.zeros(shape=(self.n_classes+1), dtype=np.int)
+
+        class_id_index = self.gt_format['class_id']
+
+        ground_truth = self.data_generator.labels
+
+        if verbose:
+            print('Computing the number of positive ground truth boxes per class.')
+            tr = trange(len(ground_truth), file=sys.stdout)
+        else:
+            tr = range(len(ground_truth))
+
+        # Iterate over the ground truth for all images in the dataset.
+        for i in tr:
+
+            boxes = np.asarray(ground_truth[i])
+
+            # Iterate over all ground truth boxes for the current image.
+            for j in range(boxes.shape[0]):
+
+                if ignore_neutral_boxes and not (self.data_generator.eval_neutral is None):
+                    if not self.data_generator.eval_neutral[i][j]:
+                        # If this box is not supposed to be evaluation-neutral,
+                        # increment the counter for the respective class ID.
+                        class_id = boxes[j, class_id_index]
+                        num_gt_per_class[class_id] += 1
+                else:
+                    # If there is no such thing as evaluation-neutral boxes for
+                    # our dataset, always increment the counter for the respective
+                    # class ID.
+                    class_id = boxes[j, class_id_index]
+                    num_gt_per_class[class_id] += 1
+
+        self.num_gt_per_class = num_gt_per_class
+
+        if ret:
+            return num_gt_per_class
+
+    def match_predictions(self,
+                          ignore_neutral_boxes=True,
+                          matching_iou_threshold=0.5,
+                          border_pixels='include',
+                          sorting_algorithm='quicksort',
+                          verbose=True,
+                          ret=False):
+        '''
+        Matches predictions to ground truth boxes.
+
+        Note that `predict_on_dataset()` must be called before calling this method.
+
+        Arguments:
+            ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth
+                bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these
+                annotations. If `False`, even boxes that are annotated as neutral will be counted into the evaluation. If `True`,
+                neutral boxes will be ignored for the evaluation. An example for evaluation-neutrality are the ground truth boxes
+                annotated as "difficult" in the Pascal VOC datasets, which are usually treated as neutral for the evaluation.
+            matching_iou_threshold (float, optional): A prediction will be considered a true positive if it has a Jaccard overlap
+                of at least `matching_iou_threshold` with any ground truth bounding box of the same class.
+            border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+                Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+                to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+                If 'half', then one of each of the two horizontal and vertical borders belong
+                to the boxex, but not the other.
+            sorting_algorithm (str, optional): Which sorting algorithm the matching algorithm should use. This argument accepts
+                any valid sorting algorithm for Numpy's `argsort()` function. You will usually want to choose between 'quicksort'
+                (fastest and most memory efficient, but not stable) and 'mergesort' (slight slower and less memory efficient, but stable).
+                The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm is only guaranteed
+                to behave identically if you choose 'mergesort' as the sorting algorithm, but it will almost always behave identically
+                even if you choose 'quicksort' (but no guarantees).
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the true and false positives.
+
+        Returns:
+            None by default. Optionally, four nested lists containing the true positives, false positives, cumulative true positives,
+            and cumulative false positives for each class.
+        '''
+
+        if self.data_generator.labels is None:
+            raise ValueError("Matching predictions to ground truth boxes not possible, no ground truth given.")
+
+        if self.prediction_results is None:
+            raise ValueError("There are no prediction results. You must run `predict_on_dataset()` before calling this method.")
+
+        class_id_gt = self.gt_format['class_id']
+        xmin_gt = self.gt_format['xmin']
+        ymin_gt = self.gt_format['ymin']
+        xmax_gt = self.gt_format['xmax']
+        ymax_gt = self.gt_format['ymax']
+
+        # Convert the ground truth to a more efficient format for what we need
+        # to do, which is access ground truth by image ID repeatedly.
+        ground_truth = {}
+        eval_neutral_available = not (self.data_generator.eval_neutral is None) # Whether or not we have annotations to decide whether ground truth boxes should be neutral or not.
+        for i in range(len(self.data_generator.image_ids)):
+            image_id = str(self.data_generator.image_ids[i])
+            labels = self.data_generator.labels[i]
+            if ignore_neutral_boxes and eval_neutral_available:
+                ground_truth[image_id] = (np.asarray(labels), np.asarray(self.data_generator.eval_neutral[i]))
+            else:
+                ground_truth[image_id] = np.asarray(labels)
+
+        true_positives = [[]] # The false positives for each class, sorted by descending confidence.
+        false_positives = [[]] # The true positives for each class, sorted by descending confidence.
+        cumulative_true_positives = [[]]
+        cumulative_false_positives = [[]]
+
+        # Iterate over all classes.
+        for class_id in range(1, self.n_classes + 1):
+
+            predictions = self.prediction_results[class_id]
+
+            # Store the matching results in these lists:
+            true_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a true positive, 0 otherwise
+            false_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a false positive, 0 otherwise
+
+            # In case there are no predictions at all for this class, we're done here.
+            if len(predictions) == 0:
+                print("No predictions for class {}/{}".format(class_id, self.n_classes))
+                true_positives.append(true_pos)
+                false_positives.append(false_pos)
+                continue
+
+            # Convert the predictions list for this class into a structured array so that we can sort it by confidence.
+
+            # Get the number of characters needed to store the image ID strings in the structured array.
+            num_chars_per_image_id = len(str(predictions[0][0])) + 6 # Keep a few characters buffer in case some image IDs are longer than others.
+            # Create the data type for the structured array.
+            preds_data_type = np.dtype([('image_id', 'U{}'.format(num_chars_per_image_id)),
+                                        ('confidence', 'f4'),
+                                        ('xmin', 'f4'),
+                                        ('ymin', 'f4'),
+                                        ('xmax', 'f4'),
+                                        ('ymax', 'f4')])
+            # Create the structured array
+            predictions = np.array(predictions, dtype=preds_data_type)
+
+            # Sort the detections by decreasing confidence.
+            descending_indices = np.argsort(-predictions['confidence'], kind=sorting_algorithm)
+            predictions_sorted = predictions[descending_indices]
+
+            if verbose:
+                tr = trange(len(predictions), file=sys.stdout)
+                tr.set_description("Matching predictions to ground truth, class {}/{}.".format(class_id, self.n_classes))
+            else:
+                tr = range(len(predictions.shape))
+
+            # Keep track of which ground truth boxes were already matched to a detection.
+            gt_matched = {}
+
+            # Iterate over all predictions.
+            for i in tr:
+
+                prediction = predictions_sorted[i]
+                image_id = prediction['image_id']
+                pred_box = np.asarray(list(prediction[['xmin', 'ymin', 'xmax', 'ymax']])) # Convert the structured array element to a regular array.
+
+                # Get the relevant ground truth boxes for this prediction,
+                # i.e. all ground truth boxes that match the prediction's
+                # image ID and class ID.
+
+                # The ground truth could either be a tuple with `(ground_truth_boxes, eval_neutral_boxes)`
+                # or only `ground_truth_boxes`.
+                if ignore_neutral_boxes and eval_neutral_available:
+                    gt, eval_neutral = ground_truth[image_id]
+                else:
+                    gt = ground_truth[image_id]
+                gt = np.asarray(gt)
+                class_mask = gt[:,class_id_gt] == class_id
+                gt = gt[class_mask]
+                if ignore_neutral_boxes and eval_neutral_available:
+                    eval_neutral = eval_neutral[class_mask]
+
+                if gt.size == 0:
+                    # If the image doesn't contain any objects of this class,
+                    # the prediction becomes a false positive.
+                    false_pos[i] = 1
+                    continue
+
+                # Compute the IoU of this prediction with all ground truth boxes of the same class.
+                overlaps = iou(boxes1=gt[:,[xmin_gt, ymin_gt, xmax_gt, ymax_gt]],
+                               boxes2=pred_box,
+                               coords='corners',
+                               mode='element-wise',
+                               border_pixels=border_pixels)
+
+                # For each detection, match the ground truth box with the highest overlap.
+                # It's possible that the same ground truth box will be matched to multiple
+                # detections.
+                gt_match_index = np.argmax(overlaps)
+                gt_match_overlap = overlaps[gt_match_index]
+
+                if gt_match_overlap < matching_iou_threshold:
+                    # False positive, IoU threshold violated:
+                    # Those predictions whose matched overlap is below the threshold become
+                    # false positives.
+                    false_pos[i] = 1
+                else:
+                    if not (ignore_neutral_boxes and eval_neutral_available) or (eval_neutral[gt_match_index] == False):
+                        # If this is not a ground truth that is supposed to be evaluation-neutral
+                        # (i.e. should be skipped for the evaluation) or if we don't even have the
+                        # concept of neutral boxes.
+                        if not (image_id in gt_matched):
+                            # True positive:
+                            # If the matched ground truth box for this prediction hasn't been matched to a
+                            # different prediction already, we have a true positive.
+                            true_pos[i] = 1
+                            gt_matched[image_id] = np.zeros(shape=(gt.shape[0]), dtype=np.bool)
+                            gt_matched[image_id][gt_match_index] = True
+                        elif not gt_matched[image_id][gt_match_index]:
+                            # True positive:
+                            # If the matched ground truth box for this prediction hasn't been matched to a
+                            # different prediction already, we have a true positive.
+                            true_pos[i] = 1
+                            gt_matched[image_id][gt_match_index] = True
+                        else:
+                            # False positive, duplicate detection:
+                            # If the matched ground truth box for this prediction has already been matched
+                            # to a different prediction previously, it is a duplicate detection for an
+                            # already detected object, which counts as a false positive.
+                            false_pos[i] = 1
+
+            true_positives.append(true_pos)
+            false_positives.append(false_pos)
+
+            cumulative_true_pos = np.cumsum(true_pos) # Cumulative sums of the true positives
+            cumulative_false_pos = np.cumsum(false_pos) # Cumulative sums of the false positives
+
+            cumulative_true_positives.append(cumulative_true_pos)
+            cumulative_false_positives.append(cumulative_false_pos)
+
+        self.true_positives = true_positives
+        self.false_positives = false_positives
+        self.cumulative_true_positives = cumulative_true_positives
+        self.cumulative_false_positives = cumulative_false_positives
+
+        if ret:
+            return true_positives, false_positives, cumulative_true_positives, cumulative_false_positives
+
+    def compute_precision_recall(self, verbose=True, ret=False):
+        '''
+        Computes the precisions and recalls for all classes.
+
+        Note that `match_predictions()` must be called before calling this method.
+
+        Arguments:
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the precisions and recalls.
+
+        Returns:
+            None by default. Optionally, two nested lists containing the cumulative precisions and recalls for each class.
+        '''
+
+        if (self.cumulative_true_positives is None) or (self.cumulative_false_positives is None):
+            raise ValueError("True and false positives not available. You must run `match_predictions()` before you call this method.")
+
+        if (self.num_gt_per_class is None):
+            raise ValueError("Number of ground truth boxes per class not available. You must run `get_num_gt_per_class()` before you call this method.")
+
+        cumulative_precisions = [[]]
+        cumulative_recalls = [[]]
+
+        # Iterate over all classes.
+        for class_id in range(1, self.n_classes + 1):
+
+            if verbose:
+                print("Computing precisions and recalls, class {}/{}".format(class_id, self.n_classes))
+
+            tp = self.cumulative_true_positives[class_id]
+            fp = self.cumulative_false_positives[class_id]
+
+
+            cumulative_precision = np.where(tp + fp > 0, tp / (tp + fp), 0) # 1D array with shape `(num_predictions,)`
+            cumulative_recall = tp / self.num_gt_per_class[class_id] # 1D array with shape `(num_predictions,)`
+
+            cumulative_precisions.append(cumulative_precision)
+            cumulative_recalls.append(cumulative_recall)
+
+        self.cumulative_precisions = cumulative_precisions
+        self.cumulative_recalls = cumulative_recalls
+
+        if ret:
+            return cumulative_precisions, cumulative_recalls
+
+    def compute_average_precisions(self, mode='sample', num_recall_points=11, verbose=True, ret=False):
+        '''
+        Computes the average precision for each class.
+
+        Can compute the Pascal-VOC-style average precision in both the pre-2010 (k-point sampling)
+        and post-2010 (integration) algorithm versions.
+
+        Note that `compute_precision_recall()` must be called before calling this method.
+
+        Arguments:
+            mode (str, optional): Can be either 'sample' or 'integrate'. In the case of 'sample', the average precision will be computed
+                according to the Pascal VOC formula that was used up until VOC 2009, where the precision will be sampled for `num_recall_points`
+                recall values. In the case of 'integrate', the average precision will be computed according to the Pascal VOC formula that
+                was used from VOC 2010 onward, where the average precision will be computed by numerically integrating over the whole
+                preciscion-recall curve instead of sampling individual points from it. 'integrate' mode is basically just the limit case
+                of 'sample' mode as the number of sample points increases. For details, see the references below.
+            num_recall_points (int, optional): Only relevant if mode is 'sample'. The number of points to sample from the precision-recall-curve
+                to compute the average precisions. In other words, this is the number of equidistant recall values for which the resulting
+                precision will be computed. 11 points is the value used in the official Pascal VOC pre-2010 detection evaluation algorithm.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the average precisions.
+
+        Returns:
+            None by default. Optionally, a list containing average precision for each class.
+
+        References:
+            http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/devkit_doc.html#sec:ap
+        '''
+
+        if (self.cumulative_precisions is None) or (self.cumulative_recalls is None):
+            raise ValueError("Precisions and recalls not available. You must run `compute_precision_recall()` before you call this method.")
+
+        if not (mode in {'sample', 'integrate'}):
+            raise ValueError("`mode` can be either 'sample' or 'integrate', but received '{}'".format(mode))
+
+        average_precisions = [0.0]
+
+        # Iterate over all classes.
+        for class_id in range(1, self.n_classes + 1):
+
+            if verbose:
+                print("Computing average precision, class {}/{}".format(class_id, self.n_classes))
+
+            cumulative_precision = self.cumulative_precisions[class_id]
+            cumulative_recall = self.cumulative_recalls[class_id]
+            average_precision = 0.0
+
+            if mode == 'sample':
+
+                for t in np.linspace(start=0, stop=1, num=num_recall_points, endpoint=True):
+
+                    cum_prec_recall_greater_t = cumulative_precision[cumulative_recall >= t]
+
+                    if cum_prec_recall_greater_t.size == 0:
+                        precision = 0.0
+                    else:
+                        precision = np.amax(cum_prec_recall_greater_t)
+
+                    average_precision += precision
+
+                average_precision /= num_recall_points
+
+            elif mode == 'integrate':
+
+                # We will compute the precision at all unique recall values.
+                unique_recalls, unique_recall_indices, unique_recall_counts = np.unique(cumulative_recall, return_index=True, return_counts=True)
+
+                # Store the maximal precision for each recall value and the absolute difference
+                # between any two unique recal values in the lists below. The products of these
+                # two nummbers constitute the rectangular areas whose sum will be our numerical
+                # integral.
+                maximal_precisions = np.zeros_like(unique_recalls)
+                recall_deltas = np.zeros_like(unique_recalls)
+
+                # Iterate over all unique recall values in reverse order. This saves a lot of computation:
+                # For each unique recall value `r`, we want to get the maximal precision value obtained
+                # for any recall value `r* >= r`. Once we know the maximal precision for the last `k` recall
+                # values after a given iteration, then in the next iteration, in order compute the maximal
+                # precisions for the last `l > k` recall values, we only need to compute the maximal precision
+                # for `l - k` recall values and then take the maximum between that and the previously computed
+                # maximum instead of computing the maximum over all `l` values.
+                # We skip the very last recall value, since the precision after between the last recall value
+                # recall 1.0 is defined to be zero.
+                for i in range(len(unique_recalls)-2, -1, -1):
+                    begin = unique_recall_indices[i]
+                    end   = unique_recall_indices[i + 1]
+                    # When computing the maximal precisions, use the maximum of the previous iteration to
+                    # avoid unnecessary repeated computation over the same precision values.
+                    # The maximal precisions are the heights of the rectangle areas of our integral under
+                    # the precision-recall curve.
+                    maximal_precisions[i] = np.maximum(np.amax(cumulative_precision[begin:end]), maximal_precisions[i + 1])
+                    # The differences between two adjacent recall values are the widths of our rectangle areas.
+                    recall_deltas[i] = unique_recalls[i + 1] - unique_recalls[i]
+
+                average_precision = np.sum(maximal_precisions * recall_deltas)
+
+            average_precisions.append(average_precision)
+
+        self.average_precisions = average_precisions
+
+        if ret:
+            return average_precisions
+
+    def compute_mean_average_precision(self, ret=True):
+        '''
+        Computes the mean average precision over all classes.
+
+        Note that `compute_average_precisions()` must be called before calling this method.
+
+        Arguments:
+            ret (bool, optional): If `True`, returns the mean average precision.
+
+        Returns:
+            A float, the mean average precision, by default. Optionally, None.
+        '''
+
+        if self.average_precisions is None:
+            raise ValueError("Average precisions not available. You must run `compute_average_precisions()` before you call this method.")
+
+        mean_average_precision = np.average(self.average_precisions[1:]) # The first element is for the background class, so skip it.
+        self.mean_average_precision = mean_average_precision
+
+        if ret:
+            return mean_average_precision
--- a/ssd_keras-master/eval_utils/coco_utils.py
+++ b/ssd_keras-master/eval_utils/coco_utils.py
@@ -0,0 +1,200 @@
+'''
+A few utilities that are useful when working with the MS COCO datasets.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import json
+from tqdm import trange
+from math import ceil
+import sys
+
+from data_generator.object_detection_2d_geometric_ops import Resize
+from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR
+from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
+from ssd_encoder_decoder.ssd_output_decoder import decode_detections
+from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
+
+def get_coco_category_maps(annotations_file):
+    '''
+    Builds dictionaries that map between MS COCO category IDs, transformed category IDs, and category names.
+    The original MS COCO category IDs are not consecutive unfortunately: The 80 category IDs are spread
+    across the integers 1 through 90 with some integers skipped. Since we usually use a one-hot
+    class representation in neural networks, we need to map these non-consecutive original COCO category
+    IDs (let's call them 'cats') to consecutive category IDs (let's call them 'classes').
+
+    Arguments:
+        annotations_file (str): The filepath to any MS COCO annotations JSON file.
+
+    Returns:
+        1) cats_to_classes: A dictionary that maps between the original (keys) and the transformed category IDs (values).
+        2) classes_to_cats: A dictionary that maps between the transformed (keys) and the original category IDs (values).
+        3) cats_to_names: A dictionary that maps between original category IDs (keys) and the respective category names (values).
+        4) classes_to_names: A list of the category names (values) with their indices representing the transformed IDs.
+    '''
+    with open(annotations_file, 'r') as f:
+        annotations = json.load(f)
+    cats_to_classes = {}
+    classes_to_cats = {}
+    cats_to_names = {}
+    classes_to_names = []
+    classes_to_names.append('background') # Need to add the background class first so that the indexing is right.
+    for i, cat in enumerate(annotations['categories']):
+        cats_to_classes[cat['id']] = i + 1
+        classes_to_cats[i + 1] = cat['id']
+        cats_to_names[cat['id']] = cat['name']
+        classes_to_names.append(cat['name'])
+
+    return cats_to_classes, classes_to_cats, cats_to_names, classes_to_names
+
+def predict_all_to_json(out_file,
+                        model,
+                        img_height,
+                        img_width,
+                        classes_to_cats,
+                        data_generator,
+                        batch_size,
+                        data_generator_mode='resize',
+                        model_mode='training',
+                        confidence_thresh=0.01,
+                        iou_threshold=0.45,
+                        top_k=200,
+                        pred_coords='centroids',
+                        normalize_coords=True):
+    '''
+    Runs detection predictions over the whole dataset given a model and saves them in a JSON file
+    in the MS COCO detection results format.
+
+    Arguments:
+        out_file (str): The file name (full path) under which to save the results JSON file.
+        model (Keras model): A Keras SSD model object.
+        img_height (int): The input image height for the model.
+        img_width (int): The input image width for the model.
+        classes_to_cats (dict): A dictionary that maps the consecutive class IDs predicted by the model
+            to the non-consecutive original MS COCO category IDs.
+        data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset.
+        batch_size (int): The batch size for the evaluation.
+        data_generator_mode (str, optional): Either of 'resize' or 'pad'. If 'resize', the input images will
+            be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
+            If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
+            and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
+        model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'.
+            This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to
+            the model documentation for the meaning of the individual modes.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+            positive class in order to be considered for the non-maximum suppression stage for the respective class.
+            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+            stage, while a larger value will result in a larger part of the selection process happening in the confidence
+            thresholding stage.
+        iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+            to the box score.
+        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+            non-maximum suppression stage. Defaults to 200, following the paper.
+        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+            coordinates. Requires `img_height` and `img_width` if set to `True`.
+
+    Returns:
+        None.
+    '''
+
+    convert_to_3_channels = ConvertTo3Channels()
+    resize = Resize(height=img_height,width=img_width)
+    if data_generator_mode == 'resize':
+        transformations = [convert_to_3_channels,
+                           resize]
+    elif data_generator_mode == 'pad':
+        random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, clip_boxes=False)
+        transformations = [convert_to_3_channels,
+                           random_pad,
+                           resize]
+    else:
+        raise ValueError("Unexpected argument value: `data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode))
+
+    # Set the generator parameters.
+    generator = data_generator.generate(batch_size=batch_size,
+                                        shuffle=False,
+                                        transformations=transformations,
+                                        label_encoder=None,
+                                        returns={'processed_images',
+                                                 'image_ids',
+                                                 'inverse_transform'},
+                                        keep_images_without_gt=True)
+    # Put the results in this list.
+    results = []
+    # Compute the number of batches to iterate over the entire dataset.
+    n_images = data_generator.get_dataset_size()
+    print("Number of images in the evaluation dataset: {}".format(n_images))
+    n_batches = int(ceil(n_images / batch_size))
+    # Loop over all batches.
+    tr = trange(n_batches, file=sys.stdout)
+    tr.set_description('Producing results file')
+    for i in tr:
+        # Generate batch.
+        batch_X, batch_image_ids, batch_inverse_transforms = next(generator)
+        # Predict.
+        y_pred = model.predict(batch_X)
+        # If the model was created in 'training' mode, the raw predictions need to
+        # be decoded and filtered, otherwise that's already taken care of.
+        if model_mode == 'training':
+            # Decode.
+            y_pred = decode_detections(y_pred,
+                                       confidence_thresh=confidence_thresh,
+                                       iou_threshold=iou_threshold,
+                                       top_k=top_k,
+                                       input_coords=pred_coords,
+                                       normalize_coords=normalize_coords,
+                                       img_height=img_height,
+                                       img_width=img_width)
+        else:
+            # Filter out the all-zeros dummy elements of `y_pred`.
+            y_pred_filtered = []
+            for i in range(len(y_pred)):
+                y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0])
+            y_pred = y_pred_filtered
+        # Convert the predicted box coordinates for the original images.
+        y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms)
+
+        # Convert each predicted box into the results format.
+        for k, batch_item in enumerate(y_pred):
+            for box in batch_item:
+                class_id = box[0]
+                # Transform the consecutive class IDs back to the original COCO category IDs.
+                cat_id = classes_to_cats[class_id]
+                # Round the box coordinates to reduce the JSON file size.
+                xmin = float(round(box[2], 1))
+                ymin = float(round(box[3], 1))
+                xmax = float(round(box[4], 1))
+                ymax = float(round(box[5], 1))
+                width = xmax - xmin
+                height = ymax - ymin
+                bbox = [xmin, ymin, width, height]
+                result = {}
+                result['image_id'] = batch_image_ids[k]
+                result['category_id'] = cat_id
+                result['score'] = float(round(box[1], 3))
+                result['bbox'] = bbox
+                results.append(result)
+
+    with open(out_file, 'w') as f:
+        json.dump(results, f)
+
+    print("Prediction results saved in '{}'".format(out_file))