Files
dl-desktop b586f22bf0 Summary
2020-02-06 16:47:03 -03:00

201 lines
11 KiB
Python

'''
A few utilities that are useful when working with the MS COCO datasets.
Copyright (C) 2018 Pierluigi Ferrari
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import json
from tqdm import trange
from math import ceil
import sys
from data_generator.object_detection_2d_geometric_ops import Resize
from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR
from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
from ssd_encoder_decoder.ssd_output_decoder import decode_detections
from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
def get_coco_category_maps(annotations_file):
'''
Builds dictionaries that map between MS COCO category IDs, transformed category IDs, and category names.
The original MS COCO category IDs are not consecutive unfortunately: The 80 category IDs are spread
across the integers 1 through 90 with some integers skipped. Since we usually use a one-hot
class representation in neural networks, we need to map these non-consecutive original COCO category
IDs (let's call them 'cats') to consecutive category IDs (let's call them 'classes').
Arguments:
annotations_file (str): The filepath to any MS COCO annotations JSON file.
Returns:
1) cats_to_classes: A dictionary that maps between the original (keys) and the transformed category IDs (values).
2) classes_to_cats: A dictionary that maps between the transformed (keys) and the original category IDs (values).
3) cats_to_names: A dictionary that maps between original category IDs (keys) and the respective category names (values).
4) classes_to_names: A list of the category names (values) with their indices representing the transformed IDs.
'''
with open(annotations_file, 'r') as f:
annotations = json.load(f)
cats_to_classes = {}
classes_to_cats = {}
cats_to_names = {}
classes_to_names = []
classes_to_names.append('background') # Need to add the background class first so that the indexing is right.
for i, cat in enumerate(annotations['categories']):
cats_to_classes[cat['id']] = i + 1
classes_to_cats[i + 1] = cat['id']
cats_to_names[cat['id']] = cat['name']
classes_to_names.append(cat['name'])
return cats_to_classes, classes_to_cats, cats_to_names, classes_to_names
def predict_all_to_json(out_file,
model,
img_height,
img_width,
classes_to_cats,
data_generator,
batch_size,
data_generator_mode='resize',
model_mode='training',
confidence_thresh=0.01,
iou_threshold=0.45,
top_k=200,
pred_coords='centroids',
normalize_coords=True):
'''
Runs detection predictions over the whole dataset given a model and saves them in a JSON file
in the MS COCO detection results format.
Arguments:
out_file (str): The file name (full path) under which to save the results JSON file.
model (Keras model): A Keras SSD model object.
img_height (int): The input image height for the model.
img_width (int): The input image width for the model.
classes_to_cats (dict): A dictionary that maps the consecutive class IDs predicted by the model
to the non-consecutive original MS COCO category IDs.
data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset.
batch_size (int): The batch size for the evaluation.
data_generator_mode (str, optional): Either of 'resize' or 'pad'. If 'resize', the input images will
be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'.
This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to
the model documentation for the meaning of the individual modes.
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
positive class in order to be considered for the non-maximum suppression stage for the respective class.
A lower value will result in a larger part of the selection process being done by the non-maximum suppression
stage, while a larger value will result in a larger part of the selection process happening in the confidence
thresholding stage.
iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
to the box score.
top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
non-maximum suppression stage. Defaults to 200, following the paper.
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
coordinates. Requires `img_height` and `img_width` if set to `True`.
Returns:
None.
'''
convert_to_3_channels = ConvertTo3Channels()
resize = Resize(height=img_height,width=img_width)
if data_generator_mode == 'resize':
transformations = [convert_to_3_channels,
resize]
elif data_generator_mode == 'pad':
random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, clip_boxes=False)
transformations = [convert_to_3_channels,
random_pad,
resize]
else:
raise ValueError("Unexpected argument value: `data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode))
# Set the generator parameters.
generator = data_generator.generate(batch_size=batch_size,
shuffle=False,
transformations=transformations,
label_encoder=None,
returns={'processed_images',
'image_ids',
'inverse_transform'},
keep_images_without_gt=True)
# Put the results in this list.
results = []
# Compute the number of batches to iterate over the entire dataset.
n_images = data_generator.get_dataset_size()
print("Number of images in the evaluation dataset: {}".format(n_images))
n_batches = int(ceil(n_images / batch_size))
# Loop over all batches.
tr = trange(n_batches, file=sys.stdout)
tr.set_description('Producing results file')
for i in tr:
# Generate batch.
batch_X, batch_image_ids, batch_inverse_transforms = next(generator)
# Predict.
y_pred = model.predict(batch_X)
# If the model was created in 'training' mode, the raw predictions need to
# be decoded and filtered, otherwise that's already taken care of.
if model_mode == 'training':
# Decode.
y_pred = decode_detections(y_pred,
confidence_thresh=confidence_thresh,
iou_threshold=iou_threshold,
top_k=top_k,
input_coords=pred_coords,
normalize_coords=normalize_coords,
img_height=img_height,
img_width=img_width)
else:
# Filter out the all-zeros dummy elements of `y_pred`.
y_pred_filtered = []
for i in range(len(y_pred)):
y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0])
y_pred = y_pred_filtered
# Convert the predicted box coordinates for the original images.
y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms)
# Convert each predicted box into the results format.
for k, batch_item in enumerate(y_pred):
for box in batch_item:
class_id = box[0]
# Transform the consecutive class IDs back to the original COCO category IDs.
cat_id = classes_to_cats[class_id]
# Round the box coordinates to reduce the JSON file size.
xmin = float(round(box[2], 1))
ymin = float(round(box[3], 1))
xmax = float(round(box[4], 1))
ymax = float(round(box[5], 1))
width = xmax - xmin
height = ymax - ymin
bbox = [xmin, ymin, width, height]
result = {}
result['image_id'] = batch_image_ids[k]
result['category_id'] = cat_id
result['score'] = float(round(box[1], 3))
result['bbox'] = bbox
results.append(result)
with open(out_file, 'w') as f:
json.dump(results, f)
print("Prediction results saved in '{}'".format(out_file))