DeepLearningExamples/TensorFlow/Classification/RN50v1.5/utils/image_processing.py
Przemek Strzelczyk 11f6038cd8 Updating RN50/TF
2019-10-21 19:49:00 +02:00

173 lines
6.1 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ==============================================================================
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import tensorflow as tf
_RESIZE_MIN = 256
_DEFAULT_IMAGE_SIZE = 224
__all__ = ['preprocess_image_record', 'preprocess_image_file']
def _deserialize_image_record(record):
feature_map = {
'image/encoded': tf.FixedLenFeature([], tf.string, ''),
'image/class/label': tf.FixedLenFeature([], tf.int64, -1),
'image/class/text': tf.FixedLenFeature([], tf.string, ''),
'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32)
}
with tf.name_scope('deserialize_image_record'):
obj = tf.parse_single_example(record, feature_map)
imgdata = obj['image/encoded']
label = tf.cast(obj['image/class/label'], tf.int32)
bbox = tf.stack([obj['image/object/bbox/%s' % x].values for x in ['ymin', 'xmin', 'ymax', 'xmax']])
bbox = tf.transpose(tf.expand_dims(bbox, 0), [0, 2, 1])
text = obj['image/class/text']
return imgdata, label, bbox, text
def _decode_jpeg(imgdata, channels=3):
return tf.image.decode_jpeg(imgdata, channels=channels, fancy_upscaling=False, dct_method='INTEGER_FAST')
def _crop_and_filp(image, bbox, num_channels):
sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
tf.shape(image),
bounding_boxes=bbox,
min_object_covered=0.1,
aspect_ratio_range=[0.75, 1.33],
area_range=[0.05, 1.0],
max_attempts=100,
use_image_if_no_bounding_boxes=True
)
bbox_begin, bbox_size, _ = sample_distorted_bounding_box
offset_y, offset_x, _ = tf.unstack(bbox_begin)
target_height, target_width, _ = tf.unstack(bbox_size)
cropped = tf.image.crop_to_bounding_box(image, offset_y, offset_x, target_height, target_width)
cropped = tf.image.random_flip_left_right(cropped)
return cropped
def _central_crop(image, crop_height, crop_width):
shape = tf.shape(image)
height, width = shape[0], shape[1]
amount_to_be_cropped_h = (height - crop_height)
crop_top = amount_to_be_cropped_h // 2
amount_to_be_cropped_w = (width - crop_width)
crop_left = amount_to_be_cropped_w // 2
return tf.slice(image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
def _smallest_size_at_least(height, width, resize_min):
resize_min = tf.cast(resize_min, tf.float32)
# Convert to floats to make subsequent calculations go smoothly.
height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
smaller_dim = tf.minimum(height, width)
scale_ratio = resize_min / smaller_dim
# Convert back to ints to make heights and widths that TF ops will accept.
new_height = tf.cast(height * scale_ratio, tf.int32)
new_width = tf.cast(width * scale_ratio, tf.int32)
return new_height, new_width
def _aspect_preserving_resize(image, resize_min):
"""Resize images preserving the original aspect ratio.
Args:
image: A 3-D image `Tensor`.
resize_min: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
resized_image: A 3-D tensor containing the resized image.
"""
shape = tf.shape(image)
height, width = shape[0], shape[1]
new_height, new_width = _smallest_size_at_least(height, width, resize_min)
return _resize_image(image, new_height, new_width)
def _resize_image(image, height, width):
"""Simple wrapper around tf.resize_images.
This is primarily to make sure we use the same `ResizeMethod` and other
details each time.
Args:
image: A 3-D image `Tensor`.
height: The target height for the resized image.
width: The target width for the resized image.
Returns:
resized_image: A 3-D tensor containing the resized image. The first two
dimensions have the shape [height, width].
"""
return tf.image.resize_images(image, [height, width], method=tf.image.ResizeMethod.BILINEAR, align_corners=False)
def preprocess_image_record(record, height, width, num_channels, is_training=False):
imgdata, label, bbox, text = _deserialize_image_record(record)
label -= 1
try:
image = _decode_jpeg(imgdata, channels=3)
except:
image = tf.image.decode_image(imgdata, channels=3)
if is_training:
# For training, we want to randomize some of the distortions.
image = _crop_and_filp(image, bbox, num_channels)
image = _resize_image(image, height, width)
else:
image = _aspect_preserving_resize(image, _RESIZE_MIN)
image = _central_crop(image, height, width)
return image, label
def preprocess_image_file(filename, height, width, num_channels, is_training=False):
imgdata = tf.read_file(filename)
try:
image = _decode_jpeg(imgdata, channels=3)
except:
image = tf.image.decode_image(imgdata, channels=3)
if is_training:
# For training, we want to randomize some of the distortions.
image = _crop_and_filp(image, bbox, num_channels)
image = _resize_image(image, height, width)
else:
image = _aspect_preserving_resize(image, _RESIZE_MIN)
image = _central_crop(image, height, width)
return image, filename