[ConvNets/TF1] Added Triton for ResNet

This commit is contained in:
kkudrynski 2021-04-20 13:50:41 +02:00
parent 7bdfc81d25
commit 169b081827
118 changed files with 100312 additions and 998 deletions

View file

@ -32,7 +32,7 @@ allow_multiline_lambdas = True
# # <------ this blank line
# def method():
# pass
blank_line_before_nested_class_or_def = True
blank_line_before_nested_class_or_def = False
# Insert a blank line before a module docstring.
blank_line_before_module_docstring = True
@ -83,7 +83,7 @@ continuation_indent_width = 4
# start_ts=now()-timedelta(days=3),
# end_ts=now(),
# ) # <--- this bracket is dedented and on a separate line
dedent_closing_brackets = True
dedent_closing_brackets = False
# Disable the heuristic which places each list element on a separate line if the list is comma-terminated.
disable_ending_comma_heuristic = false

View file

@ -1,8 +1,30 @@
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.12-tf1-py3
ARG TRITON_CLIENT_IMAGE_NAME=nvcr.io/nvidia/tritonserver:20.12-py3-sdk
FROM ${TRITON_CLIENT_IMAGE_NAME} as triton-client
FROM ${FROM_IMAGE_NAME}
ADD requirements.txt .
RUN pip install -r requirements.txt
# Install perf_client required library
RUN apt-get update && \
apt-get install -y libb64-dev libb64-0d && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
ADD . /workspace/rn50v15_tf
# Install Triton Client PythonAPI and copy Perf Client
COPY --from=triton-client /workspace/install/ /workspace/install/
ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
RUN find /workspace/install/python/ -iname triton*manylinux*.whl -exec pip install {}[all] \;
# Setup environmnent variables to access Triton Client lib and bin
ENV PATH /workspace/install/bin:${PATH}
ENV PYTHONPATH /workspace/rn50v15_tf
WORKDIR /workspace/rn50v15_tf
RUN pip uninstall -y typing
ADD requirements.txt .
ADD triton/requirements.txt triton/requirements.txt
RUN pip install -r requirements.txt
RUN pip install -r triton/requirements.txt
ADD . .

View file

@ -51,7 +51,7 @@ were averaged over an entire training epoch.
The specific training script that was run is documented
in the corresponding model's README.
The following table shows the training accuracy results of the
The following table shows the training performance results of the
three classification models side-by-side.
@ -71,7 +71,7 @@ were averaged over an entire training epoch.
The specific training script that was run is documented
in the corresponding model's README.
The following table shows the training accuracy results of the
The following table shows the training performance results of the
three classification models side-by-side.

View file

@ -0,0 +1,436 @@
#!/usr/bin/python
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Converts image data to TFRecords file format with Example protos.
The image data set is expected to reside in JPEG files located in the
following directory structure.
data_dir/label_0/image0.jpeg
data_dir/label_0/image1.jpg
...
data_dir/label_1/weird-image.jpeg
data_dir/label_1/my-image.jpeg
...
where the sub-directory is the unique label associated with these images.
This TensorFlow script converts the training and evaluation data into
a sharded data set consisting of TFRecord files
train_directory/train-00000-of-01024
train_directory/train-00001-of-01024
...
train_directory/train-01023-of-01024
and
validation_directory/validation-00000-of-00128
validation_directory/validation-00001-of-00128
...
validation_directory/validation-00127-of-00128
where we have selected 1024 and 128 shards for each data set. Each record
within the TFRecord file is a serialized Example proto. The Example proto
contains the following fields:
image/encoded: string containing JPEG encoded image in RGB colorspace
image/height: integer, image height in pixels
image/width: integer, image width in pixels
image/colorspace: string, specifying the colorspace, always 'RGB'
image/channels: integer, specifying the number of channels, always 3
image/format: string, specifying the format, always 'JPEG'
image/filename: string containing the basename of the image file
e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
image/class/label: integer specifying the index in a classification layer.
The label ranges from [0, num_labels] where 0 is unused and left as
the background class.
image/class/text: string specifying the human-readable version of the label
e.g. 'dog'
If your data set involves bounding boxes, please look at build_imagenet_data.py.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import os
import random
import sys
import threading
import numpy as np
import tensorflow as tf
tf.app.flags.DEFINE_string('train_directory', '/tmp/',
'Training data directory')
tf.app.flags.DEFINE_string('validation_directory', '/tmp/',
'Validation data directory')
tf.app.flags.DEFINE_string('output_directory', '/tmp/',
'Output data directory')
tf.app.flags.DEFINE_integer('train_shards', 2,
'Number of shards in training TFRecord files.')
tf.app.flags.DEFINE_integer('validation_shards', 2,
'Number of shards in validation TFRecord files.')
tf.app.flags.DEFINE_integer('num_threads', 2,
'Number of threads to preprocess the images.')
# The labels file contains a list of valid labels are held in this file.
# Assumes that the file contains entries as such:
# dog
# cat
# flower
# where each line corresponds to a label. We map each label contained in
# the file to an integer corresponding to the line number starting from 0.
tf.app.flags.DEFINE_string('labels_file', '', 'Labels file')
FLAGS = tf.app.flags.FLAGS
def _int64_feature(value):
"""Wrapper for inserting int64 features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
"""Wrapper for inserting bytes features into Example proto."""
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _convert_to_example(filename, image_buffer, label, text, height, width):
"""Build an Example proto for an example.
Args:
filename: string, path to an image file, e.g., '/path/to/example.JPG'
image_buffer: string, JPEG encoding of RGB image
label: integer, identifier for the ground truth for the network
text: string, unique human-readable, e.g. 'dog'
height: integer, image height in pixels
width: integer, image width in pixels
Returns:
Example proto
"""
colorspace = 'RGB'
channels = 3
image_format = 'JPEG'
example = tf.train.Example(features=tf.train.Features(feature={
'image/height': _int64_feature(height),
'image/width': _int64_feature(width),
'image/colorspace': _bytes_feature(tf.compat.as_bytes(colorspace)),
'image/channels': _int64_feature(channels),
'image/class/label': _int64_feature(label),
'image/class/text': _bytes_feature(tf.compat.as_bytes(text)),
'image/format': _bytes_feature(tf.compat.as_bytes(image_format)),
'image/filename': _bytes_feature(tf.compat.as_bytes(os.path.basename(filename))),
'image/encoded': _bytes_feature(tf.compat.as_bytes(image_buffer))}))
return example
class ImageCoder(object):
"""Helper class that provides TensorFlow image coding utilities."""
def __init__(self):
# Create a single Session to run all image coding calls.
self._sess = tf.Session()
# Initializes function that converts PNG to JPEG data.
self._png_data = tf.placeholder(dtype=tf.string)
image = tf.image.decode_png(self._png_data, channels=3)
self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100)
# Initializes function that decodes RGB JPEG data.
self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)
def png_to_jpeg(self, image_data):
return self._sess.run(self._png_to_jpeg,
feed_dict={self._png_data: image_data})
def decode_jpeg(self, image_data):
image = self._sess.run(self._decode_jpeg,
feed_dict={self._decode_jpeg_data: image_data})
assert len(image.shape) == 3
assert image.shape[2] == 3
return image
def _is_png(filename):
"""Determine if a file contains a PNG format image.
Args:
filename: string, path of the image file.
Returns:
boolean indicating if the image is a PNG.
"""
return filename.endswith('.png')
def _process_image(filename, coder):
"""Process a single image file.
Args:
filename: string, path to an image file e.g., '/path/to/example.JPG'.
coder: instance of ImageCoder to provide TensorFlow image coding utils.
Returns:
image_buffer: string, JPEG encoding of RGB image.
height: integer, image height in pixels.
width: integer, image width in pixels.
"""
# Read the image file.
with tf.gfile.FastGFile(filename, 'rb') as f:
image_data = f.read()
# Convert any PNG to JPEG's for consistency.
if _is_png(filename):
print('Converting PNG to JPEG for %s' % filename)
image_data = coder.png_to_jpeg(image_data)
# Decode the RGB JPEG.
image = coder.decode_jpeg(image_data)
# Check that image converted to RGB
assert len(image.shape) == 3
height = image.shape[0]
width = image.shape[1]
assert image.shape[2] == 3
return image_data, height, width
def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
texts, labels, num_shards):
"""Processes and saves list of images as TFRecord in 1 thread.
Args:
coder: instance of ImageCoder to provide TensorFlow image coding utils.
thread_index: integer, unique batch to run index is within [0, len(ranges)).
ranges: list of pairs of integers specifying ranges of each batches to
analyze in parallel.
name: string, unique identifier specifying the data set
filenames: list of strings; each string is a path to an image file
texts: list of strings; each string is human readable, e.g. 'dog'
labels: list of integer; each integer identifies the ground truth
num_shards: integer number of shards for this data set.
"""
# Each thread produces N shards where N = int(num_shards / num_threads).
# For instance, if num_shards = 128, and the num_threads = 2, then the first
# thread would produce shards [0, 64).
num_threads = len(ranges)
assert not num_shards % num_threads
num_shards_per_batch = int(num_shards / num_threads)
shard_ranges = np.linspace(ranges[thread_index][0],
ranges[thread_index][1],
num_shards_per_batch + 1).astype(int)
num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
counter = 0
for s in range(num_shards_per_batch):
# Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
shard = thread_index * num_shards_per_batch + s
output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
output_file = os.path.join(FLAGS.output_directory, output_filename)
writer = tf.python_io.TFRecordWriter(output_file)
shard_counter = 0
files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
for i in files_in_shard:
filename = filenames[i]
label = labels[i]
text = texts[i]
try:
image_buffer, height, width = _process_image(filename, coder)
except Exception as e:
print(e)
print('SKIPPED: Unexpected error while decoding %s.' % filename)
continue
example = _convert_to_example(filename, image_buffer, label,
text, height, width)
writer.write(example.SerializeToString())
shard_counter += 1
counter += 1
if not counter % 1000:
print('%s [thread %d]: Processed %d of %d images in thread batch.' %
(datetime.now(), thread_index, counter, num_files_in_thread))
sys.stdout.flush()
writer.close()
print('%s [thread %d]: Wrote %d images to %s' %
(datetime.now(), thread_index, shard_counter, output_file))
sys.stdout.flush()
shard_counter = 0
print('%s [thread %d]: Wrote %d images to %d shards.' %
(datetime.now(), thread_index, counter, num_files_in_thread))
sys.stdout.flush()
def _process_image_files(name, filenames, texts, labels, num_shards):
"""Process and save list of images as TFRecord of Example protos.
Args:
name: string, unique identifier specifying the data set
filenames: list of strings; each string is a path to an image file
texts: list of strings; each string is human readable, e.g. 'dog'
labels: list of integer; each integer identifies the ground truth
num_shards: integer number of shards for this data set.
"""
assert len(filenames) == len(texts)
assert len(filenames) == len(labels)
# Break all images into batches with a [ranges[i][0], ranges[i][1]].
spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
ranges = []
for i in range(len(spacing) - 1):
ranges.append([spacing[i], spacing[i + 1]])
# Launch a thread for each batch.
print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
sys.stdout.flush()
# Create a mechanism for monitoring when all threads are finished.
coord = tf.train.Coordinator()
# Create a generic TensorFlow-based utility for converting all image codings.
coder = ImageCoder()
threads = []
for thread_index in range(len(ranges)):
args = (coder, thread_index, ranges, name, filenames,
texts, labels, num_shards)
t = threading.Thread(target=_process_image_files_batch, args=args)
t.start()
threads.append(t)
# Wait for all the threads to terminate.
coord.join(threads)
print('%s: Finished writing all %d images in data set.' %
(datetime.now(), len(filenames)))
sys.stdout.flush()
def _find_image_files(data_dir, labels_file):
"""Build a list of all images files and labels in the data set.
Args:
data_dir: string, path to the root directory of images.
Assumes that the image data set resides in JPEG files located in
the following directory structure.
data_dir/dog/another-image.JPEG
data_dir/dog/my-image.jpg
where 'dog' is the label associated with these images.
labels_file: string, path to the labels file.
The list of valid labels are held in this file. Assumes that the file
contains entries as such:
dog
cat
flower
where each line corresponds to a label. We map each label contained in
the file to an integer starting with the integer 0 corresponding to the
label contained in the first line.
Returns:
filenames: list of strings; each string is a path to an image file.
texts: list of strings; each string is the class, e.g. 'dog'
labels: list of integer; each integer identifies the ground truth.
"""
print('Determining list of input files and labels from %s.' % data_dir)
unique_labels = [l.strip() for l in tf.gfile.FastGFile(
labels_file, 'r').readlines()]
labels = []
filenames = []
texts = []
# Leave label index 0 empty as a background class.
label_index = 1
# Construct the list of JPEG files and labels.
for text in unique_labels:
jpeg_file_path = '%s/%s/*' % (data_dir, text)
matching_files = tf.gfile.Glob(jpeg_file_path)
labels.extend([label_index] * len(matching_files))
texts.extend([text] * len(matching_files))
filenames.extend(matching_files)
if not label_index % 100:
print('Finished finding files in %d of %d classes.' % (
label_index, len(labels)))
label_index += 1
# Shuffle the ordering of all image files in order to guarantee
# random ordering of the images with respect to label in the
# saved TFRecord files. Make the randomization repeatable.
shuffled_index = list(range(len(filenames)))
random.seed(12345)
random.shuffle(shuffled_index)
filenames = [filenames[i] for i in shuffled_index]
texts = [texts[i] for i in shuffled_index]
labels = [labels[i] for i in shuffled_index]
print('Found %d JPEG files across %d labels inside %s.' %
(len(filenames), len(unique_labels), data_dir))
return filenames, texts, labels
def _process_dataset(name, directory, num_shards, labels_file):
"""Process a complete data set and save it as a TFRecord.
Args:
name: string, unique identifier specifying the data set.
directory: string, root path to the data set.
num_shards: integer number of shards for this data set.
labels_file: string, path to the labels file.
"""
filenames, texts, labels = _find_image_files(directory, labels_file)
_process_image_files(name, filenames, texts, labels, num_shards)
def main(unused_argv):
assert not FLAGS.train_shards % FLAGS.num_threads, (
'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards')
assert not FLAGS.validation_shards % FLAGS.num_threads, (
'Please make the FLAGS.num_threads commensurate with '
'FLAGS.validation_shards')
print('Saving results to %s' % FLAGS.output_directory)
# Run it!
_process_dataset('validation', FLAGS.validation_directory,
FLAGS.validation_shards, FLAGS.labels_file)
_process_dataset('train', FLAGS.train_directory,
FLAGS.train_shards, FLAGS.labels_file)
if __name__ == '__main__':
tf.app.run()

View file

@ -0,0 +1,707 @@
#!/usr/bin/python
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Converts ImageNet data to TFRecords file format with Example protos.
The raw ImageNet data set is expected to reside in JPEG files located in the
following directory structure.
data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
...
where 'n01440764' is the unique synset label associated with
these images.
The training data set consists of 1000 sub-directories (i.e. labels)
each containing 1200 JPEG images for a total of 1.2M JPEG images.
The evaluation data set consists of 1000 sub-directories (i.e. labels)
each containing 50 JPEG images for a total of 50K JPEG images.
This TensorFlow script converts the training and evaluation data into
a sharded data set consisting of 1024 and 128 TFRecord files, respectively.
train_directory/train-00000-of-01024
train_directory/train-00001-of-01024
...
train_directory/train-01023-of-01024
and
validation_directory/validation-00000-of-00128
validation_directory/validation-00001-of-00128
...
validation_directory/validation-00127-of-00128
Each validation TFRecord file contains ~390 records. Each training TFREcord
file contains ~1250 records. Each record within the TFRecord file is a
serialized Example proto. The Example proto contains the following fields:
image/encoded: string containing JPEG encoded image in RGB colorspace
image/height: integer, image height in pixels
image/width: integer, image width in pixels
image/colorspace: string, specifying the colorspace, always 'RGB'
image/channels: integer, specifying the number of channels, always 3
image/format: string, specifying the format, always 'JPEG'
image/filename: string containing the basename of the image file
e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
image/class/label: integer specifying the index in a classification layer.
The label ranges from [1, 1000] where 0 is not used.
image/class/synset: string specifying the unique ID of the label,
e.g. 'n01440764'
image/class/text: string specifying the human-readable version of the label
e.g. 'red fox, Vulpes vulpes'
image/object/bbox/xmin: list of integers specifying the 0+ human annotated
bounding boxes
image/object/bbox/xmax: list of integers specifying the 0+ human annotated
bounding boxes
image/object/bbox/ymin: list of integers specifying the 0+ human annotated
bounding boxes
image/object/bbox/ymax: list of integers specifying the 0+ human annotated
bounding boxes
image/object/bbox/label: integer specifying the index in a classification
layer. The label ranges from [1, 1000] where 0 is not used. Note this is
always identical to the image label.
Note that the length of xmin is identical to the length of xmax, ymin and ymax
for each example.
Running this script using 16 threads may take around ~2.5 hours on an HP Z420.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import os
import random
import sys
import threading
import numpy as np
import six
import tensorflow as tf
tf.app.flags.DEFINE_string('train_directory', '/tmp/',
'Training data directory')
tf.app.flags.DEFINE_string('validation_directory', '/tmp/',
'Validation data directory')
tf.app.flags.DEFINE_string('output_directory', '/tmp/',
'Output data directory')
tf.app.flags.DEFINE_integer('train_shards', 1024,
'Number of shards in training TFRecord files.')
tf.app.flags.DEFINE_integer('validation_shards', 128,
'Number of shards in validation TFRecord files.')
tf.app.flags.DEFINE_integer('num_threads', 8,
'Number of threads to preprocess the images.')
# The labels file contains a list of valid labels are held in this file.
# Assumes that the file contains entries as such:
# n01440764
# n01443537
# n01484850
# where each line corresponds to a label expressed as a synset. We map
# each synset contained in the file to an integer (based on the alphabetical
# ordering). See below for details.
tf.app.flags.DEFINE_string('labels_file',
'imagenet_lsvrc_2015_synsets.txt',
'Labels file')
# This file containing mapping from synset to human-readable label.
# Assumes each line of the file looks like:
#
# n02119247 black fox
# n02119359 silver fox
# n02119477 red fox, Vulpes fulva
#
# where each line corresponds to a unique mapping. Note that each line is
# formatted as <synset>\t<human readable label>.
tf.app.flags.DEFINE_string('imagenet_metadata_file',
'imagenet_metadata.txt',
'ImageNet metadata file')
# This file is the output of process_bounding_box.py
# Assumes each line of the file looks like:
#
# n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
#
# where each line corresponds to one bounding box annotation associated
# with an image. Each line can be parsed as:
#
# <JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
#
# Note that there might exist mulitple bounding box annotations associated
# with an image file.
tf.app.flags.DEFINE_string('bounding_box_file',
'./imagenet_2012_bounding_boxes.csv',
'Bounding box file')
FLAGS = tf.app.flags.FLAGS
def _int64_feature(value):
"""Wrapper for inserting int64 features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _float_feature(value):
"""Wrapper for inserting float features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _bytes_feature(value):
"""Wrapper for inserting bytes features into Example proto."""
if six.PY3 and isinstance(value, six.text_type):
value = six.binary_type(value, encoding='utf-8')
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _convert_to_example(filename, image_buffer, label, synset, human, bbox,
height, width):
"""Build an Example proto for an example.
Args:
filename: string, path to an image file, e.g., '/path/to/example.JPG'
image_buffer: string, JPEG encoding of RGB image
label: integer, identifier for the ground truth for the network
synset: string, unique WordNet ID specifying the label, e.g., 'n02323233'
human: string, human-readable label, e.g., 'red fox, Vulpes vulpes'
bbox: list of bounding boxes; each box is a list of integers
specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong to
the same label as the image label.
height: integer, image height in pixels
width: integer, image width in pixels
Returns:
Example proto
"""
xmin = []
ymin = []
xmax = []
ymax = []
for b in bbox:
assert len(b) == 4
# pylint: disable=expression-not-assigned
[l.append(point) for l, point in zip([xmin, ymin, xmax, ymax], b)]
# pylint: enable=expression-not-assigned
colorspace = 'RGB'
channels = 3
image_format = 'JPEG'
example = tf.train.Example(features=tf.train.Features(feature={
'image/height': _int64_feature(height),
'image/width': _int64_feature(width),
'image/colorspace': _bytes_feature(colorspace),
'image/channels': _int64_feature(channels),
'image/class/label': _int64_feature(label),
'image/class/synset': _bytes_feature(synset),
'image/class/text': _bytes_feature(human),
'image/object/bbox/xmin': _float_feature(xmin),
'image/object/bbox/xmax': _float_feature(xmax),
'image/object/bbox/ymin': _float_feature(ymin),
'image/object/bbox/ymax': _float_feature(ymax),
'image/object/bbox/label': _int64_feature([label] * len(xmin)),
'image/format': _bytes_feature(image_format),
'image/filename': _bytes_feature(os.path.basename(filename)),
'image/encoded': _bytes_feature(image_buffer)}))
return example
class ImageCoder(object):
"""Helper class that provides TensorFlow image coding utilities."""
def __init__(self):
# Create a single Session to run all image coding calls.
self._sess = tf.Session()
# Initializes function that converts PNG to JPEG data.
self._png_data = tf.placeholder(dtype=tf.string)
image = tf.image.decode_png(self._png_data, channels=3)
self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100)
# Initializes function that converts CMYK JPEG data to RGB JPEG data.
self._cmyk_data = tf.placeholder(dtype=tf.string)
image = tf.image.decode_jpeg(self._cmyk_data, channels=0)
self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100)
# Initializes function that decodes RGB JPEG data.
self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)
def png_to_jpeg(self, image_data):
return self._sess.run(self._png_to_jpeg,
feed_dict={self._png_data: image_data})
def cmyk_to_rgb(self, image_data):
return self._sess.run(self._cmyk_to_rgb,
feed_dict={self._cmyk_data: image_data})
def decode_jpeg(self, image_data):
image = self._sess.run(self._decode_jpeg,
feed_dict={self._decode_jpeg_data: image_data})
assert len(image.shape) == 3
assert image.shape[2] == 3
return image
def _is_png(filename):
"""Determine if a file contains a PNG format image.
Args:
filename: string, path of the image file.
Returns:
boolean indicating if the image is a PNG.
"""
# File list from:
# https://groups.google.com/forum/embed/?place=forum/torch7#!topic/torch7/fOSTXHIESSU
return 'n02105855_2933.JPEG' in filename
def _is_cmyk(filename):
"""Determine if file contains a CMYK JPEG format image.
Args:
filename: string, path of the image file.
Returns:
boolean indicating if the image is a JPEG encoded with CMYK color space.
"""
# File list from:
# https://github.com/cytsai/ilsvrc-cmyk-image-list
blacklist = ['n01739381_1309.JPEG', 'n02077923_14822.JPEG',
'n02447366_23489.JPEG', 'n02492035_15739.JPEG',
'n02747177_10752.JPEG', 'n03018349_4028.JPEG',
'n03062245_4620.JPEG', 'n03347037_9675.JPEG',
'n03467068_12171.JPEG', 'n03529860_11437.JPEG',
'n03544143_17228.JPEG', 'n03633091_5218.JPEG',
'n03710637_5125.JPEG', 'n03961711_5286.JPEG',
'n04033995_2932.JPEG', 'n04258138_17003.JPEG',
'n04264628_27969.JPEG', 'n04336792_7448.JPEG',
'n04371774_5854.JPEG', 'n04596742_4225.JPEG',
'n07583066_647.JPEG', 'n13037406_4650.JPEG']
return filename.split('/')[-1] in blacklist
def _process_image(filename, coder):
"""Process a single image file.
Args:
filename: string, path to an image file e.g., '/path/to/example.JPG'.
coder: instance of ImageCoder to provide TensorFlow image coding utils.
Returns:
image_buffer: string, JPEG encoding of RGB image.
height: integer, image height in pixels.
width: integer, image width in pixels.
"""
# Read the image file.
with tf.gfile.FastGFile(filename, 'rb') as f:
image_data = f.read()
# Clean the dirty data.
if _is_png(filename):
# 1 image is a PNG.
print('Converting PNG to JPEG for %s' % filename)
image_data = coder.png_to_jpeg(image_data)
elif _is_cmyk(filename):
# 22 JPEG images are in CMYK colorspace.
print('Converting CMYK to RGB for %s' % filename)
image_data = coder.cmyk_to_rgb(image_data)
# Decode the RGB JPEG.
image = coder.decode_jpeg(image_data)
# Check that image converted to RGB
assert len(image.shape) == 3
height = image.shape[0]
width = image.shape[1]
assert image.shape[2] == 3
return image_data, height, width
def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
synsets, labels, humans, bboxes, num_shards):
"""Processes and saves list of images as TFRecord in 1 thread.
Args:
coder: instance of ImageCoder to provide TensorFlow image coding utils.
thread_index: integer, unique batch to run index is within [0, len(ranges)).
ranges: list of pairs of integers specifying ranges of each batches to
analyze in parallel.
name: string, unique identifier specifying the data set
filenames: list of strings; each string is a path to an image file
synsets: list of strings; each string is a unique WordNet ID
labels: list of integer; each integer identifies the ground truth
humans: list of strings; each string is a human-readable label
bboxes: list of bounding boxes for each image. Note that each entry in this
list might contain from 0+ entries corresponding to the number of bounding
box annotations for the image.
num_shards: integer number of shards for this data set.
"""
# Each thread produces N shards where N = int(num_shards / num_threads).
# For instance, if num_shards = 128, and the num_threads = 2, then the first
# thread would produce shards [0, 64).
num_threads = len(ranges)
assert not num_shards % num_threads
num_shards_per_batch = int(num_shards / num_threads)
shard_ranges = np.linspace(ranges[thread_index][0],
ranges[thread_index][1],
num_shards_per_batch + 1).astype(int)
num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
counter = 0
for s in range(num_shards_per_batch):
# Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
shard = thread_index * num_shards_per_batch + s
output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
output_file = os.path.join(FLAGS.output_directory, output_filename)
writer = tf.python_io.TFRecordWriter(output_file)
shard_counter = 0
files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
for i in files_in_shard:
filename = filenames[i]
label = labels[i]
synset = synsets[i]
human = humans[i]
bbox = bboxes[i]
image_buffer, height, width = _process_image(filename, coder)
example = _convert_to_example(filename, image_buffer, label,
synset, human, bbox,
height, width)
writer.write(example.SerializeToString())
shard_counter += 1
counter += 1
if not counter % 1000:
print('%s [thread %d]: Processed %d of %d images in thread batch.' %
(datetime.now(), thread_index, counter, num_files_in_thread))
sys.stdout.flush()
writer.close()
print('%s [thread %d]: Wrote %d images to %s' %
(datetime.now(), thread_index, shard_counter, output_file))
sys.stdout.flush()
shard_counter = 0
print('%s [thread %d]: Wrote %d images to %d shards.' %
(datetime.now(), thread_index, counter, num_files_in_thread))
sys.stdout.flush()
def _process_image_files(name, filenames, synsets, labels, humans,
bboxes, num_shards):
"""Process and save list of images as TFRecord of Example protos.
Args:
name: string, unique identifier specifying the data set
filenames: list of strings; each string is a path to an image file
synsets: list of strings; each string is a unique WordNet ID
labels: list of integer; each integer identifies the ground truth
humans: list of strings; each string is a human-readable label
bboxes: list of bounding boxes for each image. Note that each entry in this
list might contain from 0+ entries corresponding to the number of bounding
box annotations for the image.
num_shards: integer number of shards for this data set.
"""
assert len(filenames) == len(synsets)
assert len(filenames) == len(labels)
assert len(filenames) == len(humans)
assert len(filenames) == len(bboxes)
# Break all images into batches with a [ranges[i][0], ranges[i][1]].
spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
ranges = []
threads = []
for i in range(len(spacing) - 1):
ranges.append([spacing[i], spacing[i + 1]])
# Launch a thread for each batch.
print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
sys.stdout.flush()
# Create a mechanism for monitoring when all threads are finished.
coord = tf.train.Coordinator()
# Create a generic TensorFlow-based utility for converting all image codings.
coder = ImageCoder()
threads = []
for thread_index in range(len(ranges)):
args = (coder, thread_index, ranges, name, filenames,
synsets, labels, humans, bboxes, num_shards)
t = threading.Thread(target=_process_image_files_batch, args=args)
t.start()
threads.append(t)
# Wait for all the threads to terminate.
coord.join(threads)
print('%s: Finished writing all %d images in data set.' %
(datetime.now(), len(filenames)))
sys.stdout.flush()
def _find_image_files(data_dir, labels_file):
"""Build a list of all images files and labels in the data set.
Args:
data_dir: string, path to the root directory of images.
Assumes that the ImageNet data set resides in JPEG files located in
the following directory structure.
data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
where 'n01440764' is the unique synset label associated with these images.
labels_file: string, path to the labels file.
The list of valid labels are held in this file. Assumes that the file
contains entries as such:
n01440764
n01443537
n01484850
where each line corresponds to a label expressed as a synset. We map
each synset contained in the file to an integer (based on the alphabetical
ordering) starting with the integer 1 corresponding to the synset
contained in the first line.
The reason we start the integer labels at 1 is to reserve label 0 as an
unused background class.
Returns:
filenames: list of strings; each string is a path to an image file.
synsets: list of strings; each string is a unique WordNet ID.
labels: list of integer; each integer identifies the ground truth.
"""
print('Determining list of input files and labels from %s.' % data_dir)
challenge_synsets = [l.strip() for l in
tf.gfile.FastGFile(labels_file, 'r').readlines()]
labels = []
filenames = []
synsets = []
# Leave label index 0 empty as a background class.
label_index = 1
# Construct the list of JPEG files and labels.
for synset in challenge_synsets:
jpeg_file_path = '%s/%s/*.JPEG' % (data_dir, synset)
matching_files = tf.gfile.Glob(jpeg_file_path)
labels.extend([label_index] * len(matching_files))
synsets.extend([synset] * len(matching_files))
filenames.extend(matching_files)
if not label_index % 100:
print('Finished finding files in %d of %d classes.' % (
label_index, len(challenge_synsets)))
label_index += 1
# Shuffle the ordering of all image files in order to guarantee
# random ordering of the images with respect to label in the
# saved TFRecord files. Make the randomization repeatable.
shuffled_index = list(range(len(filenames)))
random.seed(12345)
random.shuffle(shuffled_index)
filenames = [filenames[i] for i in shuffled_index]
synsets = [synsets[i] for i in shuffled_index]
labels = [labels[i] for i in shuffled_index]
print('Found %d JPEG files across %d labels inside %s.' %
(len(filenames), len(challenge_synsets), data_dir))
return filenames, synsets, labels
def _find_human_readable_labels(synsets, synset_to_human):
"""Build a list of human-readable labels.
Args:
synsets: list of strings; each string is a unique WordNet ID.
synset_to_human: dict of synset to human labels, e.g.,
'n02119022' --> 'red fox, Vulpes vulpes'
Returns:
List of human-readable strings corresponding to each synset.
"""
humans = []
for s in synsets:
assert s in synset_to_human, ('Failed to find: %s' % s)
humans.append(synset_to_human[s])
return humans
def _find_image_bounding_boxes(filenames, image_to_bboxes):
"""Find the bounding boxes for a given image file.
Args:
filenames: list of strings; each string is a path to an image file.
image_to_bboxes: dictionary mapping image file names to a list of
bounding boxes. This list contains 0+ bounding boxes.
Returns:
List of bounding boxes for each image. Note that each entry in this
list might contain from 0+ entries corresponding to the number of bounding
box annotations for the image.
"""
num_image_bbox = 0
bboxes = []
for f in filenames:
basename = os.path.basename(f)
if basename in image_to_bboxes:
bboxes.append(image_to_bboxes[basename])
num_image_bbox += 1
else:
bboxes.append([])
print('Found %d images with bboxes out of %d images' % (
num_image_bbox, len(filenames)))
return bboxes
def _process_dataset(name, directory, num_shards, synset_to_human,
image_to_bboxes):
"""Process a complete data set and save it as a TFRecord.
Args:
name: string, unique identifier specifying the data set.
directory: string, root path to the data set.
num_shards: integer number of shards for this data set.
synset_to_human: dict of synset to human labels, e.g.,
'n02119022' --> 'red fox, Vulpes vulpes'
image_to_bboxes: dictionary mapping image file names to a list of
bounding boxes. This list contains 0+ bounding boxes.
"""
filenames, synsets, labels = _find_image_files(directory, FLAGS.labels_file)
humans = _find_human_readable_labels(synsets, synset_to_human)
bboxes = _find_image_bounding_boxes(filenames, image_to_bboxes)
_process_image_files(name, filenames, synsets, labels,
humans, bboxes, num_shards)
def _build_synset_lookup(imagenet_metadata_file):
"""Build lookup for synset to human-readable label.
Args:
imagenet_metadata_file: string, path to file containing mapping from
synset to human-readable label.
Assumes each line of the file looks like:
n02119247 black fox
n02119359 silver fox
n02119477 red fox, Vulpes fulva
where each line corresponds to a unique mapping. Note that each line is
formatted as <synset>\t<human readable label>.
Returns:
Dictionary of synset to human labels, such as:
'n02119022' --> 'red fox, Vulpes vulpes'
"""
lines = tf.gfile.FastGFile(imagenet_metadata_file, 'r').readlines()
synset_to_human = {}
for l in lines:
if l:
parts = l.strip().split('\t')
assert len(parts) == 2
synset = parts[0]
human = parts[1]
synset_to_human[synset] = human
return synset_to_human
def _build_bounding_box_lookup(bounding_box_file):
"""Build a lookup from image file to bounding boxes.
Args:
bounding_box_file: string, path to file with bounding boxes annotations.
Assumes each line of the file looks like:
n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
where each line corresponds to one bounding box annotation associated
with an image. Each line can be parsed as:
<JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
Note that there might exist mulitple bounding box annotations associated
with an image file. This file is the output of process_bounding_boxes.py.
Returns:
Dictionary mapping image file names to a list of bounding boxes. This list
contains 0+ bounding boxes.
"""
lines = tf.gfile.FastGFile(bounding_box_file, 'r').readlines()
images_to_bboxes = {}
num_bbox = 0
num_image = 0
for l in lines:
if l:
parts = l.split(',')
assert len(parts) == 5, ('Failed to parse: %s' % l)
filename = parts[0]
xmin = float(parts[1])
ymin = float(parts[2])
xmax = float(parts[3])
ymax = float(parts[4])
box = [xmin, ymin, xmax, ymax]
if filename not in images_to_bboxes:
images_to_bboxes[filename] = []
num_image += 1
images_to_bboxes[filename].append(box)
num_bbox += 1
print('Successfully read %d bounding boxes '
'across %d images.' % (num_bbox, num_image))
return images_to_bboxes
def main(unused_argv):
assert not FLAGS.train_shards % FLAGS.num_threads, (
'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards')
assert not FLAGS.validation_shards % FLAGS.num_threads, (
'Please make the FLAGS.num_threads commensurate with '
'FLAGS.validation_shards')
print('Saving results to %s' % FLAGS.output_directory)
# Build a map from synset to human-readable label.
synset_to_human = _build_synset_lookup(FLAGS.imagenet_metadata_file)
image_to_bboxes = _build_bounding_box_lookup(FLAGS.bounding_box_file)
# Run it!
_process_dataset('validation', FLAGS.validation_directory,
FLAGS.validation_shards, synset_to_human, image_to_bboxes)
_process_dataset('train', FLAGS.train_directory, FLAGS.train_shards,
synset_to_human, image_to_bboxes)
if __name__ == '__main__':
tf.app.run()

View file

@ -0,0 +1,618 @@
#!/usr/bin/python
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Converts ImageNet data to TFRecords file format with Example protos.
The raw ImageNet data set is expected to reside in JPEG files located in the
following directory structure.
data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
...
where 'n01440764' is the unique synset label associated with
these images.
The training data set consists of 1000 sub-directories (i.e. labels)
each containing 1200 JPEG images for a total of 1.2M JPEG images.
The evaluation data set consists of 1000 sub-directories (i.e. labels)
each containing 50 JPEG images for a total of 50K JPEG images.
This TensorFlow script converts the training and evaluation data into
a sharded data set consisting of 1024 and 128 TFRecord files, respectively.
train_directory/train-00000-of-01024
train_directory/train-00001-of-01024
...
train_directory/train-01023-of-01024
and
validation_directory/validation-00000-of-00128
validation_directory/validation-00001-of-00128
...
validation_directory/validation-00127-of-00128
Each validation TFRecord file contains ~390 records. Each training TFREcord
file contains ~1250 records. Each record within the TFRecord file is a
serialized Example proto. The Example proto contains the following fields:
image/encoded: string containing JPEG encoded image in RGB colorspace
image/height: integer, image height in pixels
image/width: integer, image width in pixels
image/colorspace: string, specifying the colorspace, always 'RGB'
image/channels: integer, specifying the number of channels, always 3
image/format: string, specifying the format, always 'JPEG'
image/filename: string containing the basename of the image file
e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
image/class/label: integer specifying the index in a classification layer.
The label ranges from [1, 1000] where 0 is not used.
image/class/synset: string specifying the unique ID of the label,
e.g. 'n01440764'
image/class/text: string specifying the human-readable version of the label
e.g. 'red fox, Vulpes vulpes'
image/object/bbox/xmin: list of integers specifying the 0+ human annotated
bounding boxes
image/object/bbox/xmax: list of integers specifying the 0+ human annotated
bounding boxes
image/object/bbox/ymin: list of integers specifying the 0+ human annotated
bounding boxes
image/object/bbox/ymax: list of integers specifying the 0+ human annotated
bounding boxes
image/object/bbox/label: integer specifying the index in a classification
layer. The label ranges from [1, 1000] where 0 is not used. Note this is
always identical to the image label.
Note that the length of xmin is identical to the length of xmax, ymin and ymax
for each example.
Running this script using 16 threads may take around ~2.5 hours on an HP Z420.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import os
import random
import sys
import threading
import numpy as np
import six
import tensorflow as tf
tf.app.flags.DEFINE_string('train_directory', '/tmp/',
'Training data directory')
tf.app.flags.DEFINE_string('validation_directory', '/tmp/',
'Validation data directory')
tf.app.flags.DEFINE_string('output_directory', '/tmp/',
'Output data directory')
tf.app.flags.DEFINE_integer('train_shards', 1024,
'Number of shards in training TFRecord files.')
tf.app.flags.DEFINE_integer('validation_shards', 128,
'Number of shards in validation TFRecord files.')
tf.app.flags.DEFINE_integer('num_threads', 8,
'Number of threads to preprocess the images.')
# The labels file contains a list of valid labels are held in this file.
# Assumes that the file contains entries as such:
# n01440764
# n01443537
# n01484850
# where each line corresponds to a label expressed as a synset. We map
# each synset contained in the file to an integer (based on the alphabetical
# ordering). See below for details.
tf.app.flags.DEFINE_string('labels_file',
'imagenet_lsvrc_2015_synsets.txt',
'Labels file')
# This file containing mapping from synset to human-readable label.
# Assumes each line of the file looks like:
#
# n02119247 black fox
# n02119359 silver fox
# n02119477 red fox, Vulpes fulva
#
# where each line corresponds to a unique mapping. Note that each line is
# formatted as <synset>\t<human readable label>.
tf.app.flags.DEFINE_string('imagenet_metadata_file',
'imagenet_metadata.txt',
'ImageNet metadata file')
FLAGS = tf.app.flags.FLAGS
def _int64_feature(value):
"""Wrapper for inserting int64 features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _float_feature(value):
"""Wrapper for inserting float features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _bytes_feature(value):
"""Wrapper for inserting bytes features into Example proto."""
if six.PY3 and isinstance(value, six.text_type):
value = six.binary_type(value, encoding='utf-8')
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _convert_to_example(filename, image_buffer, label, synset, human, bbox,
height, width):
"""Build an Example proto for an example.
Args:
filename: string, path to an image file, e.g., '/path/to/example.JPG'
image_buffer: string, JPEG encoding of RGB image
label: integer, identifier for the ground truth for the network
synset: string, unique WordNet ID specifying the label, e.g., 'n02323233'
human: string, human-readable label, e.g., 'red fox, Vulpes vulpes'
bbox: list of bounding boxes; each box is a list of integers
specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong to
the same label as the image label.
height: integer, image height in pixels
width: integer, image width in pixels
Returns:
Example proto
"""
xmin = []
ymin = []
xmax = []
ymax = []
for b in bbox:
assert len(b) == 4
# pylint: disable=expression-not-assigned
[l.append(point) for l, point in zip([xmin, ymin, xmax, ymax], b)]
# pylint: enable=expression-not-assigned
colorspace = 'RGB'
channels = 3
image_format = 'JPEG'
example = tf.train.Example(features=tf.train.Features(feature={
'image/height': _int64_feature(height),
'image/width': _int64_feature(width),
'image/colorspace': _bytes_feature(colorspace),
'image/channels': _int64_feature(channels),
'image/class/label': _int64_feature(label),
'image/class/synset': _bytes_feature(synset),
'image/class/text': _bytes_feature(human),
'image/object/bbox/xmin': _float_feature(xmin),
'image/object/bbox/xmax': _float_feature(xmax),
'image/object/bbox/ymin': _float_feature(ymin),
'image/object/bbox/ymax': _float_feature(ymax),
'image/object/bbox/label': _int64_feature([label] * len(xmin)),
'image/format': _bytes_feature(image_format),
'image/filename': _bytes_feature(os.path.basename(filename)),
'image/encoded': _bytes_feature(image_buffer)}))
return example
class ImageCoder(object):
"""Helper class that provides TensorFlow image coding utilities."""
def __init__(self):
# Create a single Session to run all image coding calls.
self._sess = tf.Session()
# Initializes function that converts PNG to JPEG data.
self._png_data = tf.placeholder(dtype=tf.string)
image = tf.image.decode_png(self._png_data, channels=3)
self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100)
# Initializes function that converts CMYK JPEG data to RGB JPEG data.
self._cmyk_data = tf.placeholder(dtype=tf.string)
image = tf.image.decode_jpeg(self._cmyk_data, channels=0)
self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100)
# Initializes function that decodes RGB JPEG data.
self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)
def png_to_jpeg(self, image_data):
return self._sess.run(self._png_to_jpeg,
feed_dict={self._png_data: image_data})
def cmyk_to_rgb(self, image_data):
return self._sess.run(self._cmyk_to_rgb,
feed_dict={self._cmyk_data: image_data})
def decode_jpeg(self, image_data):
image = self._sess.run(self._decode_jpeg,
feed_dict={self._decode_jpeg_data: image_data})
assert len(image.shape) == 3
assert image.shape[2] == 3
return image
def _is_png(filename):
"""Determine if a file contains a PNG format image.
Args:
filename: string, path of the image file.
Returns:
boolean indicating if the image is a PNG.
"""
# File list from:
# https://groups.google.com/forum/embed/?place=forum/torch7#!topic/torch7/fOSTXHIESSU
return 'n02105855_2933.JPEG' in filename
def _is_cmyk(filename):
"""Determine if file contains a CMYK JPEG format image.
Args:
filename: string, path of the image file.
Returns:
boolean indicating if the image is a JPEG encoded with CMYK color space.
"""
# File list from:
# https://github.com/cytsai/ilsvrc-cmyk-image-list
blacklist = ['n01739381_1309.JPEG', 'n02077923_14822.JPEG',
'n02447366_23489.JPEG', 'n02492035_15739.JPEG',
'n02747177_10752.JPEG', 'n03018349_4028.JPEG',
'n03062245_4620.JPEG', 'n03347037_9675.JPEG',
'n03467068_12171.JPEG', 'n03529860_11437.JPEG',
'n03544143_17228.JPEG', 'n03633091_5218.JPEG',
'n03710637_5125.JPEG', 'n03961711_5286.JPEG',
'n04033995_2932.JPEG', 'n04258138_17003.JPEG',
'n04264628_27969.JPEG', 'n04336792_7448.JPEG',
'n04371774_5854.JPEG', 'n04596742_4225.JPEG',
'n07583066_647.JPEG', 'n13037406_4650.JPEG']
return filename.split('/')[-1] in blacklist
def _process_image(filename, coder):
"""Process a single image file.
Args:
filename: string, path to an image file e.g., '/path/to/example.JPG'.
coder: instance of ImageCoder to provide TensorFlow image coding utils.
Returns:
image_buffer: string, JPEG encoding of RGB image.
height: integer, image height in pixels.
width: integer, image width in pixels.
"""
# Read the image file.
with tf.gfile.FastGFile(filename, 'rb') as f:
image_data = f.read()
# Clean the dirty data.
if _is_png(filename):
# 1 image is a PNG.
print('Converting PNG to JPEG for %s' % filename)
image_data = coder.png_to_jpeg(image_data)
elif _is_cmyk(filename):
# 22 JPEG images are in CMYK colorspace.
print('Converting CMYK to RGB for %s' % filename)
image_data = coder.cmyk_to_rgb(image_data)
# Decode the RGB JPEG.
image = coder.decode_jpeg(image_data)
# Check that image converted to RGB
assert len(image.shape) == 3
height = image.shape[0]
width = image.shape[1]
assert image.shape[2] == 3
return image_data, height, width
def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
synsets, labels, humans, bboxes, num_shards):
"""Processes and saves list of images as TFRecord in 1 thread.
Args:
coder: instance of ImageCoder to provide TensorFlow image coding utils.
thread_index: integer, unique batch to run index is within [0, len(ranges)).
ranges: list of pairs of integers specifying ranges of each batches to
analyze in parallel.
name: string, unique identifier specifying the data set
filenames: list of strings; each string is a path to an image file
synsets: list of strings; each string is a unique WordNet ID
labels: list of integer; each integer identifies the ground truth
humans: list of strings; each string is a human-readable label
bboxes: list of bounding boxes for each image. Note that each entry in this
list might contain from 0+ entries corresponding to the number of bounding
box annotations for the image.
num_shards: integer number of shards for this data set.
"""
# Each thread produces N shards where N = int(num_shards / num_threads).
# For instance, if num_shards = 128, and the num_threads = 2, then the first
# thread would produce shards [0, 64).
num_threads = len(ranges)
assert not num_shards % num_threads
num_shards_per_batch = int(num_shards / num_threads)
shard_ranges = np.linspace(ranges[thread_index][0],
ranges[thread_index][1],
num_shards_per_batch + 1).astype(int)
num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
counter = 0
for s in range(num_shards_per_batch):
# Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
shard = thread_index * num_shards_per_batch + s
output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
output_file = os.path.join(FLAGS.output_directory, output_filename)
writer = tf.python_io.TFRecordWriter(output_file)
shard_counter = 0
files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
for i in files_in_shard:
filename = filenames[i]
label = labels[i]
synset = synsets[i]
human = humans[i]
#bbox = bboxes[i]
image_buffer, height, width = _process_image(filename, coder)
example = _convert_to_example(filename, image_buffer, label,
synset, human, [[0, 0, 1, 1]],
height, width)
writer.write(example.SerializeToString())
shard_counter += 1
counter += 1
if not counter % 1000:
print('%s [thread %d]: Processed %d of %d images in thread batch.' %
(datetime.now(), thread_index, counter, num_files_in_thread))
sys.stdout.flush()
writer.close()
print('%s [thread %d]: Wrote %d images to %s' %
(datetime.now(), thread_index, shard_counter, output_file))
sys.stdout.flush()
shard_counter = 0
print('%s [thread %d]: Wrote %d images to %d shards.' %
(datetime.now(), thread_index, counter, num_files_in_thread))
sys.stdout.flush()
def _process_image_files(name, filenames, synsets, labels, humans,
bboxes, num_shards):
"""Process and save list of images as TFRecord of Example protos.
Args:
name: string, unique identifier specifying the data set
filenames: list of strings; each string is a path to an image file
synsets: list of strings; each string is a unique WordNet ID
labels: list of integer; each integer identifies the ground truth
humans: list of strings; each string is a human-readable label
bboxes: list of bounding boxes for each image. Note that each entry in this
list might contain from 0+ entries corresponding to the number of bounding
box annotations for the image.
num_shards: integer number of shards for this data set.
"""
assert len(filenames) == len(synsets)
assert len(filenames) == len(labels)
assert len(filenames) == len(humans)
#assert len(filenames) == len(bboxes)
# Break all images into batches with a [ranges[i][0], ranges[i][1]].
spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
ranges = []
threads = []
for i in range(len(spacing) - 1):
ranges.append([spacing[i], spacing[i + 1]])
# Launch a thread for each batch.
print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
sys.stdout.flush()
# Create a mechanism for monitoring when all threads are finished.
coord = tf.train.Coordinator()
# Create a generic TensorFlow-based utility for converting all image codings.
coder = ImageCoder()
threads = []
for thread_index in range(len(ranges)):
args = (coder, thread_index, ranges, name, filenames,
synsets, labels, humans, bboxes, num_shards)
t = threading.Thread(target=_process_image_files_batch, args=args)
t.start()
threads.append(t)
# Wait for all the threads to terminate.
coord.join(threads)
print('%s: Finished writing all %d images in data set.' %
(datetime.now(), len(filenames)))
sys.stdout.flush()
def _find_image_files(data_dir, labels_file):
"""Build a list of all images files and labels in the data set.
Args:
data_dir: string, path to the root directory of images.
Assumes that the ImageNet data set resides in JPEG files located in
the following directory structure.
data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
where 'n01440764' is the unique synset label associated with these images.
labels_file: string, path to the labels file.
The list of valid labels are held in this file. Assumes that the file
contains entries as such:
n01440764
n01443537
n01484850
where each line corresponds to a label expressed as a synset. We map
each synset contained in the file to an integer (based on the alphabetical
ordering) starting with the integer 1 corresponding to the synset
contained in the first line.
The reason we start the integer labels at 1 is to reserve label 0 as an
unused background class.
Returns:
filenames: list of strings; each string is a path to an image file.
synsets: list of strings; each string is a unique WordNet ID.
labels: list of integer; each integer identifies the ground truth.
"""
print('Determining list of input files and labels from %s.' % data_dir)
challenge_synsets = [l.strip() for l in
tf.gfile.FastGFile(labels_file, 'r').readlines()]
labels = []
filenames = []
synsets = []
# Leave label index 0 empty as a background class.
label_index = 1
# Construct the list of JPEG files and labels.
for synset in challenge_synsets:
jpeg_file_path = '%s/%s/*.JPEG' % (data_dir, synset)
matching_files = tf.gfile.Glob(jpeg_file_path)
labels.extend([label_index] * len(matching_files))
synsets.extend([synset] * len(matching_files))
filenames.extend(matching_files)
if not label_index % 100:
print('Finished finding files in %d of %d classes.' % (
label_index, len(challenge_synsets)))
label_index += 1
# Shuffle the ordering of all image files in order to guarantee
# random ordering of the images with respect to label in the
# saved TFRecord files. Make the randomization repeatable.
shuffled_index = list(range(len(filenames)))
random.seed(12345)
random.shuffle(shuffled_index)
filenames = [filenames[i] for i in shuffled_index]
synsets = [synsets[i] for i in shuffled_index]
labels = [labels[i] for i in shuffled_index]
print('Found %d JPEG files across %d labels inside %s.' %
(len(filenames), len(challenge_synsets), data_dir))
return filenames, synsets, labels
def _find_human_readable_labels(synsets, synset_to_human):
"""Build a list of human-readable labels.
Args:
synsets: list of strings; each string is a unique WordNet ID.
synset_to_human: dict of synset to human labels, e.g.,
'n02119022' --> 'red fox, Vulpes vulpes'
Returns:
List of human-readable strings corresponding to each synset.
"""
humans = []
for s in synsets:
assert s in synset_to_human, ('Failed to find: %s' % s)
humans.append(synset_to_human[s])
return humans
def _process_dataset(name, directory, num_shards, synset_to_human,
image_to_bboxes):
"""Process a complete data set and save it as a TFRecord.
Args:
name: string, unique identifier specifying the data set.
directory: string, root path to the data set.
num_shards: integer number of shards for this data set.
synset_to_human: dict of synset to human labels, e.g.,
'n02119022' --> 'red fox, Vulpes vulpes'
image_to_bboxes: dictionary mapping image file names to a list of
bounding boxes. This list contains 0+ bounding boxes.
"""
filenames, synsets, labels = _find_image_files(directory, FLAGS.labels_file)
humans = _find_human_readable_labels(synsets, synset_to_human)
#bboxes = _find_image_bounding_boxes(filenames, image_to_bboxes)
bboxes = []
_process_image_files(name, filenames, synsets, labels,
humans, bboxes, num_shards)
def _build_synset_lookup(imagenet_metadata_file):
"""Build lookup for synset to human-readable label.
Args:
imagenet_metadata_file: string, path to file containing mapping from
synset to human-readable label.
Assumes each line of the file looks like:
n02119247 black fox
n02119359 silver fox
n02119477 red fox, Vulpes fulva
where each line corresponds to a unique mapping. Note that each line is
formatted as <synset>\t<human readable label>.
Returns:
Dictionary of synset to human labels, such as:
'n02119022' --> 'red fox, Vulpes vulpes'
"""
lines = tf.gfile.FastGFile(imagenet_metadata_file, 'r').readlines()
synset_to_human = {}
for l in lines:
if l:
parts = l.strip().split('\t')
assert len(parts) == 2
synset = parts[0]
human = parts[1]
synset_to_human[synset] = human
return synset_to_human
def main(unused_argv):
assert not FLAGS.train_shards % FLAGS.num_threads, (
'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards')
assert not FLAGS.validation_shards % FLAGS.num_threads, (
'Please make the FLAGS.num_threads commensurate with '
'FLAGS.validation_shards')
print('Saving results to %s' % FLAGS.output_directory)
# Build a map from synset to human-readable label.
synset_to_human = _build_synset_lookup(FLAGS.imagenet_metadata_file)
# Run it!
_process_dataset('validation', FLAGS.validation_directory,
FLAGS.validation_shards, synset_to_human, None)
_process_dataset('train', FLAGS.train_directory, FLAGS.train_shards,
synset_to_human, None)
if __name__ == '__main__':
tf.app.run()

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,10 @@
n02086240
n02087394
n02088364
n02089973
n02093754
n02096294
n02099601
n02105641
n02111889
n02115641

View file

@ -0,0 +1,82 @@
#!/bin/bash
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Script to download and preprocess ImageNet Challenge 2012
# training and validation data set.
#
# The final output of this script are sharded TFRecord files containing
# serialized Example protocol buffers. See build_imagenet_data.py for
# details of how the Example protocol buffers contain the ImageNet data.
#
# The final output of this script appears as such:
#
# data_dir/train-00000-of-01024
# data_dir/train-00001-of-01024
# ...
# data_dir/train-01023-of-01024
#
# and
#
# data_dir/validation-00000-of-00128
# data_dir/validation-00001-of-00128
# ...
# data_dir/validation-00127-of-00128
#
# Note that this script may take several hours to run to completion. The
# conversion of the ImageNet data to TFRecords alone takes 2-3 hours depending
# on the speed of your machine. Please be patient.
#
# **IMPORTANT**
# To download the raw images, the user must create an account with image-net.org
# and generate a username and access_key. The latter two are required for
# downloading the raw images.
#
# usage:
# ./preprocess_imagenet.sh [data-dir]
set -e
if [ -z "$1" ]; then
echo "Usage: preprocess_imagenet.sh [data dir]"
exit
fi
DATA_DIR="${1%/}"
SCRATCH_DIR="${DATA_DIR}/raw-data/"
mkdir -p ${SCRATCH_DIR}
# Convert the XML files for bounding box annotations into a single CSV.
echo "Extracting bounding box information from XML."
BOUNDING_BOX_SCRIPT="./dataprep/process_bounding_boxes.py"
BOUNDING_BOX_FILE="${DATA_DIR}/imagenet_2012_bounding_boxes.csv"
BOUNDING_BOX_DIR="${DATA_DIR}/bounding_boxes/"
LABELS_FILE="./dataprep/imagenet_lsvrc_2015_synsets.txt"
"${BOUNDING_BOX_SCRIPT}" "${BOUNDING_BOX_DIR}" "${LABELS_FILE}" \
| sort > "${BOUNDING_BOX_FILE}"
echo "preprocessing the ImageNet data."
# Build the TFRecords version of the ImageNet data.
OUTPUT_DIRECTORY="${DATA_DIR}"
IMAGENET_METADATA_FILE="./dataprep/imagenet_metadata.txt"
python ./dataprep/build_imagenet_data.py \
--train_directory="${DATA_DIR}/train" \
--validation_directory="${DATA_DIR}/val" \
--output_directory="${DATA_DIR}/result" \
--imagenet_metadata_file="${IMAGENET_METADATA_FILE}" \
--labels_file="${LABELS_FILE}" \
--bounding_box_file="${BOUNDING_BOX_FILE}"

View file

@ -0,0 +1,89 @@
#!/usr/bin/python
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Process the ImageNet Challenge bounding boxes for TensorFlow model training.
Associate the ImageNet 2012 Challenge validation data set with labels.
The raw ImageNet validation data set is expected to reside in JPEG files
located in the following directory structure.
data_dir/ILSVRC2012_val_00000001.JPEG
data_dir/ILSVRC2012_val_00000002.JPEG
...
data_dir/ILSVRC2012_val_00050000.JPEG
This script moves the files into a directory structure like such:
data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
...
where 'n01440764' is the unique synset label associated with
these images.
This directory reorganization requires a mapping from validation image
number (i.e. suffix of the original file) to the associated label. This
is provided in the ImageNet development kit via a Matlab file.
In order to make life easier and divorce ourselves from Matlab, we instead
supply a custom text file that provides this mapping for us.
Sample usage:
./preprocess_imagenet_validation_data.py ILSVRC2012_img_val \
imagenet_2012_validation_synset_labels.txt
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import errno
import os.path
import sys
if __name__ == '__main__':
if len(sys.argv) < 3:
print('Invalid usage\n'
'usage: preprocess_imagenet_validation_data.py '
'<validation data dir> <validation labels file>')
sys.exit(-1)
data_dir = sys.argv[1]
validation_labels_file = sys.argv[2]
# Read in the 50000 synsets associated with the validation data set.
labels = [l.strip() for l in open(validation_labels_file).readlines()]
unique_labels = set(labels)
# Make all sub-directories in the validation data dir.
for label in unique_labels:
labeled_data_dir = os.path.join(data_dir, label)
# Catch error if sub-directory exists
try:
os.makedirs(labeled_data_dir)
except OSError as e:
# Raise all errors but 'EEXIST'
if e.errno != errno.EEXIST:
raise
# Move all of the image to the appropriate sub-directory.
for i in range(len(labels)):
basename = 'ILSVRC2012_val_000%.5d.JPEG' % (i + 1)
original_filename = os.path.join(data_dir, basename)
if not os.path.exists(original_filename):
print('Failed to find: %s' % original_filename)
sys.exit(-1)
new_filename = os.path.join(data_dir, labels[i], basename)
os.rename(original_filename, new_filename)

View file

@ -0,0 +1,254 @@
#!/usr/bin/python
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Process the ImageNet Challenge bounding boxes for TensorFlow model training.
This script is called as
process_bounding_boxes.py <dir> [synsets-file]
Where <dir> is a directory containing the downloaded and unpacked bounding box
data. If [synsets-file] is supplied, then only the bounding boxes whose
synstes are contained within this file are returned. Note that the
[synsets-file] file contains synset ids, one per line.
The script dumps out a CSV text file in which each line contains an entry.
n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
The entry can be read as:
<JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
The bounding box for <JPEG file name> contains two points (xmin, ymin) and
(xmax, ymax) specifying the lower-left corner and upper-right corner of a
bounding box in *relative* coordinates.
The user supplies a directory where the XML files reside. The directory
structure in the directory <dir> is assumed to look like this:
<dir>/nXXXXXXXX/nXXXXXXXX_YYYY.xml
Each XML file contains a bounding box annotation. The script:
(1) Parses the XML file and extracts the filename, label and bounding box info.
(2) The bounding box is specified in the XML files as integer (xmin, ymin) and
(xmax, ymax) *relative* to image size displayed to the human annotator. The
size of the image displayed to the human annotator is stored in the XML file
as integer (height, width).
Note that the displayed size will differ from the actual size of the image
downloaded from image-net.org. To make the bounding box annotation useable,
we convert bounding box to floating point numbers relative to displayed
height and width of the image.
Note that each XML file might contain N bounding box annotations.
Note that the points are all clamped at a range of [0.0, 1.0] because some
human annotations extend outside the range of the supplied image.
See details here: http://image-net.org/download-bboxes
(3) By default, the script outputs all valid bounding boxes. If a
[synsets-file] is supplied, only the subset of bounding boxes associated
with those synsets are outputted. Importantly, one can supply a list of
synsets in the ImageNet Challenge and output the list of bounding boxes
associated with the training images of the ILSVRC.
We use these bounding boxes to inform the random distortion of images
supplied to the network.
If you run this script successfully, you will see the following output
to stderr:
> Finished processing 544546 XML files.
> Skipped 0 XML files not in ImageNet Challenge.
> Skipped 0 bounding boxes not in ImageNet Challenge.
> Wrote 615299 bounding boxes from 544546 annotated images.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import glob
import os.path
import sys
import xml.etree.ElementTree as ET
class BoundingBox(object):
pass
def GetItem(name, root, index=0):
count = 0
for item in root.iter(name):
if count == index:
return item.text
count += 1
# Failed to find "index" occurrence of item.
return -1
def GetInt(name, root, index=0):
# In some XML annotation files, the point values are not integers, but floats.
# So we add a float function to avoid ValueError.
return int(float(GetItem(name, root, index)))
def FindNumberBoundingBoxes(root):
index = 0
while True:
if GetInt('xmin', root, index) == -1:
break
index += 1
return index
def ProcessXMLAnnotation(xml_file):
"""Process a single XML file containing a bounding box."""
# pylint: disable=broad-except
try:
tree = ET.parse(xml_file)
except Exception:
print('Failed to parse: ' + xml_file, file=sys.stderr)
return None
# pylint: enable=broad-except
root = tree.getroot()
num_boxes = FindNumberBoundingBoxes(root)
boxes = []
for index in range(num_boxes):
box = BoundingBox()
# Grab the 'index' annotation.
box.xmin = GetInt('xmin', root, index)
box.ymin = GetInt('ymin', root, index)
box.xmax = GetInt('xmax', root, index)
box.ymax = GetInt('ymax', root, index)
box.width = GetInt('width', root)
box.height = GetInt('height', root)
box.filename = GetItem('filename', root) + '.JPEG'
box.label = GetItem('name', root)
xmin = float(box.xmin) / float(box.width)
xmax = float(box.xmax) / float(box.width)
ymin = float(box.ymin) / float(box.height)
ymax = float(box.ymax) / float(box.height)
# Some images contain bounding box annotations that
# extend outside of the supplied image. See, e.g.
# n03127925/n03127925_147.xml
# Additionally, for some bounding boxes, the min > max
# or the box is entirely outside of the image.
min_x = min(xmin, xmax)
max_x = max(xmin, xmax)
box.xmin_scaled = min(max(min_x, 0.0), 1.0)
box.xmax_scaled = min(max(max_x, 0.0), 1.0)
min_y = min(ymin, ymax)
max_y = max(ymin, ymax)
box.ymin_scaled = min(max(min_y, 0.0), 1.0)
box.ymax_scaled = min(max(max_y, 0.0), 1.0)
boxes.append(box)
return boxes
if __name__ == '__main__':
if len(sys.argv) < 2 or len(sys.argv) > 3:
print('Invalid usage\n'
'usage: process_bounding_boxes.py <dir> [synsets-file]',
file=sys.stderr)
sys.exit(-1)
xml_files = glob.glob(sys.argv[1] + '/*/*.xml')
print('Identified %d XML files in %s' % (len(xml_files), sys.argv[1]),
file=sys.stderr)
if len(sys.argv) == 3:
labels = set([l.strip() for l in open(sys.argv[2]).readlines()])
print('Identified %d synset IDs in %s' % (len(labels), sys.argv[2]),
file=sys.stderr)
else:
labels = None
skipped_boxes = 0
skipped_files = 0
saved_boxes = 0
saved_files = 0
for file_index, one_file in enumerate(xml_files):
# Example: <...>/n06470073/n00141669_6790.xml
label = os.path.basename(os.path.dirname(one_file))
# Determine if the annotation is from an ImageNet Challenge label.
if labels is not None and label not in labels:
skipped_files += 1
continue
bboxes = ProcessXMLAnnotation(one_file)
assert bboxes is not None, 'No bounding boxes found in ' + one_file
found_box = False
for bbox in bboxes:
if labels is not None:
if bbox.label != label:
# Note: There is a slight bug in the bounding box annotation data.
# Many of the dog labels have the human label 'Scottish_deerhound'
# instead of the synset ID 'n02092002' in the bbox.label field. As a
# simple hack to overcome this issue, we only exclude bbox labels
# *which are synset ID's* that do not match original synset label for
# the XML file.
if bbox.label in labels:
skipped_boxes += 1
continue
# Guard against improperly specified boxes.
if (bbox.xmin_scaled >= bbox.xmax_scaled or
bbox.ymin_scaled >= bbox.ymax_scaled):
skipped_boxes += 1
continue
# Note bbox.filename occasionally contains '%s' in the name. This is
# data set noise that is fixed by just using the basename of the XML file.
image_filename = os.path.splitext(os.path.basename(one_file))[0]
print('%s.JPEG,%.4f,%.4f,%.4f,%.4f' %
(image_filename,
bbox.xmin_scaled, bbox.ymin_scaled,
bbox.xmax_scaled, bbox.ymax_scaled))
saved_boxes += 1
found_box = True
if found_box:
saved_files += 1
else:
skipped_files += 1
if not file_index % 5000:
print('--> processed %d of %d XML files.' %
(file_index + 1, len(xml_files)),
file=sys.stderr)
print('--> skipped %d boxes and %d XML files.' %
(skipped_boxes, skipped_files), file=sys.stderr)
print('Finished processing %d XML files.' % len(xml_files), file=sys.stderr)
print('Skipped %d XML files not in ImageNet Challenge.' % skipped_files,
file=sys.stderr)
print('Skipped %d bounding boxes not in ImageNet Challenge.' % skipped_boxes,
file=sys.stderr)
print('Wrote %d bounding boxes from %d annotated images.' %
(saved_boxes, saved_files),
file=sys.stderr)
print('Finished.', file=sys.stderr)

View file

@ -42,12 +42,10 @@ if __name__ == "__main__":
log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
os.makedirs(FLAGS.results_dir, exist_ok=True)
dllogger.init(
backends=[
dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
]
)
dllogger.init(backends=[
dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
])
else:
dllogger.init(backends=[])
dllogger.log(data=vars(FLAGS), step='PARAMETER')
@ -58,49 +56,46 @@ if __name__ == "__main__":
architecture=FLAGS.arch,
input_format='NHWC',
compute_format=FLAGS.data_format,
dtype=tf.float32 if FLAGS.precision == 'fp32' else tf.float16,
dtype=tf.float32,
n_channels=3,
height=224,
width=224,
height=224 if FLAGS.data_dir else FLAGS.synthetic_data_size,
width=224 if FLAGS.data_dir else FLAGS.synthetic_data_size,
distort_colors=False,
log_dir=FLAGS.results_dir,
model_dir=FLAGS.model_dir if FLAGS.model_dir is not None else FLAGS.results_dir,
data_dir=FLAGS.data_dir,
data_idx_dir=FLAGS.data_idx_dir,
weight_init=FLAGS.weight_init,
use_xla=FLAGS.use_xla,
use_tf_amp=FLAGS.use_tf_amp,
use_dali=FLAGS.use_dali,
use_xla=FLAGS.xla,
use_tf_amp=FLAGS.amp,
use_dali=FLAGS.dali,
gpu_memory_fraction=FLAGS.gpu_memory_fraction,
gpu_id=FLAGS.gpu_id,
seed=FLAGS.seed
)
seed=FLAGS.seed)
if FLAGS.mode in ["train", "train_and_evaluate", "training_benchmark"]:
runner.train(
iter_unit=FLAGS.iter_unit,
num_iter=FLAGS.num_iter,
run_iter=FLAGS.run_iter,
batch_size=FLAGS.batch_size,
warmup_steps=FLAGS.warmup_steps,
log_every_n_steps=FLAGS.display_every,
weight_decay=FLAGS.weight_decay,
lr_init=FLAGS.lr_init,
lr_warmup_epochs=FLAGS.lr_warmup_epochs,
momentum=FLAGS.momentum,
loss_scale=FLAGS.loss_scale,
label_smoothing=FLAGS.label_smoothing,
mixup=FLAGS.mixup,
use_static_loss_scaling=FLAGS.use_static_loss_scaling,
use_cosine_lr=FLAGS.use_cosine_lr,
is_benchmark=FLAGS.mode == 'training_benchmark',
use_final_conv=FLAGS.use_final_conv,
quantize=FLAGS.quantize,
symmetric=FLAGS.symmetric,
quant_delay = FLAGS.quant_delay,
use_qdq = FLAGS.use_qdq,
finetune_checkpoint=FLAGS.finetune_checkpoint,
)
runner.train(iter_unit=FLAGS.iter_unit,
num_iter=FLAGS.num_iter,
run_iter=FLAGS.run_iter,
batch_size=FLAGS.batch_size,
warmup_steps=FLAGS.warmup_steps,
log_every_n_steps=FLAGS.display_every,
weight_decay=FLAGS.weight_decay,
lr_init=FLAGS.lr_init,
lr_warmup_epochs=FLAGS.lr_warmup_epochs,
momentum=FLAGS.momentum,
loss_scale=FLAGS.static_loss_scale,
label_smoothing=FLAGS.label_smoothing,
mixup=FLAGS.mixup,
use_static_loss_scaling=(FLAGS.static_loss_scale != -1),
use_cosine_lr=FLAGS.cosine_lr,
is_benchmark=FLAGS.mode == 'training_benchmark',
use_final_conv=FLAGS.use_final_conv,
quantize=FLAGS.quantize,
symmetric=FLAGS.symmetric,
quant_delay=FLAGS.quant_delay,
use_qdq=FLAGS.use_qdq,
finetune_checkpoint=FLAGS.finetune_checkpoint)
if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:
@ -109,19 +104,17 @@ if __name__ == "__main__":
elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
runner.evaluate(
iter_unit=FLAGS.iter_unit if FLAGS.mode != "train_and_evaluate" else "epoch",
num_iter=FLAGS.num_iter if FLAGS.mode != "train_and_evaluate" else 1,
warmup_steps=FLAGS.warmup_steps,
batch_size=FLAGS.batch_size,
log_every_n_steps=FLAGS.display_every,
is_benchmark=FLAGS.mode == 'inference_benchmark',
export_dir=FLAGS.export_dir,
quantize=FLAGS.quantize,
symmetric=FLAGS.symmetric,
use_final_conv=FLAGS.use_final_conv,
use_qdq=FLAGS.use_qdq
)
runner.evaluate(iter_unit=FLAGS.iter_unit if FLAGS.mode != "train_and_evaluate" else "epoch",
num_iter=FLAGS.num_iter if FLAGS.mode != "train_and_evaluate" else 1,
warmup_steps=FLAGS.warmup_steps,
batch_size=FLAGS.batch_size,
log_every_n_steps=FLAGS.display_every,
is_benchmark=FLAGS.mode == 'inference_benchmark',
export_dir=FLAGS.export_dir,
quantize=FLAGS.quantize,
symmetric=FLAGS.symmetric,
use_final_conv=FLAGS.use_final_conv,
use_qdq=FLAGS.use_qdq)
if FLAGS.mode == 'predict':
if FLAGS.to_predict is None:
@ -134,4 +127,8 @@ if __name__ == "__main__":
raise NotImplementedError("Only single GPU inference is implemented.")
elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
runner.predict(FLAGS.to_predict, quantize=FLAGS.quantize, symmetric=FLAGS.symmetric, use_qdq=FLAGS.use_qdq, use_final_conv=FLAGS.use_final_conv)
runner.predict(FLAGS.to_predict,
quantize=FLAGS.quantize,
symmetric=FLAGS.symmetric,
use_qdq=FLAGS.use_qdq,
use_final_conv=FLAGS.use_final_conv)

View file

@ -29,7 +29,7 @@ def conv2d(
data_format='NHWC',
dilation_rate=(1, 1),
use_bias=True,
kernel_initializer=tf.variance_scaling_initializer(),
kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
bias_initializer=tf.zeros_initializer(),
trainable=True,
name=None
@ -56,6 +56,5 @@ def conv2d(
activation=None,
name=name
)
return net
return net

View file

@ -22,7 +22,7 @@ def dense(
units,
use_bias=True,
trainable=True,
kernel_initializer=tf.variance_scaling_initializer(),
kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
bias_initializer=tf.zeros_initializer()
):

View file

@ -29,7 +29,7 @@ def squeeze_excitation_layer(
ratio,
training=True,
data_format='NCHW',
kernel_initializer=tf.variance_scaling_initializer(),
kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
bias_initializer=tf.zeros_initializer(),
name="squeeze_excitation_layer"
):

View file

@ -15,7 +15,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import tensorflow as tf
@ -34,7 +33,6 @@ from utils.data_utils import normalized_inputs
from utils.learning_rate import learning_rate_scheduler
from utils.optimizers import FixedLossScalerOptimizer
__all__ = [
'ResnetModel',
]
@ -89,14 +87,14 @@ class ResnetModel(object):
)
self.conv2d_hparams = tf.contrib.training.HParams(
kernel_initializer=tf.variance_scaling_initializer(
kernel_initializer=tf.compat.v1.variance_scaling_initializer(
scale=2.0, distribution='truncated_normal', mode=weight_init
),
bias_initializer=tf.constant_initializer(0.0)
)
self.dense_hparams = tf.contrib.training.HParams(
kernel_initializer=tf.variance_scaling_initializer(
kernel_initializer=tf.compat.v1.variance_scaling_initializer(
scale=2.0, distribution='truncated_normal', mode=weight_init
),
bias_initializer=tf.constant_initializer(0.0)
@ -109,12 +107,13 @@ class ResnetModel(object):
print("Input_format", input_format)
print("dtype", str(dtype))
def __call__(self, features, labels, mode, params):
if mode == tf.estimator.ModeKeys.TRAIN:
mandatory_params = ["batch_size", "lr_init", "num_gpus", "steps_per_epoch",
"momentum", "weight_decay", "loss_scale", "label_smoothing"]
mandatory_params = [
"batch_size", "lr_init", "num_gpus", "steps_per_epoch", "momentum", "weight_decay", "loss_scale",
"label_smoothing"
]
for p in mandatory_params:
if p not in params:
raise RuntimeError("Parameter {} is missing.".format(p))
@ -141,43 +140,46 @@ class ResnetModel(object):
mixup = 0
eta = 0
if mode == tf.estimator.ModeKeys.TRAIN:
if mode == tf.estimator.ModeKeys.TRAIN:
eta = params['label_smoothing']
mixup = params['mixup']
if mode != tf.estimator.ModeKeys.PREDICT:
one_hot_smoothed_labels = tf.one_hot(labels, 1001,
on_value = 1 - eta + eta/1001,
off_value = eta/1001)
if mode != tf.estimator.ModeKeys.PREDICT:
n_cls = self.model_hparams.n_classes
one_hot_smoothed_labels = tf.one_hot(labels, n_cls,
on_value=1 - eta + eta / n_cls, off_value=eta / n_cls)
if mixup != 0:
print("Using mixup training with beta=", params['mixup'])
beta_distribution = tf.distributions.Beta(params['mixup'], params['mixup'])
feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1])
feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1])
reversed_feature_coefficients = tf.subtract(tf.ones(shape=feature_coefficients.shape), feature_coefficients)
reversed_feature_coefficients = tf.subtract(
tf.ones(shape=feature_coefficients.shape), feature_coefficients
)
rotated_features = tf.reverse(features, axis=[0])
rotated_features = tf.reverse(features, axis=[0])
features = feature_coefficients * features + reversed_feature_coefficients * rotated_features
label_coefficients = tf.squeeze(feature_coefficients, axis=[2, 3])
rotated_labels = tf.reverse(one_hot_smoothed_labels, axis=[0])
rotated_labels = tf.reverse(one_hot_smoothed_labels, axis=[0])
reversed_label_coefficients = tf.subtract(tf.ones(shape=label_coefficients.shape), label_coefficients)
reversed_label_coefficients = tf.subtract(
tf.ones(shape=label_coefficients.shape), label_coefficients
)
one_hot_smoothed_labels = label_coefficients * one_hot_smoothed_labels + reversed_label_coefficients * rotated_labels
# Update Global Step
global_step = tf.train.get_or_create_global_step()
tf.identity(global_step, name="global_step_ref")
tf.identity(features, name="features_ref")
if mode == tf.estimator.ModeKeys.TRAIN:
tf.identity(labels, name="labels_ref")
@ -202,16 +204,31 @@ class ResnetModel(object):
tf.identity(probs, name="probs_ref")
tf.identity(y_preds, name="y_preds_ref")
#if mode == tf.estimator.ModeKeys.TRAIN:
#
# assert (len(tf.trainable_variables()) == 161)
#
#else:
#
# assert (len(tf.trainable_variables()) == 0)
if mode == tf.estimator.ModeKeys.TRAIN and params['quantize']:
dllogger.log(data={"QUANTIZATION AWARE TRAINING ENABLED": True}, step=tuple())
if params['symmetric']:
dllogger.log(data={"MODE":"USING SYMMETRIC MODE"}, step=tuple())
tf.contrib.quantize.experimental_create_training_graph(tf.get_default_graph(), symmetric=True, use_qdq=params['use_qdq'] ,quant_delay=params['quant_delay'])
dllogger.log(data={"MODE": "USING SYMMETRIC MODE"}, step=tuple())
tf.contrib.quantize.experimental_create_training_graph(
tf.get_default_graph(),
symmetric=True,
use_qdq=params['use_qdq'],
quant_delay=params['quant_delay']
)
else:
dllogger.log(data={"MODE":"USING ASSYMETRIC MODE"}, step=tuple())
tf.contrib.quantize.create_training_graph(tf.get_default_graph(), quant_delay=params['quant_delay'], use_qdq=params['use_qdq'])
# Fix for restoring variables during fine-tuning of Resnet-50
dllogger.log(data={"MODE": "USING ASSYMETRIC MODE"}, step=tuple())
tf.contrib.quantize.create_training_graph(
tf.get_default_graph(), quant_delay=params['quant_delay'], use_qdq=params['use_qdq']
)
# Fix for restoring variables during fine-tuning of Resnet
if 'finetune_checkpoint' in params.keys():
train_vars = tf.trainable_variables()
train_var_dict = {}
@ -220,6 +237,13 @@ class ResnetModel(object):
dllogger.log(data={"Restoring variables from checkpoint": params['finetune_checkpoint']}, step=tuple())
tf.train.init_from_checkpoint(params['finetune_checkpoint'], train_var_dict)
with tf.device("/cpu:0"):
if hvd_utils.is_using_hvd():
sync_var = tf.Variable(initial_value=[0], dtype=tf.int32, name="signal_handler_var")
sync_var_assing = sync_var.assign([1], name="signal_handler_var_set")
sync_var_reset = sync_var.assign([0], name="signal_handler_var_reset")
sync_op = hvd.allreduce(sync_var, op=hvd.Sum, name="signal_handler_all_reduce")
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {'classes': y_preds, 'probabilities': probs}
@ -239,8 +263,12 @@ class ResnetModel(object):
acc_top5 = tf.nn.in_top_k(predictions=logits, targets=labels, k=5)
else:
acc_top1, acc_top1_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=1))
acc_top5, acc_top5_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=5))
acc_top1, acc_top1_update_op = tf.metrics.mean(
tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
)
acc_top5, acc_top5_update_op = tf.metrics.mean(
tf.nn.in_top_k(predictions=logits, targets=labels, k=5)
)
tf.identity(acc_top1, name="acc_top1_ref")
tf.identity(acc_top5, name="acc_top5_ref")
@ -251,20 +279,21 @@ class ResnetModel(object):
'accuracy_top1': acc_top1,
'accuracy_top5': acc_top5
}
cross_entropy = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=one_hot_smoothed_labels)
cross_entropy = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=one_hot_smoothed_labels)
assert (cross_entropy.dtype == tf.float32)
tf.identity(cross_entropy, name='cross_entropy_loss_ref')
def loss_filter_fn(name):
"""we don't need to compute L2 loss for BN and bias (eq. to add a cste)"""
return all([
tensor_name not in name.lower()
# for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"]
for tensor_name in ["batchnorm", "batch_norm", "batch_normalization"]
])
return all(
[
tensor_name not in name.lower()
# for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"]
for tensor_name in ["batchnorm", "batch_norm", "batch_normalization"]
]
)
filtered_params = [tf.cast(v, tf.float32) for v in tf.trainable_variables() if loss_filter_fn(v.name)]
@ -287,7 +316,7 @@ class ResnetModel(object):
tf.summary.scalar('cross_entropy', cross_entropy)
tf.summary.scalar('l2_loss', l2_loss)
tf.summary.scalar('total_loss', total_loss)
if mode == tf.estimator.ModeKeys.TRAIN:
with tf.device("/cpu:0"):
@ -317,17 +346,18 @@ class ResnetModel(object):
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
if mode != tf.estimator.ModeKeys.TRAIN:
update_ops += [acc_top1_update_op, acc_top5_update_op]
deterministic = True
gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE)
gate_gradients = (tf.compat.v1.train.Optimizer.GATE_OP if deterministic else tf.compat.v1.train.Optimizer.GATE_NONE)
backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step)
if self.model_hparams.use_dali:
train_ops = tf.group(backprop_op, update_ops, name='train_ops')
else:
train_ops = tf.group(backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops')
train_ops = tf.group(
backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops'
)
return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_ops)
@ -338,23 +368,18 @@ class ResnetModel(object):
}
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=total_loss,
eval_metric_ops=eval_metrics
mode=mode, predictions=predictions, loss=total_loss, eval_metric_ops=eval_metrics
)
else:
raise NotImplementedError('Unknown mode {}'.format(mode))
@staticmethod
def _stage(tensors):
"""Stages the given tensors in a StagingArea for asynchronous put/get.
"""
stage_area = tf.contrib.staging.StagingArea(
dtypes=[tensor.dtype for tensor in tensors],
shapes=[tensor.get_shape() for tensor in tensors]
dtypes=[tensor.dtype for tensor in tensors], shapes=[tensor.get_shape() for tensor in tensors]
)
put_op = stage_area.put(tensors)
@ -364,14 +389,11 @@ class ResnetModel(object):
return put_op, get_tensors
def build_model(self, inputs, training=True, reuse=False, use_final_conv=False):
with var_storage.model_variable_scope(
self.model_hparams.model_name,
reuse=reuse,
dtype=self.model_hparams.dtype):
self.model_hparams.model_name, reuse=reuse, dtype=self.model_hparams.dtype
):
with tf.variable_scope("input_reshape"):
if self.model_hparams.input_format == 'NHWC' and self.model_hparams.compute_format == 'NCHW':
@ -426,27 +448,29 @@ class ResnetModel(object):
batch_norm_hparams=self.batch_norm_hparams,
block_name="btlnck_block_%d_%d" % (block_id, layer_id),
use_se=self.model_hparams.use_se,
ratio=self.model_hparams.se_ratio)
ratio=self.model_hparams.se_ratio
)
with tf.variable_scope("output"):
net = layers.reduce_mean(
net, keepdims=use_final_conv, data_format=self.model_hparams.compute_format, name='spatial_mean')
net, keepdims=False, data_format=self.model_hparams.compute_format, name='spatial_mean'
)
if use_final_conv:
logits = layers.conv2d(
net,
n_channels=self.model_hparams.n_classes,
kernel_size=(1, 1),
strides=(1, 1),
padding='SAME',
data_format=self.model_hparams.compute_format,
dilation_rate=(1, 1),
use_bias=True,
kernel_initializer=self.dense_hparams.kernel_initializer,
bias_initializer=self.dense_hparams.bias_initializer,
trainable=training,
name='dense'
)
net,
n_channels=self.model_hparams.n_classes,
kernel_size=(1, 1),
strides=(1, 1),
padding='SAME',
data_format=self.model_hparams.compute_format,
dilation_rate=(1, 1),
use_bias=True,
kernel_initializer=self.dense_hparams.kernel_initializer,
bias_initializer=self.dense_hparams.bias_initializer,
trainable=training,
name='dense'
)
else:
logits = layers.dense(
inputs=net,
@ -454,7 +478,8 @@ class ResnetModel(object):
use_bias=True,
trainable=training,
kernel_initializer=self.dense_hparams.kernel_initializer,
bias_initializer=self.dense_hparams.bias_initializer)
bias_initializer=self.dense_hparams.bias_initializer
)
if logits.dtype != tf.float32:
logits = tf.cast(logits, tf.float32)
@ -464,27 +489,25 @@ class ResnetModel(object):
return probs, logits
model_architectures = {
'resnet50': {
'layers': [3, 4, 6, 3],
'widths': [64, 128, 256, 512],
'expansions': 4,
},
'resnext101-32x4d': {
'layers': [3, 4, 23, 3],
'widths': [128, 256, 512, 1024],
'expansions': 2,
'cardinality': 32,
},
'se-resnext101-32x4d' : {
'cardinality' : 32,
'layers' : [3, 4, 23, 3],
'widths' : [128, 256, 512, 1024],
'expansions' : 2,
'se-resnext101-32x4d': {
'cardinality': 32,
'layers': [3, 4, 23, 3],
'widths': [128, 256, 512, 1024],
'expansions': 2,
'use_se': True,
'se_ratio': 16,
},
}

View file

@ -71,4 +71,4 @@ if __name__=='__main__':
file.write("model_checkpoint_path: "+ "\"" + new_ckpt + "\"")
# Process the input checkpoint, apply transforms and generate a new checkpoint.
process_checkpoint(input_ckpt, new_ckpt_path, args.dense_layer)
process_checkpoint(input_ckpt, new_ckpt_path, args.dense_layer)

View file

@ -244,16 +244,16 @@ For example, to train on DGX-1 for 90 epochs using AMP, run:
Additionally, features like DALI data preprocessing or TensorFlow XLA can be enabled with
following arguments when running those scripts:
`bash ./resnet50v1.5/training/DGX1_RN50_AMP_90E.sh /path/to/result /data --use_xla --use_dali`
`bash ./resnet50v1.5/training/DGX1_RN50_AMP_90E.sh /path/to/result /data --xla --dali`
7. Start validation/evaluation.
To evaluate the validation dataset located in `/data/tfrecords`, run `main.py` with
`--mode=evaluate`. For example:
`python main.py --mode=evaluate --data_dir=/data/tfrecords --batch_size <batch size> --model_dir
<model location> --results_dir <output location> [--use_xla] [--use_tf_amp]`
<model location> --results_dir <output location> [--xla] [--amp]`
The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during evaluation.
The optional `--xla` and `--amp` flags control XLA and AMP during evaluation.
## Advanced
@ -292,99 +292,116 @@ The `runtime/` directory contains the following module that define the mechanics
The script for training and evaluating the ResNet-50 v1.5 model has a variety of parameters that control these processes.
```
usage: main.py [-h]
[--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
usage: main.py [-h] [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
[--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}]
[--data_dir DATA_DIR] [--data_idx_dir DATA_IDX_DIR]
[--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
[--batch_size BATCH_SIZE] [--num_iter NUM_ITER]
[--iter_unit {epoch,batch}] [--warmup_steps WARMUP_STEPS]
[--model_dir MODEL_DIR] [--results_dir RESULTS_DIR]
[--log_filename LOG_FILENAME] [--display_every DISPLAY_EVERY]
[--lr_init LR_INIT] [--lr_warmup_epochs LR_WARMUP_EPOCHS]
[--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
[--momentum MOMENTUM] [--loss_scale LOSS_SCALE]
[--label_smoothing LABEL_SMOOTHING] [--mixup MIXUP]
[--use_static_loss_scaling | --nouse_static_loss_scaling]
[--use_xla | --nouse_xla] [--use_dali | --nouse_dali]
[--use_tf_amp | --nouse_tf_amp]
[--use_cosine_lr | --nouse_cosine_lr] [--seed SEED]
[--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
--batch_size BATCH_SIZE [--num_iter NUM_ITER]
[--run_iter RUN_ITER] [--iter_unit {epoch,batch}]
[--warmup_steps WARMUP_STEPS] [--model_dir MODEL_DIR]
[--results_dir RESULTS_DIR] [--log_filename LOG_FILENAME]
[--display_every DISPLAY_EVERY] [--seed SEED]
[--gpu_memory_fraction GPU_MEMORY_FRACTION] [--gpu_id GPU_ID]
JoC-RN50v1.5-TF
optional arguments:
-h, --help Show this help message and exit
[--finetune_checkpoint FINETUNE_CHECKPOINT] [--use_final_conv]
[--quant_delay QUANT_DELAY] [--quantize] [--use_qdq]
[--symmetric] [--data_dir DATA_DIR]
[--data_idx_dir DATA_IDX_DIR] [--dali]
[--synthetic_data_size SYNTHETIC_DATA_SIZE] [--lr_init LR_INIT]
[--lr_warmup_epochs LR_WARMUP_EPOCHS]
[--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
[--momentum MOMENTUM] [--label_smoothing LABEL_SMOOTHING]
[--mixup MIXUP] [--cosine_lr] [--xla]
[--data_format {NHWC,NCHW}] [--amp]
[--static_loss_scale STATIC_LOSS_SCALE]
JoC-RN50v1.5-TF
optional arguments:
-h, --help show this help message and exit.
--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}
Architecture of model to run (default is resnet50)
Architecture of model to run.
--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}
The execution mode of the script.
--export_dir EXPORT_DIR
Directory in which to write exported SavedModel.
--to_predict TO_PREDICT
Path to file or directory of files to run prediction
on.
--batch_size BATCH_SIZE
Size of each minibatch per GPU.
--num_iter NUM_ITER Number of iterations to run.
--run_iter RUN_ITER Number of training iterations to run on single run.
--iter_unit {epoch,batch}
Unit of iterations.
--warmup_steps WARMUP_STEPS
Number of steps considered as warmup and not taken
into account for performance measurements.
--model_dir MODEL_DIR
Directory in which to write model. If undefined,
results dir will be used.
--results_dir RESULTS_DIR
Directory in which to write training logs, summaries
and checkpoints.
--log_filename LOG_FILENAME
Name of the JSON file to which write the training log.
--display_every DISPLAY_EVERY
How often (in batches) to print out running
information.
--seed SEED Random seed.
--gpu_memory_fraction GPU_MEMORY_FRACTION
Limit memory fraction used by training script for DALI.
--gpu_id GPU_ID Specify ID of the target GPU on multi-device platform.
Effective only for single-GPU mode.
--finetune_checkpoint FINETUNE_CHECKPOINT
Path to pre-trained checkpoint which will be used for
fine-tuning.
--use_final_conv Use convolution operator instead of MLP as last layer.
--quant_delay QUANT_DELAY
Number of steps to be run before quantization starts
to happen.
--quantize Quantize weights and activations during training.
(Defaults to Assymmetric quantization)
--use_qdq Use QDQV3 op instead of FakeQuantWithMinMaxVars op for
quantization. QDQv3 does only scaling.
--symmetric Quantize weights and activations during training using
symmetric quantization.
Dataset arguments:
--data_dir DATA_DIR Path to dataset in TFRecord format. Files should be
named 'train-*' and 'validation-*'.
--data_idx_dir DATA_IDX_DIR
Path to index files for DALI. Files should be named
'train-*' and 'validation-*'.
--export_dir EXPORT_DIR
Directory in which to write exported SavedModel.
--to_predict TO_PREDICT
Path to file or directory of files to run prediction
on.
--batch_size BATCH_SIZE
Size of each minibatch per GPU.
--num_iter NUM_ITER Number of iterations to run.
--iter_unit {epoch,batch}
Unit of iterations.
--warmup_steps WARMUP_STEPS
Number of steps considered as warmup and not taken
into account for performance measurements.
--model_dir MODEL_DIR
Directory in which to write the model. If undefined,
results directory will be used.
--results_dir RESULTS_DIR
Directory in which to write training logs, summaries
and checkpoints.
--log_filename LOG_FILENAME
Name of the JSON file to which write the training log
--display_every DISPLAY_EVERY
How often (in batches) to print out running
information.
--dali Enable DALI data input.
--synthetic_data_size SYNTHETIC_DATA_SIZE
Dimension of image for synthetic dataset.
Training arguments:
--lr_init LR_INIT Initial value for the learning rate.
--lr_warmup_epochs LR_WARMUP_EPOCHS
Number of warmup epochs for the learning rate schedule.
Number of warmup epochs for learning rate schedule.
--weight_decay WEIGHT_DECAY
Weight Decay scale factor.
--weight_init {fan_in,fan_out}
Model weight initialization method.
--momentum MOMENTUM SGD momentum value for the momentum optimizer.
--loss_scale LOSS_SCALE
Loss scale for FP16 training and fast math FP32.
--momentum MOMENTUM SGD momentum value for the Momentum optimizer.
--label_smoothing LABEL_SMOOTHING
The value of label smoothing.
--mixup MIXUP The alpha parameter for mixup (if 0 then mixup is not
applied).
--use_static_loss_scaling
Use static loss scaling in FP16 or FP32 AMP.
--nouse_static_loss_scaling
--use_xla Enable XLA (Accelerated Linear Algebra) computation
--cosine_lr Use cosine learning rate schedule.
Generic optimization arguments:
--xla Enable XLA (Accelerated Linear Algebra) computation
for improved performance.
--nouse_xla
--use_dali Enable DALI data input.
--nouse_dali
--use_tf_amp Enable AMP to speedup FP32
computation using Tensor Cores.
--nouse_tf_amp
--use_cosine_lr Use cosine learning rate schedule.
--nouse_cosine_lr
--seed SEED Random seed.
--gpu_memory_fraction GPU_MEMORY_FRACTION
Limit memory fraction used by the training script for DALI
--gpu_id GPU_ID Specify the ID of the target GPU on a multi-device platform.
Effective only for single-GPU mode.
--quantize Used to add quantization nodes in the graph (Default: Asymmetric quantization)
--symmetric If --quantize mode is used, this option enables symmetric quantization
--use_qdq Use quantize_and_dequantize (QDQ) op instead of FakeQuantWithMinMaxVars op for quantization. QDQ does only scaling.
--finetune_checkpoint Path to pre-trained checkpoint which can be used for fine-tuning
--quant_delay Number of steps to be run before quantization starts to happen
--data_format {NHWC,NCHW}
Data format used to do calculations.
--amp Enable Automatic Mixed Precision to speedup
computation using tensor cores.
Automatic Mixed Precision arguments:
--static_loss_scale STATIC_LOSS_SCALE
Use static loss scaling in FP32 AMP.
```
### Quantization Aware Training
@ -424,12 +441,13 @@ Arguments:
* `--input_format` : Data format of input tensor (Default: NCHW). Use NCHW format to optimize the graph with TensorRT.
* `--compute_format` : Data format of the operations in the network (Default: NCHW). Use NCHW format to optimize the graph with TensorRT.
### Inference process
To run inference on a single example with a checkpoint and a model script, use:
`python main.py --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`
The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during inference.
The optional `--xla` and `--amp` flags control XLA and AMP during inference.
## Performance
@ -448,7 +466,7 @@ To benchmark the training performance on a specific batch size, run:
* AMP
`python ./main.py --mode=training_benchmark --use_tf_amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
`python ./main.py --mode=training_benchmark --amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
* For multiple GPUs
* FP32 / TF32
@ -457,16 +475,18 @@ To benchmark the training performance on a specific batch size, run:
* AMP
`mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --mode=training_benchmark --use_tf_amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
`mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --mode=training_benchmark --amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
Each of these scripts runs 200 warm-up iterations and measures the first epoch.
To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
with `--xla` and `--dali` flags. For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
Suggested batch sizes for training are 256 for mixed precision training and 128 for single precision training per single V100 16 GB.
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. The resolution of synthetic images used can be controlled with `--synthetic_data_size` flag.
#### Inference performance benchmark
To benchmark the inference performance on a specific batch size, run:
@ -477,11 +497,10 @@ To benchmark the inference performance on a specific batch size, run:
* AMP
`python ./main.py --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
`python ./main.py --mode=inference_benchmark --amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
For proper throughput and latency reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnet50v1.5`, by simply running:
@ -490,6 +509,9 @@ The benchmark can be automated with the `inference_benchmark.sh` script provided
The `<data dir>` parameter refers to the input data directory (by default `/data/tfrecords` inside the container).
By default, the benchmark tests the following configurations: **FP32**, **AMP**, **AMP + XLA** with different batch sizes.
When the optional directory with the DALI index files `<data idx dir>` is specified, the benchmark executes an additional **DALI + AMP + XLA** configuration.
For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
For performance benchmark of raw model, synthetic dataset can be used. To use synthetic dataset, use `--synthetic_data_size` flag instead of `--data_dir` to specify input image size.
### Results
@ -568,17 +590,6 @@ on NVIDIA DGX A100 with (8x A100 40G) GPUs.
| 8 | ~2h | ~5h |
##### Training time: NVIDIA DGX A100 (8x A100 40GB)
Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-a100-8x-a100-40g)
on NVIDIA DGX A100 with (8x A100 40G) GPUs.
| GPUs | Time to train - mixed precision + XLA | Time to train - mixed precision | Time to train - TF32 + XLA | Time to train - TF32 |
|---|--------|---------|---------|-------|
| 1 | ~18h | ~19.5h | ~40h | ~47h |
| 8 | ~2h | ~2.5h | ~5h | ~6h |
##### Training time: NVIDIA DGX-1 (8x V100 16G)
Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-1-8x-v100-16g)
@ -821,22 +832,25 @@ on NVIDIA T4 with (1x T4 16G) GPU.
* Added benchmark results for DGX-2 and XLA-enabled DGX-1 and DGX-2.
3. July, 2019
* Added Cosine learning rate schedule
3. August, 2019
4. August, 2019
* Added mixup regularization
* Added T4 benchmarks
* Improved inference capabilities
* Added SavedModel export
4. January, 2020
5. January, 2020
* Removed manual checks for dataset paths to facilitate cloud storage solutions
* Move to a new logging solution
* Bump base docker image version
5. March, 2020
6. March, 2020
* Code cleanup and refactor
* Improved training process
6. June, 2020
7. June, 2020
* Added benchmark results for DGX-A100
* Updated benchmark results for DGX-1, DGX-2 and T4
* Updated base docker image version
8. August 2020
* Updated command line argument names
* Added support for syntetic dataset with different image size
### Known issues
Performance without XLA enabled is low. We recommend using XLA.
Performance without XLA enabled is low due to BN + ReLU fusion bug.

View file

@ -22,12 +22,12 @@ function test_configuration() {
}
test_configuration "FP32 nodali noxla"
test_configuration "FP32 nodali xla" "--use_xla"
test_configuration "FP16 nodali noxla" "--use_tf_amp"
test_configuration "FP16 nodali xla" "--use_tf_amp --use_xla"
test_configuration "FP32 nodali xla" "--xla"
test_configuration "FP16 nodali noxla" "--amp"
test_configuration "FP16 nodali xla" "--amp --xla"
if [ ! -z $DALI_DIR ]; then
test_configuration "FP16 dali xla" "--use_tf_amp --use_xla --use_dali --data_idx_dir ${DALI_DIR}"
test_configuration "FP16 dali xla" "--amp --xla --dali --data_idx_dir ${DALI_DIR}"
fi
cat $INFERENCE_BENCHMARK

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -1,20 +0,0 @@
#!/bin/bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script does Quantization aware training of Resnet-50 by finetuning on the pre-trained model using 1 GPU and a batch size of 32.
# Usage ./GPU1_RN50_QAT.sh <path to the pre-trained model> <path to dataset> <path to results directory>
python main.py --mode=train_and_evaluate --batch_size=32 --lr_warmup_epochs=1 --quantize --symmetric --use_qdq --label_smoothing 0.1 --lr_init=0.00005 --momentum=0.875 --weight_decay=3.0517578125e-05 --finetune_checkpoint=$1 --data_dir=$2 --results_dir=$3 --num_iter 10 --data_format NHWC

View file

@ -26,13 +26,13 @@ function run_benchmark() {
MODE_SIZE=$2
if [[ $4 -eq "1" ]]; then
XLA="--use_xla"
XLA="--xla"
else
XLA=""
fi
case $2 in
"amp") MODE_FLAGS="--use_tf_amp --use_static_loss_scaling --loss_scale=128";;
"amp") MODE_FLAGS="--amp --static_loss_scale 128";;
"fp32"|"tf32") MODE_FLAGS="";;
*) echo "Unsupported configuration, use amp, tf32 or fp32";;
esac

View file

@ -251,16 +251,16 @@ For example, to train on DGX-1 for 90 epochs using AMP, run:
Additionally, features like DALI data preprocessing or TensorFlow XLA can be enabled with
following arguments when running those scripts:
`bash ./resnext101-32x4d/training/DGX1_RNxt101-32x4d_AMP_90E.sh /path/to/result /data --use_xla --use_dali`
`bash ./resnext101-32x4d/training/DGX1_RNxt101-32x4d_AMP_90E.sh /path/to/result /data --xla --dali`
7. Start validation/evaluation.
To evaluate the validation dataset located in `/data/tfrecords`, run `main.py` with
`--mode=evaluate`. For example:
`python main.py --arch=resnext101-32x4d --mode=evaluate --data_dir=/data/tfrecords --batch_size <batch size> --model_dir
<model location> --results_dir <output location> [--use_xla] [--use_tf_amp]`
<model location> --results_dir <output location> [--xla] [--amp]`
The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during evaluation.
The optional `--xla` and `--amp` flags control XLA and AMP during evaluation.
## Advanced
@ -299,95 +299,116 @@ The `runtime/` directory contains the following module that define the mechanics
The script for training and evaluating the ResNext101-32x4d model has a variety of parameters that control these processes.
```
usage: main.py [-h]
[--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
usage: main.py [-h] [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
[--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}]
[--data_dir DATA_DIR] [--data_idx_dir DATA_IDX_DIR]
[--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
[--batch_size BATCH_SIZE] [--num_iter NUM_ITER]
[--iter_unit {epoch,batch}] [--warmup_steps WARMUP_STEPS]
[--model_dir MODEL_DIR] [--results_dir RESULTS_DIR]
[--log_filename LOG_FILENAME] [--display_every DISPLAY_EVERY]
[--lr_init LR_INIT] [--lr_warmup_epochs LR_WARMUP_EPOCHS]
[--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
[--momentum MOMENTUM] [--loss_scale LOSS_SCALE]
[--label_smoothing LABEL_SMOOTHING] [--mixup MIXUP]
[--use_static_loss_scaling | --nouse_static_loss_scaling]
[--use_xla | --nouse_xla] [--use_dali | --nouse_dali]
[--use_tf_amp | --nouse_tf_amp]
[--use_cosine_lr | --nouse_cosine_lr] [--seed SEED]
[--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
--batch_size BATCH_SIZE [--num_iter NUM_ITER]
[--run_iter RUN_ITER] [--iter_unit {epoch,batch}]
[--warmup_steps WARMUP_STEPS] [--model_dir MODEL_DIR]
[--results_dir RESULTS_DIR] [--log_filename LOG_FILENAME]
[--display_every DISPLAY_EVERY] [--seed SEED]
[--gpu_memory_fraction GPU_MEMORY_FRACTION] [--gpu_id GPU_ID]
JoC-RN50v1.5-TF
optional arguments:
-h, --help Show this help message and exit
[--finetune_checkpoint FINETUNE_CHECKPOINT] [--use_final_conv]
[--quant_delay QUANT_DELAY] [--quantize] [--use_qdq]
[--symmetric] [--data_dir DATA_DIR]
[--data_idx_dir DATA_IDX_DIR] [--dali]
[--synthetic_data_size SYNTHETIC_DATA_SIZE] [--lr_init LR_INIT]
[--lr_warmup_epochs LR_WARMUP_EPOCHS]
[--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
[--momentum MOMENTUM] [--label_smoothing LABEL_SMOOTHING]
[--mixup MIXUP] [--cosine_lr] [--xla]
[--data_format {NHWC,NCHW}] [--amp]
[--static_loss_scale STATIC_LOSS_SCALE]
JoC-RN50v1.5-TF
optional arguments:
-h, --help show this help message and exit.
--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}
Architecture of model to run (to run Resnext-32x4d set
--arch=rensext101-32x4d)
Architecture of model to run.
--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}
The execution mode of the script.
--export_dir EXPORT_DIR
Directory in which to write exported SavedModel.
--to_predict TO_PREDICT
Path to file or directory of files to run prediction
on.
--batch_size BATCH_SIZE
Size of each minibatch per GPU.
--num_iter NUM_ITER Number of iterations to run.
--run_iter RUN_ITER Number of training iterations to run on single run.
--iter_unit {epoch,batch}
Unit of iterations.
--warmup_steps WARMUP_STEPS
Number of steps considered as warmup and not taken
into account for performance measurements.
--model_dir MODEL_DIR
Directory in which to write model. If undefined,
results dir will be used.
--results_dir RESULTS_DIR
Directory in which to write training logs, summaries
and checkpoints.
--log_filename LOG_FILENAME
Name of the JSON file to which write the training log.
--display_every DISPLAY_EVERY
How often (in batches) to print out running
information.
--seed SEED Random seed.
--gpu_memory_fraction GPU_MEMORY_FRACTION
Limit memory fraction used by training script for DALI.
--gpu_id GPU_ID Specify ID of the target GPU on multi-device platform.
Effective only for single-GPU mode.
--finetune_checkpoint FINETUNE_CHECKPOINT
Path to pre-trained checkpoint which will be used for
fine-tuning.
--use_final_conv Use convolution operator instead of MLP as last layer.
--quant_delay QUANT_DELAY
Number of steps to be run before quantization starts
to happen.
--quantize Quantize weights and activations during training.
(Defaults to Assymmetric quantization)
--use_qdq Use QDQV3 op instead of FakeQuantWithMinMaxVars op for
quantization. QDQv3 does only scaling.
--symmetric Quantize weights and activations during training using
symmetric quantization.
Dataset arguments:
--data_dir DATA_DIR Path to dataset in TFRecord format. Files should be
named 'train-*' and 'validation-*'.
--data_idx_dir DATA_IDX_DIR
Path to index files for DALI. Files should be named
'train-*' and 'validation-*'.
--export_dir EXPORT_DIR
Directory in which to write exported SavedModel.
--to_predict TO_PREDICT
Path to file or directory of files to run prediction
on.
--batch_size BATCH_SIZE
Size of each minibatch per GPU.
--num_iter NUM_ITER Number of iterations to run.
--iter_unit {epoch,batch}
Unit of iterations.
--warmup_steps WARMUP_STEPS
Number of steps considered as warmup and not taken
into account for performance measurements.
--model_dir MODEL_DIR
Directory in which to write the model. If undefined,
results directory will be used.
--results_dir RESULTS_DIR
Directory in which to write training logs, summaries
and checkpoints.
--log_filename LOG_FILENAME
Name of the JSON file to which write the training log
--display_every DISPLAY_EVERY
How often (in batches) to print out running
information.
--dali Enable DALI data input.
--synthetic_data_size SYNTHETIC_DATA_SIZE
Dimension of image for synthetic dataset.
Training arguments:
--lr_init LR_INIT Initial value for the learning rate.
--lr_warmup_epochs LR_WARMUP_EPOCHS
Number of warmup epochs for the learning rate schedule.
Number of warmup epochs for learning rate schedule.
--weight_decay WEIGHT_DECAY
Weight Decay scale factor.
--weight_init {fan_in,fan_out}
Model weight initialization method.
--momentum MOMENTUM SGD momentum value for the momentum optimizer.
--loss_scale LOSS_SCALE
Loss scale for FP16 training and fast math FP32.
--momentum MOMENTUM SGD momentum value for the Momentum optimizer.
--label_smoothing LABEL_SMOOTHING
The value of label smoothing.
--mixup MIXUP The alpha parameter for mixup (if 0 then mixup is not
applied).
--use_static_loss_scaling
Use static loss scaling in FP16 or FP32 AMP.
--nouse_static_loss_scaling
--use_xla Enable XLA (Accelerated Linear Algebra) computation
--cosine_lr Use cosine learning rate schedule.
Generic optimization arguments:
--xla Enable XLA (Accelerated Linear Algebra) computation
for improved performance.
--nouse_xla
--use_dali Enable DALI data input.
--nouse_dali
--use_tf_amp Enable AMP to speedup FP32
computation using Tensor Cores.
--nouse_tf_amp
--use_cosine_lr Use cosine learning rate schedule.
--nouse_cosine_lr
--seed SEED Random seed.
--gpu_memory_fraction GPU_MEMORY_FRACTION
Limit memory fraction used by the training script for DALI
--gpu_id GPU_ID Specify the ID of the target GPU on a multi-device platform.
Effective only for single-GPU mode.
--data_format {NHWC,NCHW}
Data format used to do calculations.
--amp Enable Automatic Mixed Precision to speedup
computation using tensor cores.
Automatic Mixed Precision arguments:
--static_loss_scale STATIC_LOSS_SCALE
Use static loss scaling in FP32 AMP.
```
### Inference process
@ -395,7 +416,7 @@ To run inference on a single example with a checkpoint and a model script, use:
`python main.py --arch=resnext101-32x4d --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`
The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during inference.
The optional `--xla` and `--amp` flags control XLA and AMP during inference.
## Performance
@ -414,7 +435,7 @@ To benchmark the training performance on a specific batch size, run:
* AMP
`python ./main.py --arch=resnext101-32x4d --mode=training_benchmark --use_tf_amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
`python ./main.py --arch=resnext101-32x4d --mode=training_benchmark --amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
* For multiple GPUs
* FP32 / TF32
@ -423,16 +444,16 @@ To benchmark the training performance on a specific batch size, run:
* AMP
`mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=resnext101-32x4d --mode=training_benchmark --use_tf_amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
`mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=resnext101-32x4d --mode=training_benchmark --amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
Each of these scripts runs 200 warm-up iterations and measures the first epoch.
To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
with `--xla` and `--dali` flags. For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
Suggested batch sizes for training are 128 for mixed precision training and 64 for single precision training per single V100 16 GB.
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. The resolution of synthetic images used can be controlled with `--synthetic_data_size` flag.
#### Inference performance benchmark
@ -444,11 +465,10 @@ To benchmark the inference performance on a specific batch size, run:
* AMP
`python ./main.py --arch=resnext101-32x4d --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
`python ./main.py --arch=resnext101-32x4d --mode=inference_benchmark --amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
For proper throughput and latency reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnext101-32x4d`, by simply running:
@ -457,6 +477,9 @@ The benchmark can be automated with the `inference_benchmark.sh` script provided
The `<data dir>` parameter refers to the input data directory (by default `/data/tfrecords` inside the container).
By default, the benchmark tests the following configurations: **FP32**, **AMP**, **AMP + XLA** with different batch sizes.
When the optional directory with the DALI index files `<data idx dir>` is specified, the benchmark executes an additional **DALI + AMP + XLA** configuration.
For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
For performance benchamrk of raw model, synthetic dataset can be used. To use synthetic dataset, use `--synthetic_data_size` flag instead of `--data_dir` to specify input image size.
### Results
@ -769,6 +792,9 @@ on NVIDIA T4 with (1x T4 16G) GPU.
June 2020
- Initial release
August 2020
- Updated command line argument names
- Added support for syntetic dataset with different image size
### Known issues
Performance without XLA enabled is low. We recommend using XLA.
Performance without XLA enabled is low due to BN + ReLU fusion bug.

View file

@ -22,12 +22,12 @@ function test_configuration() {
}
test_configuration "FP32 nodali noxla"
test_configuration "FP32 nodali xla" "--use_xla"
test_configuration "FP16 nodali noxla" "--use_tf_amp"
test_configuration "FP16 nodali xla" "--use_tf_amp --use_xla"
test_configuration "FP32 nodali xla" "--xla"
test_configuration "FP16 nodali noxla" "--amp"
test_configuration "FP16 nodali xla" "--amp --xla"
if [ ! -z $DALI_DIR ]; then
test_configuration "FP16 dali xla" "--use_tf_amp --use_xla --use_dali --data_idx_dir ${DALI_DIR}"
test_configuration "FP16 dali xla" "--amp --xla --dali --data_idx_dir ${DALI_DIR}"
fi
cat $INFERENCE_BENCHMARK

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -26,13 +26,13 @@ function run_benchmark() {
MODE_SIZE=$2
if [[ $4 -eq "1" ]]; then
XLA="--use_xla"
XLA="--xla"
else
XLA=""
fi
case $2 in
"amp") MODE_FLAGS="--use_tf_amp --use_static_loss_scaling --loss_scale=128";;
"amp") MODE_FLAGS="--amp --static_loss_scale 128";;
"fp32"|"tf32") MODE_FLAGS="";;
*) echo "Unsupported configuration, use amp, tf32 or fp32";;
esac

View file

@ -39,36 +39,34 @@ __all__ = [
class Runner(object):
def __init__(
self,
# ========= Model HParams ========= #
n_classes=1001,
architecture='resnet50',
input_format='NHWC', # NCHW or NHWC
compute_format='NCHW', # NCHW or NHWC
dtype=tf.float32, # tf.float32 or tf.float16
n_channels=3,
height=224,
width=224,
distort_colors=False,
model_dir=None,
log_dir=None,
data_dir=None,
data_idx_dir=None,
weight_init="fan_out",
self,
# ========= Model HParams ========= #
n_classes=1001,
architecture='resnet50',
input_format='NHWC', # NCHW or NHWC
compute_format='NCHW', # NCHW or NHWC
dtype=tf.float32, # tf.float32 or tf.float16
n_channels=3,
height=224,
width=224,
distort_colors=False,
model_dir=None,
log_dir=None,
data_dir=None,
data_idx_dir=None,
weight_init="fan_out",
# ======= Optimization HParams ======== #
use_xla=False,
use_tf_amp=False,
use_dali=False,
gpu_memory_fraction=1.0,
gpu_id=0,
# ======= Optimization HParams ======== #
use_xla=False,
use_tf_amp=False,
use_dali=False,
gpu_memory_fraction=1.0,
gpu_id=0,
# ======== Debug Flags ======== #
debug_verbosity=0,
seed=None
):
# ======== Debug Flags ======== #
debug_verbosity=0,
seed=None):
if dtype not in [tf.float32, tf.float16]:
raise ValueError("Unknown dtype received: %s (allowed: `tf.float32` and `tf.float16`)" % dtype)
@ -123,56 +121,49 @@ class Runner(object):
# =================================================
model_hparams = tf.contrib.training.HParams(
width=height,
height=width,
n_channels=n_channels,
n_classes=n_classes,
dtype=dtype,
input_format=input_format,
compute_format=compute_format,
distort_colors=distort_colors,
seed=tf_seed
)
model_hparams = tf.contrib.training.HParams(width=height,
height=width,
n_channels=n_channels,
n_classes=n_classes,
dtype=dtype,
input_format=input_format,
compute_format=compute_format,
distort_colors=distort_colors,
seed=tf_seed)
num_preprocessing_threads = 10 if not use_dali else 4
run_config_performance = tf.contrib.training.HParams(
num_preprocessing_threads=num_preprocessing_threads,
use_tf_amp=use_tf_amp,
use_xla=use_xla,
use_dali=use_dali,
gpu_memory_fraction=gpu_memory_fraction,
gpu_id=gpu_id
)
run_config_performance = tf.contrib.training.HParams(num_preprocessing_threads=num_preprocessing_threads,
use_tf_amp=use_tf_amp,
use_xla=use_xla,
use_dali=use_dali,
gpu_memory_fraction=gpu_memory_fraction,
gpu_id=gpu_id)
run_config_additional = tf.contrib.training.HParams(
model_dir=model_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
model_dir=model_dir, #if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
log_dir=log_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
data_dir=data_dir,
data_idx_dir=data_idx_dir,
num_preprocessing_threads=num_preprocessing_threads
)
num_preprocessing_threads=num_preprocessing_threads)
self.run_hparams = Runner._build_hparams(model_hparams, run_config_additional, run_config_performance)
model_name = architecture
architecture = resnet.model_architectures[architecture]
self._model = resnet.ResnetModel(
model_name=model_name,
n_classes=model_hparams.n_classes,
layers_count=architecture["layers"],
layers_depth=architecture["widths"],
expansions=architecture["expansions"],
input_format=model_hparams.input_format,
compute_format=model_hparams.compute_format,
dtype=model_hparams.dtype,
weight_init=weight_init,
use_dali=use_dali,
cardinality=architecture['cardinality'] if 'cardinality' in architecture else 1,
use_se=architecture['use_se'] if 'use_se' in architecture else False,
se_ratio=architecture['se_ratio'] if 'se_ratio' in architecture else 1
)
self._model = resnet.ResnetModel(model_name=model_name,
n_classes=model_hparams.n_classes,
layers_count=architecture["layers"],
layers_depth=architecture["widths"],
expansions=architecture["expansions"],
input_format=model_hparams.input_format,
compute_format=model_hparams.compute_format,
dtype=model_hparams.dtype,
weight_init=weight_init,
use_dali=use_dali,
cardinality=architecture['cardinality'] if 'cardinality' in architecture else 1,
use_se=architecture['use_se'] if 'use_se' in architecture else False,
se_ratio=architecture['se_ratio'] if 'se_ratio' in architecture else 1)
if self.run_hparams.seed is not None:
tf.set_random_seed(self.run_hparams.seed)
@ -196,9 +187,7 @@ class Runner(object):
except ValueError:
warnings.warn(
"the parameter `{}` already exists - existing value: {} and duplicated value: {}".format(
key, hparams.get(key), val
)
)
key, hparams.get(key), val))
return hparams
@ -214,9 +203,8 @@ class Runner(object):
def _get_session_config(mode, use_xla, use_dali, gpu_memory_fraction, gpu_id=0):
if mode not in ["train", 'validation', 'benchmark', 'inference']:
raise ValueError(
"Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode
)
raise ValueError("Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" %
mode)
# Limit available GPU memory (tune the size)
if use_dali:
@ -240,10 +228,6 @@ class Runner(object):
config.gpu_options.force_gpu_compatible = True # Force pinned memory
# Bug - disable bn+relu fusion
from tensorflow.core.protobuf import rewriter_config_pb2
config.graph_options.rewrite_options.remapping = (rewriter_config_pb2.RewriterConfig.OFF)
if mode == 'train':
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // max(hvd.size(), 8) - 2))
@ -254,9 +238,8 @@ class Runner(object):
def _get_run_config(mode, model_dir, use_xla, use_dali, gpu_memory_fraction, gpu_id=0, seed=None):
if mode not in ["train", 'validation', 'benchmark', 'inference']:
raise ValueError(
"Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode
)
raise ValueError("Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" %
mode)
if seed is not None:
if hvd_utils.is_using_hvd():
@ -272,9 +255,11 @@ class Runner(object):
save_summary_steps=100 if mode in ['train', 'validation'] else 1e9, # disabled in benchmark mode
save_checkpoints_steps=None,
save_checkpoints_secs=None,
session_config=Runner._get_session_config(
mode=mode, use_xla=use_xla, use_dali=use_dali, gpu_memory_fraction=gpu_memory_fraction, gpu_id=gpu_id
),
session_config=Runner._get_session_config(mode=mode,
use_xla=use_xla,
use_dali=use_dali,
gpu_memory_fraction=gpu_memory_fraction,
gpu_id=gpu_id),
keep_checkpoint_max=5,
keep_checkpoint_every_n_hours=1e6, # disabled
log_step_count_steps=1e9,
@ -282,14 +267,12 @@ class Runner(object):
device_fn=None,
protocol=None,
eval_distribute=None,
experimental_distribute=None
)
experimental_distribute=None)
if mode == 'train':
if hvd_utils.is_using_hvd():
config = config.replace(
save_checkpoints_steps=1000 if hvd.rank() == 0 else None, keep_checkpoint_every_n_hours=3
)
config = config.replace(save_checkpoints_steps=1000 if hvd.rank() == 0 else None,
keep_checkpoint_every_n_hours=3)
else:
config = config.replace(save_checkpoints_steps=1000, keep_checkpoint_every_n_hours=3)
@ -298,49 +281,45 @@ class Runner(object):
def _get_estimator(self, mode, run_params, use_xla, use_dali, gpu_memory_fraction, gpu_id=0):
if mode not in ["train", 'validation', 'benchmark', 'inference']:
raise ValueError(
"Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode
)
raise ValueError("Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" %
mode)
run_config = Runner._get_run_config(
mode=mode,
model_dir=self.run_hparams.model_dir,
use_xla=use_xla,
use_dali=use_dali,
gpu_memory_fraction=gpu_memory_fraction,
gpu_id=gpu_id,
seed=self.run_hparams.seed
)
run_config = Runner._get_run_config(mode=mode,
model_dir=self.run_hparams.model_dir,
use_xla=use_xla,
use_dali=use_dali,
gpu_memory_fraction=gpu_memory_fraction,
gpu_id=gpu_id,
seed=self.run_hparams.seed)
return tf.estimator.Estimator(
model_fn=self._model, model_dir=self.run_hparams.model_dir, config=run_config, params=run_params
)
return tf.estimator.Estimator(model_fn=self._model,
model_dir=self.run_hparams.model_dir,
config=run_config,
params=run_params)
def train(
self,
iter_unit,
num_iter,
run_iter,
batch_size,
warmup_steps=50,
weight_decay=1e-4,
lr_init=0.1,
lr_warmup_epochs=5,
momentum=0.9,
log_every_n_steps=1,
loss_scale=256,
label_smoothing=0.0,
mixup=0.0,
use_cosine_lr=False,
use_static_loss_scaling=False,
is_benchmark=False,
quantize=False,
symmetric=False,
quant_delay=0,
finetune_checkpoint=None,
use_final_conv=False,
use_qdq=False
):
def train(self,
iter_unit,
num_iter,
run_iter,
batch_size,
warmup_steps=50,
weight_decay=1e-4,
lr_init=0.1,
lr_warmup_epochs=5,
momentum=0.9,
log_every_n_steps=1,
loss_scale=256,
label_smoothing=0.0,
mixup=0.0,
use_cosine_lr=False,
use_static_loss_scaling=False,
is_benchmark=False,
quantize=False,
symmetric=False,
quant_delay=0,
finetune_checkpoint=None,
use_final_conv=False,
use_qdq=False):
if iter_unit not in ["epoch", "batch"]:
raise ValueError('`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit)
@ -383,9 +362,8 @@ class Runner(object):
run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter
if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
idx_filenames = runner_utils.parse_dali_idx_dataset(
data_idx_dir=self.run_hparams.data_idx_dir, mode="train"
)
idx_filenames = runner_utils.parse_dali_idx_dataset(data_idx_dir=self.run_hparams.data_idx_dir,
mode="train")
training_hooks = []
@ -447,14 +425,12 @@ class Runner(object):
if finetune_checkpoint:
estimator_params['finetune_checkpoint'] = finetune_checkpoint
image_classifier = self._get_estimator(
mode='train',
run_params=estimator_params,
use_xla=self.run_hparams.use_xla,
use_dali=self.run_hparams.use_dali,
gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
gpu_id=self.run_hparams.gpu_id
)
image_classifier = self._get_estimator(mode='train',
run_params=estimator_params,
use_xla=self.run_hparams.use_xla,
use_dali=self.run_hparams.use_dali,
gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
gpu_id=self.run_hparams.gpu_id)
def training_data_fn():
@ -462,30 +438,26 @@ class Runner(object):
if hvd.rank() == 0:
print("Using DALI input... ")
return data_utils.get_dali_input_fn(
filenames=filenames,
idx_filenames=idx_filenames,
batch_size=batch_size,
height=self.run_hparams.height,
width=self.run_hparams.width,
training=True,
distort_color=self.run_hparams.distort_colors,
num_threads=self.run_hparams.num_preprocessing_threads,
deterministic=False if self.run_hparams.seed is None else True
)
return data_utils.get_dali_input_fn(filenames=filenames,
idx_filenames=idx_filenames,
batch_size=batch_size,
height=self.run_hparams.height,
width=self.run_hparams.width,
training=True,
distort_color=self.run_hparams.distort_colors,
num_threads=self.run_hparams.num_preprocessing_threads,
deterministic=False if self.run_hparams.seed is None else True)
elif self.run_hparams.data_dir is not None:
return data_utils.get_tfrecords_input_fn(
filenames=filenames,
batch_size=batch_size,
height=self.run_hparams.height,
width=self.run_hparams.width,
training=True,
distort_color=self.run_hparams.distort_colors,
num_threads=self.run_hparams.num_preprocessing_threads,
deterministic=False if self.run_hparams.seed is None else True
)
return data_utils.get_tfrecords_input_fn(filenames=filenames,
batch_size=batch_size,
height=self.run_hparams.height,
width=self.run_hparams.width,
training=True,
distort_color=self.run_hparams.distort_colors,
num_threads=self.run_hparams.num_preprocessing_threads,
deterministic=False if self.run_hparams.seed is None else True)
else:
if hvd.rank() == 0:
@ -555,14 +527,12 @@ class Runner(object):
'use_qdq': use_qdq,
'use_final_conv': use_final_conv}
image_classifier = self._get_estimator(
mode='validation',
run_params=estimator_params,
use_xla=self.run_hparams.use_xla,
use_dali=self.run_hparams.use_dali,
gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
gpu_id=self.run_hparams.gpu_id
)
image_classifier = self._get_estimator(mode='validation',
run_params=estimator_params,
use_xla=self.run_hparams.use_xla,
use_dali=self.run_hparams.use_dali,
gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
gpu_id=self.run_hparams.gpu_id)
if self.run_hparams.data_dir is not None:
filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
@ -579,9 +549,8 @@ class Runner(object):
num_steps = num_iter
if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
idx_filenames = runner_utils.parse_dali_idx_dataset(
data_idx_dir=self.run_hparams.data_idx_dir, mode="validation"
)
idx_filenames = runner_utils.parse_dali_idx_dataset(data_idx_dir=self.run_hparams.data_idx_dir,
mode="validation")
eval_hooks = []
@ -603,29 +572,25 @@ class Runner(object):
if hvd.rank() == 0:
print("Using DALI input... ")
return data_utils.get_dali_input_fn(
filenames=filenames,
idx_filenames=idx_filenames,
batch_size=batch_size,
height=self.run_hparams.height,
width=self.run_hparams.width,
training=False,
distort_color=self.run_hparams.distort_colors,
num_threads=self.run_hparams.num_preprocessing_threads,
deterministic=False if self.run_hparams.seed is None else True
)
return data_utils.get_dali_input_fn(filenames=filenames,
idx_filenames=idx_filenames,
batch_size=batch_size,
height=self.run_hparams.height,
width=self.run_hparams.width,
training=False,
distort_color=self.run_hparams.distort_colors,
num_threads=self.run_hparams.num_preprocessing_threads,
deterministic=False if self.run_hparams.seed is None else True)
elif self.run_hparams.data_dir is not None:
return data_utils.get_tfrecords_input_fn(
filenames=filenames,
batch_size=batch_size,
height=self.run_hparams.height,
width=self.run_hparams.width,
training=False,
distort_color=self.run_hparams.distort_colors,
num_threads=self.run_hparams.num_preprocessing_threads,
deterministic=False if self.run_hparams.seed is None else True
)
return data_utils.get_tfrecords_input_fn(filenames=filenames,
batch_size=batch_size,
height=self.run_hparams.height,
width=self.run_hparams.width,
training=False,
distort_color=self.run_hparams.distort_colors,
num_threads=self.run_hparams.num_preprocessing_threads,
deterministic=False if self.run_hparams.seed is None else True)
else:
print("Using Synthetic Data ...\n")
@ -651,29 +616,25 @@ class Runner(object):
eval_latencies_q = np.quantile(eval_latencies, q=[0.9, 0.95, 0.99])
eval_latencies_mean = np.mean(eval_latencies)
dllogger.log(
data={
'top1_accuracy': float(eval_results['top1_accuracy']),
'top5_accuracy': float(eval_results['top5_accuracy']),
'eval_throughput': eval_throughput,
'eval_latency_avg': eval_latencies_mean,
'eval_latency_p90': eval_latencies_q[0],
'eval_latency_p95': eval_latencies_q[1],
'eval_latency_p99': eval_latencies_q[2],
},
step=tuple()
)
dllogger.log(data={
'top1_accuracy': float(eval_results['top1_accuracy']),
'top5_accuracy': float(eval_results['top5_accuracy']),
'eval_throughput': eval_throughput,
'eval_latency_avg': eval_latencies_mean,
'eval_latency_p90': eval_latencies_q[0],
'eval_latency_p95': eval_latencies_q[1],
'eval_latency_p99': eval_latencies_q[2],
},
step=tuple())
if export_dir is not None:
dllogger.log(data={'export_dir': export_dir}, step=tuple())
input_receiver_fn = data_utils.get_serving_input_receiver_fn(
batch_size=None,
height=self.run_hparams.height,
width=self.run_hparams.width,
num_channels=self.run_hparams.n_channels,
data_format=self.run_hparams.input_format,
dtype=self.run_hparams.dtype
)
input_receiver_fn = data_utils.get_serving_input_receiver_fn(batch_size=None,
height=self.run_hparams.height,
width=self.run_hparams.width,
num_channels=self.run_hparams.n_channels,
data_format=self.run_hparams.input_format,
dtype=self.run_hparams.dtype)
image_classifier.export_savedmodel(export_dir, input_receiver_fn)
@ -684,33 +645,35 @@ class Runner(object):
def predict(self, to_predict, quantize=False, symmetric=False, use_qdq=False, use_final_conv=False):
estimator_params = {'quantize': quantize, 'symmetric': symmetric, 'use_qdq': use_qdq, 'use_final_conv': use_final_conv}
estimator_params = {
'quantize': quantize,
'symmetric': symmetric,
'use_qdq': use_qdq,
'use_final_conv': use_final_conv
}
if to_predict is not None:
filenames = runner_utils.parse_inference_input(to_predict)
image_classifier = self._get_estimator(
mode='inference',
run_params=estimator_params,
use_xla=self.run_hparams.use_xla,
use_dali=self.run_hparams.use_dali,
gpu_memory_fraction=self.run_hparams.gpu_memory_fraction
)
image_classifier = self._get_estimator(mode='inference',
run_params=estimator_params,
use_xla=self.run_hparams.use_xla,
use_dali=self.run_hparams.use_dali,
gpu_memory_fraction=self.run_hparams.gpu_memory_fraction)
inference_hooks = []
def inference_data_fn():
return data_utils.get_inference_input_fn(
filenames=filenames,
height=self.run_hparams.height,
width=self.run_hparams.width,
num_threads=self.run_hparams.num_preprocessing_threads
)
return data_utils.get_inference_input_fn(filenames=filenames,
height=self.run_hparams.height,
width=self.run_hparams.width,
num_threads=self.run_hparams.num_preprocessing_threads)
try:
inference_results = image_classifier.predict(
input_fn=inference_data_fn, predict_keys=None, hooks=inference_hooks, yield_single_examples=True
)
inference_results = image_classifier.predict(input_fn=inference_data_fn,
predict_keys=None,
hooks=inference_hooks,
yield_single_examples=True)
for result in inference_results:
print(result['classes'], str(result['probabilities'][result['classes']]))

View file

@ -48,13 +48,13 @@ def list_filenames_in_dataset(data_dir, mode, count=True):
filename_pattern = os.path.join(data_dir, '%s-*' % mode)
file_list = sorted(tf.gfile.Glob(filename_pattern))
file_list = sorted(tf.compat.v1.gfile.Glob(filename_pattern))
num_samples = 0
if count:
def count_records(tf_record_filename):
count = 0
for _ in tf.python_io.tf_record_iterator(tf_record_filename):
for _ in tf.compat.v1.io.tf_record_iterator(tf_record_filename):
count += 1
return count

View file

@ -246,16 +246,16 @@ For example, to train on DGX-1 for 90 epochs using AMP, run:
Additionally, features like DALI data preprocessing or TensorFlow XLA can be enabled with
following arguments when running those scripts:
`bash ./se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_AMP_90E.sh /path/to/result /data/ --use_xla --use_dali`
`bash ./se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_AMP_90E.sh /path/to/result /data/ --xla --dali`
7. Start validation/evaluation.
To evaluate the validation dataset located in `/data/tfrecords`, run `main.py` with
`--mode=evaluate`. For example:
`python main.py --arch=se-resnext101-32x4d --mode=evaluate --data_dir=/data/tfrecords --batch_size <batch size> --model_dir
<model location> --results_dir <output location> [--use_xla] [--use_tf_amp]`
<model location> --results_dir <output location> [--xla] [--amp]`
The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during evaluation.
The optional `--xla` and `--amp` flags control XLA and AMP during evaluation.
## Advanced
@ -294,95 +294,116 @@ The `runtime/` directory contains the following module that define the mechanics
The script for training and evaluating the ResNext101-32x4d model has a variety of parameters that control these processes.
```
usage: main.py [-h]
[--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
usage: main.py [-h] [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
[--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}]
[--data_dir DATA_DIR] [--data_idx_dir DATA_IDX_DIR]
[--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
[--batch_size BATCH_SIZE] [--num_iter NUM_ITER]
[--iter_unit {epoch,batch}] [--warmup_steps WARMUP_STEPS]
[--model_dir MODEL_DIR] [--results_dir RESULTS_DIR]
[--log_filename LOG_FILENAME] [--display_every DISPLAY_EVERY]
[--lr_init LR_INIT] [--lr_warmup_epochs LR_WARMUP_EPOCHS]
[--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
[--momentum MOMENTUM] [--loss_scale LOSS_SCALE]
[--label_smoothing LABEL_SMOOTHING] [--mixup MIXUP]
[--use_static_loss_scaling | --nouse_static_loss_scaling]
[--use_xla | --nouse_xla] [--use_dali | --nouse_dali]
[--use_tf_amp | --nouse_tf_amp]
[--use_cosine_lr | --nouse_cosine_lr] [--seed SEED]
[--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
--batch_size BATCH_SIZE [--num_iter NUM_ITER]
[--run_iter RUN_ITER] [--iter_unit {epoch,batch}]
[--warmup_steps WARMUP_STEPS] [--model_dir MODEL_DIR]
[--results_dir RESULTS_DIR] [--log_filename LOG_FILENAME]
[--display_every DISPLAY_EVERY] [--seed SEED]
[--gpu_memory_fraction GPU_MEMORY_FRACTION] [--gpu_id GPU_ID]
JoC-RN50v1.5-TF
optional arguments:
-h, --help Show this help message and exit
[--finetune_checkpoint FINETUNE_CHECKPOINT] [--use_final_conv]
[--quant_delay QUANT_DELAY] [--quantize] [--use_qdq]
[--symmetric] [--data_dir DATA_DIR]
[--data_idx_dir DATA_IDX_DIR] [--dali]
[--synthetic_data_size SYNTHETIC_DATA_SIZE] [--lr_init LR_INIT]
[--lr_warmup_epochs LR_WARMUP_EPOCHS]
[--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
[--momentum MOMENTUM] [--label_smoothing LABEL_SMOOTHING]
[--mixup MIXUP] [--cosine_lr] [--xla]
[--data_format {NHWC,NCHW}] [--amp]
[--static_loss_scale STATIC_LOSS_SCALE]
JoC-RN50v1.5-TF
optional arguments:
-h, --help show this help message and exit.
--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}
Architecture of model to run (to run se-resnext-32x4d set
--arch=se-rensext101-32x4d)
Architecture of model to run.
--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}
The execution mode of the script.
--export_dir EXPORT_DIR
Directory in which to write exported SavedModel.
--to_predict TO_PREDICT
Path to file or directory of files to run prediction
on.
--batch_size BATCH_SIZE
Size of each minibatch per GPU.
--num_iter NUM_ITER Number of iterations to run.
--run_iter RUN_ITER Number of training iterations to run on single run.
--iter_unit {epoch,batch}
Unit of iterations.
--warmup_steps WARMUP_STEPS
Number of steps considered as warmup and not taken
into account for performance measurements.
--model_dir MODEL_DIR
Directory in which to write model. If undefined,
results dir will be used.
--results_dir RESULTS_DIR
Directory in which to write training logs, summaries
and checkpoints.
--log_filename LOG_FILENAME
Name of the JSON file to which write the training log.
--display_every DISPLAY_EVERY
How often (in batches) to print out running
information.
--seed SEED Random seed.
--gpu_memory_fraction GPU_MEMORY_FRACTION
Limit memory fraction used by training script for DALI.
--gpu_id GPU_ID Specify ID of the target GPU on multi-device platform.
Effective only for single-GPU mode.
--finetune_checkpoint FINETUNE_CHECKPOINT
Path to pre-trained checkpoint which will be used for
fine-tuning.
--use_final_conv Use convolution operator instead of MLP as last layer.
--quant_delay QUANT_DELAY
Number of steps to be run before quantization starts
to happen.
--quantize Quantize weights and activations during training.
(Defaults to Assymmetric quantization)
--use_qdq Use QDQV3 op instead of FakeQuantWithMinMaxVars op for
quantization. QDQv3 does only scaling.
--symmetric Quantize weights and activations during training using
symmetric quantization.
Dataset arguments:
--data_dir DATA_DIR Path to dataset in TFRecord format. Files should be
named 'train-*' and 'validation-*'.
--data_idx_dir DATA_IDX_DIR
Path to index files for DALI. Files should be named
'train-*' and 'validation-*'.
--export_dir EXPORT_DIR
Directory in which to write exported SavedModel.
--to_predict TO_PREDICT
Path to file or directory of files to run prediction
on.
--batch_size BATCH_SIZE
Size of each minibatch per GPU.
--num_iter NUM_ITER Number of iterations to run.
--iter_unit {epoch,batch}
Unit of iterations.
--warmup_steps WARMUP_STEPS
Number of steps considered as warmup and not taken
into account for performance measurements.
--model_dir MODEL_DIR
Directory in which to write the model. If undefined,
results directory will be used.
--results_dir RESULTS_DIR
Directory in which to write training logs, summaries
and checkpoints.
--log_filename LOG_FILENAME
Name of the JSON file to which write the training log
--display_every DISPLAY_EVERY
How often (in batches) to print out running
information.
--dali Enable DALI data input.
--synthetic_data_size SYNTHETIC_DATA_SIZE
Dimension of image for synthetic dataset.
Training arguments:
--lr_init LR_INIT Initial value for the learning rate.
--lr_warmup_epochs LR_WARMUP_EPOCHS
Number of warmup epochs for the learning rate schedule.
Number of warmup epochs for learning rate schedule.
--weight_decay WEIGHT_DECAY
Weight Decay scale factor.
--weight_init {fan_in,fan_out}
Model weight initialization method.
--momentum MOMENTUM SGD momentum value for the momentum optimizer.
--loss_scale LOSS_SCALE
Loss scale for FP16 training and fast math FP32.
--momentum MOMENTUM SGD momentum value for the Momentum optimizer.
--label_smoothing LABEL_SMOOTHING
The value of label smoothing.
--mixup MIXUP The alpha parameter for mixup (if 0 then mixup is not
applied).
--use_static_loss_scaling
Use static loss scaling in FP16 or FP32 AMP.
--nouse_static_loss_scaling
--use_xla Enable XLA (Accelerated Linear Algebra) computation
--cosine_lr Use cosine learning rate schedule.
Generic optimization arguments:
--xla Enable XLA (Accelerated Linear Algebra) computation
for improved performance.
--nouse_xla
--use_dali Enable DALI data input.
--nouse_dali
--use_tf_amp Enable AMP to speedup FP32
computation using Tensor Cores.
--nouse_tf_amp
--use_cosine_lr Use cosine learning rate schedule.
--nouse_cosine_lr
--seed SEED Random seed.
--gpu_memory_fraction GPU_MEMORY_FRACTION
Limit memory fraction used by the training script for DALI
--gpu_id GPU_ID Specify the ID of the target GPU on a multi-device platform.
Effective only for single-GPU mode.
--data_format {NHWC,NCHW}
Data format used to do calculations.
--amp Enable Automatic Mixed Precision to speedup
computation using tensor cores.
Automatic Mixed Precision arguments:
--static_loss_scale STATIC_LOSS_SCALE
Use static loss scaling in FP32 AMP.
```
### Inference process
@ -390,7 +411,7 @@ To run inference on a single example with a checkpoint and a model script, use:
`python main.py --arch=se-resnext101-32x4d --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`
The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during inference.
The optional `--xla` and `--amp` flags control XLA and AMP during inference.
## Performance
@ -409,7 +430,7 @@ To benchmark the training performance on a specific batch size, run:
* AMP
`python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark --use_tf_amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
`python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark --amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
* For multiple GPUs
* FP32 / TF32
@ -418,16 +439,17 @@ To benchmark the training performance on a specific batch size, run:
* AMP
`mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark --use_tf_amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
`mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark --amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
Each of these scripts runs 200 warm-up iterations and measures the first epoch.
To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
with `--xla` and `--dali` flags. For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
Suggested batch sizes for training are 96 for mixed precision training and 64 for single precision training per single V100 16 GB.
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. The resolution of synthetic images used can be controlled with `--synthetic_data_size` flag.
#### Inference performance benchmark
@ -439,11 +461,10 @@ To benchmark the inference performance on a specific batch size, run:
* AMP
`python ./main.py --arch=se-resnext101-32x4d --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
`python ./main.py --arch=se-resnext101-32x4d --mode=inference_benchmark --amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
For proper throughput and latency reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
The benchmark can be automated with the `inference_benchmark.sh` script provided in `se-resnext101-32x4d`, by simply running:
@ -452,6 +473,9 @@ The benchmark can be automated with the `inference_benchmark.sh` script provided
The `<data dir>` parameter refers to the input data directory (by default `/data/tfrecords` inside the container).
By default, the benchmark tests the following configurations: **FP32**, **AMP**, **AMP + XLA** with different batch sizes.
When the optional directory with the DALI index files `<data idx dir>` is specified, the benchmark executes an additional **DALI + AMP + XLA** configuration.
For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
For performance benchamrk of raw model, synthetic dataset can be used. To use synthetic dataset, use `--synthetic_data_size` flag instead of `--data_dir` to specify input image size.
### Results
@ -761,6 +785,9 @@ on NVIDIA T4 with (1x T4 16G) GPU.
April 2020
- Initial release
August 2020
- Updated command line argument names
- Added support for syntetic dataset with different image size
### Known issues
Performance without XLA enabled is low. We recommend using XLA.
Performance without XLA enabled is low due to BN + ReLU fusion bug.

View file

@ -22,12 +22,12 @@ function test_configuration() {
}
test_configuration "FP32 nodali noxla"
test_configuration "FP32 nodali xla" "--use_xla"
test_configuration "FP16 nodali noxla" "--use_tf_amp"
test_configuration "FP16 nodali xla" "--use_tf_amp --use_xla"
test_configuration "FP32 nodali xla" "--xla"
test_configuration "FP16 nodali noxla" "--amp"
test_configuration "FP16 nodali xla" "--amp --xla"
if [ ! -z $DALI_DIR ]; then
test_configuration "FP16 dali xla" "--use_tf_amp --use_xla --use_dali --data_idx_dir ${DALI_DIR}"
test_configuration "FP16 dali xla" "--amp --xla --dali --data_idx_dir ${DALI_DIR}"
fi
cat $INFERENCE_BENCHMARK

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=96 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=96 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=96 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=96 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=96 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=96 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=se-resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=96 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=96 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp --use_static_loss_scaling --loss_scale 128 \
--amp --static_loss_scale 128 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
--batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=se-resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,9 +25,9 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--use_tf_amp \
--amp \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -25,7 +25,7 @@ fi
mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
--mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
--batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
--batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
--lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
--data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
--results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

View file

@ -26,13 +26,13 @@ function run_benchmark() {
MODE_SIZE=$2
if [[ $4 -eq "1" ]]; then
XLA="--use_xla"
XLA="--xla"
else
XLA=""
fi
case $2 in
"amp") MODE_FLAGS="--use_tf_amp --use_static_loss_scaling --loss_scale=128";;
"amp") MODE_FLAGS="--amp --static_loss_scale=128";;
"fp32"|"tf32") MODE_FLAGS="";;
*) echo "Unsupported configuration, use amp, tf32 or fp32";;
esac

View file

@ -0,0 +1,687 @@
# Deploying the ResNet-50 v1.5 model on Triton Inference Server
This folder contains instructions for deployment to run inference
on Triton Inference Server as well as a detailed performance analysis.
The purpose of this document is to help you with achieving
the best inference performance.
## Table of contents
- [Solution overview](#solution-overview)
- [Introduction](#introduction)
- [Deployment process](#deployment-process)
- [Setup](#setup)
- [Quick Start Guide](#quick-start-guide)
- [Advanced](#advanced)
- [Prepare configuration](#prepare-configuration)
- [Latency explanation](#latency-explanation)
- [Performance](#performance)
- [Offline scenario](#offline-scenario)
- [Offline: NVIDIA A40, TF-TRT with FP16](#offline-nvidia-a40-tf-trt-with-fp16)
- [Offline: NVIDIA DGX A100 (1x A100 80GB), TF-TRT with FP16](#offline-nvidia-dgx-a100-1x-a100-80gb-tf-trt-with-fp16)
- [Offline: NVIDIA DGX-1 (1x V100 32GB), TF-TRT with FP16](#offline-nvidia-dgx-1-1x-v100-32gb-tf-trt-with-fp16)
- [Offline: NVIDIA T4, TF-TRT with FP16](#offline-nvidia-t4-tf-trt-with-fp16)
- [Online scenario](#online-scenario)
- [Online: NVIDIA A40, TF-TRT with FP16](#online-nvidia-a40-tf-trt-with-fp16)
- [Online: NVIDIA DGX A100 (1x A100 80GB), TF-TRT with FP16](#online-nvidia-dgx-a100-1x-a100-80gb-tf-trt-with-fp16)
- [Online: NVIDIA DGX-1 (1x V100 32GB), TF-TRT with FP16](#online-nvidia-dgx-1-1x-v100-32gb-tf-trt-with-fp16)
- [Online: NVIDIA T4, TF-TRT with FP16](#online-nvidia-t4-tf-trt-with-fp16)
- [Release Notes](#release-notes)
- [Changelog](#changelog)
- [Known issues](#known-issues)
## Solution overview
### Introduction
The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server)
provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs.
The server provides an inference service via an HTTP or gRPC endpoint,
allowing remote clients to request inferencing for any number of GPU
or CPU models being managed by the server.
This README provides step-by-step deployment instructions for models generated
during training (as described in the [model README](../README.md)).
Additionally, this README provides the corresponding deployment scripts that
ensure optimal GPU utilization during inferencing on Triton Inference Server.
### Deployment process
The deployment process consists of two steps:
1. Conversion. The purpose of conversion is to find the best performing model
format supported by Triton Inference Server.
Triton Inference Server uses a number of runtime backends such as
[TensorRT](https://developer.nvidia.com/tensorrt),
[TensorFlow](https://github.com/triton-inference-server/tensorflow_backend) and
[ONNX Runtime](https://github.com/triton-inference-server/onnxruntime_backend)
to support various model types. Refer to
[Triton documentation](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton)
for a list of available backends.
2. Configuration. Model configuration on Triton Inference Server, which generates
necessary [configuration files](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md).
To run benchmarks measuring the model performance in inference,
perform the following steps:
1. Start the Triton Inference Server.
The Triton Inference Server container is started
in one (possibly remote) container and ports for gRPC or REST API are exposed.
2. Run accuracy tests.
Produce results which are tested against given accuracy thresholds.
Refer to step 8 in the [Quick Start Guide](#quick-start-guide).
3. Run performance tests.
Produce latency and throughput results for offline (static batching)
and online (dynamic batching) scenarios.
Refer to step 11 in the [Quick Start Guide](#quick-start-guide).
## Setup
Ensure you have the following components:
* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
* [TensorFlow1 NGC container 20.12](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
* [Triton Inference Server NGC container 20.12](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver)
* [NVIDIA CUDA repository](https://docs.nvidia.com/cuda/archive/11.1.1/index.html)
* [NVIDIA Ampere](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/), [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
## Quick Start Guide
Running the following scripts will build and launch the container with all
required dependencies for native TensorFlow as well as Triton Inference Server.
This is necessary for running inference and can also be used for data download,
processing, and training of the model.
1. Clone the repository.
IMPORTANT: This step is executed on the host computer.
```
git clone https://github.com/NVIDIA/DeepLearningExamples.git
cd DeepLearningExamples/TensorFlow/Classification/ConvNets
```
2. Setup the environment in host PC and start Triton Inference Server.
```
source triton/scripts/setup_environment.sh
bash triton/scripts/docker/triton_inference_server.sh
```
3. Build and run a container that extends the NGC TensorFlow container with
the Triton Inference Server client libraries and dependencies.
```
bash triton/scripts/docker/build.sh
bash triton/scripts/docker/interactive.sh
```
4. Prepare the deployment configuration and create folders in Docker.
IMPORTANT: These and the following commands must be executed in the TensorFlow NGC container.
```
source triton/scripts/setup_environment.sh
```
5. Download and pre-process the dataset.
```
bash triton/scripts/download_data.sh
bash triton/scripts/process_dataset.sh
```
6. Setup the parameters for deployment.
```
source triton/scripts/setup_parameters.sh
```
7. Convert the model from training to inference format (e.g. TensorRT).
```
python3 triton/convert_model.py \
--input-path triton/rn50_model.py \
--input-type tf-estimator \
--output-path ${SHARED_DIR}/model \
--output-type ${FORMAT} \
--onnx-opset 12 \
--onnx-optimized 1 \
--max-batch-size ${MAX_BATCH_SIZE} \
--max-workspace-size 4294967296 \
--ignore-unknown-parameters \
\
--model-dir ${CHECKPOINT_DIR} \
--precision ${PRECISION} \
--dataloader triton/dataloader.py \
--data-dir ${DATASETS_DIR}/imagenet
```
8. Run the model accuracy tests in framework.
```
python3 triton/run_inference_on_fw.py \
--input-path ${SHARED_DIR}/model \
--input-type ${FORMAT} \
--dataloader triton/dataloader.py \
--data-dir ${DATASETS_DIR}/imagenet \
--images-num 256 \
--batch-size ${MAX_BATCH_SIZE} \
--output-dir ${SHARED_DIR}/correctness_dump \
--dump-labels
python3 triton/calculate_metrics.py \
--dump-dir ${SHARED_DIR}/correctness_dump \
--metrics triton/metrics.py \
--output-used-for-metrics classes \
--csv ${SHARED_DIR}/correctness_metrics.csv
cat ${SHARED_DIR}/correctness_metrics.csv
```
9. Configure the model on Triton Inference Server.
Generate the configuration from your model repository.
```
python3 triton/config_model_on_trion.py \
--model-repository ${MODEL_REPOSITORY_PATH} \
--model-path ${SHARED_DIR}/model \
--model-format ${FORMAT} \
--model-name ${MODEL_NAME} \
--model-version 1 \
--max-batch-size ${MAX_BATCH_SIZE} \
--precision ${PRECISION} \
--number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
--max-queue-delay-us ${TRITON_MAX_QUEUE_DELAY} \
--preferred-batch-sizes ${TRITON_PREFERRED_BATCH_SIZES} \
--capture-cuda-graph 0 \
--backend-accelerator ${BACKEND_ACCELERATOR} \
--load-model ${TRITON_LOAD_MODEL_METHOD}
```
10. Run the Triton Inference Server accuracy tests.
```
python3 triton/run_inference_on_triton.py \
--server-url localhost:8001 \
--model-name ${MODEL_NAME} \
--model-version 1 \
--dataloader triton/dataloader.py \
--data-dir ${DATASETS_DIR}/imagenet \
--batch-size ${MAX_BATCH_SIZE} \
--output-dir ${SHARED_DIR}/accuracy_dump \
--dump-labels
python3 triton/calculate_metrics.py \
--dump-dir ${SHARED_DIR}/accuracy_dump \
--metrics triton/metrics.py \
--output-used-for-metrics classes \
--csv ${SHARED_DIR}/accuracy_metrics.csv
cat ${SHARED_DIR}/accuracy_metrics.csv
```
11. Run the Triton Inference Server performance online tests.
We want to maximize throughput within latency budget constraints.
Dynamic batching is a feature of Triton Inference Server that allows
inference requests to be combined by the server, so that a batch is
created dynamically, resulting in a reduced average latency.
You can set the Dynamic Batcher parameter `max_queue_delay_microseconds` to
indicate the maximum amount of time you are willing to wait and
`preferred_batch_size` to indicate your maximum server batch size
in the Triton Inference Server model configuration. The measurements
presented below set the maximum latency to zero to achieve the best latency
possible with good performance.
```
python triton/run_offline_performance_test_on_triton.py \
--server-url ${TRITON_SERVER_URL} \
--model-name ${MODEL_NAME} \
--input-data random \
--batch-sizes ${BATCH_SIZE} \
--triton-instances ${TRITON_INSTANCES} \
--result-path ${SHARED_DIR}/triton_performance_offline.csv
```
12. Run the Triton Inference Server performance offline tests.
We want to maximize throughput. It assumes you have your data available
for inference or that your data saturate to maximum batch size quickly.
Triton Inference Server supports offline scenarios with static batching.
Static batching allows inference requests to be served
as they are received. The largest improvements to throughput come
from increasing the batch size due to efficiency gains in the GPU with larger
batches.
```
python triton/run_online_performance_test_on_triton.py \
--server-url ${TRITON_SERVER_URL} \
--model-name ${MODEL_NAME} \
--input-data random \
--batch-sizes ${BATCH_SIZE} \
--triton-instances ${TRITON_INSTANCES} \
--number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
--result-path ${SHARED_DIR}/triton_performance_online.csv
```
## Advanced
### Prepare configuration
You can use the environment variables to set the parameters of your inference
configuration.
Triton deployment scripts support several inference runtimes listed in the table below:
| Inference runtime | Mnemonic used in scripts |
|--------------------|--------------------------|
| [TensorFlow SavedModel](https://www.tensorflow.org/guide/saved_model) | `tf-savedmodel` |
| [TensorFlow TensorRT](https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html) | `tf-trt` |
| [ONNX](https://onnx.ai) | `onnx` |
| [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) | `trt` |
The name of the inference runtime should be put into the `FORMAT` variable.
Example values of some key variables in one configuration:
```
PRECISION="fp16"
FORMAT="tf-trt"
BATCH_SIZE="1, 2, 4, 8, 16, 32, 64, 128"
BACKEND_ACCELERATOR="trt"
MAX_BATCH_SIZE="128"
NUMBER_OF_MODEL_INSTANCES="2"
TRITON_MAX_QUEUE_DELAY="1"
TRITON_PREFERRED_BATCH_SIZES="64 128"
```
### Latency explanation
A typical Triton Inference Server pipeline can be broken down into the following steps:
1. The client serializes the inference request into a message and sends it to
the server (Client Send).
2. The message travels over the network from the client to the server (Network).
3. The message arrives at the server and is deserialized (Server Receive).
4. The request is placed on the queue (Server Queue).
5. The request is removed from the queue and computed (Server Compute).
6. The completed request is serialized in a message and sent back to
the client (Server Send).
7. The completed message then travels over the network from the server
to the client (Network).
8. The completed message is deserialized by the client and processed as
a completed inference request (Client Receive).
Generally, for local clients, steps 1-4 and 6-8 will only occupy
a small fraction of time, compared to steps 5. As backend deep learning
systems like Jasper are rarely exposed directly to end users, but instead
only interfacing with local front-end servers, for the sake of Jasper,
we can consider that all clients are local.
## Performance
### Offline scenario
This table lists the common variable parameters for all performance measurements:
| Parameter Name | Parameter Value |
|:-----------------------------|:------------------|
| Max Batch Size | 128.0 |
| Number of model instances | 2.0 |
| Triton Max Queue Delay | 1.0 |
| Triton Preferred Batch Sizes | 64 128 |
#### Offline: NVIDIA A40, TF-TRT with FP16
Our results were obtained using the following configuration:
* **GPU:** NVIDIA A40
* **Backend:** TensorFlow
* **Model binding:** TF-TRT
* **Precision:** FP16
* **Model format:** TensorFlow SavedModel
|![](plots/graph_performance_offline_3l.svg)|![](plots/graph_performance_offline_3r.svg)|
|-----|-----|
<details>
<summary>
Full tabular data
</summary>
| Precision | Backend Accelerator | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
| FP16 | TensorRT | 1 | 329.5 | 3.23 | 3.43 | 3.973 | 3.031 |
| FP16 | TensorRT | 2 | 513.8 | 4.292 | 4.412 | 4.625 | 3.888 |
| FP16 | TensorRT | 4 | 720.8 | 6.122 | 6.264 | 6.5 | 5.543 |
| FP16 | TensorRT | 8 | 919.2 | 9.145 | 9.664 | 10.3 | 8.701 |
| FP16 | TensorRT | 16 | 1000 | 17.522 | 17.979 | 19.098 | 16.01 |
| FP16 | TensorRT | 32 | 889.6 | 37.49 | 38.481 | 40.316 | 35.946 |
| FP16 | TensorRT | 64 | 992 | 66.837 | 67.923 | 70.324 | 64.645 |
| FP16 | TensorRT | 128 | 896 | 148.461 | 149.854 | 150.05 | 143.684 |
</details>
#### Offline: NVIDIA DGX A100 (1x A100 80GB), TF-TRT with FP16
Our results were obtained using the following configuration:
* **GPU:** NVIDIA DGX A100 (1x A100 80GB)
* **Backend:** TensorFlow
* **Model binding:** TF-TRT
* **Precision:** FP16
* **Model format:** TensorFlow SavedModel
|![](plots/graph_performance_offline_7l.svg)|![](plots/graph_performance_offline_7r.svg)|
|-----|-----|
<details>
<summary>
Full tabular data
</summary>
| Precision | Backend Accelerator | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
| FP16 | TensorRT | 1 | 387.9 | 2.626 | 2.784 | 2.875 | 2.574 |
| FP16 | TensorRT | 2 | 637.2 | 3.454 | 3.506 | 3.547 | 3.135 |
| FP16 | TensorRT | 4 | 982.4 | 4.328 | 4.454 | 4.627 | 4.07 |
| FP16 | TensorRT | 8 | 1181.6 | 7.012 | 7.074 | 7.133 | 6.765 |
| FP16 | TensorRT | 16 | 1446.4 | 11.162 | 11.431 | 11.941 | 11.061 |
| FP16 | TensorRT | 32 | 1353.6 | 24.392 | 24.914 | 25.178 | 23.603 |
| FP16 | TensorRT | 64 | 1478.4 | 45.539 | 46.096 | 47.546 | 43.401 |
| FP16 | TensorRT | 128 | 1331.2 | 97.504 | 100.611 | 101.896 | 96.198 |
</details>
#### Offline: NVIDIA DGX-1 (1x V100 32GB), TF-TRT with FP16
Our results were obtained using the following configuration:
* **GPU:** NVIDIA DGX A100 (1x A100 80GB)
* **Backend:** TensorFlow
* **Model binding:** TF-TRT
* **Precision:** FP16
* **Model format:** TensorFlow SavedModel
|![](plots/graph_performance_offline_11l.svg)|![](plots/graph_performance_offline_11r.svg)|
|-----|-----|
<details>
<summary>
Full tabular data
</summary>
| Precision | Backend Accelerator | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
| FP16 | TensorRT | 1 | 255.6 | 4.032 | 4.061 | 4.141 | 3.909 |
| FP16 | TensorRT | 2 | 419.2 | 4.892 | 4.94 | 5.133 | 4.766 |
| FP16 | TensorRT | 4 | 633.6 | 6.603 | 6.912 | 7.18 | 6.306 |
| FP16 | TensorRT | 8 | 865.6 | 9.657 | 9.73 | 9.834 | 9.236 |
| FP16 | TensorRT | 16 | 950.4 | 18.396 | 20.748 | 23.873 | 16.824 |
| FP16 | TensorRT | 32 | 854.4 | 37.965 | 38.599 | 40.34 | 37.432 |
| FP16 | TensorRT | 64 | 825.6 | 80.118 | 80.758 | 87.374 | 77.596 |
| FP16 | TensorRT | 128 | 704 | 189.198 | 189.87 | 191.259 | 183.205 |
</details>
#### Offline: NVIDIA T4, TF-TRT with FP16
Our results were obtained using the following configuration:
* **GPU:** NVIDIA T4
* **Backend:** TensorFlow
* **Model binding:** TF-TRT
* **Precision:** FP16
* **Model format:** TensorFlow SavedModel
|![](plots/graph_performance_offline_15l.svg)|![](plots/graph_performance_offline_15r.svg)|
|-----|-----|
<details>
<summary>
Full tabular data
</summary>
| Precision | Backend Accelerator | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
| FP16 | TensorRT | 1 | 211.7 | 4.89 | 4.926 | 4.965 | 4.717 |
| FP16 | TensorRT | 2 | 327.8 | 6.258 | 6.309 | 6.436 | 6.094 |
| FP16 | TensorRT | 4 | 468.4 | 8.996 | 9.085 | 9.239 | 8.531 |
| FP16 | TensorRT | 8 | 544.8 | 15.654 | 15.978 | 16.324 | 14.673 |
| FP16 | TensorRT | 16 | 544 | 30.626 | 30.788 | 31.311 | 29.477 |
| FP16 | TensorRT | 32 | 524.8 | 64.527 | 65.35 | 66.13 | 60.943 |
| FP16 | TensorRT | 64 | 556.8 | 115.455 | 115.717 | 116.02 | 113.802 |
| FP16 | TensorRT | 128 | 537.6 | 242.501 | 244.599 | 246.16 | 238.384 |
</details>
### Online scenario
This table lists the common variable parameters for all performance measurements:
| Parameter Name | Parameter Value |
|:-----------------------------|:------------------|
| Max Batch Size | 128.0 |
| Number of model instances | 2.0 |
| Triton Max Queue Delay | 1.0 |
| Triton Preferred Batch Sizes | 64 128 |
#### Online: NVIDIA A40, TF-TRT with FP16
Our results were obtained using the following configuration:
* **GPU:** NVIDIA A40
* **Backend:** TensorFlow
* **Model binding:** TF-TRT
* **Precision:** FP16
* **Model format:** TensorFlow SavedModel
![](plots/graph_performance_online_6.svg)
<details>
<summary>
Full tabular data
</summary>
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
| 16 | 1421.3 | 0.109 | 4.875 | 1.126 | 0.895 | 4.188 | 0.053 | 0 | 11.046 | 17.34 | 17.851 | 19.013 | 11.246 |
| 32 | 1920 | 0.118 | 8.402 | 1.47 | 1.323 | 5.277 | 0.09 | 0 | 16.328 | 28.052 | 29.871 | 31.932 | 16.68 |
| 48 | 2270.4 | 0.12 | 11.505 | 1.856 | 1.582 | 5.953 | 0.113 | 0 | 22.172 | 31.87 | 35.395 | 41.256 | 21.129 |
| 64 | 2401.9 | 0.12 | 14.443 | 2.299 | 2.358 | 7.285 | 0.149 | 0 | 26.69 | 37.388 | 40.73 | 47.503 | 26.654 |
| 80 | 2823 | 0.126 | 14.917 | 2.71 | 2.406 | 7.977 | 0.174 | 0 | 29.113 | 39.932 | 43.789 | 51.24 | 28.31 |
| 96 | 2903.8 | 0.133 | 18.824 | 2.929 | 2.595 | 8.364 | 0.18 | 0 | 33.951 | 46.785 | 51.878 | 60.37 | 33.025 |
| 112 | 3096.6 | 0.135 | 20.018 | 3.362 | 2.97 | 9.434 | 0.209 | 0 | 37.927 | 50.587 | 55.169 | 63.141 | 36.128 |
| 128 | 3252 | 0.138 | 21.092 | 3.912 | 3.445 | 10.505 | 0.245 | 0 | 41.241 | 53.912 | 58.961 | 68.864 | 39.337 |
| 144 | 3352.4 | 0.137 | 21.407 | 4.527 | 4.237 | 12.363 | 0.293 | 0 | 44.211 | 59.876 | 65.971 | 79.335 | 42.964 |
| 160 | 3387.4 | 0.137 | 22.947 | 5.179 | 4.847 | 13.805 | 0.326 | 0 | 48.423 | 65.393 | 69.568 | 81.288 | 47.241 |
| 176 | 3409.1 | 0.142 | 24.989 | 5.623 | 5.539 | 14.956 | 0.357 | 0 | 52.714 | 71.332 | 78.478 | 99.086 | 51.606 |
| 192 | 3481.8 | 0.143 | 25.661 | 6.079 | 6.666 | 16.442 | 0.372 | 0 | 55.383 | 79.276 | 95.479 | 122.295 | 55.363 |
| 208 | 3523.8 | 0.147 | 27.042 | 6.376 | 7.526 | 17.413 | 0.4 | 0 | 58.823 | 86.375 | 104.134 | 123.278 | 58.904 |
| 224 | 3587.2 | 0.148 | 29.648 | 6.776 | 7.659 | 17.85 | 0.411 | 0 | 61.973 | 91.804 | 107.987 | 130.413 | 62.492 |
| 240 | 3507.4 | 0.153 | 31.079 | 7.987 | 9.246 | 19.342 | 0.426 | 0 | 65.697 | 106.035 | 121.914 | 137.572 | 68.233 |
| 256 | 3504.4 | 0.16 | 34.664 | 8.252 | 9.886 | 19.567 | 0.461 | 0 | 70.708 | 115.965 | 127.808 | 147.327 | 72.99 |
</details>
#### Online: NVIDIA DGX A100 (1x A100 80GB), TF-TRT with FP16
Our results were obtained using the following configuration:
* **GPU:** NVIDIA DGX A100 (1x A100 80GB)
* **Backend:** TensorFlow
* **Model binding:** TF-TRT
* **Precision:** FP16
* **Model format:** TensorFlow SavedModel
![](plots/graph_performance_online_14.svg)
<details>
<summary>
Full tabular data
</summary>
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
| 16 | 1736.5 | 0.11 | 2.754 | 1.272 | 0.954 | 4.08 | 0.036 | 0 | 9.037 | 12.856 | 13.371 | 15.174 | 9.206 |
| 32 | 2418.9 | 0.114 | 5.15 | 1.494 | 1.361 | 5.031 | 0.072 | 0 | 13.234 | 20.638 | 21.717 | 23.352 | 13.222 |
| 48 | 2891.3 | 0.112 | 7.389 | 1.721 | 1.586 | 5.688 | 0.096 | 0 | 17.089 | 25.946 | 27.611 | 29.784 | 16.592 |
| 64 | 3432.6 | 0.11 | 7.866 | 2.11 | 2.126 | 6.301 | 0.131 | 0 | 19.322 | 25.971 | 28.845 | 34.024 | 18.644 |
| 80 | 3644.6 | 0.116 | 9.665 | 2.33 | 2.493 | 7.185 | 0.146 | 0 | 22.834 | 29.061 | 32.281 | 37.224 | 21.935 |
| 96 | 3902.2 | 0.116 | 11.138 | 2.676 | 2.828 | 7.684 | 0.166 | 0 | 25.589 | 32.572 | 35.307 | 40.123 | 24.608 |
| 112 | 3960.6 | 0.124 | 13.321 | 2.964 | 3.209 | 8.438 | 0.186 | 0 | 29.537 | 37.388 | 40.602 | 46.193 | 28.242 |
| 128 | 4137.7 | 0.124 | 14.325 | 3.372 | 3.646 | 9.244 | 0.219 | 0 | 31.587 | 41.968 | 44.993 | 51.38 | 30.93 |
| 144 | 4139.6 | 0.136 | 15.919 | 3.803 | 4.451 | 10.274 | 0.233 | 0 | 35.696 | 48.301 | 51.345 | 57.414 | 34.816 |
| 160 | 4300.5 | 0.134 | 16.453 | 4.341 | 4.934 | 10.979 | 0.274 | 0 | 38.495 | 50.566 | 53.943 | 61.406 | 37.115 |
| 176 | 4166.6 | 0.143 | 18.436 | 4.959 | 6.081 | 12.321 | 0.309 | 0 | 43.451 | 60.739 | 69.51 | 84.959 | 42.249 |
| 192 | 4281.3 | 0.138 | 19.585 | 5.201 | 6.571 | 13.042 | 0.313 | 0 | 46.175 | 62.718 | 69.46 | 83.032 | 44.85 |
| 208 | 4314.8 | 0.15 | 20.046 | 5.805 | 7.752 | 14.062 | 0.335 | 0 | 47.957 | 73.848 | 84.644 | 96.408 | 48.15 |
| 224 | 4388.2 | 0.141 | 21.393 | 6.105 | 8.236 | 14.85 | 0.343 | 0 | 50.449 | 77.534 | 88.553 | 100.727 | 51.068 |
| 240 | 4371.8 | 0.143 | 22.342 | 6.711 | 9.423 | 15.78 | 0.377 | 0 | 53.216 | 85.983 | 97.756 | 112.48 | 54.776 |
| 256 | 4617.3 | 0.144 | 23.392 | 6.595 | 9.466 | 15.568 | 0.367 | 0 | 54.703 | 86.054 | 93.95 | 105.917 | 55.532 |
</details>
#### Online: NVIDIA DGX-1 (1x V100 32GB), TF-TRT with FP16
Our results were obtained using the following configuration:
* **GPU:** NVIDIA DGX-1 (1x V100 32GB)
* **Backend:** TensorFlow
* **Model binding:** TF-TRT
* **Precision:** FP16
* **Model format:** TensorFlow SavedModel
![](plots/graph_performance_online_22.svg)
<details>
<summary>
Full tabular data
</summary>
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
| 16 | 1259.7 | 0.121 | 3.735 | 1.999 | 0.803 | 5.998 | 0.034 | 0 | 13.623 | 17.271 | 17.506 | 18.938 | 12.69 |
| 32 | 1686.4 | 0.17 | 6.9 | 2.33 | 2.212 | 7.303 | 0.07 | 0 | 18.836 | 28.302 | 30.423 | 32.916 | 18.985 |
| 48 | 1888.3 | 0.183 | 9.068 | 3.372 | 3.65 | 9.058 | 0.108 | 0.001 | 26.571 | 36.583 | 40.84 | 50.402 | 25.44 |
| 64 | 2103.9 | 0.204 | 12.416 | 3.146 | 4.304 | 10.127 | 0.145 | 0.001 | 32.401 | 37.121 | 41.252 | 49.094 | 30.343 |
| 80 | 2255.2 | 0.211 | 13.753 | 4.074 | 5.455 | 11.776 | 0.192 | 0.001 | 38.298 | 47.082 | 54.476 | 65.412 | 35.462 |
| 96 | 2376.6 | 0.214 | 16.22 | 4.873 | 5.972 | 12.911 | 0.208 | 0.001 | 43.008 | 52.947 | 57.126 | 69.778 | 40.399 |
| 112 | 2445.6 | 0.243 | 18.495 | 5.461 | 7.012 | 14.365 | 0.248 | 0.001 | 48.081 | 62.414 | 68.274 | 85.766 | 45.825 |
| 128 | 2534.2 | 0.261 | 19.294 | 6.486 | 7.925 | 16.312 | 0.282 | 0.001 | 52.894 | 68.475 | 74.852 | 89.979 | 50.561 |
| 144 | 2483.9 | 0.27 | 20.771 | 7.744 | 9.993 | 18.865 | 0.414 | 0.001 | 64.866 | 70.434 | 80.279 | 99.177 | 58.058 |
| 160 | 2512.8 | 0.302 | 24.205 | 7.838 | 11.217 | 19.689 | 0.373 | 0.001 | 69.085 | 85.576 | 95.016 | 109.455 | 63.625 |
| 176 | 2541 | 0.311 | 26.206 | 8.556 | 12.439 | 21.393 | 0.418 | 0.001 | 76.666 | 92.266 | 106.889 | 127.055 | 69.324 |
| 192 | 2623.4 | 0.33 | 27.783 | 9.058 | 13.198 | 22.181 | 0.433 | 0.001 | 79.724 | 97.736 | 111.44 | 142.418 | 72.984 |
| 208 | 2616.2 | 0.353 | 29.667 | 9.759 | 15.693 | 23.567 | 0.444 | 0.001 | 80.571 | 125.202 | 140.527 | 175.331 | 79.484 |
| 224 | 2693.9 | 0.369 | 32.283 | 9.941 | 15.769 | 24.304 | 0.439 | 0.001 | 78.743 | 137.09 | 151.955 | 183.397 | 83.106 |
| 240 | 2700.4 | 0.447 | 32.287 | 11.128 | 18.204 | 26.578 | 0.456 | 0.001 | 82.561 | 155.011 | 177.925 | 191.51 | 89.101 |
| 256 | 2743.8 | 0.481 | 34.688 | 11.834 | 19.087 | 26.597 | 0.459 | 0.001 | 89.387 | 153.866 | 177.805 | 204.319 | 93.147 |
</details>
#### Online: NVIDIA T4, TF-TRT with FP16
Our results were obtained using the following configuration:
* **GPU:** NVIDIA T4
* **Backend:** TensorFlow
* **Model binding:** TF-TRT
* **Precision:** FP16
* **Model format:** TensorFlow SavedModel
![](plots/graph_performance_online_30.svg)
<details>
<summary>
Full tabular data
</summary>
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
| 16 | 731.4 | 0.271 | 6.9 | 3.745 | 2.073 | 8.802 | 0.081 | 0.001 | 25.064 | 28.863 | 29.7 | 32.01 | 21.873 |
| 32 | 935 | 0.273 | 12.023 | 3.48 | 4.375 | 13.885 | 0.141 | 0.001 | 31.339 | 50.564 | 52.684 | 55.823 | 34.178 |
| 48 | 1253 | 0.298 | 12.331 | 5.313 | 4.623 | 15.634 | 0.178 | 0.001 | 38.099 | 60.665 | 64.537 | 72.38 | 38.378 |
| 64 | 1368.3 | 0.303 | 15.3 | 6.926 | 4.9 | 19.118 | 0.2 | 0.001 | 48.758 | 66.391 | 73.271 | 81.537 | 46.748 |
| 80 | 1410.7 | 0.296 | 15.525 | 11.06 | 6.934 | 22.476 | 0.286 | 0.001 | 60.346 | 65.664 | 76.055 | 84.643 | 56.578 |
| 96 | 1473.1 | 0.309 | 18.846 | 11.746 | 7.825 | 26.165 | 0.319 | 0.001 | 69.785 | 77.337 | 91.586 | 100.918 | 65.211 |
| 112 | 1475.5 | 0.316 | 23.275 | 12.412 | 8.954 | 30.724 | 0.338 | 0.001 | 79.904 | 106.324 | 111.382 | 126.559 | 76.02 |
| 128 | 1535.9 | 0.328 | 23.486 | 14.64 | 10.057 | 34.534 | 0.352 | 0.001 | 89.451 | 110.789 | 121.814 | 140.139 | 83.398 |
| 144 | 1512.3 | 0.336 | 25.79 | 18.7 | 12.205 | 37.909 | 0.435 | 0.001 | 103.388 | 108.917 | 114.44 | 136.469 | 95.376 |
| 160 | 1533.6 | 0.406 | 29.825 | 17.67 | 13.751 | 42.259 | 0.44 | 0.001 | 111.899 | 140.67 | 154.76 | 191.391 | 104.352 |
| 176 | 1515.1 | 0.438 | 34.286 | 17.867 | 16.42 | 46.792 | 0.461 | 0.001 | 120.503 | 187.317 | 205.71 | 223.391 | 116.265 |
| 192 | 1532.2 | 0.476 | 34.796 | 18.86 | 19.071 | 51.446 | 0.483 | 0.001 | 124.044 | 211.466 | 226.921 | 237.664 | 125.133 |
| 208 | 1616.7 | 0.697 | 32.363 | 21.465 | 18.315 | 55.539 | 0.516 | 0.001 | 127.891 | 200.478 | 221.404 | 250.348 | 128.896 |
| 224 | 1541.5 | 0.702 | 35.932 | 22.786 | 22.138 | 62.657 | 0.527 | 0.001 | 141.32 | 248.069 | 263.661 | 276.579 | 144.743 |
| 240 | 1631.7 | 0.79 | 37.581 | 22.791 | 21.651 | 64.278 | 0.549 | 0.001 | 141.393 | 250.354 | 272.17 | 289.926 | 147.641 |
| 256 | 1607.4 | 0.801 | 39.342 | 29.09 | 23.416 | 66.866 | 0.593 | 0.001 | 157.87 | 262.818 | 280.921 | 310.504 | 160.109 |
</details>
## Release Notes
Were constantly refining and improving our performance on AI
and HPC workloads even on the same hardware with frequent updates
to our software stack. For our latest performance data please refer
to these pages for
[AI](https://developer.nvidia.com/deep-learning-performance-training-inference)
and [HPC](https://developer.nvidia.com/hpc-application-performance) benchmarks.
### Changelog
July 2020
- Initial release
April 2021
- NVIDIA A100 results added
### Known issues
There are no known issues with this model with this model.

View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""
Using `calculate_metrics.py` script, you can obtain model accuracy/error metrics using defined `MetricsCalculator` class.
See [documentation](https://gitlab-master.nvidia.com/dl/JoC/bermuda-api/-/blob/develop/bermuda_api_toolset/docs/metrics.md) on preparation of this class.
Data provided to `MetricsCalculator` are obtained from [npz dump files](https://gitlab-master.nvidia.com/dl/JoC/bermuda-api/-/blob/develop/bermuda_api_toolset/docs/dump_files.md)
stored in directory pointed by `--dump-dir` argument.
Above files are prepared by `run_inference_on_fw.py` and `run_inference_on_triton.py` scripts.
Output data is stored in csv file pointed by `--csv` argument.
Example call:
```shell script
python ./triton/calculate_metrics.py \
--dump-dir /results/dump_triton \
--csv /results/accuracy_results.csv \
--metrics metrics.py \
--metric-class-param1 value
```
"""
import argparse
import csv
import logging
import string
from pathlib import Path
import numpy as np
# method from PEP-366 to support relative import in executed modules
if __package__ is None:
__package__ = Path(__file__).parent.name
from .deployment_toolkit.args import ArgParserGenerator
from .deployment_toolkit.core import BaseMetricsCalculator, load_from_file
from .deployment_toolkit.dump import pad_except_batch_axis
LOGGER = logging.getLogger("calculate_metrics")
TOTAL_COLUMN_NAME = "_total_"
def get_data(dump_dir, prefix):
"""Loads and concatenates dump files for given prefix (ex. inputs, outputs, labels, ids)"""
dump_dir = Path(dump_dir)
npz_files = sorted(dump_dir.glob(f"{prefix}*.npz"))
data = None
if npz_files:
# assume that all npz files with given prefix contain same set of names
names = list(np.load(npz_files[0].as_posix()).keys())
# calculate target shape
target_shape = {
name: tuple(np.max([np.load(npz_file.as_posix())[name].shape for npz_file in npz_files], axis=0))
for name in names
}
# pad and concatenate data
data = {
name: np.concatenate(
[pad_except_batch_axis(np.load(npz_file.as_posix())[name], target_shape[name]) for npz_file in npz_files]
)
for name in names
}
return data
def main():
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description="Run models with given dataloader", allow_abbrev=False)
parser.add_argument("--metrics", help=f"Path to python module containing metrics calculator", required=True)
parser.add_argument("--csv", help="Path to csv file", required=True)
parser.add_argument("--dump-dir", help="Path to directory with dumped outputs (and labels)", required=True)
args, *_ = parser.parse_known_args()
MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
ArgParserGenerator(MetricsCalculator).update_argparser(parser)
args = parser.parse_args()
LOGGER.info(f"args:")
for key, value in vars(args).items():
LOGGER.info(f" {key} = {value}")
MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
metrics_calculator: BaseMetricsCalculator = ArgParserGenerator(MetricsCalculator).from_args(args)
ids = get_data(args.dump_dir, "ids")["ids"]
x = get_data(args.dump_dir, "inputs")
y_true = get_data(args.dump_dir, "labels")
y_pred = get_data(args.dump_dir, "outputs")
common_keys = list({k for k in (y_true or [])} & {k for k in (y_pred or [])})
for key in common_keys:
if y_true[key].shape != y_pred[key].shape:
LOGGER.warning(
f"Model predictions and labels shall have equal shapes. "
f"y_pred[{key}].shape={y_pred[key].shape} != "
f"y_true[{key}].shape={y_true[key].shape}"
)
metrics = metrics_calculator.calc(ids=ids, x=x, y_pred=y_pred, y_real=y_true)
metrics = {TOTAL_COLUMN_NAME: len(ids), **metrics}
metric_names_with_space = [name for name in metrics if any([c in string.whitespace for c in name])]
if metric_names_with_space:
raise ValueError(f"Metric names shall have no spaces; Incorrect names: {', '.join(metric_names_with_space)}")
csv_path = Path(args.csv)
csv_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open("w") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=list(metrics.keys()))
writer.writeheader()
writer.writerow(metrics)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,193 @@
#!/usr/bin/env python3
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""
To deploy model in Triton, you can use `deploy_model.py` script.
This will prepare layout of Model Repository, including Model Configuration.
```shell script
python ./triton/deploy_model.py \
--model-repository /model_repository \
--model-path /models/exported/model.onnx \
--model-format onnx \
--model-name ResNet50 \
--model-version 1 \
--max-batch-size 32 \
--precision fp16 \
--backend-accelerator trt \
--load-model \
--timeout 120 \
--verbose
```
If Triton server to which we prepare model repository is running with **explicit model control mode**,
use `--load-model` argument to send request load_model request to Triton Inference Server.
If server is listening on non-default address or port use `--server-url` argument to point server control endpoint.
If it is required to use HTTP protocol to communcate with Triton server use `--http` argument.
To improve inference throughput you can use
[dynamic batching](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#dynamic-batcher)
for your model by providing `--preferred-batch-sizes` and `--max-queue-delay-us` parameters.
By default Triton will [automatically obtain inputs and outputs definitions](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#auto-generated-model-configuration).
but for TorchScript models script uses file with I/O specs. This file is automatically generated
when the model is converted to ScriptModule (either traced or scripted).
If there is a need to pass different than default path to I/O spec file use `--io-spec` CLI argument.
I/O spec file is yaml file with below structure:
```yaml
- inputs:
- name: input
dtype: float32 # np.dtype name
shape: [None, 224, 224, 3]
- outputs:
- name: probabilities
dtype: float32
shape: [None, 1001]
- name: classes
dtype: int32
shape: [None, 1]
```
"""
import argparse
import logging
from service_maker import Accelerator, Format, Precision
from service_maker.args import str2bool
from service_maker.log import dump_arguments, set_logger
from service_maker.triton import ModelConfig, TritonClient, TritonModelStore
LOGGER = logging.getLogger("deploy_model")
def _available_enum_values(my_enum):
return [item.value for item in my_enum]
def main():
parser = argparse.ArgumentParser(
description="Create Triton model repository and model configuration", allow_abbrev=False
)
parser.add_argument("--model-repository", required=True, help="Path to Triton model repository.")
parser.add_argument("--model-path", required=True, help="Path to model to deploy")
# TODO: automation
parser.add_argument(
"--model-format",
required=True,
choices=_available_enum_values(Format),
help="Format of model to deploy",
)
parser.add_argument("--model-name", required=True, help="Model name")
parser.add_argument("--model-version", default="1", help="Version of model (default 1)")
parser.add_argument(
"--max-batch-size",
type=int,
default=32,
help="Maximum batch size allowed for inference. "
"A max_batch_size value of 0 indicates that batching is not allowed for the model",
)
# TODO: automation
parser.add_argument(
"--precision",
type=str,
default=Precision.FP16.value,
choices=_available_enum_values(Precision),
help="Model precision (parameter used only by Tensorflow backend with TensorRT optimization)",
)
# Triton Inference Server endpoint
parser.add_argument(
"--server-url",
type=str,
default="grpc://localhost:8001",
help="Inference server URL in format protocol://host[:port] (default grpc://localhost:8001)",
)
parser.add_argument(
"--load-model",
choices=["none", "poll", "explicit"],
help="Loading model while Triton Server is in given model control mode",
)
parser.add_argument(
"--timeout", default=120, help="Timeout in seconds to wait till model load (default=120)", type=int
)
# optimization related
parser.add_argument(
"--backend-accelerator",
type=str,
choices=_available_enum_values(Accelerator),
default=Accelerator.TRT.value,
help="Select Backend Accelerator used to serve model",
)
parser.add_argument("--number-of-model-instances", type=int, default=1, help="Number of model instances per GPU")
parser.add_argument(
"--preferred-batch-sizes",
type=int,
nargs="*",
help="Batch sizes that the dynamic batcher should attempt to create. "
"In case --max-queue-delay-us is set and this parameter is not, default value will be --max-batch-size",
)
parser.add_argument(
"--max-queue-delay-us",
type=int,
default=0,
help="Max delay time which dynamic batcher shall wait to form a batch (default 0)",
)
parser.add_argument(
"--capture-cuda-graph",
type=int,
default=0,
help="Use cuda capture graph (used only by TensorRT platform)",
)
parser.add_argument("-v", "--verbose", help="Provide verbose logs", type=str2bool, default=False)
args = parser.parse_args()
set_logger(verbose=args.verbose)
dump_arguments(args)
config = ModelConfig.create(
model_path=args.model_path,
# model definition
model_name=args.model_name,
model_version=args.model_version,
model_format=args.model_format,
precision=args.precision,
max_batch_size=args.max_batch_size,
# optimization
accelerator=args.backend_accelerator,
gpu_engine_count=args.number_of_model_instances,
preferred_batch_sizes=args.preferred_batch_sizes or [],
max_queue_delay_us=args.max_queue_delay_us,
capture_cuda_graph=args.capture_cuda_graph,
)
model_store = TritonModelStore(args.model_repository)
model_store.deploy_model(model_config=config, model_path=args.model_path)
if args.load_model != "none":
client = TritonClient(server_url=args.server_url, verbose=args.verbose)
if args.load_model == "explicit":
client.load_model(model_name=args.model_name)
client.wait_for_model(model_name=args.model_name, model_version=args.model_version, timeout_s=args.timeout)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,166 @@
#!/usr/bin/env python3
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""
`convert_model.py` script allows to convert between model formats with additional model optimizations
for faster inference.
It converts model from results of [`get_model`](https://gitlab-master.nvidia.com/dl/JoC/bermuda-api/-/blob/develop/bermuda_api_toolset/docs/model.md) function.
Currently supported input and output formats are:
- inputs
- `tf-estimator` - `get_model` function returning Tensorflow Estimator
- `tf-keras` - `get_model` function returning Tensorflow Keras Model
- `tf-savedmodel` - Tensorflow SavedModel binary
- `pyt` - `get_model` function returning PyTorch Module
- output
- `tf-savedmodel` - Tensorflow saved model
- `tf-trt` - TF-TRT saved model
- `ts-trace` - PyTorch traced ScriptModule
- `ts-script` - PyTorch scripted ScriptModule
- `onnx` - ONNX
- `trt` - TensorRT plan file
For tf-keras input you can use:
- --large-model flag - helps loading model which exceeds maximum protobuf size of 2GB
- --tf-allow-growth flag - control limiting GPU memory growth feature
(https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth). By default it is disabled.
"""
import argparse
import logging
import os
from pathlib import Path
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "1"
# method from PEP-366 to support relative import in executed modules
if __name__ == "__main__" and __package__ is None:
__package__ = Path(__file__).parent.name
from .deployment_toolkit.args import ArgParserGenerator
from .deployment_toolkit.core import (
DATALOADER_FN_NAME,
BaseConverter,
BaseLoader,
BaseSaver,
Format,
Precision,
load_from_file,
)
from .deployment_toolkit.extensions import converters, loaders, savers
LOGGER = logging.getLogger("convert_model")
INPUT_MODEL_TYPES = [Format.TF_ESTIMATOR, Format.TF_KERAS, Format.TF_SAVEDMODEL, Format.PYT]
OUTPUT_MODEL_TYPES = [Format.TF_SAVEDMODEL, Format.TF_TRT, Format.ONNX, Format.TRT, Format.TS_TRACE, Format.TS_SCRIPT]
def _get_args():
parser = argparse.ArgumentParser(description="Script for conversion between model formats.", allow_abbrev=False)
parser.add_argument("--input-path", help="Path to input model file (python module or binary file)", required=True)
parser.add_argument(
"--input-type", help="Input model type", choices=[f.value for f in INPUT_MODEL_TYPES], required=True
)
parser.add_argument("--output-path", help="Path to output model file", required=True)
parser.add_argument(
"--output-type", help="Output model type", choices=[f.value for f in OUTPUT_MODEL_TYPES], required=True
)
parser.add_argument("--dataloader", help="Path to python module containing data loader")
parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
parser.add_argument(
"--ignore-unknown-parameters",
help="Ignore unknown parameters (argument often used in CI where set of arguments is constant)",
action="store_true",
default=False,
)
args, unparsed_args = parser.parse_known_args()
Loader: BaseLoader = loaders.get(args.input_type)
ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
converter_name = f"{args.input_type}--{args.output_type}"
Converter: BaseConverter = converters.get(converter_name)
if Converter is not None:
ArgParserGenerator(Converter).update_argparser(parser)
Saver: BaseSaver = savers.get(args.output_type)
ArgParserGenerator(Saver).update_argparser(parser)
if args.dataloader is not None:
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
if args.ignore_unknown_parameters:
args, unknown_args = parser.parse_known_args()
LOGGER.warning(f"Got additional args {unknown_args}")
else:
args = parser.parse_args()
return args
def main():
args = _get_args()
log_level = logging.INFO if not args.verbose else logging.DEBUG
log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
logging.basicConfig(level=log_level, format=log_format)
LOGGER.info(f"args:")
for key, value in vars(args).items():
LOGGER.info(f" {key} = {value}")
requested_model_precision = Precision(args.precision)
dataloader_fn = None
# if conversion is required, temporary change model load precision to that required by converter
# it is for TensorRT converters which require fp32 models for all requested precisions
converter_name = f"{args.input_type}--{args.output_type}"
Converter: BaseConverter = converters.get(converter_name)
if Converter:
args.precision = Converter.required_source_model_precision(requested_model_precision).value
Loader: BaseLoader = loaders.get(args.input_type)
loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
model = loader.load(args.input_path)
LOGGER.info("inputs: %s", model.inputs)
LOGGER.info("outputs: %s", model.outputs)
if Converter: # if conversion is needed
# dataloader must much source model precision - so not recovering it yet
if args.dataloader is not None:
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
# recover precision to that requested by user
args.precision = requested_model_precision.value
if Converter:
converter = ArgParserGenerator(Converter).from_args(args)
model = converter.convert(model, dataloader_fn=dataloader_fn)
Saver: BaseSaver = savers.get(args.output_type)
saver = ArgParserGenerator(Saver).from_args(args)
saver.save(model, args.output_path)
return 0
if __name__ == "__main__":
main()

View file

@ -0,0 +1,45 @@
import logging
from pathlib import Path
import numpy as np
from PIL import Image
from rn50_model import HEIGHT, WIDTH
LOGGER = logging.getLogger(__name__)
def get_dataloader_fn(
*, data_dir: str, batch_size: int = 1, width: int = WIDTH, height: int = HEIGHT, images_num: int = None
):
image_extensions = [".gif", ".png", ".jpeg", ".jpg"]
image_paths = sorted([p for p in Path(data_dir).rglob("*") if p.suffix.lower() in image_extensions])
if images_num is not None:
image_paths = image_paths[:images_num]
LOGGER.info(
f"Creating PIL dataloader on data_dir={data_dir} #images={len(image_paths)} "
f"image_size=({width}, {height}) batch_size={batch_size}"
)
def _dataloader_fn():
batch = []
for image_path in image_paths:
img = Image.open(image_path.as_posix()).convert('RGB')
img = img.resize((width, height))
img = np.array(img).astype(np.float32)
true_class = np.array([int(image_path.parent.name)])
assert tuple(img.shape) == (height, width, 3)
img = img[np.newaxis, ...]
batch.append((img, image_path.as_posix(), true_class))
if len(batch) >= batch_size:
ids = [image_path for _, image_path, *_ in batch]
x = {
"input": np.concatenate([img for img, *_ in batch]),
}
y_real = {"classes": np.concatenate([class_ for *_, class_ in batch])}
batch = []
yield ids, x, y_real
return _dataloader_fn

View file

@ -0,0 +1 @@
0.4.6-46-g5bc739c

View file

@ -0,0 +1,110 @@
import argparse
import inspect
import logging
from typing import Callable, Dict, Optional, Union
from .core import GET_ARGPARSER_FN_NAME, load_from_file
LOGGER = logging.getLogger(__name__)
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError("Boolean value expected.")
def filter_fn_args(args: Union[dict, argparse.Namespace], fn: Callable) -> dict:
signature = inspect.signature(fn)
parameters_names = list(signature.parameters)
if isinstance(args, argparse.Namespace):
args = vars(args)
args = {k: v for k, v in args.items() if k in parameters_names}
return args
def add_args_for_fn_signature(parser, fn) -> argparse.ArgumentParser:
parser.conflict_handler = "resolve"
signature = inspect.signature(fn)
for parameter in signature.parameters.values():
if parameter.name in ["self", "args", "kwargs"]:
continue
argument_kwargs = {}
if parameter.annotation != inspect.Parameter.empty:
if parameter.annotation == bool:
argument_kwargs["type"] = str2bool
argument_kwargs["choices"] = [0, 1]
elif type(parameter.annotation) == type(Union):
types = [type_ for type_ in parameter.annotation.__args__ if not isinstance(None, type_)]
if len(types) != 1:
raise RuntimeError(
f"Could not prepare argument parser for {parameter.name}: {parameter.annotation} in {fn}"
)
argument_kwargs["type"] = types[0]
else:
argument_kwargs["type"] = parameter.annotation
if parameter.default != inspect.Parameter.empty:
if parameter.annotation == bool:
argument_kwargs["default"] = str2bool(parameter.default)
else:
argument_kwargs["default"] = parameter.default
else:
argument_kwargs["required"] = True
name = parameter.name.replace("_", "-")
LOGGER.debug(f"Adding argument {name} with {argument_kwargs}")
parser.add_argument(f"--{name}", **argument_kwargs)
return parser
class ArgParserGenerator:
def __init__(self, cls_or_fn, module_path: Optional[str] = None):
self._cls_or_fn = cls_or_fn
self._handle = cls_or_fn if inspect.isfunction(cls_or_fn) else getattr(cls_or_fn, "__init__")
input_is_python_file = module_path and module_path.endswith(".py")
self._input_path = module_path if input_is_python_file else None
self._required_fn_name_for_signature_parsing = getattr(
cls_or_fn, "required_fn_name_for_signature_parsing", None
)
def update_argparser(self, parser):
name = self._handle.__name__
group_parser = parser.add_argument_group(name)
add_args_for_fn_signature(group_parser, fn=self._handle)
self._update_argparser(group_parser)
def get_args(self, args: argparse.Namespace):
filtered_args = filter_fn_args(args, fn=self._handle)
tmp_parser = argparse.ArgumentParser(allow_abbrev=False)
self._update_argparser(tmp_parser)
custom_names = [
p.dest.replace("-", "_") for p in tmp_parser._actions if not isinstance(p, argparse._HelpAction)
]
custom_params = {n: getattr(args, n) for n in custom_names}
filtered_args = {**filtered_args, **custom_params}
return filtered_args
def from_args(self, args: Union[argparse.Namespace, Dict]):
args = self.get_args(args)
LOGGER.info(f"Initializing {self._cls_or_fn.__name__}({args})")
return self._cls_or_fn(**args)
def _update_argparser(self, parser):
label = "argparser_update"
if self._input_path:
update_argparser_handle = load_from_file(self._input_path, label=label, target=GET_ARGPARSER_FN_NAME)
if update_argparser_handle:
update_argparser_handle(parser)
elif self._required_fn_name_for_signature_parsing:
fn_handle = load_from_file(
self._input_path, label=label, target=self._required_fn_name_for_signature_parsing
)
if fn_handle:
add_args_for_fn_signature(parser, fn_handle)

View file

@ -0,0 +1,223 @@
import logging
from pathlib import Path
from typing import Dict, Optional, Union
import numpy as np
# pytype: disable=import-error
import onnx
import onnx.optimizer
import onnx.shape_inference
import onnxruntime
from google.protobuf import text_format
from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
# pytype: enable=import-error
from ..core import BaseLoader, BaseRunner, BaseRunnerSession, BaseSaver, Format, Model, Precision, TensorSpec
from ..extensions import loaders, runners, savers
from .utils import infer_precision
LOGGER = logging.getLogger(__name__)
def _value_info2tensor_spec(value_info: onnx.ValueInfoProto):
onnx_data_type_map = {"float": "float32", "double": "float64"}
elem_type_name = onnx.TensorProto.DataType.Name(value_info.type.tensor_type.elem_type).lower()
dtype = onnx_data_type_map.get(elem_type_name, elem_type_name)
def _get_dim(dim):
which = dim.WhichOneof("value")
if which is not None: # which is None when dim is None
dim = getattr(dim, which)
return None if isinstance(dim, (str, bytes)) else dim
shape = value_info.type.tensor_type.shape
shape = tuple([_get_dim(d) for d in shape.dim])
return TensorSpec(value_info.name, dtype=dtype, shape=shape)
def _infer_graph_precision(onnx_graph: onnx.GraphProto) -> Optional[Precision]:
import networkx as nx
# build directed graph
nx_graph = nx.DiGraph()
def _get_dtype(vi):
t = vi.type
if hasattr(t, "tensor_type"):
type_id = t.tensor_type.elem_type
else:
raise NotImplementedError("Not implemented yet")
return TENSOR_TYPE_TO_NP_TYPE[type_id]
node_output2type = {vi.name: _get_dtype(vi) for vi in onnx_graph.value_info}
node_outputs2node = {output_name: node for node in onnx_graph.node for output_name in node.output}
node_inputs2node = {input_name: node for node in onnx_graph.node for input_name in node.input}
for node in onnx_graph.node:
node_dtype = node_output2type.get("+".join(node.output), None)
nx_graph.add_node(
node.name,
op=node.op_type,
attr={a.name: a for a in node.attribute},
dtype=node_dtype,
)
for input_name in node.input:
prev_node = node_outputs2node.get(input_name, None)
if prev_node:
nx_graph.add_edge(prev_node.name, node.name)
for input_node in onnx_graph.input:
input_name = input_node.name
nx_graph.add_node(input_name, op="input", dtype=_get_dtype(input_node))
next_node = node_inputs2node.get(input_name, None)
if next_node:
nx_graph.add_edge(input_name, next_node.name)
for output in onnx_graph.output:
output_name = output.name
nx_graph.add_node(output_name, op="output", dtype=_get_dtype(output))
prev_node = node_outputs2node.get(output_name, None)
if prev_node:
nx_graph.add_edge(prev_node.name, output_name)
else:
LOGGER.warning(f"Could not find previous node for {output_name}")
input_names = [n.name for n in onnx_graph.input]
output_names = [n.name for n in onnx_graph.output]
most_common_dtype = infer_precision(nx_graph, input_names, output_names, lambda node: node.get("dtype", None))
if most_common_dtype is not None:
precision = {np.dtype("float32"): Precision.FP32, np.dtype("float16"): Precision.FP16}[most_common_dtype]
else:
precision = None
return precision
class OnnxLoader(BaseLoader):
def load(self, model_path: Union[str, Path], **_) -> Model:
if isinstance(model_path, Path):
model_path = model_path.as_posix()
model = onnx.load(model_path)
onnx.checker.check_model(model)
onnx.helper.strip_doc_string(model)
model = onnx.shape_inference.infer_shapes(model)
# TODO: probably modification of onnx model ios causes error on optimize
# from onnx.utils import polish_model
# model = polish_model(model) # run checker, docs strip, optimizer and shape inference
inputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.input}
outputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.output}
precision = _infer_graph_precision(model.graph)
return Model(model, precision, inputs, outputs)
class OnnxSaver(BaseSaver):
def __init__(self, as_text: bool = False):
self._as_text = as_text
def save(self, model: Model, model_path: Union[str, Path]) -> None:
model_path = Path(model_path)
LOGGER.debug(f"Saving ONNX model to {model_path.as_posix()}")
model_path.parent.mkdir(parents=True, exist_ok=True)
onnx_model: onnx.ModelProto = model.handle
if self._as_text:
with model_path.open("w") as f:
f.write(text_format.MessageToString(onnx_model))
else:
with model_path.open("wb") as f:
f.write(onnx_model.SerializeToString())
"""
ExecutionProviders on onnxruntime 1.4.0
['TensorrtExecutionProvider',
'CUDAExecutionProvider',
'MIGraphXExecutionProvider',
'NGRAPHExecutionProvider',
'OpenVINOExecutionProvider',
'DnnlExecutionProvider',
'NupharExecutionProvider',
'VitisAIExecutionProvider',
'ArmNNExecutionProvider',
'ACLExecutionProvider',
'CPUExecutionProvider']
"""
def _check_providers(providers):
providers = providers or []
if not isinstance(providers, (list, tuple)):
providers = [providers]
available_providers = onnxruntime.get_available_providers()
unavailable = set(providers) - set(available_providers)
if unavailable:
raise RuntimeError(f"Unavailable providers {unavailable}")
return providers
class OnnxRunner(BaseRunner):
def __init__(self, verbose_runtime_logs: bool = False):
self._providers = None
self._verbose_runtime_logs = verbose_runtime_logs
def init_inference(self, model: Model):
assert isinstance(model.handle, onnx.ModelProto)
return OnnxRunnerSession(
model=model, providers=self._providers, verbose_runtime_logs=self._verbose_runtime_logs
)
class OnnxRunnerSession(BaseRunnerSession):
def __init__(self, model: Model, providers, verbose_runtime_logs: bool = False):
super().__init__(model)
self._input_names = None
self._output_names = None
self._session = None
self._providers = providers
self._verbose_runtime_logs = verbose_runtime_logs
self._old_env_values = {}
def __enter__(self):
self._old_env_values = self._set_env_variables()
sess_options = onnxruntime.SessionOptions() # default session options
if self._verbose_runtime_logs:
sess_options.log_severity_level = 0
sess_options.log_verbosity_level = 1
LOGGER.info(
f"Starting inference session for onnx model providers={self._providers} sess_options={sess_options}"
)
self._input_names = list(self._model.inputs)
self._output_names = list(self._model.outputs)
model_payload = self._model.handle.SerializeToString()
self._session = onnxruntime.InferenceSession(
model_payload, providers=self._providers, sess_options=sess_options
)
return self
def __exit__(self, exc_type, exc_value, traceback):
self._input_names = None
self._output_names = None
self._session = None
self._recover_env_variables(self._old_env_values)
def __call__(self, x: Dict[str, object]):
feed_dict = {k: x[k] for k in self._input_names}
y_pred = self._session.run(self._output_names, feed_dict)
y_pred = dict(zip(self._output_names, y_pred))
return y_pred
loaders.register_extension(Format.ONNX.value, OnnxLoader)
runners.register_extension(Format.ONNX.value, OnnxRunner)
savers.register_extension(Format.ONNX.value, OnnxSaver)

View file

@ -0,0 +1,100 @@
import logging
from typing import Dict, Iterable, Optional
# pytype: disable=import-error
import onnx
import tensorrt as trt
from ..core import BaseConverter, Format, Model, Precision, ShapeSpec
from ..extensions import converters
from .utils import get_input_shapes
# pytype: enable=import-error
LOGGER = logging.getLogger(__name__)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
class Onnx2TRTConverter(BaseConverter):
def __init__(self, *, max_batch_size: int, max_workspace_size: int, precision: str):
self._max_batch_size = max_batch_size
self._max_workspace_size = max_workspace_size
self._precision = Precision(precision)
def convert(self, model: Model, dataloader_fn) -> Model:
input_shapes = get_input_shapes(dataloader_fn(), self._max_batch_size)
cuda_engine = onnx2trt(
model.handle,
shapes=input_shapes,
max_workspace_size=self._max_workspace_size,
max_batch_size=self._max_batch_size,
model_precision=self._precision.value,
)
return model._replace(handle=cuda_engine)
@staticmethod
def required_source_model_precision(requested_model_precision: Precision) -> Precision:
# TensorRT requires source models to be in FP32 precision
return Precision.FP32
def onnx2trt(
onnx_model: onnx.ModelProto,
*,
shapes: Dict[str, ShapeSpec],
max_workspace_size: int,
max_batch_size: int,
model_precision: str,
) -> "trt.ICudaEngine":
"""
Converts onnx model to TensorRT ICudaEngine
Args:
onnx_model: onnx.Model to convert
shapes: dictionary containing min shape, max shape, opt shape for each input name
max_workspace_size: The maximum GPU temporary memory which the CudaEngine can use at execution time.
max_batch_size: The maximum batch size which can be used at execution time,
and also the batch size for which the CudaEngine will be optimized.
model_precision: precision of kernels (possible values: fp16, fp32)
Returns: TensorRT ICudaEngine
"""
# Whether or not 16-bit kernels are permitted.
# During :class:`ICudaEngine` build fp16 kernels will also be tried when this mode is enabled.
fp16_mode = "16" in model_precision
builder = trt.Builder(TRT_LOGGER)
builder.fp16_mode = fp16_mode
builder.max_batch_size = max_batch_size
builder.max_workspace_size = max_workspace_size
# In TensorRT 7.0, the ONNX parser only supports full-dimensions mode,
# meaning that your network definition must be created with the explicitBatch flag set.
# For more information, see
# https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work_dynamic_shapes
flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(flags)
with trt.OnnxParser(network, TRT_LOGGER) as parser:
# onnx model parsing
if not parser.parse(onnx_model.SerializeToString()):
for i in range(parser.num_errors):
LOGGER.error(f"OnnxParser error {i}/{parser.num_errors}: {parser.get_error(i)}")
raise RuntimeError("Error during parsing ONNX model (see logs for details)")
# optimization
config = builder.create_builder_config()
config.flags |= bool(fp16_mode) << int(trt.BuilderFlag.FP16)
config.max_workspace_size = max_workspace_size
profile = builder.create_optimization_profile()
for name, spec in shapes.items():
profile.set_shape(name, **spec._asdict())
config.add_optimization_profile(profile)
engine = builder.build_engine(network, config=config)
return engine
converters.register_extension(f"{Format.ONNX.value}--{Format.TRT.value}", Onnx2TRTConverter)

View file

@ -0,0 +1,202 @@
import logging
import sys
from pathlib import Path
from typing import Dict, NamedTuple, Optional, Union
import numpy as np
# pytype: disable=import-error
try:
import pycuda.autoinit
import pycuda.driver as cuda
except (ImportError, Exception) as e:
logging.getLogger(__name__).debug(f"Problems with importing pycuda package; {e}")
# pytype: enable=import-error
import tensorrt as trt # pytype: disable=import-error
from ..core import BaseLoader, BaseRunner, BaseRunnerSession, BaseSaver, Format, Model, Precision, TensorSpec
from ..extensions import loaders, runners, savers
LOGGER = logging.getLogger(__name__)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
"""
documentation:
https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html
https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_samples_section
"""
class TensorRTLoader(BaseLoader):
def load(self, model_path: Union[str, Path], **_) -> Model:
model_path = Path(model_path)
LOGGER.debug(f"Loading TensorRT engine from {model_path}")
with model_path.open("rb") as fh, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(fh.read())
if engine is None:
raise RuntimeError(f"Could not load ICudaEngine from {model_path}")
inputs = {}
outputs = {}
for binding_idx in range(engine.num_bindings):
name = engine.get_binding_name(binding_idx)
is_input = engine.binding_is_input(binding_idx)
dtype = engine.get_binding_dtype(binding_idx)
shape = engine.get_binding_shape(binding_idx)
if is_input:
inputs[name] = TensorSpec(name, dtype, shape)
else:
outputs[name] = TensorSpec(name, dtype, shape)
return Model(engine, None, inputs, outputs)
class TensorRTSaver(BaseSaver):
def __init__(self):
pass
def save(self, model: Model, model_path: Union[str, Path]) -> None:
model_path = Path(model_path)
LOGGER.debug(f"Saving TensorRT engine to {model_path.as_posix()}")
model_path.parent.mkdir(parents=True, exist_ok=True)
engine: "trt.ICudaEngine" = model.handle
with model_path.open("wb") as fh:
fh.write(engine.serialize())
class TRTBuffers(NamedTuple):
x_host: Optional[Dict[str, object]]
x_dev: Dict[str, object]
y_pred_host: Dict[str, object]
y_pred_dev: Dict[str, object]
class TensorRTRunner(BaseRunner):
def __init__(self):
pass
def init_inference(self, model: Model):
return TensorRTRunnerSession(model=model)
class TensorRTRunnerSession(BaseRunnerSession):
def __init__(self, model: Model):
super().__init__(model)
assert isinstance(model.handle, trt.ICudaEngine)
self._model = model
self._has_dynamic_shapes = None
self._context = None
self._engine: trt.ICudaEngine = self._model.handle
self._cuda_context = pycuda.autoinit.context
self._input_names = None
self._output_names = None
self._buffers = None
def __enter__(self):
self._context = self._engine.create_execution_context()
self._context.__enter__()
self._input_names = [
self._engine[idx] for idx in range(self._engine.num_bindings) if self._engine.binding_is_input(idx)
]
self._output_names = [
self._engine[idx] for idx in range(self._engine.num_bindings) if not self._engine.binding_is_input(idx)
]
# all_binding_shapes_specified is True for models without dynamic shapes
# so initially this variable is False for models with dynamic shapes
self._has_dynamic_shapes = not self._context.all_binding_shapes_specified
return self
def __exit__(self, exc_type, exc_value, traceback):
self._context.__exit__(exc_type, exc_value, traceback)
self._input_names = None
self._output_names = None
# TODO: are cuda buffers dealloc automatically?
self._buffers = None
def __call__(self, x):
buffers = self._prepare_buffers_if_needed(x)
bindings = self._update_bindings(buffers)
for name in self._input_names:
cuda.memcpy_htod(buffers.x_dev[name], buffers.x_host[name])
self._cuda_context.push()
self._context.execute_v2(bindings=bindings)
self._cuda_context.pop()
for name in self._output_names:
cuda.memcpy_dtoh(buffers.y_pred_host[name], buffers.y_pred_dev[name])
return buffers.y_pred_host
def _update_bindings(self, buffers: TRTBuffers):
bindings = [None] * self._engine.num_bindings
for name in buffers.y_pred_dev:
binding_idx: int = self._engine[name]
bindings[binding_idx] = buffers.y_pred_dev[name]
for name in buffers.x_dev:
binding_idx: int = self._engine[name]
bindings[binding_idx] = buffers.x_dev[name]
return bindings
def _set_dynamic_input_shapes(self, x_host):
def _is_shape_dynamic(input_shape):
return any([dim is None or dim == -1 for dim in input_shape])
for name in self._input_names:
bindings_idx = self._engine[name]
data_shape = x_host[name].shape # pytype: disable=attribute-error
if self._engine.is_shape_binding(bindings_idx):
input_shape = self._context.get_shape(bindings_idx)
if _is_shape_dynamic(input_shape):
self._context.set_shape_input(bindings_idx, data_shape)
else:
input_shape = self._engine.get_binding_shape(bindings_idx)
if _is_shape_dynamic(input_shape):
self._context.set_binding_shape(bindings_idx, data_shape)
assert self._context.all_binding_shapes_specified and self._context.all_shape_inputs_specified
def _prepare_buffers_if_needed(self, x_host: Dict[str, object]):
# pytype: disable=attribute-error
new_batch_size = list(x_host.values())[0].shape[0]
current_batch_size = list(self._buffers.y_pred_host.values())[0].shape[0] if self._buffers else 0
# pytype: enable=attribute-error
if self._has_dynamic_shapes or new_batch_size != current_batch_size:
# TODO: are CUDA buffers dealloc automatically?
self._set_dynamic_input_shapes(x_host)
y_pred_host = {}
for name in self._output_names:
shape = self._context.get_binding_shape(self._engine[name])
y_pred_host[name] = np.zeros(shape, dtype=trt.nptype(self._model.outputs[name].dtype))
y_pred_dev = {name: cuda.mem_alloc(data.nbytes) for name, data in y_pred_host.items()}
x_dev = {
name: cuda.mem_alloc(host_input.nbytes)
for name, host_input in x_host.items()
if name in self._input_names # pytype: disable=attribute-error
}
self._buffers = TRTBuffers(None, x_dev, y_pred_host, y_pred_dev)
return self._buffers._replace(x_host=x_host)
if "pycuda.driver" in sys.modules:
loaders.register_extension(Format.TRT.value, TensorRTLoader)
runners.register_extension(Format.TRT.value, TensorRTRunner)
savers.register_extension(Format.TRT.value, TensorRTSaver)
else:
LOGGER.debug("Do not register TensorRT extension due problems with importing pycuda.driver package.")

View file

@ -0,0 +1,535 @@
import logging
from pathlib import Path
from typing import Dict, Iterable, Optional, Tuple, Union
import numpy as np
# pytype: disable=import-error
import tensorflow as tf
from tensorflow.python.eager import wrap_function
from tf2onnx.shape_inference import infer_shape
from tf2onnx.tf_loader import (
freeze_session,
from_function,
inputs_without_resource,
is_function,
remove_redundant_inputs,
tf_optimize,
)
# pytype: enable=import-error
from ..args import filter_fn_args
from ..core import (
GET_MODEL_FN_NAME,
GET_SERVING_INPUT_RECEIVER_FN,
BaseConverter,
BaseLoader,
BaseRunner,
BaseRunnerSession,
BaseSaver,
Format,
Model,
Precision,
TensorSpec,
load_from_file,
)
from ..extensions import converters, loaders, runners, savers
from .utils import infer_precision
LOGGER = logging.getLogger(__name__)
def is_tf2():
return tf.__version__.startswith("2.")
def create_session_config(*, allow_growth=False, use_xla=False, gpu_memory_fraction=1.0):
gpu_options = tf.compat.v1.GPUOptions(
per_process_gpu_memory_fraction=gpu_memory_fraction, allow_growth=allow_growth
)
config = tf.compat.v1.ConfigProto(gpu_options=gpu_options)
if use_xla:
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
LOGGER.debug(
f"Using gpu memory fraction: allow_growth={allow_growth} "
f"gpu_memory_fraction={gpu_memory_fraction} "
f"use_xla={use_xla}"
)
return config
class TFTRTConverter(BaseConverter):
def __init__(
self,
*,
is_dynamic_op: bool = False,
minimum_segment_size: int = 3,
max_batch_size: int = 1,
max_workspace_size: int = (4 << 30) - 1000, # ~3.999GB
maximum_cached_engines: int = 1000,
precision: str,
):
self._is_dynamic_op = is_dynamic_op
self._minimum_segment_size = minimum_segment_size
self._max_batch_size = max_batch_size
self._max_workspace_size = max_workspace_size
self._maximum_cached_engines = maximum_cached_engines
self._precision = Precision(precision)
def convert(self, model: Model, dataloader_fn) -> Model:
# https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html
# converting graph_def is not supported in TF2
from tensorflow.python.compiler.tensorrt import trt_convert # pytype: disable=import-error
assert isinstance(model.handle, tf.compat.v1.GraphDef)
session_config = create_session_config(allow_growth=True)
output_node_names = [spec.name.split(":")[0] for spec in model.outputs.values()]
converter = trt_convert.TrtGraphConverter(
input_graph_def=model.handle,
session_config=session_config,
nodes_blacklist=output_node_names,
is_dynamic_op=self._is_dynamic_op,
precision_mode=self._precision.value,
max_workspace_size_bytes=self._max_workspace_size,
maximum_cached_engines=self._maximum_cached_engines,
max_batch_size=self._max_batch_size,
minimum_segment_size=self._minimum_segment_size,
)
graph_def = converter.convert()
return model._replace(handle=graph_def)
@staticmethod
def required_source_model_precision(requested_model_precision: Precision) -> Precision:
# TensorRT requires source models to be in FP32 precision
return Precision.FP32
def _from_saved_model_v1(sess, model_path, tag, signatures):
"""
Load tensorflow graph from saved_model.
NOTICE: Modified version from tf2onnx project
"""
wrn_no_tag = "'--tag' not specified for saved_model. Using --tag serve"
wrn_empty_tag = "'--tag' value is empty string. Using tag =[[]]"
if tag is None:
tag = [tf.saved_model.SERVING]
LOGGER.warning(wrn_no_tag)
if tag == "":
tag = [[]]
LOGGER.warning(wrn_empty_tag)
if not isinstance(tag, list):
tag = [tag]
imported = tf.compat.v1.saved_model.loader.load(sess, tag, model_path)
for k in imported.signature_def.keys():
if k.startswith("_"):
# consider signatures starting with '_' private
continue
signatures.append(k)
try:
from tensorflow.contrib.saved_model.python.saved_model import ( # pytype: disable=import-error
signature_def_utils,
)
# pylint: disable=unnecessary-lambda
get_signature_def = lambda meta_graph_def, k: signature_def_utils.get_signature_def_by_key(meta_graph_def, k)
except ImportError:
# TF1.12 changed the api
get_signature_def = lambda meta_graph_def, k: meta_graph_def.signature_def[k]
inputs = {}
outputs = {}
for k in signatures:
inputs_tensor_info = get_signature_def(imported, k).inputs
for name, input_tensor in inputs_tensor_info.items():
inputs[name] = input_tensor.name
outputs_tensor_info = get_signature_def(imported, k).outputs
for name, output_tensor in outputs_tensor_info.items():
outputs[name] = output_tensor.name
frozen_graph = freeze_session(sess, input_names=list(inputs.values()), output_names=list(outputs.values()))
return frozen_graph, inputs, outputs
def _infer_model_precision(
tf_graph: tf.compat.v1.GraphDef, inputs_dict: Dict[str, TensorSpec], outputs_dict: Dict[str, TensorSpec]
) -> Optional[Precision]:
import networkx as nx
def _get_dtype(node_def):
node_type = node_def.attr.get("T", None) or node_def.attr.get("dtype", None)
if node_type:
if node_type.list.type:
assert len(set(node_type.list.type)) == 1
node_type = tf.dtypes.DType(node_type.list.type[0])
else:
node_type = tf.dtypes.DType(node_type.type)
return np.dtype(node_type.as_numpy_dtype()) if node_type and node_type.is_numpy_compatible else node_type
# build directed graph
nx_graph = nx.DiGraph()
for node_def in tf_graph.node:
nx_graph.add_node(
node_def.name,
op=node_def.op,
**{key: value for key, value in node_def.attr.items() if key not in ["value", "dtype"]},
dtype=_get_dtype(node_def),
)
for input in node_def.input:
nx_graph.add_edge(input, node_def.name)
input_names = [spec.name.split(":")[0] for spec in inputs_dict.values()]
output_names = [spec.name.split(":")[0] for spec in outputs_dict.values()]
most_common_dtype = infer_precision(nx_graph, input_names, output_names, _get_dtype)
if most_common_dtype is not None:
precision = {np.dtype("float32"): Precision.FP32, np.dtype("float16"): Precision.FP16}[most_common_dtype]
else:
precision = None
return precision
class TFEstimatorLoader(BaseLoader):
required_fn_name_for_signature_parsing: Optional[str] = GET_MODEL_FN_NAME
def __init__(self, **kwargs):
self._model_args = kwargs
def load(self, model_path: Union[str, Path], **_) -> Model:
if isinstance(model_path, Path):
model_path = model_path.as_posix()
get_model = load_from_file(model_path, "model", GET_MODEL_FN_NAME)
get_serving_input_receiver_fn = load_from_file(model_path, "model", GET_SERVING_INPUT_RECEIVER_FN)
if get_model is None:
raise RuntimeError(f"Could not find {GET_MODEL_FN_NAME} in {model_path}")
if get_serving_input_receiver_fn is None:
raise RuntimeError(f"Could not find {GET_SERVING_INPUT_RECEIVER_FN} in {model_path}")
model_args = filter_fn_args(self._model_args, fn=get_model)
serving_input_receiver_args = filter_fn_args(self._model_args, fn=get_serving_input_receiver_fn)
session_config = create_session_config(allow_growth=True)
tf.compat.v1.reset_default_graph()
with tf.compat.v1.Session(config=session_config) as sess:
estimator = get_model(**model_args)
serving_input_receiver_fn = get_serving_input_receiver_fn(**serving_input_receiver_args)
input_receiver = serving_input_receiver_fn()
estimator_spec = estimator.model_fn(
features=input_receiver.features,
labels=None,
mode=tf.estimator.ModeKeys.PREDICT,
config=estimator.config,
)
input_tensors_dict = input_receiver.receiver_tensors
output_tensors_dict = estimator_spec.predictions
inputs_dict = {k: tensor2tensor_spec(tensor) for k, tensor in input_tensors_dict.items()}
outputs_dict = {k: tensor2tensor_spec(tensor) for k, tensor in output_tensors_dict.items()}
input_tensor_names = [t.name for t in inputs_dict.values()]
output_tensor_names = [t.name for t in outputs_dict.values()]
graph_saver = estimator_spec.scaffold.saver or tf.compat.v1.train.Saver(sharded=True)
graph_saver.restore(sess, estimator.latest_checkpoint())
input_tensor_names = inputs_without_resource(sess, input_tensor_names)
frozen_graph = freeze_session(sess, input_names=input_tensor_names, output_names=output_tensor_names)
input_tensor_names = remove_redundant_inputs(frozen_graph, input_tensor_names)
tf.compat.v1.reset_default_graph()
with tf.compat.v1.Session(config=estimator.config.session_config):
frozen_graph = tf_optimize(input_tensor_names, output_tensor_names, frozen_graph)
tf.compat.v1.reset_default_graph()
precision = _infer_model_precision(frozen_graph, inputs_dict, outputs_dict)
return Model(frozen_graph, precision, inputs_dict, outputs_dict)
class TFKerasLoader(BaseLoader):
"""
Loads keras model from source code
The large-model flag helps loading model which exceeds maximum protobuf size of 2GB. By default it is disabled.
The tf-allow-growth flag control limiting GPU memory growth feature
(https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth). By default it is disabled.
"""
required_fn_name_for_signature_parsing: Optional[str] = GET_MODEL_FN_NAME
def __init__(self, large_model: bool = False, tf_allow_growth: bool = False, **kwargs):
self._large_model = large_model
self._allow_growth = tf_allow_growth
self._model_args = kwargs
def load(self, model_path: Union[str, Path], **_) -> Model:
if isinstance(model_path, Path):
model_path = model_path.as_posix()
get_model = load_from_file(model_path, "model", GET_MODEL_FN_NAME)
if get_model is None:
raise RuntimeError(f"Could not find {GET_MODEL_FN_NAME} in {model_path}")
model_args = filter_fn_args(self._model_args, fn=get_model)
if self._allow_growth:
physical_devices = tf.config.experimental.list_physical_devices("GPU")
for device in physical_devices:
tf.config.experimental.set_memory_growth(device, True)
tf.keras.backend.clear_session()
tf.keras.backend.set_learning_phase(False)
eager_model, call_fn = get_model(**model_args)
inputs_dict: Dict[str, TensorSpec] = {
input_name: TensorSpec(t.name, t.dtype.name, tuple(t.shape.as_list()))
for input_name, t in zip(eager_model.input_names, eager_model.inputs)
}
concrete_func = call_fn.get_concrete_function(
*[tf.TensorSpec(shape=spec.shape, dtype=spec.dtype, name=name) for name, spec in inputs_dict.items()]
)
input_tensors_names = [tensor.name for tensor in concrete_func.inputs if tensor.dtype != tf.dtypes.resource]
output_tensors_names = [tensor.name for tensor in concrete_func.outputs]
graph_def = from_function(
concrete_func, input_tensors_names, output_tensors_names, large_model=self._large_model
)
# tensor names changes after wrapping with call_fn, thus need to use those from concrete_func
outputs_dict: Dict[str, TensorSpec] = {
output_name: TensorSpec(output_tensor_name, t.dtype.name, tuple(t.shape.as_list()))
for output_name, output_tensor_name, t in zip(
eager_model.output_names, output_tensors_names, eager_model.outputs
)
}
precision = _infer_model_precision(graph_def, inputs_dict, outputs_dict)
tf.keras.backend.clear_session()
tf.keras.backend.set_learning_phase(False)
return Model(graph_def, precision, inputs_dict, outputs_dict)
class TFSavedModelLoader(BaseLoader):
def load(self, model_path: Union[str, Path], **kwargs) -> Model:
if isinstance(model_path, Path):
model_path = model_path.as_posix()
tf.compat.v1.reset_default_graph()
if is_tf2():
from tf2onnx.tf_loader import _from_saved_model_v2 # pytype: disable=import-error
graph_def, input_names, output_names, concrete_func, imported, initialized_tables = _from_saved_model_v2(
model_path=model_path,
input_names=None,
output_names=None,
tag=None,
signature_def=[],
concrete_function_index=None,
large_model=False,
)
# inspired by https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/saved_model_cli.py#L205
if concrete_func.structured_input_signature:
input_args, input_kwargs = concrete_func.structured_input_signature
input_names = list(input_kwargs)
assert (
not input_args
), f"Not supported args in concrete function signature args={input_args}, kwargs={input_kwargs}"
elif concrete_func._arg_keywords: # pylint: disable=protected-access
# For pure ConcreteFunctions we might have nothing better than _arg_keywords.
assert concrete_func._num_positional_args in [0, 1]
input_names = concrete_func._arg_keywords
input_tensors = [tensor for tensor in concrete_func.inputs if tensor.dtype != tf.dtypes.resource]
inputs = {name: tensor.name for name, tensor in zip(input_names, input_tensors)}
# they are already flattened
output_tensors = [tensor for tensor in concrete_func.outputs if tensor.dtype != tf.dtypes.resource]
output_names = sorted(concrete_func.structured_outputs) # because outputs are in flatten form
outputs = {name: tensor.name for name, tensor in zip(output_names, output_tensors)}
else:
session_config = create_session_config(allow_growth=True)
with tf.compat.v1.Session(config=session_config) as sess:
graph_def, inputs, outputs = _from_saved_model_v1(sess, model_path, tag=None, signatures=[])
inputs, outputs = handle_tensor_specs(graph_def, inputs, outputs)
precision = _infer_model_precision(graph_def, inputs, outputs)
return Model(graph_def, precision, inputs, outputs)
class TFRunner(BaseRunner):
def __init__(self):
pass
def init_inference(self, model: Model):
if is_tf2():
return TF2RunnerSession(model=model)
else:
return TF1RunnerSession(model=model)
class TF1RunnerSession(BaseRunnerSession):
def __init__(self, model: Model):
super().__init__(model)
assert isinstance(model.handle, tf.compat.v1.GraphDef)
self._inputs = None
self._outputs = None
self._session = None
self._old_env_values = {}
def __enter__(self):
self._old_env_values = self._set_env_variables()
tf.compat.v1.reset_default_graph()
session_config = create_session_config(allow_growth=True)
self._session = tf.compat.v1.Session(config=session_config)
self._session.__enter__()
tf.import_graph_def(self._model.handle, name="")
self._inputs = {
name: self._session.graph.get_tensor_by_name(spec.name) for name, spec in self._model.inputs.items()
}
self._outputs = {
name: self._session.graph.get_tensor_by_name(spec.name) for name, spec in self._model.outputs.items()
}
return self
def __exit__(self, exc_type, exc_value, traceback):
self._session.__exit__(exc_type, exc_value, traceback)
tf.compat.v1.reset_default_graph()
self._inputs = None
self._outputs = None
self._session = None
self._recover_env_variables(self._old_env_values)
def __call__(self, x: Dict[str, object]):
feed_dict = {placeholder: x[name] for name, placeholder in self._inputs.items()}
return self._session.run(self._outputs, feed_dict=feed_dict)
class TF2RunnerSession(BaseRunnerSession):
def __init__(self, model: Model):
super().__init__(model)
assert isinstance(model.handle, tf.compat.v1.GraphDef)
self._concrete_func = None
def __enter__(self):
tf.compat.v1.reset_default_graph()
input_tensor_names = [spec.name for spec in self._model.inputs.values()]
output_tensor_names = [spec.name for spec in self._model.outputs.values()]
self._concrete_func = wrap_function.function_from_graph_def(
self._model.handle, input_tensor_names, output_tensor_names
)
self._concrete_func._signature = [
tf.TensorSpec(shape=spec.shape, dtype=spec.dtype, name=name) for name, spec in self._model.inputs.items()
]
return self
def __exit__(self, exc_type, exc_value, traceback):
self._concrete_func = None
tf.compat.v1.reset_default_graph()
def __call__(self, x: Dict[str, object]):
x = tf.nest.map_structure(tf.convert_to_tensor, x)
y_pred = self._concrete_func(**x)
output_struct = {name: spec.name for name, spec in self._model.outputs.items()}
y_pred = tf.nest.map_structure(lambda t: t.numpy(), y_pred)
y_pred = tf.nest.pack_sequence_as(output_struct, y_pred)
return y_pred
class TFSavedModelSaver(BaseSaver):
def save(self, model: Model, model_path: Union[str, Path]) -> None:
if isinstance(model_path, Path):
model_path = model_path.as_posix()
session_config = create_session_config(allow_growth=True)
with tf.compat.v1.Session(config=session_config) as sess:
tf.import_graph_def(model.handle, name="")
is_func = is_function(sess.graph)
if not is_func:
infer_shape(sess.graph, {})
inputs = {name: sess.graph.get_tensor_by_name(spec.name) for name, spec in model.inputs.items()}
outputs = {name: sess.graph.get_tensor_by_name(spec.name) for name, spec in model.outputs.items()}
def _ensure_shape(tensors_dict, tensors_specs):
for name, tensor in tensors_dict.items():
if tensor.shape.rank is None:
tensor.set_shape(tensors_specs[name].shape)
return tensors_dict
inputs = _ensure_shape(inputs, model.inputs)
outputs = _ensure_shape(outputs, model.outputs)
LOGGER.info(inputs)
LOGGER.info(outputs)
tf.compat.v1.saved_model.simple_save(sess, model_path, inputs, outputs, legacy_init_op=None)
def handle_tensor_specs(
graph_def, inputs: Dict[str, str], outputs: Dict[str, str]
) -> Tuple[Dict[str, TensorSpec], Dict[str, TensorSpec]]:
session_config = tf.compat.v1.ConfigProto(graph_options=tf.compat.v1.GraphOptions(infer_shapes=True))
tf.compat.v1.reset_default_graph()
with tf.compat.v1.Session(config=session_config) as sess:
tf.import_graph_def(graph_def, name="")
def _get_spec(tensors_dict):
tensors_dict = {name: sess.graph.get_tensor_by_name(tname) for name, tname in tensors_dict.items()}
return {name: tensor2tensor_spec(tensor) for name, tensor in tensors_dict.items()}
inputs = _get_spec(inputs)
outputs = _get_spec(outputs)
tf.compat.v1.reset_default_graph()
return inputs, outputs
def tensor2tensor_spec(tensor):
shape = tuple([s.value if hasattr(s, "value") else s for s in tensor.shape])
return TensorSpec(tensor.name, tensor.dtype.name, shape)
loaders.register_extension(Format.TF_ESTIMATOR.value, TFEstimatorLoader)
loaders.register_extension(Format.TF_KERAS.value, TFKerasLoader)
loaders.register_extension(Format.TF_SAVEDMODEL.value, TFSavedModelLoader)
loaders.register_extension(Format.TF_TRT.value, TFSavedModelLoader)
converters.register_extension(f"{Format.TF_ESTIMATOR.value}--{Format.TF_SAVEDMODEL.value}", None)
converters.register_extension(f"{Format.TF_KERAS.value}--{Format.TF_SAVEDMODEL.value}", None)
converters.register_extension(f"{Format.TF_SAVEDMODEL.value}--{Format.TF_SAVEDMODEL.value}", None)
converters.register_extension(f"{Format.TF_ESTIMATOR.value}--{Format.TF_TRT.value}", TFTRTConverter)
converters.register_extension(f"{Format.TF_KERAS.value}--{Format.TF_TRT.value}", TFTRTConverter)
converters.register_extension(f"{Format.TF_SAVEDMODEL.value}--{Format.TF_TRT.value}", TFTRTConverter)
savers.register_extension(Format.TF_SAVEDMODEL.value, TFSavedModelSaver)
savers.register_extension(Format.TF_TRT.value, TFSavedModelSaver)
runners.register_extension(Format.TF_ESTIMATOR.value, TFRunner)
runners.register_extension(Format.TF_KERAS.value, TFRunner)
runners.register_extension(Format.TF_SAVEDMODEL.value, TFRunner)
runners.register_extension(Format.TF_TRT.value, TFRunner)

View file

@ -0,0 +1,89 @@
from collections import Iterable
# pytype: disable=import-error
import onnx
import onnx.shape_inference
import tensorflow as tf
from tf2onnx import optimizer, tfonnx
# pytype: enable=import-error
from ..core import BaseConverter, Format, Model
from ..extensions import converters
from .tf import create_session_config
def _replace_io_names(graph_proto, io_type, name2tensor):
tensor2name = {v: k for k, v in name2tensor.items()}
tensor_value_info_list = {"inputs": graph_proto.input, "outputs": graph_proto.output}[io_type]
for tensor_value_info in tensor_value_info_list:
old_name = tensor_value_info.name
new_name = tensor2name.get(old_name)
if new_name is not None and new_name != old_name:
tensor_value_info.name = new_name
# replace other graph nodes I/O
for node in graph_proto.node:
if old_name in node.input:
idx = list(node.input).index(old_name)
node.input[idx] = new_name
if old_name in node.output:
idx = list(node.output).index(old_name)
node.output[idx] = new_name
def tfgraph2onnx(graph_def, inputnames2tensornames, outputnames2tensornames, *, onnx_opset, onnx_optimized=True):
with tf.Graph().as_default() as tf_graph:
tf.import_graph_def(graph_def, name="")
session_config = create_session_config(allow_growth=True)
with tf.compat.v1.Session(graph=tf_graph, config=session_config):
input_tensor_names = list(inputnames2tensornames.values())
output_tensor_names = list(outputnames2tensornames.values())
onnx_graph = tfonnx.process_tf_graph(
tf_graph,
input_names=input_tensor_names,
output_names=output_tensor_names,
opset=onnx_opset,
)
if onnx_optimized:
onnx_graph = optimizer.optimize_graph(onnx_graph)
graph_doc: str = "triton export"
onnx_model = onnx_graph.make_model(graph_doc)
# to match tensorflow savedmodel signature
_replace_io_names(onnx_model.graph, "inputs", inputnames2tensornames)
_replace_io_names(onnx_model.graph, "outputs", outputnames2tensornames)
onnx.checker.check_model(onnx_model)
onnx.helper.strip_doc_string(onnx_model)
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
return onnx_model
class TFGraphDef2ONNXConverter(BaseConverter):
def __init__(self, *, onnx_opset: int, onnx_optimized: bool = True):
self._onnx_opset = onnx_opset
self._onnx_optimized = onnx_optimized
def convert(self, model: Model, dataloader_fn) -> Model:
assert isinstance(model.handle, tf.compat.v1.GraphDef)
inputnames2tensorname = {name: spec.name for name, spec in model.inputs.items()}
outputnames2tensorname = {name: spec.name for name, spec in model.outputs.items()}
onnx_model = tfgraph2onnx(
model.handle,
inputnames2tensorname,
outputnames2tensorname,
onnx_opset=self._onnx_opset,
onnx_optimized=self._onnx_optimized,
)
from .onnx import _infer_graph_precision
precision = _infer_graph_precision(onnx_model.graph)
assert precision == model.precision # for testing precision inference function
return model._replace(handle=onnx_model)
converters.register_extension(f"{Format.TF_ESTIMATOR.value}--{Format.ONNX.value}", TFGraphDef2ONNXConverter)
converters.register_extension(f"{Format.TF_KERAS.value}--{Format.ONNX.value}", TFGraphDef2ONNXConverter)
converters.register_extension(f"{Format.TF_SAVEDMODEL.value}--{Format.ONNX.value}", TFGraphDef2ONNXConverter)

View file

@ -0,0 +1,60 @@
from typing import Iterable
from ..core import BaseConverter, Format, Model, Precision, ShapeSpec
from ..extensions import converters
from .onnx2trt_conv import onnx2trt
from .tf2onnx_conv import tfgraph2onnx
from .utils import get_input_shapes
class TFGraphDef2TRTConverter(BaseConverter):
def __init__(
self,
*,
max_batch_size: int,
max_workspace_size: int,
onnx_opset: int,
onnx_optimized: bool = True,
precision: str,
):
self._max_batch_size = max_batch_size
self._max_workspace_size = max_workspace_size
self._onnx_opset = onnx_opset
self._onnx_optimized = onnx_optimized
self._precision = Precision(precision)
def convert(self, model: Model, dataloader_fn) -> Model:
inputnames2tensorname = {name: spec.name for name, spec in model.inputs.items()}
outputnames2tensorname = {name: spec.name for name, spec in model.outputs.items()}
onnx_model = tfgraph2onnx(
model.handle,
inputnames2tensorname,
outputnames2tensorname,
onnx_opset=self._onnx_opset,
onnx_optimized=self._onnx_optimized,
)
from .onnx import _infer_graph_precision
precision = _infer_graph_precision(onnx_model.graph)
assert precision == model.precision # for testing precision inference function
input_shapes = get_input_shapes(dataloader_fn(), self._max_batch_size)
cuda_engine = onnx2trt(
onnx_model,
shapes=input_shapes,
max_workspace_size=self._max_workspace_size,
max_batch_size=self._max_batch_size,
model_precision=self._precision.value,
)
return model._replace(handle=cuda_engine)
@staticmethod
def required_source_model_precision(requested_model_precision: Precision) -> Precision:
# TensorRT requires source models to be in FP32 precision
return Precision.FP32
converters.register_extension(f"{Format.TF_ESTIMATOR.value}--{Format.TRT.value}", TFGraphDef2TRTConverter)
converters.register_extension(f"{Format.TF_KERAS.value}--{Format.TRT.value}", TFGraphDef2TRTConverter)
converters.register_extension(f"{Format.TF_SAVEDMODEL.value}--{Format.TRT.value}", TFGraphDef2TRTConverter)

View file

@ -0,0 +1,107 @@
from collections import Counter
from typing import Callable, Dict, List
import networkx as nx
from ..core import ShapeSpec
def infer_precision(
nx_graph: nx.Graph,
input_names: List[str],
output_names: List[str],
get_node_dtype_fn: Callable,
):
node_dtypes = [nx_graph.nodes[node_name].get("dtype", None) for node_name in nx_graph.nodes]
node_dtypes = [dt for dt in node_dtypes if dt is None or dt.kind not in ["i", "b"]]
dtypes_counter = Counter(node_dtypes)
return dtypes_counter.most_common()[0][0]
def get_shapes_with_dynamic_axes(dataloader, batch_size_dim=0):
def _set_dynamic_shapes(t, shapes):
for k, v in t.items():
shape = list(v.shape)
for dim, s in enumerate(shape):
if shapes[k][dim] != -1 and shapes[k][dim] != s:
shapes[k][dim] = -1
## get all shapes from input and output tensors
input_shapes = {}
output_shapes = {}
for batch in dataloader:
_, x, y = batch
for k, v in x.items():
input_shapes[k] = list(v.shape)
for k, v in y.items():
output_shapes[k] = list(v.shape)
break
# based on max <max_num_iters> iterations, check which
# dimensions differ to determine dynamic_axes
max_num_iters = 100
for idx, batch in enumerate(dataloader):
if idx >= max_num_iters:
break
_, x, y = batch
_set_dynamic_shapes(x, input_shapes)
_set_dynamic_shapes(y, output_shapes)
return input_shapes, output_shapes
def get_dynamic_axes(dataloader, batch_size_dim=0):
input_shapes, output_shapes = get_shapes_with_dynamic_axes(dataloader, batch_size_dim)
all_shapes = {**input_shapes, **output_shapes}
dynamic_axes = {}
for k, shape in all_shapes.items():
for idx, s in enumerate(shape):
if s == -1:
dynamic_axes[k] = {idx: k + "_" + str(idx)}
for k, v in all_shapes.items():
if k in dynamic_axes:
dynamic_axes[k].update({batch_size_dim: "batch_size_" + str(batch_size_dim)})
else:
dynamic_axes[k] = {batch_size_dim: "batch_size_" + str(batch_size_dim)}
return dynamic_axes
def get_input_shapes(dataloader, max_batch_size=1) -> Dict[str, ShapeSpec]:
def init_counters_and_shapes(x, counters, min_shapes, max_shapes):
for k, v in x.items():
counters[k] = Counter()
min_shapes[k] = [float("inf")] * v.ndim
max_shapes[k] = [float("-inf")] * v.ndim
counters = {}
min_shapes: Dict[str, tuple] = {}
max_shapes: Dict[str, tuple] = {}
for idx, batch in enumerate(dataloader):
ids, x, y = batch
if idx == 0:
init_counters_and_shapes(x, counters, min_shapes, max_shapes)
for k, v in x.items():
shape = v.shape
counters[k][shape] += 1
min_shapes[k] = tuple([min(a, b) for a, b in zip(min_shapes[k], shape)])
max_shapes[k] = tuple([max(a, b) for a, b in zip(max_shapes[k], shape)])
opt_shapes: Dict[str, tuple] = {}
for k, v in counters.items():
opt_shapes[k] = v.most_common(1)[0][0]
shapes = {}
for k in opt_shapes.keys(): # same keys in min_shapes and max_shapes
shapes[k] = ShapeSpec(
min=(1,) + min_shapes[k][1:],
max=(max_batch_size,) + max_shapes[k][1:],
opt=(max_batch_size,) + opt_shapes[k][1:],
)
return shapes

View file

@ -0,0 +1,169 @@
import abc
import importlib
import logging
import os
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union
import numpy as np
LOGGER = logging.getLogger(__name__)
DATALOADER_FN_NAME = "get_dataloader_fn"
GET_MODEL_FN_NAME = "get_model"
GET_SERVING_INPUT_RECEIVER_FN = "get_serving_input_receiver_fn"
GET_ARGPARSER_FN_NAME = "update_argparser"
class TensorSpec(NamedTuple):
name: str
dtype: str
shape: Tuple
class Parameter(Enum):
def __lt__(self, other: "Parameter") -> bool:
return self.value < other.value
class Accelerator(Parameter):
AMP = "amp"
CUDA = "cuda"
TRT = "trt"
class Precision(Parameter):
FP16 = "fp16"
FP32 = "fp32"
TF32 = "tf32" # Deprecated
class Format(Parameter):
TF_GRAPHDEF = "tf-graphdef"
TF_SAVEDMODEL = "tf-savedmodel"
TF_TRT = "tf-trt"
TF_ESTIMATOR = "tf-estimator"
TF_KERAS = "tf-keras"
ONNX = "onnx"
TRT = "trt"
TS_SCRIPT = "ts-script"
TS_TRACE = "ts-trace"
PYT = "pyt"
class Model(NamedTuple):
handle: object
precision: Optional[Precision]
inputs: Dict[str, TensorSpec]
outputs: Dict[str, TensorSpec]
def load_from_file(file_path, label, target):
spec = importlib.util.spec_from_file_location(name=label, location=file_path)
my_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(my_module) # pytype: disable=attribute-error
return getattr(my_module, target, None)
class BaseLoader(abc.ABC):
required_fn_name_for_signature_parsing: Optional[str] = None
@abc.abstractmethod
def load(self, model_path: Union[str, Path], **kwargs) -> Model:
"""
Loads and process model from file based on given set of args
"""
pass
class BaseSaver(abc.ABC):
required_fn_name_for_signature_parsing: Optional[str] = None
@abc.abstractmethod
def save(self, model: Model, model_path: Union[str, Path]) -> None:
"""
Save model to file
"""
pass
class BaseRunner(abc.ABC):
required_fn_name_for_signature_parsing: Optional[str] = None
@abc.abstractmethod
def init_inference(self, model: Model):
raise NotImplementedError
class BaseRunnerSession(abc.ABC):
def __init__(self, model: Model):
self._model = model
@abc.abstractmethod
def __enter__(self):
raise NotImplementedError()
@abc.abstractmethod
def __exit__(self, exc_type, exc_value, traceback):
raise NotImplementedError()
@abc.abstractmethod
def __call__(self, x: Dict[str, object]):
raise NotImplementedError()
def _set_env_variables(self) -> Dict[str, object]:
"""this method not remove values; fix it if needed"""
to_set = {}
old_values = {k: os.environ.pop(k, None) for k in to_set}
os.environ.update(to_set)
return old_values
def _recover_env_variables(self, old_envs: Dict[str, object]):
for name, value in old_envs.items():
if value is None:
del os.environ[name]
else:
os.environ[name] = str(value)
class BaseConverter(abc.ABC):
required_fn_name_for_signature_parsing: Optional[str] = None
@abc.abstractmethod
def convert(self, model: Model, dataloader_fn) -> Model:
raise NotImplementedError()
@staticmethod
def required_source_model_precision(requested_model_precision: Precision) -> Precision:
return requested_model_precision
class BaseMetricsCalculator(abc.ABC):
required_fn_name_for_signature_parsing: Optional[str] = None
@abc.abstractmethod
def calc(
self,
*,
ids: List[Any],
y_pred: Dict[str, np.ndarray],
x: Optional[Dict[str, np.ndarray]],
y_real: Optional[Dict[str, np.ndarray]],
) -> Dict[str, float]:
"""
Calculates error/accuracy metrics
Args:
ids: List of ids identifying each sample in the batch
y_pred: model output as dict where key is output name and value is output value
x: model input as dict where key is input name and value is input value
y_real: input ground truth as dict where key is output name and value is output value
Returns:
dictionary where key is metric name and value is its value
"""
pass
class ShapeSpec(NamedTuple):
min: Tuple
opt: Tuple
max: Tuple

View file

@ -0,0 +1,133 @@
from pathlib import Path
from typing import Dict, Iterable
import numpy as np
MB2B = 2 ** 20
B2MB = 1 / MB2B
FLUSH_THRESHOLD_B = 256 * MB2B
def pad_except_batch_axis(data: np.ndarray, target_shape_with_batch_axis: Iterable[int]):
assert all(
[current_size <= target_size for target_size, current_size in zip(target_shape_with_batch_axis, data.shape)]
), "target_shape should have equal or greater all dimensions comparing to data.shape"
padding = [(0, 0)] + [ # (0, 0) - do not pad on batch_axis (with index 0)
(0, target_size - current_size)
for target_size, current_size in zip(target_shape_with_batch_axis[1:], data.shape[1:])
]
return np.pad(data, padding, "constant", constant_values=np.nan)
class NpzWriter:
"""
Dumps dicts of numpy arrays into npz files
It can/shall be used as context manager:
```
with OutputWriter('mydir') as writer:
writer.write(outputs={'classes': np.zeros(8), 'probs': np.zeros((8, 4))},
labels={'classes': np.zeros(8)},
inputs={'input': np.zeros((8, 240, 240, 3)})
```
## Variable size data
Only dynamic of last axis is handled. Data is padded with np.nan value.
Also each generated file may have different size of dynamic axis.
"""
def __init__(self, output_dir, compress=False):
self._output_dir = Path(output_dir)
self._items_cache: Dict[str, Dict[str, np.ndarray]] = {}
self._items_counters: Dict[str, int] = {}
self._flush_threshold_b = FLUSH_THRESHOLD_B
self._compress = compress
@property
def cache_size(self):
return {name: sum([a.nbytes for a in data.values()]) for name, data in self._items_cache.items()}
def _append_to_cache(self, prefix, data):
if data is None:
return
if not isinstance(data, dict):
raise ValueError(f"{prefix} data to store shall be dict")
cached_data = self._items_cache.get(prefix, {})
for name, value in data.items():
assert isinstance(
value, (list, np.ndarray)
), f"Values shall be lists or np.ndarrays; current type {type(value)}"
if not isinstance(value, np.ndarray):
value = np.array(value)
assert value.dtype.kind in ["S", "U"] or not np.any(
np.isnan(value)
), f"Values with np.nan is not supported; {name}={value}"
cached_value = cached_data.get(name, None)
if cached_value is not None:
target_shape = np.max([cached_value.shape, value.shape], axis=0)
cached_value = pad_except_batch_axis(cached_value, target_shape)
value = pad_except_batch_axis(value, target_shape)
value = np.concatenate((cached_value, value))
cached_data[name] = value
self._items_cache[prefix] = cached_data
def write(self, **kwargs):
"""
Writes named list of dictionaries of np.ndarrays.
Finally keyword names will be later prefixes of npz files where those dictionaries will be stored.
ex. writer.write(inputs={'input': np.zeros((2, 10))},
outputs={'classes': np.zeros((2,)), 'probabilities': np.zeros((2, 32))},
labels={'classes': np.zeros((2,))})
Args:
**kwargs: named list of dictionaries of np.ndarrays to store
"""
for prefix, data in kwargs.items():
self._append_to_cache(prefix, data)
biggest_item_size = max(self.cache_size.values())
if biggest_item_size > self._flush_threshold_b:
self.flush()
def flush(self):
for prefix, data in self._items_cache.items():
self._dump(prefix, data)
self._items_cache = {}
def _dump(self, prefix, data):
idx = self._items_counters.setdefault(prefix, 0)
filename = f"{prefix}-{idx:012d}.npz"
output_path = self._output_dir / filename
if self._compress:
np.savez_compressed(output_path, **data)
else:
np.savez(output_path, **data)
nitems = len(list(data.values())[0])
msg_for_labels = (
"If these are correct shapes - consider moving loading of them into metrics.py."
if prefix == "labels"
else ""
)
shapes = {name: value.shape if isinstance(value, np.ndarray) else (len(value),) for name, value in data.items()}
assert all(len(v) == nitems for v in data.values()), (
f'All items in "{prefix}" shall have same size on 0 axis equal to batch size. {msg_for_labels}'
f'{", ".join(f"{name}: {shape}" for name, shape in shapes.items())}'
)
self._items_counters[prefix] += nitems
def __enter__(self):
if self._output_dir.exists() and len(list(self._output_dir.iterdir())):
raise ValueError(f"{self._output_dir.as_posix()} is not empty")
self._output_dir.mkdir(parents=True, exist_ok=True)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.flush()

View file

@ -0,0 +1,69 @@
import importlib
import logging
import os
import re
from pathlib import Path
from typing import List
LOGGER = logging.getLogger(__name__)
class ExtensionManager:
def __init__(self, name: str):
self._name = name
self._registry = {}
def register_extension(self, extension: str, clazz):
already_registered_class = self._registry.get(extension, None)
if already_registered_class and already_registered_class.__module__ != clazz.__module__:
raise RuntimeError(
f"Conflicting extension {self._name}/{extension}; "
f"{already_registered_class.__module__}.{already_registered_class.__name} "
f"and "
f"{clazz.__module__}.{clazz.__name__}"
)
elif already_registered_class is None:
clazz_full_name = f"{clazz.__module__}.{clazz.__name__}" if clazz is not None else "None"
LOGGER.debug(f"Registering extension {self._name}/{extension}: {clazz_full_name}")
self._registry[extension] = clazz
def get(self, extension):
if extension not in self._registry:
raise RuntimeError(f"Missing extension {self._name}/{extension}")
return self._registry[extension]
@property
def supported_extensions(self):
return list(self._registry)
@staticmethod
def scan_for_extensions(extension_dirs: List[Path]):
register_pattern = r".*\.register_extension\(.*"
for extension_dir in extension_dirs:
for python_path in extension_dir.rglob("*.py"):
if not python_path.is_file():
continue
payload = python_path.read_text()
if re.findall(register_pattern, payload):
import_path = python_path.relative_to(toolkit_root_dir.parent)
package = import_path.parent.as_posix().replace(os.sep, ".")
package_with_module = f"{package}.{import_path.stem}"
spec = importlib.util.spec_from_file_location(name=package_with_module, location=python_path)
my_module = importlib.util.module_from_spec(spec)
my_module.__package__ = package
try:
spec.loader.exec_module(my_module) # pytype: disable=attribute-error
except ModuleNotFoundError as e:
LOGGER.error(
f"Could not load extensions from {import_path} due to missing python packages; {e}"
)
runners = ExtensionManager("runners")
loaders = ExtensionManager("loaders")
savers = ExtensionManager("savers")
converters = ExtensionManager("converters")
toolkit_root_dir = (Path(__file__).parent / "..").resolve()
ExtensionManager.scan_for_extensions([toolkit_root_dir])

View file

@ -0,0 +1,47 @@
import csv
import re
from typing import Dict, List
from natsort import natsorted
from tabulate import tabulate
def sort_results(results: List):
results = natsorted(results, key=lambda item: [item[key] for key in item.keys()])
return results
def save_results(filename: str, data: List, formatted: bool = False):
data = format_data(data=data) if formatted else data
with open(filename, "a") as csvfile:
fieldnames = data[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
def format_data(data: List[Dict]) -> List[Dict]:
formatted_data = list()
for item in data:
formatted_item = format_keys(data=item)
formatted_data.append(formatted_item)
return formatted_data
def format_keys(data: Dict) -> Dict:
keys = {format_key(key=key): value for key, value in data.items()}
return keys
def format_key(key: str) -> str:
key = " ".join([k.capitalize() for k in re.split("_| ", key)])
return key
def show_results(results: List[Dict]):
headers = list(results[0].keys())
summary = map(lambda x: list(map(lambda item: item[1], x.items())), results)
print(tabulate(summary, headers=headers))

View file

@ -0,0 +1,47 @@
import os
from typing import List, Optional
def warmup(
model_name: str,
batch_sizes: List[int],
triton_instances: int = 1,
profiling_data: str = "random",
input_shapes: Optional[List[str]] = None,
server_url: str = "localhost",
measurement_window: int = 10000,
):
print("\n")
print(f"==== Warmup start ====")
print("\n")
input_shapes = " ".join(map(lambda shape: f" --shape {shape}", input_shapes)) if input_shapes else ""
bs = set()
bs.add(min(batch_sizes))
bs.add(max(batch_sizes))
measurement_window = 6 * measurement_window
for batch_size in bs:
exec_args = f"""-max-threads {triton_instances} \
-m {model_name} \
-x 1 \
-c {triton_instances} \
-t {triton_instances} \
-p {measurement_window} \
-v \
-i http \
-u {server_url}:8000 \
-b {batch_size} \
--input-data {profiling_data} {input_shapes}
"""
result = os.system(f"perf_client {exec_args}")
if result != 0:
print(f"Failed running performance tests. Perf client failed with exit code {result}")
exit(1)
print("\n")
print(f"==== Warmup done ====")
print("\n")

View file

@ -0,0 +1,18 @@
from typing import Any, Dict, List, Optional
import numpy as np
from deployment_toolkit.core import BaseMetricsCalculator
class MetricsCalculator(BaseMetricsCalculator):
def __init__(self, output_used_for_metrics: str = "classes"):
self._output_used_for_metrics = output_used_for_metrics
def calc(self, *, y_pred: Dict[str, np.ndarray], y_real: Optional[Dict[str, np.ndarray]], **_) -> Dict[str, float]:
y_true = y_real[self._output_used_for_metrics]
y_pred = y_pred[self._output_used_for_metrics]
y_true = np.squeeze(y_true)
y_pred = np.squeeze(y_pred)
assert y_true.shape == y_pred.shape
return {"accuracy": (y_true == y_pred).mean()}

View file

@ -0,0 +1,992 @@
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Created with matplotlib (https://matplotlib.org/) -->
<svg height="331.389812pt" version="1.1" viewBox="0 0 424.62875 331.389812" width="424.62875pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<metadata>
<rdf:RDF xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<cc:Work>
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
<dc:date>2021-04-15T15:15:19.288796</dc:date>
<dc:format>image/svg+xml</dc:format>
<dc:creator>
<cc:Agent>
<dc:title>Matplotlib v3.3.4, https://matplotlib.org/</dc:title>
</cc:Agent>
</dc:creator>
</cc:Work>
</rdf:RDF>
</metadata>
<defs>
<style type="text/css">*{stroke-linecap:butt;stroke-linejoin:round;}</style>
</defs>
<g id="figure_1">
<g id="patch_1">
<path d="M 0 331.389812
L 424.62875 331.389812
L 424.62875 0
L 0 0
z
" style="fill:#ffffff;"/>
</g>
<g id="axes_1">
<g id="patch_2">
<path d="M 60.30875 288.430125
L 417.42875 288.430125
L 417.42875 22.318125
L 60.30875 22.318125
z
" style="fill:#ffffff;"/>
</g>
<g id="matplotlib.axis_1">
<g id="xtick_1">
<g id="line2d_1">
<path clip-path="url(#p0d91672b8f)" d="M 76.541477 288.430125
L 76.541477 22.318125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_1">
<!-- 1 -->
<g style="fill:#262626;" transform="translate(73.042102 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 12.40625 8.296875
L 28.515625 8.296875
L 28.515625 63.921875
L 10.984375 60.40625
L 10.984375 69.390625
L 28.421875 72.90625
L 38.28125 72.90625
L 38.28125 8.296875
L 54.390625 8.296875
L 54.390625 0
L 12.40625 0
z
" id="DejaVuSans-49"/>
</defs>
<use xlink:href="#DejaVuSans-49"/>
</g>
</g>
</g>
<g id="xtick_2">
<g id="line2d_2">
<path clip-path="url(#p0d91672b8f)" d="M 122.920698 288.430125
L 122.920698 22.318125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_2">
<!-- 2 -->
<g style="fill:#262626;" transform="translate(119.421323 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 19.1875 8.296875
L 53.609375 8.296875
L 53.609375 0
L 7.328125 0
L 7.328125 8.296875
Q 12.9375 14.109375 22.625 23.890625
Q 32.328125 33.6875 34.8125 36.53125
Q 39.546875 41.84375 41.421875 45.53125
Q 43.3125 49.21875 43.3125 52.78125
Q 43.3125 58.59375 39.234375 62.25
Q 35.15625 65.921875 28.609375 65.921875
Q 23.96875 65.921875 18.8125 64.3125
Q 13.671875 62.703125 7.8125 59.421875
L 7.8125 69.390625
Q 13.765625 71.78125 18.9375 73
Q 24.125 74.21875 28.421875 74.21875
Q 39.75 74.21875 46.484375 68.546875
Q 53.21875 62.890625 53.21875 53.421875
Q 53.21875 48.921875 51.53125 44.890625
Q 49.859375 40.875 45.40625 35.40625
Q 44.1875 33.984375 37.640625 27.21875
Q 31.109375 20.453125 19.1875 8.296875
z
" id="DejaVuSans-50"/>
</defs>
<use xlink:href="#DejaVuSans-50"/>
</g>
</g>
</g>
<g id="xtick_3">
<g id="line2d_3">
<path clip-path="url(#p0d91672b8f)" d="M 169.299919 288.430125
L 169.299919 22.318125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_3">
<!-- 4 -->
<g style="fill:#262626;" transform="translate(165.800544 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 37.796875 64.3125
L 12.890625 25.390625
L 37.796875 25.390625
z
M 35.203125 72.90625
L 47.609375 72.90625
L 47.609375 25.390625
L 58.015625 25.390625
L 58.015625 17.1875
L 47.609375 17.1875
L 47.609375 0
L 37.796875 0
L 37.796875 17.1875
L 4.890625 17.1875
L 4.890625 26.703125
z
" id="DejaVuSans-52"/>
</defs>
<use xlink:href="#DejaVuSans-52"/>
</g>
</g>
</g>
<g id="xtick_4">
<g id="line2d_4">
<path clip-path="url(#p0d91672b8f)" d="M 215.67914 288.430125
L 215.67914 22.318125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_4">
<!-- 8 -->
<g style="fill:#262626;" transform="translate(212.179765 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 31.78125 34.625
Q 24.75 34.625 20.71875 30.859375
Q 16.703125 27.09375 16.703125 20.515625
Q 16.703125 13.921875 20.71875 10.15625
Q 24.75 6.390625 31.78125 6.390625
Q 38.8125 6.390625 42.859375 10.171875
Q 46.921875 13.96875 46.921875 20.515625
Q 46.921875 27.09375 42.890625 30.859375
Q 38.875 34.625 31.78125 34.625
z
M 21.921875 38.8125
Q 15.578125 40.375 12.03125 44.71875
Q 8.5 49.078125 8.5 55.328125
Q 8.5 64.0625 14.71875 69.140625
Q 20.953125 74.21875 31.78125 74.21875
Q 42.671875 74.21875 48.875 69.140625
Q 55.078125 64.0625 55.078125 55.328125
Q 55.078125 49.078125 51.53125 44.71875
Q 48 40.375 41.703125 38.8125
Q 48.828125 37.15625 52.796875 32.3125
Q 56.78125 27.484375 56.78125 20.515625
Q 56.78125 9.90625 50.3125 4.234375
Q 43.84375 -1.421875 31.78125 -1.421875
Q 19.734375 -1.421875 13.25 4.234375
Q 6.78125 9.90625 6.78125 20.515625
Q 6.78125 27.484375 10.78125 32.3125
Q 14.796875 37.15625 21.921875 38.8125
z
M 18.3125 54.390625
Q 18.3125 48.734375 21.84375 45.5625
Q 25.390625 42.390625 31.78125 42.390625
Q 38.140625 42.390625 41.71875 45.5625
Q 45.3125 48.734375 45.3125 54.390625
Q 45.3125 60.0625 41.71875 63.234375
Q 38.140625 66.40625 31.78125 66.40625
Q 25.390625 66.40625 21.84375 63.234375
Q 18.3125 60.0625 18.3125 54.390625
z
" id="DejaVuSans-56"/>
</defs>
<use xlink:href="#DejaVuSans-56"/>
</g>
</g>
</g>
<g id="xtick_5">
<g id="line2d_5">
<path clip-path="url(#p0d91672b8f)" d="M 262.05836 288.430125
L 262.05836 22.318125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_5">
<!-- 16 -->
<g style="fill:#262626;" transform="translate(255.05961 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 33.015625 40.375
Q 26.375 40.375 22.484375 35.828125
Q 18.609375 31.296875 18.609375 23.390625
Q 18.609375 15.53125 22.484375 10.953125
Q 26.375 6.390625 33.015625 6.390625
Q 39.65625 6.390625 43.53125 10.953125
Q 47.40625 15.53125 47.40625 23.390625
Q 47.40625 31.296875 43.53125 35.828125
Q 39.65625 40.375 33.015625 40.375
z
M 52.59375 71.296875
L 52.59375 62.3125
Q 48.875 64.0625 45.09375 64.984375
Q 41.3125 65.921875 37.59375 65.921875
Q 27.828125 65.921875 22.671875 59.328125
Q 17.53125 52.734375 16.796875 39.40625
Q 19.671875 43.65625 24.015625 45.921875
Q 28.375 48.1875 33.59375 48.1875
Q 44.578125 48.1875 50.953125 41.515625
Q 57.328125 34.859375 57.328125 23.390625
Q 57.328125 12.15625 50.6875 5.359375
Q 44.046875 -1.421875 33.015625 -1.421875
Q 20.359375 -1.421875 13.671875 8.265625
Q 6.984375 17.96875 6.984375 36.375
Q 6.984375 53.65625 15.1875 63.9375
Q 23.390625 74.21875 37.203125 74.21875
Q 40.921875 74.21875 44.703125 73.484375
Q 48.484375 72.75 52.59375 71.296875
z
" id="DejaVuSans-54"/>
</defs>
<use xlink:href="#DejaVuSans-49"/>
<use x="63.623047" xlink:href="#DejaVuSans-54"/>
</g>
</g>
</g>
<g id="xtick_6">
<g id="line2d_6">
<path clip-path="url(#p0d91672b8f)" d="M 308.437581 288.430125
L 308.437581 22.318125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_6">
<!-- 32 -->
<g style="fill:#262626;" transform="translate(301.438831 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 40.578125 39.3125
Q 47.65625 37.796875 51.625 33
Q 55.609375 28.21875 55.609375 21.1875
Q 55.609375 10.40625 48.1875 4.484375
Q 40.765625 -1.421875 27.09375 -1.421875
Q 22.515625 -1.421875 17.65625 -0.515625
Q 12.796875 0.390625 7.625 2.203125
L 7.625 11.71875
Q 11.71875 9.328125 16.59375 8.109375
Q 21.484375 6.890625 26.8125 6.890625
Q 36.078125 6.890625 40.9375 10.546875
Q 45.796875 14.203125 45.796875 21.1875
Q 45.796875 27.640625 41.28125 31.265625
Q 36.765625 34.90625 28.71875 34.90625
L 20.21875 34.90625
L 20.21875 43.015625
L 29.109375 43.015625
Q 36.375 43.015625 40.234375 45.921875
Q 44.09375 48.828125 44.09375 54.296875
Q 44.09375 59.90625 40.109375 62.90625
Q 36.140625 65.921875 28.71875 65.921875
Q 24.65625 65.921875 20.015625 65.03125
Q 15.375 64.15625 9.8125 62.3125
L 9.8125 71.09375
Q 15.4375 72.65625 20.34375 73.4375
Q 25.25 74.21875 29.59375 74.21875
Q 40.828125 74.21875 47.359375 69.109375
Q 53.90625 64.015625 53.90625 55.328125
Q 53.90625 49.265625 50.4375 45.09375
Q 46.96875 40.921875 40.578125 39.3125
z
" id="DejaVuSans-51"/>
</defs>
<use xlink:href="#DejaVuSans-51"/>
<use x="63.623047" xlink:href="#DejaVuSans-50"/>
</g>
</g>
</g>
<g id="xtick_7">
<g id="line2d_7">
<path clip-path="url(#p0d91672b8f)" d="M 354.816802 288.430125
L 354.816802 22.318125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_7">
<!-- 64 -->
<g style="fill:#262626;" transform="translate(347.818052 306.288406)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-54"/>
<use x="63.623047" xlink:href="#DejaVuSans-52"/>
</g>
</g>
</g>
<g id="xtick_8">
<g id="line2d_8">
<path clip-path="url(#p0d91672b8f)" d="M 401.196023 288.430125
L 401.196023 22.318125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_8">
<!-- 128 -->
<g style="fill:#262626;" transform="translate(390.697898 306.288406)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-49"/>
<use x="63.623047" xlink:href="#DejaVuSans-50"/>
<use x="127.246094" xlink:href="#DejaVuSans-56"/>
</g>
</g>
</g>
<g id="text_9">
<!-- Client Batch Size -->
<g style="fill:#262626;" transform="translate(188.120938 321.694187)scale(0.12 -0.12)">
<defs>
<path d="M 64.40625 67.28125
L 64.40625 56.890625
Q 59.421875 61.53125 53.78125 63.8125
Q 48.140625 66.109375 41.796875 66.109375
Q 29.296875 66.109375 22.65625 58.46875
Q 16.015625 50.828125 16.015625 36.375
Q 16.015625 21.96875 22.65625 14.328125
Q 29.296875 6.6875 41.796875 6.6875
Q 48.140625 6.6875 53.78125 8.984375
Q 59.421875 11.28125 64.40625 15.921875
L 64.40625 5.609375
Q 59.234375 2.09375 53.4375 0.328125
Q 47.65625 -1.421875 41.21875 -1.421875
Q 24.65625 -1.421875 15.125 8.703125
Q 5.609375 18.84375 5.609375 36.375
Q 5.609375 53.953125 15.125 64.078125
Q 24.65625 74.21875 41.21875 74.21875
Q 47.75 74.21875 53.53125 72.484375
Q 59.328125 70.75 64.40625 67.28125
z
" id="DejaVuSans-67"/>
<path d="M 9.421875 75.984375
L 18.40625 75.984375
L 18.40625 0
L 9.421875 0
z
" id="DejaVuSans-108"/>
<path d="M 9.421875 54.6875
L 18.40625 54.6875
L 18.40625 0
L 9.421875 0
z
M 9.421875 75.984375
L 18.40625 75.984375
L 18.40625 64.59375
L 9.421875 64.59375
z
" id="DejaVuSans-105"/>
<path d="M 56.203125 29.59375
L 56.203125 25.203125
L 14.890625 25.203125
Q 15.484375 15.921875 20.484375 11.0625
Q 25.484375 6.203125 34.421875 6.203125
Q 39.59375 6.203125 44.453125 7.46875
Q 49.3125 8.734375 54.109375 11.28125
L 54.109375 2.78125
Q 49.265625 0.734375 44.1875 -0.34375
Q 39.109375 -1.421875 33.890625 -1.421875
Q 20.796875 -1.421875 13.15625 6.1875
Q 5.515625 13.8125 5.515625 26.8125
Q 5.515625 40.234375 12.765625 48.109375
Q 20.015625 56 32.328125 56
Q 43.359375 56 49.78125 48.890625
Q 56.203125 41.796875 56.203125 29.59375
z
M 47.21875 32.234375
Q 47.125 39.59375 43.09375 43.984375
Q 39.0625 48.390625 32.421875 48.390625
Q 24.90625 48.390625 20.390625 44.140625
Q 15.875 39.890625 15.1875 32.171875
z
" id="DejaVuSans-101"/>
<path d="M 54.890625 33.015625
L 54.890625 0
L 45.90625 0
L 45.90625 32.71875
Q 45.90625 40.484375 42.875 44.328125
Q 39.84375 48.1875 33.796875 48.1875
Q 26.515625 48.1875 22.3125 43.546875
Q 18.109375 38.921875 18.109375 30.90625
L 18.109375 0
L 9.078125 0
L 9.078125 54.6875
L 18.109375 54.6875
L 18.109375 46.1875
Q 21.34375 51.125 25.703125 53.5625
Q 30.078125 56 35.796875 56
Q 45.21875 56 50.046875 50.171875
Q 54.890625 44.34375 54.890625 33.015625
z
" id="DejaVuSans-110"/>
<path d="M 18.3125 70.21875
L 18.3125 54.6875
L 36.8125 54.6875
L 36.8125 47.703125
L 18.3125 47.703125
L 18.3125 18.015625
Q 18.3125 11.328125 20.140625 9.421875
Q 21.96875 7.515625 27.59375 7.515625
L 36.8125 7.515625
L 36.8125 0
L 27.59375 0
Q 17.1875 0 13.234375 3.875
Q 9.28125 7.765625 9.28125 18.015625
L 9.28125 47.703125
L 2.6875 47.703125
L 2.6875 54.6875
L 9.28125 54.6875
L 9.28125 70.21875
z
" id="DejaVuSans-116"/>
<path id="DejaVuSans-32"/>
<path d="M 19.671875 34.8125
L 19.671875 8.109375
L 35.5 8.109375
Q 43.453125 8.109375 47.28125 11.40625
Q 51.125 14.703125 51.125 21.484375
Q 51.125 28.328125 47.28125 31.5625
Q 43.453125 34.8125 35.5 34.8125
z
M 19.671875 64.796875
L 19.671875 42.828125
L 34.28125 42.828125
Q 41.5 42.828125 45.03125 45.53125
Q 48.578125 48.25 48.578125 53.8125
Q 48.578125 59.328125 45.03125 62.0625
Q 41.5 64.796875 34.28125 64.796875
z
M 9.8125 72.90625
L 35.015625 72.90625
Q 46.296875 72.90625 52.390625 68.21875
Q 58.5 63.53125 58.5 54.890625
Q 58.5 48.1875 55.375 44.234375
Q 52.25 40.28125 46.1875 39.3125
Q 53.46875 37.75 57.5 32.78125
Q 61.53125 27.828125 61.53125 20.40625
Q 61.53125 10.640625 54.890625 5.3125
Q 48.25 0 35.984375 0
L 9.8125 0
z
" id="DejaVuSans-66"/>
<path d="M 34.28125 27.484375
Q 23.390625 27.484375 19.1875 25
Q 14.984375 22.515625 14.984375 16.5
Q 14.984375 11.71875 18.140625 8.90625
Q 21.296875 6.109375 26.703125 6.109375
Q 34.1875 6.109375 38.703125 11.40625
Q 43.21875 16.703125 43.21875 25.484375
L 43.21875 27.484375
z
M 52.203125 31.203125
L 52.203125 0
L 43.21875 0
L 43.21875 8.296875
Q 40.140625 3.328125 35.546875 0.953125
Q 30.953125 -1.421875 24.3125 -1.421875
Q 15.921875 -1.421875 10.953125 3.296875
Q 6 8.015625 6 15.921875
Q 6 25.140625 12.171875 29.828125
Q 18.359375 34.515625 30.609375 34.515625
L 43.21875 34.515625
L 43.21875 35.40625
Q 43.21875 41.609375 39.140625 45
Q 35.0625 48.390625 27.6875 48.390625
Q 23 48.390625 18.546875 47.265625
Q 14.109375 46.140625 10.015625 43.890625
L 10.015625 52.203125
Q 14.9375 54.109375 19.578125 55.046875
Q 24.21875 56 28.609375 56
Q 40.484375 56 46.34375 49.84375
Q 52.203125 43.703125 52.203125 31.203125
z
" id="DejaVuSans-97"/>
<path d="M 48.78125 52.59375
L 48.78125 44.1875
Q 44.96875 46.296875 41.140625 47.34375
Q 37.3125 48.390625 33.40625 48.390625
Q 24.65625 48.390625 19.8125 42.84375
Q 14.984375 37.3125 14.984375 27.296875
Q 14.984375 17.28125 19.8125 11.734375
Q 24.65625 6.203125 33.40625 6.203125
Q 37.3125 6.203125 41.140625 7.25
Q 44.96875 8.296875 48.78125 10.40625
L 48.78125 2.09375
Q 45.015625 0.34375 40.984375 -0.53125
Q 36.96875 -1.421875 32.421875 -1.421875
Q 20.0625 -1.421875 12.78125 6.34375
Q 5.515625 14.109375 5.515625 27.296875
Q 5.515625 40.671875 12.859375 48.328125
Q 20.21875 56 33.015625 56
Q 37.15625 56 41.109375 55.140625
Q 45.0625 54.296875 48.78125 52.59375
z
" id="DejaVuSans-99"/>
<path d="M 54.890625 33.015625
L 54.890625 0
L 45.90625 0
L 45.90625 32.71875
Q 45.90625 40.484375 42.875 44.328125
Q 39.84375 48.1875 33.796875 48.1875
Q 26.515625 48.1875 22.3125 43.546875
Q 18.109375 38.921875 18.109375 30.90625
L 18.109375 0
L 9.078125 0
L 9.078125 75.984375
L 18.109375 75.984375
L 18.109375 46.1875
Q 21.34375 51.125 25.703125 53.5625
Q 30.078125 56 35.796875 56
Q 45.21875 56 50.046875 50.171875
Q 54.890625 44.34375 54.890625 33.015625
z
" id="DejaVuSans-104"/>
<path d="M 53.515625 70.515625
L 53.515625 60.890625
Q 47.90625 63.578125 42.921875 64.890625
Q 37.9375 66.21875 33.296875 66.21875
Q 25.25 66.21875 20.875 63.09375
Q 16.5 59.96875 16.5 54.203125
Q 16.5 49.359375 19.40625 46.890625
Q 22.3125 44.4375 30.421875 42.921875
L 36.375 41.703125
Q 47.40625 39.59375 52.65625 34.296875
Q 57.90625 29 57.90625 20.125
Q 57.90625 9.515625 50.796875 4.046875
Q 43.703125 -1.421875 29.984375 -1.421875
Q 24.8125 -1.421875 18.96875 -0.25
Q 13.140625 0.921875 6.890625 3.21875
L 6.890625 13.375
Q 12.890625 10.015625 18.65625 8.296875
Q 24.421875 6.59375 29.984375 6.59375
Q 38.421875 6.59375 43.015625 9.90625
Q 47.609375 13.234375 47.609375 19.390625
Q 47.609375 24.75 44.3125 27.78125
Q 41.015625 30.8125 33.5 32.328125
L 27.484375 33.5
Q 16.453125 35.6875 11.515625 40.375
Q 6.59375 45.0625 6.59375 53.421875
Q 6.59375 63.09375 13.40625 68.65625
Q 20.21875 74.21875 32.171875 74.21875
Q 37.3125 74.21875 42.625 73.28125
Q 47.953125 72.359375 53.515625 70.515625
z
" id="DejaVuSans-83"/>
<path d="M 5.515625 54.6875
L 48.1875 54.6875
L 48.1875 46.484375
L 14.40625 7.171875
L 48.1875 7.171875
L 48.1875 0
L 4.296875 0
L 4.296875 8.203125
L 38.09375 47.515625
L 5.515625 47.515625
z
" id="DejaVuSans-122"/>
</defs>
<use xlink:href="#DejaVuSans-67"/>
<use x="69.824219" xlink:href="#DejaVuSans-108"/>
<use x="97.607422" xlink:href="#DejaVuSans-105"/>
<use x="125.390625" xlink:href="#DejaVuSans-101"/>
<use x="186.914062" xlink:href="#DejaVuSans-110"/>
<use x="250.292969" xlink:href="#DejaVuSans-116"/>
<use x="289.501953" xlink:href="#DejaVuSans-32"/>
<use x="321.289062" xlink:href="#DejaVuSans-66"/>
<use x="389.892578" xlink:href="#DejaVuSans-97"/>
<use x="451.171875" xlink:href="#DejaVuSans-116"/>
<use x="490.380859" xlink:href="#DejaVuSans-99"/>
<use x="545.361328" xlink:href="#DejaVuSans-104"/>
<use x="608.740234" xlink:href="#DejaVuSans-32"/>
<use x="640.527344" xlink:href="#DejaVuSans-83"/>
<use x="704.003906" xlink:href="#DejaVuSans-105"/>
<use x="731.787109" xlink:href="#DejaVuSans-122"/>
<use x="784.277344" xlink:href="#DejaVuSans-101"/>
</g>
</g>
</g>
<g id="matplotlib.axis_2">
<g id="ytick_1">
<g id="line2d_9">
<path clip-path="url(#p0d91672b8f)" d="M 60.30875 288.430125
L 417.42875 288.430125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_10">
<!-- 0 -->
<g style="fill:#262626;" transform="translate(43.81 292.609266)scale(0.11 -0.11)">
<defs>
<path d="M 31.78125 66.40625
Q 24.171875 66.40625 20.328125 58.90625
Q 16.5 51.421875 16.5 36.375
Q 16.5 21.390625 20.328125 13.890625
Q 24.171875 6.390625 31.78125 6.390625
Q 39.453125 6.390625 43.28125 13.890625
Q 47.125 21.390625 47.125 36.375
Q 47.125 51.421875 43.28125 58.90625
Q 39.453125 66.40625 31.78125 66.40625
z
M 31.78125 74.21875
Q 44.046875 74.21875 50.515625 64.515625
Q 56.984375 54.828125 56.984375 36.375
Q 56.984375 17.96875 50.515625 8.265625
Q 44.046875 -1.421875 31.78125 -1.421875
Q 19.53125 -1.421875 13.0625 8.265625
Q 6.59375 17.96875 6.59375 36.375
Q 6.59375 54.828125 13.0625 64.515625
Q 19.53125 74.21875 31.78125 74.21875
z
" id="DejaVuSans-48"/>
</defs>
<use xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_2">
<g id="line2d_10">
<path clip-path="url(#p0d91672b8f)" d="M 60.30875 241.763458
L 417.42875 241.763458
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_11">
<!-- 200 -->
<g style="fill:#262626;" transform="translate(29.8125 245.942599)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-50"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
<use x="127.246094" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_3">
<g id="line2d_11">
<path clip-path="url(#p0d91672b8f)" d="M 60.30875 195.096792
L 417.42875 195.096792
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_12">
<!-- 400 -->
<g style="fill:#262626;" transform="translate(29.8125 199.275932)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-52"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
<use x="127.246094" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_4">
<g id="line2d_12">
<path clip-path="url(#p0d91672b8f)" d="M 60.30875 148.430125
L 417.42875 148.430125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_13">
<!-- 600 -->
<g style="fill:#262626;" transform="translate(29.8125 152.609266)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-54"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
<use x="127.246094" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_5">
<g id="line2d_13">
<path clip-path="url(#p0d91672b8f)" d="M 60.30875 101.763458
L 417.42875 101.763458
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_14">
<!-- 800 -->
<g style="fill:#262626;" transform="translate(29.8125 105.942599)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-56"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
<use x="127.246094" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_6">
<g id="line2d_14">
<path clip-path="url(#p0d91672b8f)" d="M 60.30875 55.096792
L 417.42875 55.096792
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_15">
<!-- 1000 -->
<g style="fill:#262626;" transform="translate(22.81375 59.275932)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-49"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
<use x="127.246094" xlink:href="#DejaVuSans-48"/>
<use x="190.869141" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="text_16">
<!-- Inferences/second -->
<g style="fill:#262626;" transform="translate(16.318125 210.113812)rotate(-90)scale(0.12 -0.12)">
<defs>
<path d="M 9.8125 72.90625
L 19.671875 72.90625
L 19.671875 0
L 9.8125 0
z
" id="DejaVuSans-73"/>
<path d="M 37.109375 75.984375
L 37.109375 68.5
L 28.515625 68.5
Q 23.6875 68.5 21.796875 66.546875
Q 19.921875 64.59375 19.921875 59.515625
L 19.921875 54.6875
L 34.71875 54.6875
L 34.71875 47.703125
L 19.921875 47.703125
L 19.921875 0
L 10.890625 0
L 10.890625 47.703125
L 2.296875 47.703125
L 2.296875 54.6875
L 10.890625 54.6875
L 10.890625 58.5
Q 10.890625 67.625 15.140625 71.796875
Q 19.390625 75.984375 28.609375 75.984375
z
" id="DejaVuSans-102"/>
<path d="M 41.109375 46.296875
Q 39.59375 47.171875 37.8125 47.578125
Q 36.03125 48 33.890625 48
Q 26.265625 48 22.1875 43.046875
Q 18.109375 38.09375 18.109375 28.8125
L 18.109375 0
L 9.078125 0
L 9.078125 54.6875
L 18.109375 54.6875
L 18.109375 46.1875
Q 20.953125 51.171875 25.484375 53.578125
Q 30.03125 56 36.53125 56
Q 37.453125 56 38.578125 55.875
Q 39.703125 55.765625 41.0625 55.515625
z
" id="DejaVuSans-114"/>
<path d="M 44.28125 53.078125
L 44.28125 44.578125
Q 40.484375 46.53125 36.375 47.5
Q 32.28125 48.484375 27.875 48.484375
Q 21.1875 48.484375 17.84375 46.4375
Q 14.5 44.390625 14.5 40.28125
Q 14.5 37.15625 16.890625 35.375
Q 19.28125 33.59375 26.515625 31.984375
L 29.59375 31.296875
Q 39.15625 29.25 43.1875 25.515625
Q 47.21875 21.78125 47.21875 15.09375
Q 47.21875 7.46875 41.1875 3.015625
Q 35.15625 -1.421875 24.609375 -1.421875
Q 20.21875 -1.421875 15.453125 -0.5625
Q 10.6875 0.296875 5.421875 2
L 5.421875 11.28125
Q 10.40625 8.6875 15.234375 7.390625
Q 20.0625 6.109375 24.8125 6.109375
Q 31.15625 6.109375 34.5625 8.28125
Q 37.984375 10.453125 37.984375 14.40625
Q 37.984375 18.0625 35.515625 20.015625
Q 33.0625 21.96875 24.703125 23.78125
L 21.578125 24.515625
Q 13.234375 26.265625 9.515625 29.90625
Q 5.8125 33.546875 5.8125 39.890625
Q 5.8125 47.609375 11.28125 51.796875
Q 16.75 56 26.8125 56
Q 31.78125 56 36.171875 55.265625
Q 40.578125 54.546875 44.28125 53.078125
z
" id="DejaVuSans-115"/>
<path d="M 25.390625 72.90625
L 33.6875 72.90625
L 8.296875 -9.28125
L 0 -9.28125
z
" id="DejaVuSans-47"/>
<path d="M 30.609375 48.390625
Q 23.390625 48.390625 19.1875 42.75
Q 14.984375 37.109375 14.984375 27.296875
Q 14.984375 17.484375 19.15625 11.84375
Q 23.34375 6.203125 30.609375 6.203125
Q 37.796875 6.203125 41.984375 11.859375
Q 46.1875 17.53125 46.1875 27.296875
Q 46.1875 37.015625 41.984375 42.703125
Q 37.796875 48.390625 30.609375 48.390625
z
M 30.609375 56
Q 42.328125 56 49.015625 48.375
Q 55.71875 40.765625 55.71875 27.296875
Q 55.71875 13.875 49.015625 6.21875
Q 42.328125 -1.421875 30.609375 -1.421875
Q 18.84375 -1.421875 12.171875 6.21875
Q 5.515625 13.875 5.515625 27.296875
Q 5.515625 40.765625 12.171875 48.375
Q 18.84375 56 30.609375 56
z
" id="DejaVuSans-111"/>
<path d="M 45.40625 46.390625
L 45.40625 75.984375
L 54.390625 75.984375
L 54.390625 0
L 45.40625 0
L 45.40625 8.203125
Q 42.578125 3.328125 38.25 0.953125
Q 33.9375 -1.421875 27.875 -1.421875
Q 17.96875 -1.421875 11.734375 6.484375
Q 5.515625 14.40625 5.515625 27.296875
Q 5.515625 40.1875 11.734375 48.09375
Q 17.96875 56 27.875 56
Q 33.9375 56 38.25 53.625
Q 42.578125 51.265625 45.40625 46.390625
z
M 14.796875 27.296875
Q 14.796875 17.390625 18.875 11.75
Q 22.953125 6.109375 30.078125 6.109375
Q 37.203125 6.109375 41.296875 11.75
Q 45.40625 17.390625 45.40625 27.296875
Q 45.40625 37.203125 41.296875 42.84375
Q 37.203125 48.484375 30.078125 48.484375
Q 22.953125 48.484375 18.875 42.84375
Q 14.796875 37.203125 14.796875 27.296875
z
" id="DejaVuSans-100"/>
</defs>
<use xlink:href="#DejaVuSans-73"/>
<use x="29.492188" xlink:href="#DejaVuSans-110"/>
<use x="92.871094" xlink:href="#DejaVuSans-102"/>
<use x="128.076172" xlink:href="#DejaVuSans-101"/>
<use x="189.599609" xlink:href="#DejaVuSans-114"/>
<use x="228.462891" xlink:href="#DejaVuSans-101"/>
<use x="289.986328" xlink:href="#DejaVuSans-110"/>
<use x="353.365234" xlink:href="#DejaVuSans-99"/>
<use x="408.345703" xlink:href="#DejaVuSans-101"/>
<use x="469.869141" xlink:href="#DejaVuSans-115"/>
<use x="521.96875" xlink:href="#DejaVuSans-47"/>
<use x="555.660156" xlink:href="#DejaVuSans-115"/>
<use x="607.759766" xlink:href="#DejaVuSans-101"/>
<use x="669.283203" xlink:href="#DejaVuSans-99"/>
<use x="724.263672" xlink:href="#DejaVuSans-111"/>
<use x="785.445312" xlink:href="#DejaVuSans-110"/>
<use x="848.824219" xlink:href="#DejaVuSans-100"/>
</g>
</g>
</g>
<g id="line2d_15">
<path clip-path="url(#p0d91672b8f)" d="M 76.541477 228.790125
L 122.920698 190.616792
L 169.299919 140.590125
L 215.67914 86.456792
L 262.05836 66.670125
L 308.437581 89.070125
L 354.816802 95.790125
L 401.196023 124.163458
" style="fill:none;stroke:#0173b2;stroke-linecap:round;stroke-width:1.5;"/>
<defs>
<path d="M 0 3
C 0.795609 3 1.55874 2.683901 2.12132 2.12132
C 2.683901 1.55874 3 0.795609 3 0
C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132
C 1.55874 -2.683901 0.795609 -3 0 -3
C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132
C -2.683901 -1.55874 -3 -0.795609 -3 0
C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132
C -1.55874 2.683901 -0.795609 3 0 3
z
" id="mfc2dfc2535" style="stroke:#ffffff;stroke-width:0.75;"/>
</defs>
<g clip-path="url(#p0d91672b8f)">
<use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="76.541477" xlink:href="#mfc2dfc2535" y="228.790125"/>
<use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="122.920698" xlink:href="#mfc2dfc2535" y="190.616792"/>
<use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="169.299919" xlink:href="#mfc2dfc2535" y="140.590125"/>
<use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="215.67914" xlink:href="#mfc2dfc2535" y="86.456792"/>
<use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="262.05836" xlink:href="#mfc2dfc2535" y="66.670125"/>
<use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="308.437581" xlink:href="#mfc2dfc2535" y="89.070125"/>
<use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="354.816802" xlink:href="#mfc2dfc2535" y="95.790125"/>
<use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="401.196023" xlink:href="#mfc2dfc2535" y="124.163458"/>
</g>
</g>
<g id="line2d_16"/>
<g id="line2d_17"/>
<g id="patch_3">
<path d="M 60.30875 288.430125
L 60.30875 22.318125
" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
</g>
<g id="patch_4">
<path d="M 417.42875 288.430125
L 417.42875 22.318125
" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
</g>
<g id="patch_5">
<path d="M 60.30875 288.430125
L 417.42875 288.430125
" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
</g>
<g id="patch_6">
<path d="M 60.30875 22.318125
L 417.42875 22.318125
" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
</g>
<g id="text_17">
<!-- Performance offline -->
<g style="fill:#262626;" transform="translate(180.219688 16.318125)scale(0.12 -0.12)">
<defs>
<path d="M 19.671875 64.796875
L 19.671875 37.40625
L 32.078125 37.40625
Q 38.96875 37.40625 42.71875 40.96875
Q 46.484375 44.53125 46.484375 51.125
Q 46.484375 57.671875 42.71875 61.234375
Q 38.96875 64.796875 32.078125 64.796875
z
M 9.8125 72.90625
L 32.078125 72.90625
Q 44.34375 72.90625 50.609375 67.359375
Q 56.890625 61.8125 56.890625 51.125
Q 56.890625 40.328125 50.609375 34.8125
Q 44.34375 29.296875 32.078125 29.296875
L 19.671875 29.296875
L 19.671875 0
L 9.8125 0
z
" id="DejaVuSans-80"/>
<path d="M 52 44.1875
Q 55.375 50.25 60.0625 53.125
Q 64.75 56 71.09375 56
Q 79.640625 56 84.28125 50.015625
Q 88.921875 44.046875 88.921875 33.015625
L 88.921875 0
L 79.890625 0
L 79.890625 32.71875
Q 79.890625 40.578125 77.09375 44.375
Q 74.3125 48.1875 68.609375 48.1875
Q 61.625 48.1875 57.5625 43.546875
Q 53.515625 38.921875 53.515625 30.90625
L 53.515625 0
L 44.484375 0
L 44.484375 32.71875
Q 44.484375 40.625 41.703125 44.40625
Q 38.921875 48.1875 33.109375 48.1875
Q 26.21875 48.1875 22.15625 43.53125
Q 18.109375 38.875 18.109375 30.90625
L 18.109375 0
L 9.078125 0
L 9.078125 54.6875
L 18.109375 54.6875
L 18.109375 46.1875
Q 21.1875 51.21875 25.484375 53.609375
Q 29.78125 56 35.6875 56
Q 41.65625 56 45.828125 52.96875
Q 50 49.953125 52 44.1875
z
" id="DejaVuSans-109"/>
</defs>
<use xlink:href="#DejaVuSans-80"/>
<use x="56.677734" xlink:href="#DejaVuSans-101"/>
<use x="118.201172" xlink:href="#DejaVuSans-114"/>
<use x="159.314453" xlink:href="#DejaVuSans-102"/>
<use x="194.519531" xlink:href="#DejaVuSans-111"/>
<use x="255.701172" xlink:href="#DejaVuSans-114"/>
<use x="295.064453" xlink:href="#DejaVuSans-109"/>
<use x="392.476562" xlink:href="#DejaVuSans-97"/>
<use x="453.755859" xlink:href="#DejaVuSans-110"/>
<use x="517.134766" xlink:href="#DejaVuSans-99"/>
<use x="572.115234" xlink:href="#DejaVuSans-101"/>
<use x="633.638672" xlink:href="#DejaVuSans-32"/>
<use x="665.425781" xlink:href="#DejaVuSans-111"/>
<use x="726.607422" xlink:href="#DejaVuSans-102"/>
<use x="761.8125" xlink:href="#DejaVuSans-102"/>
<use x="797.017578" xlink:href="#DejaVuSans-108"/>
<use x="824.800781" xlink:href="#DejaVuSans-105"/>
<use x="852.583984" xlink:href="#DejaVuSans-110"/>
<use x="915.962891" xlink:href="#DejaVuSans-101"/>
</g>
</g>
<g id="legend_1"/>
</g>
</g>
<defs>
<clipPath id="p0d91672b8f">
<rect height="266.112" width="357.12" x="60.30875" y="22.318125"/>
</clipPath>
</defs>
</svg>

After

Width:  |  Height:  |  Size: 32 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 32 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 33 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 30 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 32 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 31 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 34 KiB

View file

@ -0,0 +1,980 @@
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Created with matplotlib (https://matplotlib.org/) -->
<svg height="331.389812pt" version="1.1" viewBox="0 0 417.63 331.389812" width="417.63pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<metadata>
<rdf:RDF xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<cc:Work>
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
<dc:date>2021-04-15T15:15:18.496826</dc:date>
<dc:format>image/svg+xml</dc:format>
<dc:creator>
<cc:Agent>
<dc:title>Matplotlib v3.3.4, https://matplotlib.org/</dc:title>
</cc:Agent>
</dc:creator>
</cc:Work>
</rdf:RDF>
</metadata>
<defs>
<style type="text/css">*{stroke-linecap:butt;stroke-linejoin:round;}</style>
</defs>
<g id="figure_1">
<g id="patch_1">
<path d="M 0 331.389812
L 417.63 331.389812
L 417.63 0
L 0 0
z
" style="fill:#ffffff;"/>
</g>
<g id="axes_1">
<g id="patch_2">
<path d="M 53.31 288.430125
L 410.43 288.430125
L 410.43 22.318125
L 53.31 22.318125
z
" style="fill:#ffffff;"/>
</g>
<g id="matplotlib.axis_1">
<g id="xtick_1">
<g id="text_1">
<!-- 1 -->
<g style="fill:#262626;" transform="translate(72.130625 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 12.40625 8.296875
L 28.515625 8.296875
L 28.515625 63.921875
L 10.984375 60.40625
L 10.984375 69.390625
L 28.421875 72.90625
L 38.28125 72.90625
L 38.28125 8.296875
L 54.390625 8.296875
L 54.390625 0
L 12.40625 0
z
" id="DejaVuSans-49"/>
</defs>
<use xlink:href="#DejaVuSans-49"/>
</g>
</g>
</g>
<g id="xtick_2">
<g id="text_2">
<!-- 2 -->
<g style="fill:#262626;" transform="translate(116.770625 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 19.1875 8.296875
L 53.609375 8.296875
L 53.609375 0
L 7.328125 0
L 7.328125 8.296875
Q 12.9375 14.109375 22.625 23.890625
Q 32.328125 33.6875 34.8125 36.53125
Q 39.546875 41.84375 41.421875 45.53125
Q 43.3125 49.21875 43.3125 52.78125
Q 43.3125 58.59375 39.234375 62.25
Q 35.15625 65.921875 28.609375 65.921875
Q 23.96875 65.921875 18.8125 64.3125
Q 13.671875 62.703125 7.8125 59.421875
L 7.8125 69.390625
Q 13.765625 71.78125 18.9375 73
Q 24.125 74.21875 28.421875 74.21875
Q 39.75 74.21875 46.484375 68.546875
Q 53.21875 62.890625 53.21875 53.421875
Q 53.21875 48.921875 51.53125 44.890625
Q 49.859375 40.875 45.40625 35.40625
Q 44.1875 33.984375 37.640625 27.21875
Q 31.109375 20.453125 19.1875 8.296875
z
" id="DejaVuSans-50"/>
</defs>
<use xlink:href="#DejaVuSans-50"/>
</g>
</g>
</g>
<g id="xtick_3">
<g id="text_3">
<!-- 4 -->
<g style="fill:#262626;" transform="translate(161.410625 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 37.796875 64.3125
L 12.890625 25.390625
L 37.796875 25.390625
z
M 35.203125 72.90625
L 47.609375 72.90625
L 47.609375 25.390625
L 58.015625 25.390625
L 58.015625 17.1875
L 47.609375 17.1875
L 47.609375 0
L 37.796875 0
L 37.796875 17.1875
L 4.890625 17.1875
L 4.890625 26.703125
z
" id="DejaVuSans-52"/>
</defs>
<use xlink:href="#DejaVuSans-52"/>
</g>
</g>
</g>
<g id="xtick_4">
<g id="text_4">
<!-- 8 -->
<g style="fill:#262626;" transform="translate(206.050625 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 31.78125 34.625
Q 24.75 34.625 20.71875 30.859375
Q 16.703125 27.09375 16.703125 20.515625
Q 16.703125 13.921875 20.71875 10.15625
Q 24.75 6.390625 31.78125 6.390625
Q 38.8125 6.390625 42.859375 10.171875
Q 46.921875 13.96875 46.921875 20.515625
Q 46.921875 27.09375 42.890625 30.859375
Q 38.875 34.625 31.78125 34.625
z
M 21.921875 38.8125
Q 15.578125 40.375 12.03125 44.71875
Q 8.5 49.078125 8.5 55.328125
Q 8.5 64.0625 14.71875 69.140625
Q 20.953125 74.21875 31.78125 74.21875
Q 42.671875 74.21875 48.875 69.140625
Q 55.078125 64.0625 55.078125 55.328125
Q 55.078125 49.078125 51.53125 44.71875
Q 48 40.375 41.703125 38.8125
Q 48.828125 37.15625 52.796875 32.3125
Q 56.78125 27.484375 56.78125 20.515625
Q 56.78125 9.90625 50.3125 4.234375
Q 43.84375 -1.421875 31.78125 -1.421875
Q 19.734375 -1.421875 13.25 4.234375
Q 6.78125 9.90625 6.78125 20.515625
Q 6.78125 27.484375 10.78125 32.3125
Q 14.796875 37.15625 21.921875 38.8125
z
M 18.3125 54.390625
Q 18.3125 48.734375 21.84375 45.5625
Q 25.390625 42.390625 31.78125 42.390625
Q 38.140625 42.390625 41.71875 45.5625
Q 45.3125 48.734375 45.3125 54.390625
Q 45.3125 60.0625 41.71875 63.234375
Q 38.140625 66.40625 31.78125 66.40625
Q 25.390625 66.40625 21.84375 63.234375
Q 18.3125 60.0625 18.3125 54.390625
z
" id="DejaVuSans-56"/>
</defs>
<use xlink:href="#DejaVuSans-56"/>
</g>
</g>
</g>
<g id="xtick_5">
<g id="text_5">
<!-- 16 -->
<g style="fill:#262626;" transform="translate(247.19125 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 33.015625 40.375
Q 26.375 40.375 22.484375 35.828125
Q 18.609375 31.296875 18.609375 23.390625
Q 18.609375 15.53125 22.484375 10.953125
Q 26.375 6.390625 33.015625 6.390625
Q 39.65625 6.390625 43.53125 10.953125
Q 47.40625 15.53125 47.40625 23.390625
Q 47.40625 31.296875 43.53125 35.828125
Q 39.65625 40.375 33.015625 40.375
z
M 52.59375 71.296875
L 52.59375 62.3125
Q 48.875 64.0625 45.09375 64.984375
Q 41.3125 65.921875 37.59375 65.921875
Q 27.828125 65.921875 22.671875 59.328125
Q 17.53125 52.734375 16.796875 39.40625
Q 19.671875 43.65625 24.015625 45.921875
Q 28.375 48.1875 33.59375 48.1875
Q 44.578125 48.1875 50.953125 41.515625
Q 57.328125 34.859375 57.328125 23.390625
Q 57.328125 12.15625 50.6875 5.359375
Q 44.046875 -1.421875 33.015625 -1.421875
Q 20.359375 -1.421875 13.671875 8.265625
Q 6.984375 17.96875 6.984375 36.375
Q 6.984375 53.65625 15.1875 63.9375
Q 23.390625 74.21875 37.203125 74.21875
Q 40.921875 74.21875 44.703125 73.484375
Q 48.484375 72.75 52.59375 71.296875
z
" id="DejaVuSans-54"/>
</defs>
<use xlink:href="#DejaVuSans-49"/>
<use x="63.623047" xlink:href="#DejaVuSans-54"/>
</g>
</g>
</g>
<g id="xtick_6">
<g id="text_6">
<!-- 32 -->
<g style="fill:#262626;" transform="translate(291.83125 306.288406)scale(0.11 -0.11)">
<defs>
<path d="M 40.578125 39.3125
Q 47.65625 37.796875 51.625 33
Q 55.609375 28.21875 55.609375 21.1875
Q 55.609375 10.40625 48.1875 4.484375
Q 40.765625 -1.421875 27.09375 -1.421875
Q 22.515625 -1.421875 17.65625 -0.515625
Q 12.796875 0.390625 7.625 2.203125
L 7.625 11.71875
Q 11.71875 9.328125 16.59375 8.109375
Q 21.484375 6.890625 26.8125 6.890625
Q 36.078125 6.890625 40.9375 10.546875
Q 45.796875 14.203125 45.796875 21.1875
Q 45.796875 27.640625 41.28125 31.265625
Q 36.765625 34.90625 28.71875 34.90625
L 20.21875 34.90625
L 20.21875 43.015625
L 29.109375 43.015625
Q 36.375 43.015625 40.234375 45.921875
Q 44.09375 48.828125 44.09375 54.296875
Q 44.09375 59.90625 40.109375 62.90625
Q 36.140625 65.921875 28.71875 65.921875
Q 24.65625 65.921875 20.015625 65.03125
Q 15.375 64.15625 9.8125 62.3125
L 9.8125 71.09375
Q 15.4375 72.65625 20.34375 73.4375
Q 25.25 74.21875 29.59375 74.21875
Q 40.828125 74.21875 47.359375 69.109375
Q 53.90625 64.015625 53.90625 55.328125
Q 53.90625 49.265625 50.4375 45.09375
Q 46.96875 40.921875 40.578125 39.3125
z
" id="DejaVuSans-51"/>
</defs>
<use xlink:href="#DejaVuSans-51"/>
<use x="63.623047" xlink:href="#DejaVuSans-50"/>
</g>
</g>
</g>
<g id="xtick_7">
<g id="text_7">
<!-- 64 -->
<g style="fill:#262626;" transform="translate(336.47125 306.288406)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-54"/>
<use x="63.623047" xlink:href="#DejaVuSans-52"/>
</g>
</g>
</g>
<g id="xtick_8">
<g id="text_8">
<!-- 128 -->
<g style="fill:#262626;" transform="translate(377.611875 306.288406)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-49"/>
<use x="63.623047" xlink:href="#DejaVuSans-50"/>
<use x="127.246094" xlink:href="#DejaVuSans-56"/>
</g>
</g>
</g>
<g id="text_9">
<!-- Client Batch Size -->
<g style="fill:#262626;" transform="translate(181.122187 321.694187)scale(0.12 -0.12)">
<defs>
<path d="M 64.40625 67.28125
L 64.40625 56.890625
Q 59.421875 61.53125 53.78125 63.8125
Q 48.140625 66.109375 41.796875 66.109375
Q 29.296875 66.109375 22.65625 58.46875
Q 16.015625 50.828125 16.015625 36.375
Q 16.015625 21.96875 22.65625 14.328125
Q 29.296875 6.6875 41.796875 6.6875
Q 48.140625 6.6875 53.78125 8.984375
Q 59.421875 11.28125 64.40625 15.921875
L 64.40625 5.609375
Q 59.234375 2.09375 53.4375 0.328125
Q 47.65625 -1.421875 41.21875 -1.421875
Q 24.65625 -1.421875 15.125 8.703125
Q 5.609375 18.84375 5.609375 36.375
Q 5.609375 53.953125 15.125 64.078125
Q 24.65625 74.21875 41.21875 74.21875
Q 47.75 74.21875 53.53125 72.484375
Q 59.328125 70.75 64.40625 67.28125
z
" id="DejaVuSans-67"/>
<path d="M 9.421875 75.984375
L 18.40625 75.984375
L 18.40625 0
L 9.421875 0
z
" id="DejaVuSans-108"/>
<path d="M 9.421875 54.6875
L 18.40625 54.6875
L 18.40625 0
L 9.421875 0
z
M 9.421875 75.984375
L 18.40625 75.984375
L 18.40625 64.59375
L 9.421875 64.59375
z
" id="DejaVuSans-105"/>
<path d="M 56.203125 29.59375
L 56.203125 25.203125
L 14.890625 25.203125
Q 15.484375 15.921875 20.484375 11.0625
Q 25.484375 6.203125 34.421875 6.203125
Q 39.59375 6.203125 44.453125 7.46875
Q 49.3125 8.734375 54.109375 11.28125
L 54.109375 2.78125
Q 49.265625 0.734375 44.1875 -0.34375
Q 39.109375 -1.421875 33.890625 -1.421875
Q 20.796875 -1.421875 13.15625 6.1875
Q 5.515625 13.8125 5.515625 26.8125
Q 5.515625 40.234375 12.765625 48.109375
Q 20.015625 56 32.328125 56
Q 43.359375 56 49.78125 48.890625
Q 56.203125 41.796875 56.203125 29.59375
z
M 47.21875 32.234375
Q 47.125 39.59375 43.09375 43.984375
Q 39.0625 48.390625 32.421875 48.390625
Q 24.90625 48.390625 20.390625 44.140625
Q 15.875 39.890625 15.1875 32.171875
z
" id="DejaVuSans-101"/>
<path d="M 54.890625 33.015625
L 54.890625 0
L 45.90625 0
L 45.90625 32.71875
Q 45.90625 40.484375 42.875 44.328125
Q 39.84375 48.1875 33.796875 48.1875
Q 26.515625 48.1875 22.3125 43.546875
Q 18.109375 38.921875 18.109375 30.90625
L 18.109375 0
L 9.078125 0
L 9.078125 54.6875
L 18.109375 54.6875
L 18.109375 46.1875
Q 21.34375 51.125 25.703125 53.5625
Q 30.078125 56 35.796875 56
Q 45.21875 56 50.046875 50.171875
Q 54.890625 44.34375 54.890625 33.015625
z
" id="DejaVuSans-110"/>
<path d="M 18.3125 70.21875
L 18.3125 54.6875
L 36.8125 54.6875
L 36.8125 47.703125
L 18.3125 47.703125
L 18.3125 18.015625
Q 18.3125 11.328125 20.140625 9.421875
Q 21.96875 7.515625 27.59375 7.515625
L 36.8125 7.515625
L 36.8125 0
L 27.59375 0
Q 17.1875 0 13.234375 3.875
Q 9.28125 7.765625 9.28125 18.015625
L 9.28125 47.703125
L 2.6875 47.703125
L 2.6875 54.6875
L 9.28125 54.6875
L 9.28125 70.21875
z
" id="DejaVuSans-116"/>
<path id="DejaVuSans-32"/>
<path d="M 19.671875 34.8125
L 19.671875 8.109375
L 35.5 8.109375
Q 43.453125 8.109375 47.28125 11.40625
Q 51.125 14.703125 51.125 21.484375
Q 51.125 28.328125 47.28125 31.5625
Q 43.453125 34.8125 35.5 34.8125
z
M 19.671875 64.796875
L 19.671875 42.828125
L 34.28125 42.828125
Q 41.5 42.828125 45.03125 45.53125
Q 48.578125 48.25 48.578125 53.8125
Q 48.578125 59.328125 45.03125 62.0625
Q 41.5 64.796875 34.28125 64.796875
z
M 9.8125 72.90625
L 35.015625 72.90625
Q 46.296875 72.90625 52.390625 68.21875
Q 58.5 63.53125 58.5 54.890625
Q 58.5 48.1875 55.375 44.234375
Q 52.25 40.28125 46.1875 39.3125
Q 53.46875 37.75 57.5 32.78125
Q 61.53125 27.828125 61.53125 20.40625
Q 61.53125 10.640625 54.890625 5.3125
Q 48.25 0 35.984375 0
L 9.8125 0
z
" id="DejaVuSans-66"/>
<path d="M 34.28125 27.484375
Q 23.390625 27.484375 19.1875 25
Q 14.984375 22.515625 14.984375 16.5
Q 14.984375 11.71875 18.140625 8.90625
Q 21.296875 6.109375 26.703125 6.109375
Q 34.1875 6.109375 38.703125 11.40625
Q 43.21875 16.703125 43.21875 25.484375
L 43.21875 27.484375
z
M 52.203125 31.203125
L 52.203125 0
L 43.21875 0
L 43.21875 8.296875
Q 40.140625 3.328125 35.546875 0.953125
Q 30.953125 -1.421875 24.3125 -1.421875
Q 15.921875 -1.421875 10.953125 3.296875
Q 6 8.015625 6 15.921875
Q 6 25.140625 12.171875 29.828125
Q 18.359375 34.515625 30.609375 34.515625
L 43.21875 34.515625
L 43.21875 35.40625
Q 43.21875 41.609375 39.140625 45
Q 35.0625 48.390625 27.6875 48.390625
Q 23 48.390625 18.546875 47.265625
Q 14.109375 46.140625 10.015625 43.890625
L 10.015625 52.203125
Q 14.9375 54.109375 19.578125 55.046875
Q 24.21875 56 28.609375 56
Q 40.484375 56 46.34375 49.84375
Q 52.203125 43.703125 52.203125 31.203125
z
" id="DejaVuSans-97"/>
<path d="M 48.78125 52.59375
L 48.78125 44.1875
Q 44.96875 46.296875 41.140625 47.34375
Q 37.3125 48.390625 33.40625 48.390625
Q 24.65625 48.390625 19.8125 42.84375
Q 14.984375 37.3125 14.984375 27.296875
Q 14.984375 17.28125 19.8125 11.734375
Q 24.65625 6.203125 33.40625 6.203125
Q 37.3125 6.203125 41.140625 7.25
Q 44.96875 8.296875 48.78125 10.40625
L 48.78125 2.09375
Q 45.015625 0.34375 40.984375 -0.53125
Q 36.96875 -1.421875 32.421875 -1.421875
Q 20.0625 -1.421875 12.78125 6.34375
Q 5.515625 14.109375 5.515625 27.296875
Q 5.515625 40.671875 12.859375 48.328125
Q 20.21875 56 33.015625 56
Q 37.15625 56 41.109375 55.140625
Q 45.0625 54.296875 48.78125 52.59375
z
" id="DejaVuSans-99"/>
<path d="M 54.890625 33.015625
L 54.890625 0
L 45.90625 0
L 45.90625 32.71875
Q 45.90625 40.484375 42.875 44.328125
Q 39.84375 48.1875 33.796875 48.1875
Q 26.515625 48.1875 22.3125 43.546875
Q 18.109375 38.921875 18.109375 30.90625
L 18.109375 0
L 9.078125 0
L 9.078125 75.984375
L 18.109375 75.984375
L 18.109375 46.1875
Q 21.34375 51.125 25.703125 53.5625
Q 30.078125 56 35.796875 56
Q 45.21875 56 50.046875 50.171875
Q 54.890625 44.34375 54.890625 33.015625
z
" id="DejaVuSans-104"/>
<path d="M 53.515625 70.515625
L 53.515625 60.890625
Q 47.90625 63.578125 42.921875 64.890625
Q 37.9375 66.21875 33.296875 66.21875
Q 25.25 66.21875 20.875 63.09375
Q 16.5 59.96875 16.5 54.203125
Q 16.5 49.359375 19.40625 46.890625
Q 22.3125 44.4375 30.421875 42.921875
L 36.375 41.703125
Q 47.40625 39.59375 52.65625 34.296875
Q 57.90625 29 57.90625 20.125
Q 57.90625 9.515625 50.796875 4.046875
Q 43.703125 -1.421875 29.984375 -1.421875
Q 24.8125 -1.421875 18.96875 -0.25
Q 13.140625 0.921875 6.890625 3.21875
L 6.890625 13.375
Q 12.890625 10.015625 18.65625 8.296875
Q 24.421875 6.59375 29.984375 6.59375
Q 38.421875 6.59375 43.015625 9.90625
Q 47.609375 13.234375 47.609375 19.390625
Q 47.609375 24.75 44.3125 27.78125
Q 41.015625 30.8125 33.5 32.328125
L 27.484375 33.5
Q 16.453125 35.6875 11.515625 40.375
Q 6.59375 45.0625 6.59375 53.421875
Q 6.59375 63.09375 13.40625 68.65625
Q 20.21875 74.21875 32.171875 74.21875
Q 37.3125 74.21875 42.625 73.28125
Q 47.953125 72.359375 53.515625 70.515625
z
" id="DejaVuSans-83"/>
<path d="M 5.515625 54.6875
L 48.1875 54.6875
L 48.1875 46.484375
L 14.40625 7.171875
L 48.1875 7.171875
L 48.1875 0
L 4.296875 0
L 4.296875 8.203125
L 38.09375 47.515625
L 5.515625 47.515625
z
" id="DejaVuSans-122"/>
</defs>
<use xlink:href="#DejaVuSans-67"/>
<use x="69.824219" xlink:href="#DejaVuSans-108"/>
<use x="97.607422" xlink:href="#DejaVuSans-105"/>
<use x="125.390625" xlink:href="#DejaVuSans-101"/>
<use x="186.914062" xlink:href="#DejaVuSans-110"/>
<use x="250.292969" xlink:href="#DejaVuSans-116"/>
<use x="289.501953" xlink:href="#DejaVuSans-32"/>
<use x="321.289062" xlink:href="#DejaVuSans-66"/>
<use x="389.892578" xlink:href="#DejaVuSans-97"/>
<use x="451.171875" xlink:href="#DejaVuSans-116"/>
<use x="490.380859" xlink:href="#DejaVuSans-99"/>
<use x="545.361328" xlink:href="#DejaVuSans-104"/>
<use x="608.740234" xlink:href="#DejaVuSans-32"/>
<use x="640.527344" xlink:href="#DejaVuSans-83"/>
<use x="704.003906" xlink:href="#DejaVuSans-105"/>
<use x="731.787109" xlink:href="#DejaVuSans-122"/>
<use x="784.277344" xlink:href="#DejaVuSans-101"/>
</g>
</g>
</g>
<g id="matplotlib.axis_2">
<g id="ytick_1">
<g id="line2d_1">
<path clip-path="url(#p9ba82f1e29)" d="M 53.31 288.430125
L 410.43 288.430125
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_10">
<!-- 0 -->
<g style="fill:#262626;" transform="translate(36.81125 292.609266)scale(0.11 -0.11)">
<defs>
<path d="M 31.78125 66.40625
Q 24.171875 66.40625 20.328125 58.90625
Q 16.5 51.421875 16.5 36.375
Q 16.5 21.390625 20.328125 13.890625
Q 24.171875 6.390625 31.78125 6.390625
Q 39.453125 6.390625 43.28125 13.890625
Q 47.125 21.390625 47.125 36.375
Q 47.125 51.421875 43.28125 58.90625
Q 39.453125 66.40625 31.78125 66.40625
z
M 31.78125 74.21875
Q 44.046875 74.21875 50.515625 64.515625
Q 56.984375 54.828125 56.984375 36.375
Q 56.984375 17.96875 50.515625 8.265625
Q 44.046875 -1.421875 31.78125 -1.421875
Q 19.53125 -1.421875 13.0625 8.265625
Q 6.59375 17.96875 6.59375 36.375
Q 6.59375 54.828125 13.0625 64.515625
Q 19.53125 74.21875 31.78125 74.21875
z
" id="DejaVuSans-48"/>
</defs>
<use xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_2">
<g id="line2d_2">
<path clip-path="url(#p9ba82f1e29)" d="M 53.31 242.325216
L 410.43 242.325216
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_11">
<!-- 20 -->
<g style="fill:#262626;" transform="translate(29.8125 246.504357)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-50"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_3">
<g id="line2d_3">
<path clip-path="url(#p9ba82f1e29)" d="M 53.31 196.220308
L 410.43 196.220308
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_12">
<!-- 40 -->
<g style="fill:#262626;" transform="translate(29.8125 200.399448)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-52"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_4">
<g id="line2d_4">
<path clip-path="url(#p9ba82f1e29)" d="M 53.31 150.115399
L 410.43 150.115399
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_13">
<!-- 60 -->
<g style="fill:#262626;" transform="translate(29.8125 154.29454)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-54"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_5">
<g id="line2d_5">
<path clip-path="url(#p9ba82f1e29)" d="M 53.31 104.01049
L 410.43 104.01049
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_14">
<!-- 80 -->
<g style="fill:#262626;" transform="translate(29.8125 108.189631)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-56"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="ytick_6">
<g id="line2d_6">
<path clip-path="url(#p9ba82f1e29)" d="M 53.31 57.905582
L 410.43 57.905582
" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
</g>
<g id="text_15">
<!-- 100 -->
<g style="fill:#262626;" transform="translate(22.81375 62.084722)scale(0.11 -0.11)">
<use xlink:href="#DejaVuSans-49"/>
<use x="63.623047" xlink:href="#DejaVuSans-48"/>
<use x="127.246094" xlink:href="#DejaVuSans-48"/>
</g>
</g>
</g>
<g id="text_16">
<!-- Avg Latency -->
<g style="fill:#262626;" transform="translate(16.318125 192.110062)rotate(-90)scale(0.12 -0.12)">
<defs>
<path d="M 34.1875 63.1875
L 20.796875 26.90625
L 47.609375 26.90625
z
M 28.609375 72.90625
L 39.796875 72.90625
L 67.578125 0
L 57.328125 0
L 50.6875 18.703125
L 17.828125 18.703125
L 11.1875 0
L 0.78125 0
z
" id="DejaVuSans-65"/>
<path d="M 2.984375 54.6875
L 12.5 54.6875
L 29.59375 8.796875
L 46.6875 54.6875
L 56.203125 54.6875
L 35.6875 0
L 23.484375 0
z
" id="DejaVuSans-118"/>
<path d="M 45.40625 27.984375
Q 45.40625 37.75 41.375 43.109375
Q 37.359375 48.484375 30.078125 48.484375
Q 22.859375 48.484375 18.828125 43.109375
Q 14.796875 37.75 14.796875 27.984375
Q 14.796875 18.265625 18.828125 12.890625
Q 22.859375 7.515625 30.078125 7.515625
Q 37.359375 7.515625 41.375 12.890625
Q 45.40625 18.265625 45.40625 27.984375
z
M 54.390625 6.78125
Q 54.390625 -7.171875 48.1875 -13.984375
Q 42 -20.796875 29.203125 -20.796875
Q 24.46875 -20.796875 20.265625 -20.09375
Q 16.0625 -19.390625 12.109375 -17.921875
L 12.109375 -9.1875
Q 16.0625 -11.328125 19.921875 -12.34375
Q 23.78125 -13.375 27.78125 -13.375
Q 36.625 -13.375 41.015625 -8.765625
Q 45.40625 -4.15625 45.40625 5.171875
L 45.40625 9.625
Q 42.625 4.78125 38.28125 2.390625
Q 33.9375 0 27.875 0
Q 17.828125 0 11.671875 7.65625
Q 5.515625 15.328125 5.515625 27.984375
Q 5.515625 40.671875 11.671875 48.328125
Q 17.828125 56 27.875 56
Q 33.9375 56 38.28125 53.609375
Q 42.625 51.21875 45.40625 46.390625
L 45.40625 54.6875
L 54.390625 54.6875
z
" id="DejaVuSans-103"/>
<path d="M 9.8125 72.90625
L 19.671875 72.90625
L 19.671875 8.296875
L 55.171875 8.296875
L 55.171875 0
L 9.8125 0
z
" id="DejaVuSans-76"/>
<path d="M 32.171875 -5.078125
Q 28.375 -14.84375 24.75 -17.8125
Q 21.140625 -20.796875 15.09375 -20.796875
L 7.90625 -20.796875
L 7.90625 -13.28125
L 13.1875 -13.28125
Q 16.890625 -13.28125 18.9375 -11.515625
Q 21 -9.765625 23.484375 -3.21875
L 25.09375 0.875
L 2.984375 54.6875
L 12.5 54.6875
L 29.59375 11.921875
L 46.6875 54.6875
L 56.203125 54.6875
z
" id="DejaVuSans-121"/>
</defs>
<use xlink:href="#DejaVuSans-65"/>
<use x="62.533203" xlink:href="#DejaVuSans-118"/>
<use x="121.712891" xlink:href="#DejaVuSans-103"/>
<use x="185.189453" xlink:href="#DejaVuSans-32"/>
<use x="216.976562" xlink:href="#DejaVuSans-76"/>
<use x="272.689453" xlink:href="#DejaVuSans-97"/>
<use x="333.96875" xlink:href="#DejaVuSans-116"/>
<use x="373.177734" xlink:href="#DejaVuSans-101"/>
<use x="434.701172" xlink:href="#DejaVuSans-110"/>
<use x="498.080078" xlink:href="#DejaVuSans-99"/>
<use x="553.060547" xlink:href="#DejaVuSans-121"/>
</g>
</g>
</g>
<g id="patch_3">
<path clip-path="url(#p9ba82f1e29)" d="M 57.774 288.430125
L 93.486 288.430125
L 93.486 282.496423
L 57.774 282.496423
z
" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
</g>
<g id="patch_4">
<path clip-path="url(#p9ba82f1e29)" d="M 102.414 288.430125
L 138.126 288.430125
L 138.126 281.203181
L 102.414 281.203181
z
" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
</g>
<g id="patch_5">
<path clip-path="url(#p9ba82f1e29)" d="M 147.054 288.430125
L 182.766 288.430125
L 182.766 279.047776
L 147.054 279.047776
z
" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
</g>
<g id="patch_6">
<path clip-path="url(#p9ba82f1e29)" d="M 191.694 288.430125
L 227.406 288.430125
L 227.406 272.83514
L 191.694 272.83514
z
" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
</g>
<g id="patch_7">
<path clip-path="url(#p9ba82f1e29)" d="M 236.334 288.430125
L 272.046 288.430125
L 272.046 262.931805
L 236.334 262.931805
z
" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
</g>
<g id="patch_8">
<path clip-path="url(#p9ba82f1e29)" d="M 280.974 288.430125
L 316.686 288.430125
L 316.686 234.019417
L 280.974 234.019417
z
" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
</g>
<g id="patch_9">
<path clip-path="url(#p9ba82f1e29)" d="M 325.614 288.430125
L 361.326 288.430125
L 361.326 188.380168
L 325.614 188.380168
z
" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
</g>
<g id="patch_10">
<path clip-path="url(#p9ba82f1e29)" d="M 370.254 288.430125
L 405.966 288.430125
L 405.966 66.670125
L 370.254 66.670125
z
" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
</g>
<g id="patch_11">
<path d="M 53.31 288.430125
L 53.31 22.318125
" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
</g>
<g id="patch_12">
<path d="M 410.43 288.430125
L 410.43 22.318125
" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
</g>
<g id="patch_13">
<path d="M 53.31 288.430125
L 410.43 288.430125
" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
</g>
<g id="patch_14">
<path d="M 53.31 22.318125
L 410.43 22.318125
" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
</g>
<g id="text_17">
<!-- Performance offline -->
<g style="fill:#262626;" transform="translate(173.220937 16.318125)scale(0.12 -0.12)">
<defs>
<path d="M 19.671875 64.796875
L 19.671875 37.40625
L 32.078125 37.40625
Q 38.96875 37.40625 42.71875 40.96875
Q 46.484375 44.53125 46.484375 51.125
Q 46.484375 57.671875 42.71875 61.234375
Q 38.96875 64.796875 32.078125 64.796875
z
M 9.8125 72.90625
L 32.078125 72.90625
Q 44.34375 72.90625 50.609375 67.359375
Q 56.890625 61.8125 56.890625 51.125
Q 56.890625 40.328125 50.609375 34.8125
Q 44.34375 29.296875 32.078125 29.296875
L 19.671875 29.296875
L 19.671875 0
L 9.8125 0
z
" id="DejaVuSans-80"/>
<path d="M 41.109375 46.296875
Q 39.59375 47.171875 37.8125 47.578125
Q 36.03125 48 33.890625 48
Q 26.265625 48 22.1875 43.046875
Q 18.109375 38.09375 18.109375 28.8125
L 18.109375 0
L 9.078125 0
L 9.078125 54.6875
L 18.109375 54.6875
L 18.109375 46.1875
Q 20.953125 51.171875 25.484375 53.578125
Q 30.03125 56 36.53125 56
Q 37.453125 56 38.578125 55.875
Q 39.703125 55.765625 41.0625 55.515625
z
" id="DejaVuSans-114"/>
<path d="M 37.109375 75.984375
L 37.109375 68.5
L 28.515625 68.5
Q 23.6875 68.5 21.796875 66.546875
Q 19.921875 64.59375 19.921875 59.515625
L 19.921875 54.6875
L 34.71875 54.6875
L 34.71875 47.703125
L 19.921875 47.703125
L 19.921875 0
L 10.890625 0
L 10.890625 47.703125
L 2.296875 47.703125
L 2.296875 54.6875
L 10.890625 54.6875
L 10.890625 58.5
Q 10.890625 67.625 15.140625 71.796875
Q 19.390625 75.984375 28.609375 75.984375
z
" id="DejaVuSans-102"/>
<path d="M 30.609375 48.390625
Q 23.390625 48.390625 19.1875 42.75
Q 14.984375 37.109375 14.984375 27.296875
Q 14.984375 17.484375 19.15625 11.84375
Q 23.34375 6.203125 30.609375 6.203125
Q 37.796875 6.203125 41.984375 11.859375
Q 46.1875 17.53125 46.1875 27.296875
Q 46.1875 37.015625 41.984375 42.703125
Q 37.796875 48.390625 30.609375 48.390625
z
M 30.609375 56
Q 42.328125 56 49.015625 48.375
Q 55.71875 40.765625 55.71875 27.296875
Q 55.71875 13.875 49.015625 6.21875
Q 42.328125 -1.421875 30.609375 -1.421875
Q 18.84375 -1.421875 12.171875 6.21875
Q 5.515625 13.875 5.515625 27.296875
Q 5.515625 40.765625 12.171875 48.375
Q 18.84375 56 30.609375 56
z
" id="DejaVuSans-111"/>
<path d="M 52 44.1875
Q 55.375 50.25 60.0625 53.125
Q 64.75 56 71.09375 56
Q 79.640625 56 84.28125 50.015625
Q 88.921875 44.046875 88.921875 33.015625
L 88.921875 0
L 79.890625 0
L 79.890625 32.71875
Q 79.890625 40.578125 77.09375 44.375
Q 74.3125 48.1875 68.609375 48.1875
Q 61.625 48.1875 57.5625 43.546875
Q 53.515625 38.921875 53.515625 30.90625
L 53.515625 0
L 44.484375 0
L 44.484375 32.71875
Q 44.484375 40.625 41.703125 44.40625
Q 38.921875 48.1875 33.109375 48.1875
Q 26.21875 48.1875 22.15625 43.53125
Q 18.109375 38.875 18.109375 30.90625
L 18.109375 0
L 9.078125 0
L 9.078125 54.6875
L 18.109375 54.6875
L 18.109375 46.1875
Q 21.1875 51.21875 25.484375 53.609375
Q 29.78125 56 35.6875 56
Q 41.65625 56 45.828125 52.96875
Q 50 49.953125 52 44.1875
z
" id="DejaVuSans-109"/>
</defs>
<use xlink:href="#DejaVuSans-80"/>
<use x="56.677734" xlink:href="#DejaVuSans-101"/>
<use x="118.201172" xlink:href="#DejaVuSans-114"/>
<use x="159.314453" xlink:href="#DejaVuSans-102"/>
<use x="194.519531" xlink:href="#DejaVuSans-111"/>
<use x="255.701172" xlink:href="#DejaVuSans-114"/>
<use x="295.064453" xlink:href="#DejaVuSans-109"/>
<use x="392.476562" xlink:href="#DejaVuSans-97"/>
<use x="453.755859" xlink:href="#DejaVuSans-110"/>
<use x="517.134766" xlink:href="#DejaVuSans-99"/>
<use x="572.115234" xlink:href="#DejaVuSans-101"/>
<use x="633.638672" xlink:href="#DejaVuSans-32"/>
<use x="665.425781" xlink:href="#DejaVuSans-111"/>
<use x="726.607422" xlink:href="#DejaVuSans-102"/>
<use x="761.8125" xlink:href="#DejaVuSans-102"/>
<use x="797.017578" xlink:href="#DejaVuSans-108"/>
<use x="824.800781" xlink:href="#DejaVuSans-105"/>
<use x="852.583984" xlink:href="#DejaVuSans-110"/>
<use x="915.962891" xlink:href="#DejaVuSans-101"/>
</g>
</g>
<g id="legend_1"/>
</g>
</g>
<defs>
<clipPath id="p9ba82f1e29">
<rect height="266.112" width="357.12" x="53.31" y="22.318125"/>
</clipPath>
</defs>
</svg>

After

Width:  |  Height:  |  Size: 29 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 92 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 94 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 94 KiB

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 93 KiB

View file

@ -0,0 +1,127 @@
#!/usr/bin/env python3
# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tarfile
from pathlib import Path
from typing import Tuple, Dict, List
from PIL import Image
from tqdm import tqdm
DATASETS_DIR = os.environ.get("DATASETS_DIR", None)
IMAGENET_DIRNAME = "imagenet"
IMAGE_ARCHIVE_FILENAME = "ILSVRC2012_img_val.tar"
DEVKIT_ARCHIVE_FILENAME = "ILSVRC2012_devkit_t12.tar.gz"
LABELS_REL_PATH = "ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt"
META_REL_PATH = "ILSVRC2012_devkit_t12/data/meta.mat"
TARGET_SIZE = (224, 224) # (width, height)
_RESIZE_MIN = 256 # resize preserving aspect ratio to where this is minimal size
def parse_meta_mat(metafile) -> Dict[int, str]:
import scipy.io
meta = scipy.io.loadmat(metafile, squeeze_me=True)["synsets"]
nums_children = list(zip(*meta))[4]
meta = [meta[idx] for idx, num_children in enumerate(nums_children) if num_children == 0]
idcs, wnids = list(zip(*meta))[:2]
idx_to_wnid = {idx: wnid for idx, wnid in zip(idcs, wnids)}
return idx_to_wnid
def _process_image(image_file, target_size):
image = Image.open(image_file)
original_size = image.size
# scale image to size where minimal size is _RESIZE_MIN
scale_factor = max(_RESIZE_MIN / original_size[0], _RESIZE_MIN / original_size[1])
resize_to = int(original_size[0] * scale_factor), int(original_size[1] * scale_factor)
resized_image = image.resize(resize_to)
# central crop of image to target_size
left, upper = (resize_to[0] - target_size[0]) // 2, (resize_to[1] - target_size[1]) // 2
cropped_image = resized_image.crop((left, upper, left + target_size[0], upper + target_size[1]))
return cropped_image
def main():
import argparse
parser = argparse.ArgumentParser(description="short_description")
parser.add_argument(
"--dataset-dir",
help="Path to dataset directory where imagenet archives are stored and processed files will be saved.",
required=False,
default=DATASETS_DIR,
)
parser.add_argument(
"--target-size",
help="Size of target image. Format it as <width>,<height>.",
required=False,
default=",".join(map(str, TARGET_SIZE)),
)
args = parser.parse_args()
if args.dataset_dir is None:
raise ValueError(
"Please set $DATASETS_DIR env variable to point dataset dir with original dataset archives "
"and where processed files should be stored. Alternatively provide --dataset-dir CLI argument"
)
datasets_dir = Path(args.dataset_dir)
target_size = tuple(map(int, args.target_size.split(",")))
image_archive_path = datasets_dir / IMAGE_ARCHIVE_FILENAME
if not image_archive_path.exists():
raise RuntimeError(
f"There should be {IMAGE_ARCHIVE_FILENAME} file in {datasets_dir}."
f"You need to download the dataset from http://www.image-net.org/download."
)
devkit_archive_path = datasets_dir / DEVKIT_ARCHIVE_FILENAME
if not devkit_archive_path.exists():
raise RuntimeError(
f"There should be {DEVKIT_ARCHIVE_FILENAME} file in {datasets_dir}."
f"You need to download the dataset from http://www.image-net.org/download."
)
with tarfile.open(devkit_archive_path, mode="r") as devkit_archive_file:
labels_file = devkit_archive_file.extractfile(LABELS_REL_PATH)
labels = list(map(int, labels_file.readlines()))
# map validation labels (idxes from LABELS_REL_PATH) into WNID compatible with training set
meta_file = devkit_archive_file.extractfile(META_REL_PATH)
idx_to_wnid = parse_meta_mat(meta_file)
labels_wnid = [idx_to_wnid[idx] for idx in labels]
# remap WNID into index in sorted list of all WNIDs - this is how network outputs class
available_wnids = sorted(set(labels_wnid))
wnid_to_newidx = {wnid: new_cls for new_cls, wnid in enumerate(available_wnids)}
labels = [wnid_to_newidx[wnid] for wnid in labels_wnid]
output_dir = datasets_dir / IMAGENET_DIRNAME
with tarfile.open(image_archive_path, mode="r") as image_archive_file:
image_rel_paths = sorted(image_archive_file.getnames())
for cls, image_rel_path in tqdm(zip(labels, image_rel_paths), total=len(image_rel_paths)):
output_path = output_dir / str(cls) / image_rel_path
original_image_file = image_archive_file.extractfile(image_rel_path)
processed_image = _process_image(original_image_file, target_size)
output_path.parent.mkdir(parents=True, exist_ok=True)
processed_image.save(output_path.as_posix())
if __name__ == "__main__":
main()

View file

@ -0,0 +1,12 @@
networkx==2.5
numpy<1.20.0,>=1.16.0 # # numpy 1.20+ requires py37+
onnx==1.8.0
onnxruntime==1.6.0
pycuda>=2019.1.2
PyYAML>=5.2
tqdm>=4.44.1
tf2onnx==1.8.3
tabulate>=0.8.7
natsort>=7.0.0
# use tags instead of branch names - because there might be docker cache hit causing not fetching most recent changes on branch
service_maker @ git+https://access-token:usVyg8b11sn9gCacsVCf@gitlab-master.nvidia.com/dl/JoC/service_maker.git@1b83b96#egg=service_maker

View file

@ -0,0 +1,86 @@
import logging
import tensorflow as tf
from utils import data_utils
LOGGER = logging.getLogger(__name__)
NCLASSES = 1001
WIDTH = 224
HEIGHT = 224
NCHANNELS = 3
INPUT_FORMAT = "NHWC"
COMPUTE_FORMAT = "NHWC"
def get_model(
*,
model_dir: str,
arch: str = "resnet50",
precision: str = "fp32",
use_xla: bool = True,
use_tf_amp: bool = False,
use_dali: bool = False,
gpu_memory_fraction=0.7,
):
import horovod.tensorflow as hvd
from runtime import Runner
hvd.init()
try:
dtype = {"fp16": tf.float16, "fp32": tf.float32}[precision.lower()]
except KeyError:
raise ValueError(f"Uknown precision {precision}. Allowed values: fp16|fp32")
LOGGER.info(
f"Creating model arch={arch} precision={precision} xla={use_xla}"
f"tf_amp={use_tf_amp}, dali={use_dali}, gpu_memory_frac={gpu_memory_fraction}"
)
runner = Runner(
n_classes=NCLASSES,
architecture=arch,
input_format=INPUT_FORMAT,
compute_format=COMPUTE_FORMAT,
dtype=dtype,
n_channels=NCHANNELS,
height=HEIGHT,
width=WIDTH,
use_xla=use_xla,
use_tf_amp=use_tf_amp,
use_dali=use_dali,
gpu_memory_fraction=gpu_memory_fraction,
gpu_id=0,
model_dir=model_dir,
)
# removed params not used in inference
estimator_params = {"use_final_conv": False} # TODO: Why not moved to model constructor?
estimator = runner._get_estimator(
mode="inference",
run_params=estimator_params,
use_xla=use_xla,
use_dali=use_dali,
gpu_memory_fraction=gpu_memory_fraction,
)
return estimator
def get_serving_input_receiver_fn(
batch_size: int = None,
input_dtype: str = "fp32",
width: int = WIDTH,
height: int = HEIGHT,
nchannels: int = NCHANNELS,
):
input_dtype = tf.float16 if input_dtype and "16" in input_dtype else tf.float32
serving_input_receiver_fn = data_utils.get_serving_input_receiver_fn(
batch_size=batch_size,
height=height,
width=width,
num_channels=nchannels,
data_format=INPUT_FORMAT,
dtype=input_dtype,
)
return serving_input_receiver_fn

View file

@ -0,0 +1,220 @@
#!/usr/bin/env python3
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# method from PEP-366 to support relative import in executed modules
import argparse
import logging
from pathlib import Path
from typing import List
if __name__ == "__main__" and __package__ is None:
__package__ = Path(__file__).parent.name
from .benchmark.benchmark import Benchmark
from .benchmark.checkpoints import HttpCheckpoint
from .benchmark.core import LOGGER
from .benchmark.executor import DockerExecutor
from .deployment_toolkit.core import Accelerator, Format, Precision
AVAILABLE_MODEL_FORMATS = [f.value for f in Format]
AVAILABLE_MODEL_PRECISIONS = [p.value for p in Precision]
AVAILABLE_MODEL_ACCELERATORS = [a.value for a in Accelerator]
def run_benchmark(
devices: List[str],
model_name: str,
model_version: int,
model_format: str,
container_version: str,
checkpoint: str,
max_batch_size: int,
precision: str,
number_of_model_instances: int,
preferred_batch_sizes: List[int],
max_queue_delay_us: int,
backend_accelerator: str,
verbose: bool,
**kwargs
):
benchmark = Benchmark(
devices=devices,
model_name=model_name,
model_version=model_version,
framework="TensorFlow1",
container_version=container_version,
checkpoint=HttpCheckpoint(checkpoint),
verbose=verbose
)
benchmark.model_conversion(
cmds=(
r"""
python3 triton/convert_model.py \
--input-path triton/rn50_model.py \
--input-type tf-estimator \
--output-path ${SHARED_DIR}/model \
--output-type ${FORMAT} \
--onnx-opset 12 \
--onnx-optimized 1 \
--max-batch-size ${MAX_BATCH_SIZE} \
--max-workspace-size 4294967296 \
--ignore-unknown-parameters \
\
--model-dir ${CHECKPOINT_DIR} \
--precision ${PRECISION} \
--dataloader triton/dataloader.py \
--data-dir ${DATASETS_DIR}/imagenet
""",
)
)
benchmark.model_deploy(
cmds=(
r"""
python3 triton/deploy_model.py \
--model-repository ${MODEL_REPOSITORY_PATH} \
--model-path ${SHARED_DIR}/model \
--model-format ${FORMAT} \
--model-name ${MODEL_NAME} \
--model-version 1 \
--max-batch-size ${MAX_BATCH_SIZE} \
--precision ${PRECISION} \
--number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
--max-queue-delay-us ${TRITON_MAX_QUEUE_DELAY} \
--preferred-batch-sizes ${TRITON_PREFERRED_BATCH_SIZES} \
--capture-cuda-graph 0 \
--backend-accelerator ${BACKEND_ACCELERATOR} \
--load-model ${TRITON_LOAD_MODEL_METHOD}
""",
)
)
benchmark.triton_performance_offline_tests(
cmds=(
r"""
python triton/run_offline_performance_test_on_triton.py \
--server-url ${TRITON_SERVER_URL} \
--model-name ${MODEL_NAME} \
--number-of-warmup-iterations 5 \
--input-data random \
--batch-sizes ${BATCH_SIZE} \
--triton-instances ${TRITON_INSTANCES} \
--result-path ${SHARED_DIR}/triton_performance_offline.csv
""",
),
result_path="${SHARED_DIR}/triton_performance_offline.csv",
)
benchmark.triton_performance_online_tests(
cmds=(
r"""
python triton/run_online_performance_test_on_triton.py \
--server-url ${TRITON_SERVER_URL} \
--model-name ${MODEL_NAME} \
--number-of-warmup-iterations 5 \
--input-data random \
--batch-sizes ${BATCH_SIZE} \
--triton-instances ${TRITON_INSTANCES} \
--number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
--result-path ${SHARED_DIR}/triton_performance_online.csv
""",
),
result_path="${SHARED_DIR}/triton_performance_online.csv",
)
benchmark.configuration(
precision=precision,
max_batch_size=max_batch_size,
format=model_format,
accelerator=backend_accelerator,
triton_gpu_engine_count=number_of_model_instances,
triton_preferred_batch_sizes=preferred_batch_sizes,
triton_max_queue_delay_us=max_queue_delay_us,
**kwargs
)
executor = DockerExecutor()
executor.run(benchmark)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run benchmark for model.")
parser.add_argument("--devices", help="NVIDIA GPU device ID on which Triton Inference Server is ran. Accept multiple values", nargs="*", required=False)
parser.add_argument("--model-name", help="Model name. Default: ResNet50", default="ResNet50", required=False)
parser.add_argument("--model-version", default="1", help="Version of model. Default: 1", required=False)
parser.add_argument("--checkpoint", default="https://api.ngc.nvidia.com/v2/models/nvidia/rn50_tf_amp_ckpt/versions/20.06.0/zip", help="Checkpoint url. Default: https://api.ngc.nvidia.com/v2/models/nvidia/rn50_tf_amp_ckpt/versions/20.06.0/zip", required=False)
parser.add_argument("--container-version", help="Version of container for Triton Inference Server. Default: 20.12", default="20.12", required=False)
parser.add_argument(
"--model-format",
choices=AVAILABLE_MODEL_FORMATS,
help="Format of exported model. Default: tf-savedmodel",
default="tf-savedmodel",
required=False
)
parser.add_argument(
"--precision",
type=str,
default="fp16",
choices=AVAILABLE_MODEL_PRECISIONS,
help="Model precision (parameter used only by Tensorflow backend with TensorRT optimization). Default: fp16",
required=False
)
parser.add_argument(
"--max-batch-size",
type=int,
default=32,
help="Batch size used for benchmark. Maximal batch size which is used to convert model. Default: 32",
required=False
)
parser.add_argument(
"--number-of-model-instances",
type=int,
default=2,
help="Number of model instances per GPU (model instances). Default: 2",
required=False
)
parser.add_argument(
"--preferred-batch-sizes",
type=int,
nargs="*",
help="Batch sizes that the dynamic batching should attempt to create. "
"In case --max-queue-delay-us is set and this parameter is not, default value will be calculated based on --max-batch-size",
required=False
)
parser.add_argument(
"--max-queue-delay-us",
type=int,
default=100,
help="Max delay time which dynamic batch shall wait to form a batch. Default: 100",
required=False
)
parser.add_argument(
"--backend-accelerator",
choices=AVAILABLE_MODEL_ACCELERATORS,
type=str,
default="cuda",
help="Select backend accelerator used for model. Default: cuda",
required=False
)
parser.add_argument("--verbose", action="store_true", default=False, help="Provide verbose output")
args = parser.parse_args()
log_level = logging.INFO if not args.verbose else logging.DEBUG
LOGGER.setLevel(log_level)
LOGGER.info(f"args:")
for key, value in vars(args).items():
LOGGER.info(f" {key} = {value}")
run_benchmark(**vars(args))

View file

@ -0,0 +1,135 @@
#!/usr/bin/env python3
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""
To infer the model on framework runtime, you can use `run_inference_on_fw.py` script.
It infers data obtained from pointed data loader locally and saves received data into
[npz files](https://gitlab-master.nvidia.com/dl/JoC/bermuda-api/-/blob/develop/bermuda_api_toolset/docs/dump_files.md).
Those files are stored in directory pointed by `--output-dir` argument.
Example call:
```shell script
python ./triton/run_inference_on_fw.py \
--input-path /models/exported/model.onnx \
--input-type onnx \
--dataloader triton/dataloader.py \
--data-dir /data/imagenet \
--batch-size 32 \
--output-dir /results/dump_local \
--dump-labels
```
"""
import argparse
import logging
import os
from pathlib import Path
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "0"
from tqdm import tqdm
# method from PEP-366 to support relative import in executed modules
if __package__ is None:
__package__ = Path(__file__).parent.name
from .deployment_toolkit.args import ArgParserGenerator
from .deployment_toolkit.core import DATALOADER_FN_NAME, BaseLoader, BaseRunner, Format, load_from_file
from .deployment_toolkit.dump import NpzWriter
from .deployment_toolkit.extensions import loaders, runners
LOGGER = logging.getLogger("run_inference_on_fw")
def _verify_and_format_dump(args, ids, x, y_pred, y_real):
data = {"outputs": y_pred, "ids": {"ids": ids}}
if args.dump_inputs:
data["inputs"] = x
if args.dump_labels:
if not y_real:
raise ValueError(
"Found empty label values. Please provide labels in dataloader_fn or do not use --dump-labels argument"
)
data["labels"] = y_real
return data
def _parse_and_validate_args():
supported_inputs = set(runners.supported_extensions) & set(loaders.supported_extensions)
parser = argparse.ArgumentParser(description="Dump local inference output of given model", allow_abbrev=False)
parser.add_argument("--input-path", help="Path to input model", required=True)
parser.add_argument("--input-type", help="Input model type", choices=supported_inputs, required=True)
parser.add_argument("--dataloader", help="Path to python file containing dataloader.", required=True)
parser.add_argument("--output-dir", help="Path to dir where output files will be stored", required=True)
parser.add_argument("--dump-labels", help="Dump labels to output dir", action="store_true", default=False)
parser.add_argument("--dump-inputs", help="Dump inputs to output dir", action="store_true", default=False)
parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
args, *_ = parser.parse_known_args()
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
Loader: BaseLoader = loaders.get(args.input_type)
ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
Runner: BaseRunner = runners.get(args.input_type)
ArgParserGenerator(Runner).update_argparser(parser)
args = parser.parse_args()
types_requiring_io_params = []
if args.input_type in types_requiring_io_params and not all(p for p in [args.inputs, args.outputs]):
parser.error(f"For {args.input_type} input provide --inputs and --outputs parameters")
return args
def main():
args = _parse_and_validate_args()
log_level = logging.INFO if not args.verbose else logging.DEBUG
log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
logging.basicConfig(level=log_level, format=log_format)
LOGGER.info(f"args:")
for key, value in vars(args).items():
LOGGER.info(f" {key} = {value}")
Loader: BaseLoader = loaders.get(args.input_type)
Runner: BaseRunner = runners.get(args.input_type)
loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
runner = ArgParserGenerator(Runner).from_args(args)
LOGGER.info(f"Loading {args.input_path}")
model = loader.load(args.input_path)
with runner.init_inference(model=model) as runner_session, NpzWriter(args.output_dir) as writer:
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
LOGGER.info(f"Data loader initialized; Running inference")
for ids, x, y_real in tqdm(dataloader_fn(), unit="batch", mininterval=10):
y_pred = runner_session(x)
data = _verify_and_format_dump(args, ids=ids, x=x, y_pred=y_pred, y_real=y_real)
writer.write(**data)
LOGGER.info(f"Inference finished")
if __name__ == "__main__":
main()

Some files were not shown because too many files have changed in this diff Show more