[ConvNets/TF1] Added Triton for ResNet

2021-04-20 13:50:41 +02:00 · 2021-04-20 13:50:41 +02:00 · 169b081827
parent 7bdfc81d25
commit 169b081827
118 changed files with 100312 additions and 998 deletions
--- a/TensorFlow/Classification/ConvNets/.style.yapf
+++ b/TensorFlow/Classification/ConvNets/.style.yapf
@ -32,7 +32,7 @@ allow_multiline_lambdas = True
 #                    # <------ this blank line
 #     def method():
 #         pass
-blank_line_before_nested_class_or_def = True
+blank_line_before_nested_class_or_def = False

 # Insert a blank line before a module docstring.
 blank_line_before_module_docstring = True
@ -83,7 +83,7 @@ continuation_indent_width = 4
 #       start_ts=now()-timedelta(days=3),
 #       end_ts=now(),
 #   )        # <--- this bracket is dedented and on a separate line
-dedent_closing_brackets = True
+dedent_closing_brackets = False

 # Disable the heuristic which places each list element on a separate line if the list is comma-terminated.
 disable_ending_comma_heuristic = false
--- a/TensorFlow/Classification/ConvNets/Dockerfile
+++ b/TensorFlow/Classification/ConvNets/Dockerfile
@ -1,8 +1,30 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.12-tf1-py3
+ARG TRITON_CLIENT_IMAGE_NAME=nvcr.io/nvidia/tritonserver:20.12-py3-sdk
+FROM ${TRITON_CLIENT_IMAGE_NAME} as triton-client
 FROM ${FROM_IMAGE_NAME}

-ADD requirements.txt .
-RUN pip install -r requirements.txt
+# Install perf_client required library
+RUN apt-get update && \
+    apt-get install -y libb64-dev libb64-0d && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

-ADD . /workspace/rn50v15_tf
+# Install Triton Client PythonAPI and copy Perf Client
+COPY --from=triton-client /workspace/install/ /workspace/install/
+ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
+RUN find /workspace/install/python/ -iname triton*manylinux*.whl -exec pip install {}[all] \;
+
+# Setup environmnent variables to access Triton Client lib and bin
+ENV PATH /workspace/install/bin:${PATH}
+
+ENV PYTHONPATH /workspace/rn50v15_tf
 WORKDIR /workspace/rn50v15_tf
+
+RUN pip uninstall -y typing
+
+ADD requirements.txt .
+ADD triton/requirements.txt triton/requirements.txt
+RUN pip install -r requirements.txt
+RUN pip install -r triton/requirements.txt
+
+ADD . .
--- a/TensorFlow/Classification/ConvNets/README.md
+++ b/TensorFlow/Classification/ConvNets/README.md
@ -51,7 +51,7 @@ were averaged over an entire training epoch.
 The specific training script that was run is documented 
 in the corresponding model's README.

-The following table shows the training accuracy results of the 
+The following table shows the training performance results of the 
 three classification models side-by-side.


@ -71,7 +71,7 @@ were averaged over an entire training epoch.
 The specific training script that was run is documented 
 in the corresponding model's README.

-The following table shows the training accuracy results of the 
+The following table shows the training performance results of the 
 three classification models side-by-side.


--- a/TensorFlow/Classification/ConvNets/dataprep/build_image_data.py
+++ b/TensorFlow/Classification/ConvNets/dataprep/build_image_data.py
@ -0,0 +1,436 @@
+#!/usr/bin/python
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts image data to TFRecords file format with Example protos.
+
+The image data set is expected to reside in JPEG files located in the
+following directory structure.
+
+  data_dir/label_0/image0.jpeg
+  data_dir/label_0/image1.jpg
+  ...
+  data_dir/label_1/weird-image.jpeg
+  data_dir/label_1/my-image.jpeg
+  ...
+
+where the sub-directory is the unique label associated with these images.
+
+This TensorFlow script converts the training and evaluation data into
+a sharded data set consisting of TFRecord files
+
+  train_directory/train-00000-of-01024
+  train_directory/train-00001-of-01024
+  ...
+  train_directory/train-01023-of-01024
+
+and
+
+  validation_directory/validation-00000-of-00128
+  validation_directory/validation-00001-of-00128
+  ...
+  validation_directory/validation-00127-of-00128
+
+where we have selected 1024 and 128 shards for each data set. Each record
+within the TFRecord file is a serialized Example proto. The Example proto
+contains the following fields:
+
+  image/encoded: string containing JPEG encoded image in RGB colorspace
+  image/height: integer, image height in pixels
+  image/width: integer, image width in pixels
+  image/colorspace: string, specifying the colorspace, always 'RGB'
+  image/channels: integer, specifying the number of channels, always 3
+  image/format: string, specifying the format, always 'JPEG'
+
+  image/filename: string containing the basename of the image file
+            e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
+  image/class/label: integer specifying the index in a classification layer.
+    The label ranges from [0, num_labels] where 0 is unused and left as
+    the background class.
+  image/class/text: string specifying the human-readable version of the label
+    e.g. 'dog'
+
+If your data set involves bounding boxes, please look at build_imagenet_data.py.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from datetime import datetime
+import os
+import random
+import sys
+import threading
+
+import numpy as np
+import tensorflow as tf
+
+tf.app.flags.DEFINE_string('train_directory', '/tmp/',
+                           'Training data directory')
+tf.app.flags.DEFINE_string('validation_directory', '/tmp/',
+                           'Validation data directory')
+tf.app.flags.DEFINE_string('output_directory', '/tmp/',
+                           'Output data directory')
+
+tf.app.flags.DEFINE_integer('train_shards', 2,
+                            'Number of shards in training TFRecord files.')
+tf.app.flags.DEFINE_integer('validation_shards', 2,
+                            'Number of shards in validation TFRecord files.')
+
+tf.app.flags.DEFINE_integer('num_threads', 2,
+                            'Number of threads to preprocess the images.')
+
+# The labels file contains a list of valid labels are held in this file.
+# Assumes that the file contains entries as such:
+#   dog
+#   cat
+#   flower
+# where each line corresponds to a label. We map each label contained in
+# the file to an integer corresponding to the line number starting from 0.
+tf.app.flags.DEFINE_string('labels_file', '', 'Labels file')
+
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def _int64_feature(value):
+  """Wrapper for inserting int64 features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def _bytes_feature(value):
+  """Wrapper for inserting bytes features into Example proto."""
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def _convert_to_example(filename, image_buffer, label, text, height, width):
+  """Build an Example proto for an example.
+
+  Args:
+    filename: string, path to an image file, e.g., '/path/to/example.JPG'
+    image_buffer: string, JPEG encoding of RGB image
+    label: integer, identifier for the ground truth for the network
+    text: string, unique human-readable, e.g. 'dog'
+    height: integer, image height in pixels
+    width: integer, image width in pixels
+  Returns:
+    Example proto
+  """
+
+  colorspace = 'RGB'
+  channels = 3
+  image_format = 'JPEG'
+
+  example = tf.train.Example(features=tf.train.Features(feature={
+      'image/height': _int64_feature(height),
+      'image/width': _int64_feature(width),
+      'image/colorspace': _bytes_feature(tf.compat.as_bytes(colorspace)),
+      'image/channels': _int64_feature(channels),
+      'image/class/label': _int64_feature(label),
+      'image/class/text': _bytes_feature(tf.compat.as_bytes(text)),
+      'image/format': _bytes_feature(tf.compat.as_bytes(image_format)),
+      'image/filename': _bytes_feature(tf.compat.as_bytes(os.path.basename(filename))),
+      'image/encoded': _bytes_feature(tf.compat.as_bytes(image_buffer))}))
+  return example
+
+
+class ImageCoder(object):
+  """Helper class that provides TensorFlow image coding utilities."""
+
+  def __init__(self):
+    # Create a single Session to run all image coding calls.
+    self._sess = tf.Session()
+
+    # Initializes function that converts PNG to JPEG data.
+    self._png_data = tf.placeholder(dtype=tf.string)
+    image = tf.image.decode_png(self._png_data, channels=3)
+    self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100)
+
+    # Initializes function that decodes RGB JPEG data.
+    self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
+    self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)
+
+  def png_to_jpeg(self, image_data):
+    return self._sess.run(self._png_to_jpeg,
+                          feed_dict={self._png_data: image_data})
+
+  def decode_jpeg(self, image_data):
+    image = self._sess.run(self._decode_jpeg,
+                           feed_dict={self._decode_jpeg_data: image_data})
+    assert len(image.shape) == 3
+    assert image.shape[2] == 3
+    return image
+
+
+def _is_png(filename):
+  """Determine if a file contains a PNG format image.
+
+  Args:
+    filename: string, path of the image file.
+
+  Returns:
+    boolean indicating if the image is a PNG.
+  """
+  return filename.endswith('.png')
+
+
+def _process_image(filename, coder):
+  """Process a single image file.
+
+  Args:
+    filename: string, path to an image file e.g., '/path/to/example.JPG'.
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+  Returns:
+    image_buffer: string, JPEG encoding of RGB image.
+    height: integer, image height in pixels.
+    width: integer, image width in pixels.
+  """
+  # Read the image file.
+  with tf.gfile.FastGFile(filename, 'rb') as f:
+    image_data = f.read()
+
+  # Convert any PNG to JPEG's for consistency.
+  if _is_png(filename):
+    print('Converting PNG to JPEG for %s' % filename)
+    image_data = coder.png_to_jpeg(image_data)
+
+  # Decode the RGB JPEG.
+  image = coder.decode_jpeg(image_data)
+
+  # Check that image converted to RGB
+  assert len(image.shape) == 3
+  height = image.shape[0]
+  width = image.shape[1]
+  assert image.shape[2] == 3
+
+  return image_data, height, width
+
+
+def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
+                               texts, labels, num_shards):
+  """Processes and saves list of images as TFRecord in 1 thread.
+
+  Args:
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+    thread_index: integer, unique batch to run index is within [0, len(ranges)).
+    ranges: list of pairs of integers specifying ranges of each batches to
+      analyze in parallel.
+    name: string, unique identifier specifying the data set
+    filenames: list of strings; each string is a path to an image file
+    texts: list of strings; each string is human readable, e.g. 'dog'
+    labels: list of integer; each integer identifies the ground truth
+    num_shards: integer number of shards for this data set.
+  """
+  # Each thread produces N shards where N = int(num_shards / num_threads).
+  # For instance, if num_shards = 128, and the num_threads = 2, then the first
+  # thread would produce shards [0, 64).
+  num_threads = len(ranges)
+  assert not num_shards % num_threads
+  num_shards_per_batch = int(num_shards / num_threads)
+
+  shard_ranges = np.linspace(ranges[thread_index][0],
+                             ranges[thread_index][1],
+                             num_shards_per_batch + 1).astype(int)
+  num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
+
+  counter = 0
+  for s in range(num_shards_per_batch):
+    # Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
+    shard = thread_index * num_shards_per_batch + s
+    output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
+    output_file = os.path.join(FLAGS.output_directory, output_filename)
+    writer = tf.python_io.TFRecordWriter(output_file)
+
+    shard_counter = 0
+    files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
+    for i in files_in_shard:
+      filename = filenames[i]
+      label = labels[i]
+      text = texts[i]
+
+      try:
+        image_buffer, height, width = _process_image(filename, coder)
+      except Exception as e:
+        print(e)
+        print('SKIPPED: Unexpected error while decoding %s.' % filename)
+        continue
+
+      example = _convert_to_example(filename, image_buffer, label,
+                                    text, height, width)
+      writer.write(example.SerializeToString())
+      shard_counter += 1
+      counter += 1
+
+      if not counter % 1000:
+        print('%s [thread %d]: Processed %d of %d images in thread batch.' %
+              (datetime.now(), thread_index, counter, num_files_in_thread))
+        sys.stdout.flush()
+
+    writer.close()
+    print('%s [thread %d]: Wrote %d images to %s' %
+          (datetime.now(), thread_index, shard_counter, output_file))
+    sys.stdout.flush()
+    shard_counter = 0
+  print('%s [thread %d]: Wrote %d images to %d shards.' %
+        (datetime.now(), thread_index, counter, num_files_in_thread))
+  sys.stdout.flush()
+
+
+def _process_image_files(name, filenames, texts, labels, num_shards):
+  """Process and save list of images as TFRecord of Example protos.
+
+  Args:
+    name: string, unique identifier specifying the data set
+    filenames: list of strings; each string is a path to an image file
+    texts: list of strings; each string is human readable, e.g. 'dog'
+    labels: list of integer; each integer identifies the ground truth
+    num_shards: integer number of shards for this data set.
+  """
+  assert len(filenames) == len(texts)
+  assert len(filenames) == len(labels)
+
+  # Break all images into batches with a [ranges[i][0], ranges[i][1]].
+  spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
+  ranges = []
+  for i in range(len(spacing) - 1):
+    ranges.append([spacing[i], spacing[i + 1]])
+
+  # Launch a thread for each batch.
+  print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
+  sys.stdout.flush()
+
+  # Create a mechanism for monitoring when all threads are finished.
+  coord = tf.train.Coordinator()
+
+  # Create a generic TensorFlow-based utility for converting all image codings.
+  coder = ImageCoder()
+
+  threads = []
+  for thread_index in range(len(ranges)):
+    args = (coder, thread_index, ranges, name, filenames,
+            texts, labels, num_shards)
+    t = threading.Thread(target=_process_image_files_batch, args=args)
+    t.start()
+    threads.append(t)
+
+  # Wait for all the threads to terminate.
+  coord.join(threads)
+  print('%s: Finished writing all %d images in data set.' %
+        (datetime.now(), len(filenames)))
+  sys.stdout.flush()
+
+
+def _find_image_files(data_dir, labels_file):
+  """Build a list of all images files and labels in the data set.
+
+  Args:
+    data_dir: string, path to the root directory of images.
+
+      Assumes that the image data set resides in JPEG files located in
+      the following directory structure.
+
+        data_dir/dog/another-image.JPEG
+        data_dir/dog/my-image.jpg
+
+      where 'dog' is the label associated with these images.
+
+    labels_file: string, path to the labels file.
+
+      The list of valid labels are held in this file. Assumes that the file
+      contains entries as such:
+        dog
+        cat
+        flower
+      where each line corresponds to a label. We map each label contained in
+      the file to an integer starting with the integer 0 corresponding to the
+      label contained in the first line.
+
+  Returns:
+    filenames: list of strings; each string is a path to an image file.
+    texts: list of strings; each string is the class, e.g. 'dog'
+    labels: list of integer; each integer identifies the ground truth.
+  """
+  print('Determining list of input files and labels from %s.' % data_dir)
+  unique_labels = [l.strip() for l in tf.gfile.FastGFile(
+      labels_file, 'r').readlines()]
+
+  labels = []
+  filenames = []
+  texts = []
+
+  # Leave label index 0 empty as a background class.
+  label_index = 1
+
+  # Construct the list of JPEG files and labels.
+  for text in unique_labels:
+    jpeg_file_path = '%s/%s/*' % (data_dir, text)
+    matching_files = tf.gfile.Glob(jpeg_file_path)
+
+    labels.extend([label_index] * len(matching_files))
+    texts.extend([text] * len(matching_files))
+    filenames.extend(matching_files)
+
+    if not label_index % 100:
+      print('Finished finding files in %d of %d classes.' % (
+          label_index, len(labels)))
+    label_index += 1
+
+  # Shuffle the ordering of all image files in order to guarantee
+  # random ordering of the images with respect to label in the
+  # saved TFRecord files. Make the randomization repeatable.
+  shuffled_index = list(range(len(filenames)))
+  random.seed(12345)
+  random.shuffle(shuffled_index)
+
+  filenames = [filenames[i] for i in shuffled_index]
+  texts = [texts[i] for i in shuffled_index]
+  labels = [labels[i] for i in shuffled_index]
+
+  print('Found %d JPEG files across %d labels inside %s.' %
+        (len(filenames), len(unique_labels), data_dir))
+  return filenames, texts, labels
+
+
+def _process_dataset(name, directory, num_shards, labels_file):
+  """Process a complete data set and save it as a TFRecord.
+
+  Args:
+    name: string, unique identifier specifying the data set.
+    directory: string, root path to the data set.
+    num_shards: integer number of shards for this data set.
+    labels_file: string, path to the labels file.
+  """
+  filenames, texts, labels = _find_image_files(directory, labels_file)
+  _process_image_files(name, filenames, texts, labels, num_shards)
+
+
+def main(unused_argv):
+  assert not FLAGS.train_shards % FLAGS.num_threads, (
+      'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards')
+  assert not FLAGS.validation_shards % FLAGS.num_threads, (
+      'Please make the FLAGS.num_threads commensurate with '
+      'FLAGS.validation_shards')
+  print('Saving results to %s' % FLAGS.output_directory)
+
+  # Run it!
+  _process_dataset('validation', FLAGS.validation_directory,
+                   FLAGS.validation_shards, FLAGS.labels_file)
+  _process_dataset('train', FLAGS.train_directory,
+                   FLAGS.train_shards, FLAGS.labels_file)
+
+
+if __name__ == '__main__':
+  tf.app.run()
--- a/TensorFlow/Classification/ConvNets/dataprep/build_imagenet_data.py
+++ b/TensorFlow/Classification/ConvNets/dataprep/build_imagenet_data.py
@ -0,0 +1,707 @@
+#!/usr/bin/python
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts ImageNet data to TFRecords file format with Example protos.
+
+The raw ImageNet data set is expected to reside in JPEG files located in the
+following directory structure.
+
+  data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
+  data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
+  ...
+
+where 'n01440764' is the unique synset label associated with
+these images.
+
+The training data set consists of 1000 sub-directories (i.e. labels)
+each containing 1200 JPEG images for a total of 1.2M JPEG images.
+
+The evaluation data set consists of 1000 sub-directories (i.e. labels)
+each containing 50 JPEG images for a total of 50K JPEG images.
+
+This TensorFlow script converts the training and evaluation data into
+a sharded data set consisting of 1024 and 128 TFRecord files, respectively.
+
+  train_directory/train-00000-of-01024
+  train_directory/train-00001-of-01024
+  ...
+  train_directory/train-01023-of-01024
+
+and
+
+  validation_directory/validation-00000-of-00128
+  validation_directory/validation-00001-of-00128
+  ...
+  validation_directory/validation-00127-of-00128
+
+Each validation TFRecord file contains ~390 records. Each training TFREcord
+file contains ~1250 records. Each record within the TFRecord file is a
+serialized Example proto. The Example proto contains the following fields:
+
+  image/encoded: string containing JPEG encoded image in RGB colorspace
+  image/height: integer, image height in pixels
+  image/width: integer, image width in pixels
+  image/colorspace: string, specifying the colorspace, always 'RGB'
+  image/channels: integer, specifying the number of channels, always 3
+  image/format: string, specifying the format, always 'JPEG'
+
+  image/filename: string containing the basename of the image file
+            e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
+  image/class/label: integer specifying the index in a classification layer.
+    The label ranges from [1, 1000] where 0 is not used.
+  image/class/synset: string specifying the unique ID of the label,
+    e.g. 'n01440764'
+  image/class/text: string specifying the human-readable version of the label
+    e.g. 'red fox, Vulpes vulpes'
+
+  image/object/bbox/xmin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/xmax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/label: integer specifying the index in a classification
+    layer. The label ranges from [1, 1000] where 0 is not used. Note this is
+    always identical to the image label.
+
+Note that the length of xmin is identical to the length of xmax, ymin and ymax
+for each example.
+
+Running this script using 16 threads may take around ~2.5 hours on an HP Z420.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from datetime import datetime
+import os
+import random
+import sys
+import threading
+
+import numpy as np
+import six
+import tensorflow as tf
+
+tf.app.flags.DEFINE_string('train_directory', '/tmp/',
+                           'Training data directory')
+tf.app.flags.DEFINE_string('validation_directory', '/tmp/',
+                           'Validation data directory')
+tf.app.flags.DEFINE_string('output_directory', '/tmp/',
+                           'Output data directory')
+
+tf.app.flags.DEFINE_integer('train_shards', 1024,
+                            'Number of shards in training TFRecord files.')
+tf.app.flags.DEFINE_integer('validation_shards', 128,
+                            'Number of shards in validation TFRecord files.')
+
+tf.app.flags.DEFINE_integer('num_threads', 8,
+                            'Number of threads to preprocess the images.')
+
+# The labels file contains a list of valid labels are held in this file.
+# Assumes that the file contains entries as such:
+#   n01440764
+#   n01443537
+#   n01484850
+# where each line corresponds to a label expressed as a synset. We map
+# each synset contained in the file to an integer (based on the alphabetical
+# ordering). See below for details.
+tf.app.flags.DEFINE_string('labels_file',
+                           'imagenet_lsvrc_2015_synsets.txt',
+                           'Labels file')
+
+# This file containing mapping from synset to human-readable label.
+# Assumes each line of the file looks like:
+#
+#   n02119247    black fox
+#   n02119359    silver fox
+#   n02119477    red fox, Vulpes fulva
+#
+# where each line corresponds to a unique mapping. Note that each line is
+# formatted as <synset>\t<human readable label>.
+tf.app.flags.DEFINE_string('imagenet_metadata_file',
+                           'imagenet_metadata.txt',
+                           'ImageNet metadata file')
+
+# This file is the output of process_bounding_box.py
+# Assumes each line of the file looks like:
+#
+#   n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
+#
+# where each line corresponds to one bounding box annotation associated
+# with an image. Each line can be parsed as:
+#
+#   <JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
+#
+# Note that there might exist mulitple bounding box annotations associated
+# with an image file.
+tf.app.flags.DEFINE_string('bounding_box_file',
+                           './imagenet_2012_bounding_boxes.csv',
+                           'Bounding box file')
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def _int64_feature(value):
+  """Wrapper for inserting int64 features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def _float_feature(value):
+  """Wrapper for inserting float features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+
+def _bytes_feature(value):
+  """Wrapper for inserting bytes features into Example proto."""
+  if six.PY3 and isinstance(value, six.text_type):           
+    value = six.binary_type(value, encoding='utf-8') 
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def _convert_to_example(filename, image_buffer, label, synset, human, bbox,
+                        height, width):
+  """Build an Example proto for an example.
+
+  Args:
+    filename: string, path to an image file, e.g., '/path/to/example.JPG'
+    image_buffer: string, JPEG encoding of RGB image
+    label: integer, identifier for the ground truth for the network
+    synset: string, unique WordNet ID specifying the label, e.g., 'n02323233'
+    human: string, human-readable label, e.g., 'red fox, Vulpes vulpes'
+    bbox: list of bounding boxes; each box is a list of integers
+      specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong to
+      the same label as the image label.
+    height: integer, image height in pixels
+    width: integer, image width in pixels
+  Returns:
+    Example proto
+  """
+  xmin = []
+  ymin = []
+  xmax = []
+  ymax = []
+  for b in bbox:
+    assert len(b) == 4
+    # pylint: disable=expression-not-assigned
+    [l.append(point) for l, point in zip([xmin, ymin, xmax, ymax], b)]
+    # pylint: enable=expression-not-assigned
+
+  colorspace = 'RGB'
+  channels = 3
+  image_format = 'JPEG'
+
+  example = tf.train.Example(features=tf.train.Features(feature={
+      'image/height': _int64_feature(height),
+      'image/width': _int64_feature(width),
+      'image/colorspace': _bytes_feature(colorspace),
+      'image/channels': _int64_feature(channels),
+      'image/class/label': _int64_feature(label),
+      'image/class/synset': _bytes_feature(synset),
+      'image/class/text': _bytes_feature(human),
+      'image/object/bbox/xmin': _float_feature(xmin),
+      'image/object/bbox/xmax': _float_feature(xmax),
+      'image/object/bbox/ymin': _float_feature(ymin),
+      'image/object/bbox/ymax': _float_feature(ymax),
+      'image/object/bbox/label': _int64_feature([label] * len(xmin)),
+      'image/format': _bytes_feature(image_format),
+      'image/filename': _bytes_feature(os.path.basename(filename)),
+      'image/encoded': _bytes_feature(image_buffer)}))
+  return example
+
+
+class ImageCoder(object):
+  """Helper class that provides TensorFlow image coding utilities."""
+
+  def __init__(self):
+    # Create a single Session to run all image coding calls.
+    self._sess = tf.Session()
+
+    # Initializes function that converts PNG to JPEG data.
+    self._png_data = tf.placeholder(dtype=tf.string)
+    image = tf.image.decode_png(self._png_data, channels=3)
+    self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100)
+
+    # Initializes function that converts CMYK JPEG data to RGB JPEG data.
+    self._cmyk_data = tf.placeholder(dtype=tf.string)
+    image = tf.image.decode_jpeg(self._cmyk_data, channels=0)
+    self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100)
+
+    # Initializes function that decodes RGB JPEG data.
+    self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
+    self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)
+
+  def png_to_jpeg(self, image_data):
+    return self._sess.run(self._png_to_jpeg,
+                          feed_dict={self._png_data: image_data})
+
+  def cmyk_to_rgb(self, image_data):
+    return self._sess.run(self._cmyk_to_rgb,
+                          feed_dict={self._cmyk_data: image_data})
+
+  def decode_jpeg(self, image_data):
+    image = self._sess.run(self._decode_jpeg,
+                           feed_dict={self._decode_jpeg_data: image_data})
+    assert len(image.shape) == 3
+    assert image.shape[2] == 3
+    return image
+
+
+def _is_png(filename):
+  """Determine if a file contains a PNG format image.
+
+  Args:
+    filename: string, path of the image file.
+
+  Returns:
+    boolean indicating if the image is a PNG.
+  """
+  # File list from:
+  # https://groups.google.com/forum/embed/?place=forum/torch7#!topic/torch7/fOSTXHIESSU
+  return 'n02105855_2933.JPEG' in filename
+
+
+def _is_cmyk(filename):
+  """Determine if file contains a CMYK JPEG format image.
+
+  Args:
+    filename: string, path of the image file.
+
+  Returns:
+    boolean indicating if the image is a JPEG encoded with CMYK color space.
+  """
+  # File list from:
+  # https://github.com/cytsai/ilsvrc-cmyk-image-list
+  blacklist = ['n01739381_1309.JPEG', 'n02077923_14822.JPEG',
+               'n02447366_23489.JPEG', 'n02492035_15739.JPEG',
+               'n02747177_10752.JPEG', 'n03018349_4028.JPEG',
+               'n03062245_4620.JPEG', 'n03347037_9675.JPEG',
+               'n03467068_12171.JPEG', 'n03529860_11437.JPEG',
+               'n03544143_17228.JPEG', 'n03633091_5218.JPEG',
+               'n03710637_5125.JPEG', 'n03961711_5286.JPEG',
+               'n04033995_2932.JPEG', 'n04258138_17003.JPEG',
+               'n04264628_27969.JPEG', 'n04336792_7448.JPEG',
+               'n04371774_5854.JPEG', 'n04596742_4225.JPEG',
+               'n07583066_647.JPEG', 'n13037406_4650.JPEG']
+  return filename.split('/')[-1] in blacklist
+
+
+def _process_image(filename, coder):
+  """Process a single image file.
+
+  Args:
+    filename: string, path to an image file e.g., '/path/to/example.JPG'.
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+  Returns:
+    image_buffer: string, JPEG encoding of RGB image.
+    height: integer, image height in pixels.
+    width: integer, image width in pixels.
+  """
+  # Read the image file.
+  with tf.gfile.FastGFile(filename, 'rb') as f:
+    image_data = f.read()
+
+  # Clean the dirty data.
+  if _is_png(filename):
+    # 1 image is a PNG.
+    print('Converting PNG to JPEG for %s' % filename)
+    image_data = coder.png_to_jpeg(image_data)
+  elif _is_cmyk(filename):
+    # 22 JPEG images are in CMYK colorspace.
+    print('Converting CMYK to RGB for %s' % filename)
+    image_data = coder.cmyk_to_rgb(image_data)
+
+  # Decode the RGB JPEG.
+  image = coder.decode_jpeg(image_data)
+
+  # Check that image converted to RGB
+  assert len(image.shape) == 3
+  height = image.shape[0]
+  width = image.shape[1]
+  assert image.shape[2] == 3
+
+  return image_data, height, width
+
+
+def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
+                               synsets, labels, humans, bboxes, num_shards):
+  """Processes and saves list of images as TFRecord in 1 thread.
+
+  Args:
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+    thread_index: integer, unique batch to run index is within [0, len(ranges)).
+    ranges: list of pairs of integers specifying ranges of each batches to
+      analyze in parallel.
+    name: string, unique identifier specifying the data set
+    filenames: list of strings; each string is a path to an image file
+    synsets: list of strings; each string is a unique WordNet ID
+    labels: list of integer; each integer identifies the ground truth
+    humans: list of strings; each string is a human-readable label
+    bboxes: list of bounding boxes for each image. Note that each entry in this
+      list might contain from 0+ entries corresponding to the number of bounding
+      box annotations for the image.
+    num_shards: integer number of shards for this data set.
+  """
+  # Each thread produces N shards where N = int(num_shards / num_threads).
+  # For instance, if num_shards = 128, and the num_threads = 2, then the first
+  # thread would produce shards [0, 64).
+  num_threads = len(ranges)
+  assert not num_shards % num_threads
+  num_shards_per_batch = int(num_shards / num_threads)
+
+  shard_ranges = np.linspace(ranges[thread_index][0],
+                             ranges[thread_index][1],
+                             num_shards_per_batch + 1).astype(int)
+  num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
+
+  counter = 0
+  for s in range(num_shards_per_batch):
+    # Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
+    shard = thread_index * num_shards_per_batch + s
+    output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
+    output_file = os.path.join(FLAGS.output_directory, output_filename)
+    writer = tf.python_io.TFRecordWriter(output_file)
+
+    shard_counter = 0
+    files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
+    for i in files_in_shard:
+      filename = filenames[i]
+      label = labels[i]
+      synset = synsets[i]
+      human = humans[i]
+      bbox = bboxes[i]
+
+      image_buffer, height, width = _process_image(filename, coder)
+
+      example = _convert_to_example(filename, image_buffer, label,
+                                    synset, human, bbox,
+                                    height, width)
+      writer.write(example.SerializeToString())
+      shard_counter += 1
+      counter += 1
+
+      if not counter % 1000:
+        print('%s [thread %d]: Processed %d of %d images in thread batch.' %
+              (datetime.now(), thread_index, counter, num_files_in_thread))
+        sys.stdout.flush()
+
+    writer.close()
+    print('%s [thread %d]: Wrote %d images to %s' %
+          (datetime.now(), thread_index, shard_counter, output_file))
+    sys.stdout.flush()
+    shard_counter = 0
+  print('%s [thread %d]: Wrote %d images to %d shards.' %
+        (datetime.now(), thread_index, counter, num_files_in_thread))
+  sys.stdout.flush()
+
+
+def _process_image_files(name, filenames, synsets, labels, humans,
+                         bboxes, num_shards):
+  """Process and save list of images as TFRecord of Example protos.
+
+  Args:
+    name: string, unique identifier specifying the data set
+    filenames: list of strings; each string is a path to an image file
+    synsets: list of strings; each string is a unique WordNet ID
+    labels: list of integer; each integer identifies the ground truth
+    humans: list of strings; each string is a human-readable label
+    bboxes: list of bounding boxes for each image. Note that each entry in this
+      list might contain from 0+ entries corresponding to the number of bounding
+      box annotations for the image.
+    num_shards: integer number of shards for this data set.
+  """
+  assert len(filenames) == len(synsets)
+  assert len(filenames) == len(labels)
+  assert len(filenames) == len(humans)
+  assert len(filenames) == len(bboxes)
+
+  # Break all images into batches with a [ranges[i][0], ranges[i][1]].
+  spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
+  ranges = []
+  threads = []
+  for i in range(len(spacing) - 1):
+    ranges.append([spacing[i], spacing[i + 1]])
+
+  # Launch a thread for each batch.
+  print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
+  sys.stdout.flush()
+
+  # Create a mechanism for monitoring when all threads are finished.
+  coord = tf.train.Coordinator()
+
+  # Create a generic TensorFlow-based utility for converting all image codings.
+  coder = ImageCoder()
+
+  threads = []
+  for thread_index in range(len(ranges)):
+    args = (coder, thread_index, ranges, name, filenames,
+            synsets, labels, humans, bboxes, num_shards)
+    t = threading.Thread(target=_process_image_files_batch, args=args)
+    t.start()
+    threads.append(t)
+
+  # Wait for all the threads to terminate.
+  coord.join(threads)
+  print('%s: Finished writing all %d images in data set.' %
+        (datetime.now(), len(filenames)))
+  sys.stdout.flush()
+
+
+def _find_image_files(data_dir, labels_file):
+  """Build a list of all images files and labels in the data set.
+
+  Args:
+    data_dir: string, path to the root directory of images.
+
+      Assumes that the ImageNet data set resides in JPEG files located in
+      the following directory structure.
+
+        data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
+        data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
+
+      where 'n01440764' is the unique synset label associated with these images.
+
+    labels_file: string, path to the labels file.
+
+      The list of valid labels are held in this file. Assumes that the file
+      contains entries as such:
+        n01440764
+        n01443537
+        n01484850
+      where each line corresponds to a label expressed as a synset. We map
+      each synset contained in the file to an integer (based on the alphabetical
+      ordering) starting with the integer 1 corresponding to the synset
+      contained in the first line.
+
+      The reason we start the integer labels at 1 is to reserve label 0 as an
+      unused background class.
+
+  Returns:
+    filenames: list of strings; each string is a path to an image file.
+    synsets: list of strings; each string is a unique WordNet ID.
+    labels: list of integer; each integer identifies the ground truth.
+  """
+  print('Determining list of input files and labels from %s.' % data_dir)
+  challenge_synsets = [l.strip() for l in
+                       tf.gfile.FastGFile(labels_file, 'r').readlines()]
+
+  labels = []
+  filenames = []
+  synsets = []
+
+  # Leave label index 0 empty as a background class.
+  label_index = 1
+
+  # Construct the list of JPEG files and labels.
+  for synset in challenge_synsets:
+    jpeg_file_path = '%s/%s/*.JPEG' % (data_dir, synset)
+    matching_files = tf.gfile.Glob(jpeg_file_path)
+
+    labels.extend([label_index] * len(matching_files))
+    synsets.extend([synset] * len(matching_files))
+    filenames.extend(matching_files)
+
+    if not label_index % 100:
+      print('Finished finding files in %d of %d classes.' % (
+          label_index, len(challenge_synsets)))
+    label_index += 1
+
+  # Shuffle the ordering of all image files in order to guarantee
+  # random ordering of the images with respect to label in the
+  # saved TFRecord files. Make the randomization repeatable.
+  shuffled_index = list(range(len(filenames)))
+  random.seed(12345)
+  random.shuffle(shuffled_index)
+
+  filenames = [filenames[i] for i in shuffled_index]
+  synsets = [synsets[i] for i in shuffled_index]
+  labels = [labels[i] for i in shuffled_index]
+
+  print('Found %d JPEG files across %d labels inside %s.' %
+        (len(filenames), len(challenge_synsets), data_dir))
+  return filenames, synsets, labels
+
+
+def _find_human_readable_labels(synsets, synset_to_human):
+  """Build a list of human-readable labels.
+
+  Args:
+    synsets: list of strings; each string is a unique WordNet ID.
+    synset_to_human: dict of synset to human labels, e.g.,
+      'n02119022' --> 'red fox, Vulpes vulpes'
+
+  Returns:
+    List of human-readable strings corresponding to each synset.
+  """
+  humans = []
+  for s in synsets:
+    assert s in synset_to_human, ('Failed to find: %s' % s)
+    humans.append(synset_to_human[s])
+  return humans
+
+
+def _find_image_bounding_boxes(filenames, image_to_bboxes):
+  """Find the bounding boxes for a given image file.
+
+  Args:
+    filenames: list of strings; each string is a path to an image file.
+    image_to_bboxes: dictionary mapping image file names to a list of
+      bounding boxes. This list contains 0+ bounding boxes.
+  Returns:
+    List of bounding boxes for each image. Note that each entry in this
+    list might contain from 0+ entries corresponding to the number of bounding
+    box annotations for the image.
+  """
+  num_image_bbox = 0
+  bboxes = []
+  for f in filenames:
+    basename = os.path.basename(f)
+    if basename in image_to_bboxes:
+      bboxes.append(image_to_bboxes[basename])
+      num_image_bbox += 1
+    else:
+      bboxes.append([])
+  print('Found %d images with bboxes out of %d images' % (
+      num_image_bbox, len(filenames)))
+  return bboxes
+
+
+def _process_dataset(name, directory, num_shards, synset_to_human,
+                     image_to_bboxes):
+  """Process a complete data set and save it as a TFRecord.
+
+  Args:
+    name: string, unique identifier specifying the data set.
+    directory: string, root path to the data set.
+    num_shards: integer number of shards for this data set.
+    synset_to_human: dict of synset to human labels, e.g.,
+      'n02119022' --> 'red fox, Vulpes vulpes'
+    image_to_bboxes: dictionary mapping image file names to a list of
+      bounding boxes. This list contains 0+ bounding boxes.
+  """
+  filenames, synsets, labels = _find_image_files(directory, FLAGS.labels_file)
+  humans = _find_human_readable_labels(synsets, synset_to_human)
+  bboxes = _find_image_bounding_boxes(filenames, image_to_bboxes)
+  _process_image_files(name, filenames, synsets, labels,
+                       humans, bboxes, num_shards)
+
+
+def _build_synset_lookup(imagenet_metadata_file):
+  """Build lookup for synset to human-readable label.
+
+  Args:
+    imagenet_metadata_file: string, path to file containing mapping from
+      synset to human-readable label.
+
+      Assumes each line of the file looks like:
+
+        n02119247    black fox
+        n02119359    silver fox
+        n02119477    red fox, Vulpes fulva
+
+      where each line corresponds to a unique mapping. Note that each line is
+      formatted as <synset>\t<human readable label>.
+
+  Returns:
+    Dictionary of synset to human labels, such as:
+      'n02119022' --> 'red fox, Vulpes vulpes'
+  """
+  lines = tf.gfile.FastGFile(imagenet_metadata_file, 'r').readlines()
+  synset_to_human = {}
+  for l in lines:
+    if l:
+      parts = l.strip().split('\t')
+      assert len(parts) == 2
+      synset = parts[0]
+      human = parts[1]
+      synset_to_human[synset] = human
+  return synset_to_human
+
+
+def _build_bounding_box_lookup(bounding_box_file):
+  """Build a lookup from image file to bounding boxes.
+
+  Args:
+    bounding_box_file: string, path to file with bounding boxes annotations.
+
+      Assumes each line of the file looks like:
+
+        n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
+
+      where each line corresponds to one bounding box annotation associated
+      with an image. Each line can be parsed as:
+
+        <JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
+
+      Note that there might exist mulitple bounding box annotations associated
+      with an image file. This file is the output of process_bounding_boxes.py.
+
+  Returns:
+    Dictionary mapping image file names to a list of bounding boxes. This list
+    contains 0+ bounding boxes.
+  """
+  lines = tf.gfile.FastGFile(bounding_box_file, 'r').readlines()
+  images_to_bboxes = {}
+  num_bbox = 0
+  num_image = 0
+  for l in lines:
+    if l:
+      parts = l.split(',')
+      assert len(parts) == 5, ('Failed to parse: %s' % l)
+      filename = parts[0]
+      xmin = float(parts[1])
+      ymin = float(parts[2])
+      xmax = float(parts[3])
+      ymax = float(parts[4])
+      box = [xmin, ymin, xmax, ymax]
+
+      if filename not in images_to_bboxes:
+        images_to_bboxes[filename] = []
+        num_image += 1
+      images_to_bboxes[filename].append(box)
+      num_bbox += 1
+
+  print('Successfully read %d bounding boxes '
+        'across %d images.' % (num_bbox, num_image))
+  return images_to_bboxes
+
+
+def main(unused_argv):
+  assert not FLAGS.train_shards % FLAGS.num_threads, (
+      'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards')
+  assert not FLAGS.validation_shards % FLAGS.num_threads, (
+      'Please make the FLAGS.num_threads commensurate with '
+      'FLAGS.validation_shards')
+  print('Saving results to %s' % FLAGS.output_directory)
+
+  # Build a map from synset to human-readable label.
+  synset_to_human = _build_synset_lookup(FLAGS.imagenet_metadata_file)
+  image_to_bboxes = _build_bounding_box_lookup(FLAGS.bounding_box_file)
+
+  # Run it!
+  _process_dataset('validation', FLAGS.validation_directory,
+                   FLAGS.validation_shards, synset_to_human, image_to_bboxes)
+  _process_dataset('train', FLAGS.train_directory, FLAGS.train_shards,
+                   synset_to_human, image_to_bboxes)
+
+
+if __name__ == '__main__':
+  tf.app.run()
--- a/TensorFlow/Classification/ConvNets/dataprep/build_imagewoof_data.py
+++ b/TensorFlow/Classification/ConvNets/dataprep/build_imagewoof_data.py
@ -0,0 +1,618 @@
+#!/usr/bin/python
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts ImageNet data to TFRecords file format with Example protos.
+
+The raw ImageNet data set is expected to reside in JPEG files located in the
+following directory structure.
+
+  data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
+  data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
+  ...
+
+where 'n01440764' is the unique synset label associated with
+these images.
+
+The training data set consists of 1000 sub-directories (i.e. labels)
+each containing 1200 JPEG images for a total of 1.2M JPEG images.
+
+The evaluation data set consists of 1000 sub-directories (i.e. labels)
+each containing 50 JPEG images for a total of 50K JPEG images.
+
+This TensorFlow script converts the training and evaluation data into
+a sharded data set consisting of 1024 and 128 TFRecord files, respectively.
+
+  train_directory/train-00000-of-01024
+  train_directory/train-00001-of-01024
+  ...
+  train_directory/train-01023-of-01024
+
+and
+
+  validation_directory/validation-00000-of-00128
+  validation_directory/validation-00001-of-00128
+  ...
+  validation_directory/validation-00127-of-00128
+
+Each validation TFRecord file contains ~390 records. Each training TFREcord
+file contains ~1250 records. Each record within the TFRecord file is a
+serialized Example proto. The Example proto contains the following fields:
+
+  image/encoded: string containing JPEG encoded image in RGB colorspace
+  image/height: integer, image height in pixels
+  image/width: integer, image width in pixels
+  image/colorspace: string, specifying the colorspace, always 'RGB'
+  image/channels: integer, specifying the number of channels, always 3
+  image/format: string, specifying the format, always 'JPEG'
+
+  image/filename: string containing the basename of the image file
+            e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
+  image/class/label: integer specifying the index in a classification layer.
+    The label ranges from [1, 1000] where 0 is not used.
+  image/class/synset: string specifying the unique ID of the label,
+    e.g. 'n01440764'
+  image/class/text: string specifying the human-readable version of the label
+    e.g. 'red fox, Vulpes vulpes'
+
+  image/object/bbox/xmin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/xmax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/label: integer specifying the index in a classification
+    layer. The label ranges from [1, 1000] where 0 is not used. Note this is
+    always identical to the image label.
+
+Note that the length of xmin is identical to the length of xmax, ymin and ymax
+for each example.
+
+Running this script using 16 threads may take around ~2.5 hours on an HP Z420.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from datetime import datetime
+import os
+import random
+import sys
+import threading
+
+import numpy as np
+import six
+import tensorflow as tf
+
+tf.app.flags.DEFINE_string('train_directory', '/tmp/',
+                           'Training data directory')
+tf.app.flags.DEFINE_string('validation_directory', '/tmp/',
+                           'Validation data directory')
+tf.app.flags.DEFINE_string('output_directory', '/tmp/',
+                           'Output data directory')
+
+tf.app.flags.DEFINE_integer('train_shards', 1024,
+                            'Number of shards in training TFRecord files.')
+tf.app.flags.DEFINE_integer('validation_shards', 128,
+                            'Number of shards in validation TFRecord files.')
+
+tf.app.flags.DEFINE_integer('num_threads', 8,
+                            'Number of threads to preprocess the images.')
+
+# The labels file contains a list of valid labels are held in this file.
+# Assumes that the file contains entries as such:
+#   n01440764
+#   n01443537
+#   n01484850
+# where each line corresponds to a label expressed as a synset. We map
+# each synset contained in the file to an integer (based on the alphabetical
+# ordering). See below for details.
+tf.app.flags.DEFINE_string('labels_file',
+                           'imagenet_lsvrc_2015_synsets.txt',
+                           'Labels file')
+
+# This file containing mapping from synset to human-readable label.
+# Assumes each line of the file looks like:
+#
+#   n02119247    black fox
+#   n02119359    silver fox
+#   n02119477    red fox, Vulpes fulva
+#
+# where each line corresponds to a unique mapping. Note that each line is
+# formatted as <synset>\t<human readable label>.
+tf.app.flags.DEFINE_string('imagenet_metadata_file',
+                           'imagenet_metadata.txt',
+                           'ImageNet metadata file')
+
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def _int64_feature(value):
+  """Wrapper for inserting int64 features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def _float_feature(value):
+  """Wrapper for inserting float features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+
+def _bytes_feature(value):
+  """Wrapper for inserting bytes features into Example proto."""
+  if six.PY3 and isinstance(value, six.text_type):           
+    value = six.binary_type(value, encoding='utf-8') 
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def _convert_to_example(filename, image_buffer, label, synset, human, bbox,
+                        height, width):
+  """Build an Example proto for an example.
+
+  Args:
+    filename: string, path to an image file, e.g., '/path/to/example.JPG'
+    image_buffer: string, JPEG encoding of RGB image
+    label: integer, identifier for the ground truth for the network
+    synset: string, unique WordNet ID specifying the label, e.g., 'n02323233'
+    human: string, human-readable label, e.g., 'red fox, Vulpes vulpes'
+    bbox: list of bounding boxes; each box is a list of integers
+      specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong to
+      the same label as the image label.
+    height: integer, image height in pixels
+    width: integer, image width in pixels
+  Returns:
+    Example proto
+  """
+  xmin = []
+  ymin = []
+  xmax = []
+  ymax = []
+  for b in bbox:
+    assert len(b) == 4
+    # pylint: disable=expression-not-assigned
+    [l.append(point) for l, point in zip([xmin, ymin, xmax, ymax], b)]
+    # pylint: enable=expression-not-assigned
+
+  colorspace = 'RGB'
+  channels = 3
+  image_format = 'JPEG'
+
+  example = tf.train.Example(features=tf.train.Features(feature={
+      'image/height': _int64_feature(height),
+      'image/width': _int64_feature(width),
+      'image/colorspace': _bytes_feature(colorspace),
+      'image/channels': _int64_feature(channels),
+      'image/class/label': _int64_feature(label),
+      'image/class/synset': _bytes_feature(synset),
+      'image/class/text': _bytes_feature(human),
+      'image/object/bbox/xmin': _float_feature(xmin),
+      'image/object/bbox/xmax': _float_feature(xmax),
+      'image/object/bbox/ymin': _float_feature(ymin),
+      'image/object/bbox/ymax': _float_feature(ymax),
+      'image/object/bbox/label': _int64_feature([label] * len(xmin)),
+      'image/format': _bytes_feature(image_format),
+      'image/filename': _bytes_feature(os.path.basename(filename)),
+      'image/encoded': _bytes_feature(image_buffer)}))
+  return example
+
+
+class ImageCoder(object):
+  """Helper class that provides TensorFlow image coding utilities."""
+
+  def __init__(self):
+    # Create a single Session to run all image coding calls.
+    self._sess = tf.Session()
+
+    # Initializes function that converts PNG to JPEG data.
+    self._png_data = tf.placeholder(dtype=tf.string)
+    image = tf.image.decode_png(self._png_data, channels=3)
+    self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100)
+
+    # Initializes function that converts CMYK JPEG data to RGB JPEG data.
+    self._cmyk_data = tf.placeholder(dtype=tf.string)
+    image = tf.image.decode_jpeg(self._cmyk_data, channels=0)
+    self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100)
+
+    # Initializes function that decodes RGB JPEG data.
+    self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
+    self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)
+
+  def png_to_jpeg(self, image_data):
+    return self._sess.run(self._png_to_jpeg,
+                          feed_dict={self._png_data: image_data})
+
+  def cmyk_to_rgb(self, image_data):
+    return self._sess.run(self._cmyk_to_rgb,
+                          feed_dict={self._cmyk_data: image_data})
+
+  def decode_jpeg(self, image_data):
+    image = self._sess.run(self._decode_jpeg,
+                           feed_dict={self._decode_jpeg_data: image_data})
+    assert len(image.shape) == 3
+    assert image.shape[2] == 3
+    return image
+
+
+def _is_png(filename):
+  """Determine if a file contains a PNG format image.
+
+  Args:
+    filename: string, path of the image file.
+
+  Returns:
+    boolean indicating if the image is a PNG.
+  """
+  # File list from:
+  # https://groups.google.com/forum/embed/?place=forum/torch7#!topic/torch7/fOSTXHIESSU
+  return 'n02105855_2933.JPEG' in filename
+
+
+def _is_cmyk(filename):
+  """Determine if file contains a CMYK JPEG format image.
+
+  Args:
+    filename: string, path of the image file.
+
+  Returns:
+    boolean indicating if the image is a JPEG encoded with CMYK color space.
+  """
+  # File list from:
+  # https://github.com/cytsai/ilsvrc-cmyk-image-list
+  blacklist = ['n01739381_1309.JPEG', 'n02077923_14822.JPEG',
+               'n02447366_23489.JPEG', 'n02492035_15739.JPEG',
+               'n02747177_10752.JPEG', 'n03018349_4028.JPEG',
+               'n03062245_4620.JPEG', 'n03347037_9675.JPEG',
+               'n03467068_12171.JPEG', 'n03529860_11437.JPEG',
+               'n03544143_17228.JPEG', 'n03633091_5218.JPEG',
+               'n03710637_5125.JPEG', 'n03961711_5286.JPEG',
+               'n04033995_2932.JPEG', 'n04258138_17003.JPEG',
+               'n04264628_27969.JPEG', 'n04336792_7448.JPEG',
+               'n04371774_5854.JPEG', 'n04596742_4225.JPEG',
+               'n07583066_647.JPEG', 'n13037406_4650.JPEG']
+  return filename.split('/')[-1] in blacklist
+
+
+def _process_image(filename, coder):
+  """Process a single image file.
+
+  Args:
+    filename: string, path to an image file e.g., '/path/to/example.JPG'.
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+  Returns:
+    image_buffer: string, JPEG encoding of RGB image.
+    height: integer, image height in pixels.
+    width: integer, image width in pixels.
+  """
+  # Read the image file.
+  with tf.gfile.FastGFile(filename, 'rb') as f:
+    image_data = f.read()
+
+  # Clean the dirty data.
+  if _is_png(filename):
+    # 1 image is a PNG.
+    print('Converting PNG to JPEG for %s' % filename)
+    image_data = coder.png_to_jpeg(image_data)
+  elif _is_cmyk(filename):
+    # 22 JPEG images are in CMYK colorspace.
+    print('Converting CMYK to RGB for %s' % filename)
+    image_data = coder.cmyk_to_rgb(image_data)
+
+  # Decode the RGB JPEG.
+  image = coder.decode_jpeg(image_data)
+
+  # Check that image converted to RGB
+  assert len(image.shape) == 3
+  height = image.shape[0]
+  width = image.shape[1]
+  assert image.shape[2] == 3
+
+  return image_data, height, width
+
+
+def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
+                               synsets, labels, humans, bboxes, num_shards):
+  """Processes and saves list of images as TFRecord in 1 thread.
+
+  Args:
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+    thread_index: integer, unique batch to run index is within [0, len(ranges)).
+    ranges: list of pairs of integers specifying ranges of each batches to
+      analyze in parallel.
+    name: string, unique identifier specifying the data set
+    filenames: list of strings; each string is a path to an image file
+    synsets: list of strings; each string is a unique WordNet ID
+    labels: list of integer; each integer identifies the ground truth
+    humans: list of strings; each string is a human-readable label
+    bboxes: list of bounding boxes for each image. Note that each entry in this
+      list might contain from 0+ entries corresponding to the number of bounding
+      box annotations for the image.
+    num_shards: integer number of shards for this data set.
+  """
+  # Each thread produces N shards where N = int(num_shards / num_threads).
+  # For instance, if num_shards = 128, and the num_threads = 2, then the first
+  # thread would produce shards [0, 64).
+  num_threads = len(ranges)
+  assert not num_shards % num_threads
+  num_shards_per_batch = int(num_shards / num_threads)
+
+  shard_ranges = np.linspace(ranges[thread_index][0],
+                             ranges[thread_index][1],
+                             num_shards_per_batch + 1).astype(int)
+  num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
+
+  counter = 0
+  for s in range(num_shards_per_batch):
+    # Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
+    shard = thread_index * num_shards_per_batch + s
+    output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
+    output_file = os.path.join(FLAGS.output_directory, output_filename)
+    writer = tf.python_io.TFRecordWriter(output_file)
+
+    shard_counter = 0
+    files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
+    for i in files_in_shard:
+      filename = filenames[i]
+      label = labels[i]
+      synset = synsets[i]
+      human = humans[i]
+      #bbox = bboxes[i]
+
+      image_buffer, height, width = _process_image(filename, coder)
+
+      example = _convert_to_example(filename, image_buffer, label,
+                                    synset, human, [[0, 0, 1, 1]],
+                                    height, width)
+      writer.write(example.SerializeToString())
+      shard_counter += 1
+      counter += 1
+
+      if not counter % 1000:
+        print('%s [thread %d]: Processed %d of %d images in thread batch.' %
+              (datetime.now(), thread_index, counter, num_files_in_thread))
+        sys.stdout.flush()
+
+    writer.close()
+    print('%s [thread %d]: Wrote %d images to %s' %
+          (datetime.now(), thread_index, shard_counter, output_file))
+    sys.stdout.flush()
+    shard_counter = 0
+  print('%s [thread %d]: Wrote %d images to %d shards.' %
+        (datetime.now(), thread_index, counter, num_files_in_thread))
+  sys.stdout.flush()
+
+
+def _process_image_files(name, filenames, synsets, labels, humans,
+                         bboxes, num_shards):
+  """Process and save list of images as TFRecord of Example protos.
+
+  Args:
+    name: string, unique identifier specifying the data set
+    filenames: list of strings; each string is a path to an image file
+    synsets: list of strings; each string is a unique WordNet ID
+    labels: list of integer; each integer identifies the ground truth
+    humans: list of strings; each string is a human-readable label
+    bboxes: list of bounding boxes for each image. Note that each entry in this
+      list might contain from 0+ entries corresponding to the number of bounding
+      box annotations for the image.
+    num_shards: integer number of shards for this data set.
+  """
+  assert len(filenames) == len(synsets)
+  assert len(filenames) == len(labels)
+  assert len(filenames) == len(humans)
+  #assert len(filenames) == len(bboxes)
+
+  # Break all images into batches with a [ranges[i][0], ranges[i][1]].
+  spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
+  ranges = []
+  threads = []
+  for i in range(len(spacing) - 1):
+    ranges.append([spacing[i], spacing[i + 1]])
+
+  # Launch a thread for each batch.
+  print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
+  sys.stdout.flush()
+
+  # Create a mechanism for monitoring when all threads are finished.
+  coord = tf.train.Coordinator()
+
+  # Create a generic TensorFlow-based utility for converting all image codings.
+  coder = ImageCoder()
+
+  threads = []
+  for thread_index in range(len(ranges)):
+    args = (coder, thread_index, ranges, name, filenames,
+            synsets, labels, humans, bboxes, num_shards)
+    t = threading.Thread(target=_process_image_files_batch, args=args)
+    t.start()
+    threads.append(t)
+
+  # Wait for all the threads to terminate.
+  coord.join(threads)
+  print('%s: Finished writing all %d images in data set.' %
+        (datetime.now(), len(filenames)))
+  sys.stdout.flush()
+
+
+def _find_image_files(data_dir, labels_file):
+  """Build a list of all images files and labels in the data set.
+
+  Args:
+    data_dir: string, path to the root directory of images.
+
+      Assumes that the ImageNet data set resides in JPEG files located in
+      the following directory structure.
+
+        data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
+        data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
+
+      where 'n01440764' is the unique synset label associated with these images.
+
+    labels_file: string, path to the labels file.
+
+      The list of valid labels are held in this file. Assumes that the file
+      contains entries as such:
+        n01440764
+        n01443537
+        n01484850
+      where each line corresponds to a label expressed as a synset. We map
+      each synset contained in the file to an integer (based on the alphabetical
+      ordering) starting with the integer 1 corresponding to the synset
+      contained in the first line.
+
+      The reason we start the integer labels at 1 is to reserve label 0 as an
+      unused background class.
+
+  Returns:
+    filenames: list of strings; each string is a path to an image file.
+    synsets: list of strings; each string is a unique WordNet ID.
+    labels: list of integer; each integer identifies the ground truth.
+  """
+  print('Determining list of input files and labels from %s.' % data_dir)
+  challenge_synsets = [l.strip() for l in
+                       tf.gfile.FastGFile(labels_file, 'r').readlines()]
+
+  labels = []
+  filenames = []
+  synsets = []
+
+  # Leave label index 0 empty as a background class.
+  label_index = 1
+
+  # Construct the list of JPEG files and labels.
+  for synset in challenge_synsets:
+    jpeg_file_path = '%s/%s/*.JPEG' % (data_dir, synset)
+    matching_files = tf.gfile.Glob(jpeg_file_path)
+
+    labels.extend([label_index] * len(matching_files))
+    synsets.extend([synset] * len(matching_files))
+    filenames.extend(matching_files)
+
+    if not label_index % 100:
+      print('Finished finding files in %d of %d classes.' % (
+          label_index, len(challenge_synsets)))
+    label_index += 1
+
+  # Shuffle the ordering of all image files in order to guarantee
+  # random ordering of the images with respect to label in the
+  # saved TFRecord files. Make the randomization repeatable.
+  shuffled_index = list(range(len(filenames)))
+  random.seed(12345)
+  random.shuffle(shuffled_index)
+
+  filenames = [filenames[i] for i in shuffled_index]
+  synsets = [synsets[i] for i in shuffled_index]
+  labels = [labels[i] for i in shuffled_index]
+
+  print('Found %d JPEG files across %d labels inside %s.' %
+        (len(filenames), len(challenge_synsets), data_dir))
+  return filenames, synsets, labels
+
+
+def _find_human_readable_labels(synsets, synset_to_human):
+  """Build a list of human-readable labels.
+
+  Args:
+    synsets: list of strings; each string is a unique WordNet ID.
+    synset_to_human: dict of synset to human labels, e.g.,
+      'n02119022' --> 'red fox, Vulpes vulpes'
+
+  Returns:
+    List of human-readable strings corresponding to each synset.
+  """
+  humans = []
+  for s in synsets:
+    assert s in synset_to_human, ('Failed to find: %s' % s)
+    humans.append(synset_to_human[s])
+  return humans
+
+
+def _process_dataset(name, directory, num_shards, synset_to_human,
+                     image_to_bboxes):
+  """Process a complete data set and save it as a TFRecord.
+
+  Args:
+    name: string, unique identifier specifying the data set.
+    directory: string, root path to the data set.
+    num_shards: integer number of shards for this data set.
+    synset_to_human: dict of synset to human labels, e.g.,
+      'n02119022' --> 'red fox, Vulpes vulpes'
+    image_to_bboxes: dictionary mapping image file names to a list of
+      bounding boxes. This list contains 0+ bounding boxes.
+  """
+  filenames, synsets, labels = _find_image_files(directory, FLAGS.labels_file)
+  humans = _find_human_readable_labels(synsets, synset_to_human)
+  #bboxes = _find_image_bounding_boxes(filenames, image_to_bboxes)
+  bboxes = []
+  _process_image_files(name, filenames, synsets, labels,
+                       humans, bboxes, num_shards)
+
+
+def _build_synset_lookup(imagenet_metadata_file):
+  """Build lookup for synset to human-readable label.
+
+  Args:
+    imagenet_metadata_file: string, path to file containing mapping from
+      synset to human-readable label.
+
+      Assumes each line of the file looks like:
+
+        n02119247    black fox
+        n02119359    silver fox
+        n02119477    red fox, Vulpes fulva
+
+      where each line corresponds to a unique mapping. Note that each line is
+      formatted as <synset>\t<human readable label>.
+
+  Returns:
+    Dictionary of synset to human labels, such as:
+      'n02119022' --> 'red fox, Vulpes vulpes'
+  """
+  lines = tf.gfile.FastGFile(imagenet_metadata_file, 'r').readlines()
+  synset_to_human = {}
+  for l in lines:
+    if l:
+      parts = l.strip().split('\t')
+      assert len(parts) == 2
+      synset = parts[0]
+      human = parts[1]
+      synset_to_human[synset] = human
+  return synset_to_human
+
+
+def main(unused_argv):
+  assert not FLAGS.train_shards % FLAGS.num_threads, (
+      'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards')
+  assert not FLAGS.validation_shards % FLAGS.num_threads, (
+      'Please make the FLAGS.num_threads commensurate with '
+      'FLAGS.validation_shards')
+  print('Saving results to %s' % FLAGS.output_directory)
+
+  # Build a map from synset to human-readable label.
+  synset_to_human = _build_synset_lookup(FLAGS.imagenet_metadata_file)
+
+  # Run it!
+  _process_dataset('validation', FLAGS.validation_directory,
+                   FLAGS.validation_shards, synset_to_human, None)
+  _process_dataset('train', FLAGS.train_directory, FLAGS.train_shards,
+                   synset_to_human, None)
+
+
+if __name__ == '__main__':
+  tf.app.run()
--- a/TensorFlow/Classification/ConvNets/dataprep/imagenet_2012_validation_synset_labels.txt
+++ b/TensorFlow/Classification/ConvNets/dataprep/imagenet_2012_validation_synset_labels.txt
--- a/TensorFlow/Classification/ConvNets/dataprep/imagenet_lsvrc_2015_synsets.txt
+++ b/TensorFlow/Classification/ConvNets/dataprep/imagenet_lsvrc_2015_synsets.txt
--- a/TensorFlow/Classification/ConvNets/dataprep/imagenet_metadata.txt
+++ b/TensorFlow/Classification/ConvNets/dataprep/imagenet_metadata.txt
--- a/TensorFlow/Classification/ConvNets/dataprep/imagewoof_synsets.txt
+++ b/TensorFlow/Classification/ConvNets/dataprep/imagewoof_synsets.txt
@ -0,0 +1,10 @@
+n02086240
+n02087394
+n02088364
+n02089973
+n02093754
+n02096294
+n02099601
+n02105641
+n02111889
+n02115641
--- a/TensorFlow/Classification/ConvNets/dataprep/preprocess_imagenet.sh
+++ b/TensorFlow/Classification/ConvNets/dataprep/preprocess_imagenet.sh
@ -0,0 +1,82 @@
+#!/bin/bash
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Script to download and preprocess ImageNet Challenge 2012
+# training and validation data set.
+#
+# The final output of this script are sharded TFRecord files containing
+# serialized Example protocol buffers. See build_imagenet_data.py for
+# details of how the Example protocol buffers contain the ImageNet data.
+#
+# The final output of this script appears as such:
+#
+#   data_dir/train-00000-of-01024
+#   data_dir/train-00001-of-01024
+#    ...
+#   data_dir/train-01023-of-01024
+#
+# and
+#
+#   data_dir/validation-00000-of-00128
+#   data_dir/validation-00001-of-00128
+#   ...
+#   data_dir/validation-00127-of-00128
+#
+# Note that this script may take several hours to run to completion. The
+# conversion of the ImageNet data to TFRecords alone takes 2-3 hours depending
+# on the speed of your machine. Please be patient.
+#
+# **IMPORTANT**
+# To download the raw images, the user must create an account with image-net.org
+# and generate a username and access_key. The latter two are required for
+# downloading the raw images.
+#
+# usage:
+#  ./preprocess_imagenet.sh [data-dir]
+set -e
+
+if [ -z "$1" ]; then
+  echo "Usage: preprocess_imagenet.sh [data dir]"
+  exit
+fi
+
+DATA_DIR="${1%/}"
+SCRATCH_DIR="${DATA_DIR}/raw-data/"
+mkdir -p ${SCRATCH_DIR}
+
+# Convert the XML files for bounding box annotations into a single CSV.
+echo "Extracting bounding box information from XML."
+BOUNDING_BOX_SCRIPT="./dataprep/process_bounding_boxes.py"
+BOUNDING_BOX_FILE="${DATA_DIR}/imagenet_2012_bounding_boxes.csv"
+BOUNDING_BOX_DIR="${DATA_DIR}/bounding_boxes/"
+
+LABELS_FILE="./dataprep/imagenet_lsvrc_2015_synsets.txt"
+
+"${BOUNDING_BOX_SCRIPT}" "${BOUNDING_BOX_DIR}" "${LABELS_FILE}" \
+ | sort > "${BOUNDING_BOX_FILE}"
+echo "preprocessing the ImageNet data."
+
+# Build the TFRecords version of the ImageNet data.
+OUTPUT_DIRECTORY="${DATA_DIR}"
+IMAGENET_METADATA_FILE="./dataprep/imagenet_metadata.txt"
+
+python ./dataprep/build_imagenet_data.py \
+  --train_directory="${DATA_DIR}/train" \
+  --validation_directory="${DATA_DIR}/val" \
+  --output_directory="${DATA_DIR}/result" \
+  --imagenet_metadata_file="${IMAGENET_METADATA_FILE}" \
+  --labels_file="${LABELS_FILE}" \
+  --bounding_box_file="${BOUNDING_BOX_FILE}"
--- a/TensorFlow/Classification/ConvNets/dataprep/preprocess_imagenet_validation_data.py
+++ b/TensorFlow/Classification/ConvNets/dataprep/preprocess_imagenet_validation_data.py
@ -0,0 +1,89 @@
+#!/usr/bin/python
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Process the ImageNet Challenge bounding boxes for TensorFlow model training.
+
+Associate the ImageNet 2012 Challenge validation data set with labels.
+
+The raw ImageNet validation data set is expected to reside in JPEG files
+located in the following directory structure.
+
+ data_dir/ILSVRC2012_val_00000001.JPEG
+ data_dir/ILSVRC2012_val_00000002.JPEG
+ ...
+ data_dir/ILSVRC2012_val_00050000.JPEG
+
+This script moves the files into a directory structure like such:
+ data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
+ data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
+ ...
+where 'n01440764' is the unique synset label associated with
+these images.
+
+This directory reorganization requires a mapping from validation image
+number (i.e. suffix of the original file) to the associated label. This
+is provided in the ImageNet development kit via a Matlab file.
+
+In order to make life easier and divorce ourselves from Matlab, we instead
+supply a custom text file that provides this mapping for us.
+
+Sample usage:
+  ./preprocess_imagenet_validation_data.py ILSVRC2012_img_val \
+  imagenet_2012_validation_synset_labels.txt
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import errno
+import os.path
+import sys
+
+
+if __name__ == '__main__':
+  if len(sys.argv) < 3:
+    print('Invalid usage\n'
+          'usage: preprocess_imagenet_validation_data.py '
+          '<validation data dir> <validation labels file>')
+    sys.exit(-1)
+  data_dir = sys.argv[1]
+  validation_labels_file = sys.argv[2]
+
+  # Read in the 50000 synsets associated with the validation data set.
+  labels = [l.strip() for l in open(validation_labels_file).readlines()]
+  unique_labels = set(labels)
+
+  # Make all sub-directories in the validation data dir.
+  for label in unique_labels:
+    labeled_data_dir = os.path.join(data_dir, label)
+    # Catch error if sub-directory exists
+    try:
+      os.makedirs(labeled_data_dir)
+    except OSError as e:
+      # Raise all errors but 'EEXIST'
+      if e.errno != errno.EEXIST:
+        raise
+
+  # Move all of the image to the appropriate sub-directory.
+  for i in range(len(labels)):
+    basename = 'ILSVRC2012_val_000%.5d.JPEG' % (i + 1)
+    original_filename = os.path.join(data_dir, basename)
+    if not os.path.exists(original_filename):
+      print('Failed to find: %s' % original_filename)
+      sys.exit(-1)
+    new_filename = os.path.join(data_dir, labels[i], basename)
+    os.rename(original_filename, new_filename)
--- a/TensorFlow/Classification/ConvNets/dataprep/process_bounding_boxes.py
+++ b/TensorFlow/Classification/ConvNets/dataprep/process_bounding_boxes.py
@ -0,0 +1,254 @@
+#!/usr/bin/python
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Process the ImageNet Challenge bounding boxes for TensorFlow model training.
+
+This script is called as
+
+process_bounding_boxes.py <dir> [synsets-file]
+
+Where <dir> is a directory containing the downloaded and unpacked bounding box
+data. If [synsets-file] is supplied, then only the bounding boxes whose
+synstes are contained within this file are returned. Note that the
+[synsets-file] file contains synset ids, one per line.
+
+The script dumps out a CSV text file in which each line contains an entry.
+  n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
+
+The entry can be read as:
+  <JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
+
+The bounding box for <JPEG file name> contains two points (xmin, ymin) and
+(xmax, ymax) specifying the lower-left corner and upper-right corner of a
+bounding box in *relative* coordinates.
+
+The user supplies a directory where the XML files reside. The directory
+structure in the directory <dir> is assumed to look like this:
+
+<dir>/nXXXXXXXX/nXXXXXXXX_YYYY.xml
+
+Each XML file contains a bounding box annotation. The script:
+
+ (1) Parses the XML file and extracts the filename, label and bounding box info.
+
+ (2) The bounding box is specified in the XML files as integer (xmin, ymin) and
+    (xmax, ymax) *relative* to image size displayed to the human annotator. The
+    size of the image displayed to the human annotator is stored in the XML file
+    as integer (height, width).
+
+    Note that the displayed size will differ from the actual size of the image
+    downloaded from image-net.org. To make the bounding box annotation useable,
+    we convert bounding box to floating point numbers relative to displayed
+    height and width of the image.
+
+    Note that each XML file might contain N bounding box annotations.
+
+    Note that the points are all clamped at a range of [0.0, 1.0] because some
+    human annotations extend outside the range of the supplied image.
+
+    See details here: http://image-net.org/download-bboxes
+
+(3) By default, the script outputs all valid bounding boxes. If a
+    [synsets-file] is supplied, only the subset of bounding boxes associated
+    with those synsets are outputted. Importantly, one can supply a list of
+    synsets in the ImageNet Challenge and output the list of bounding boxes
+    associated with the training images of the ILSVRC.
+
+    We use these bounding boxes to inform the random distortion of images
+    supplied to the network.
+
+If you run this script successfully, you will see the following output
+to stderr:
+> Finished processing 544546 XML files.
+> Skipped 0 XML files not in ImageNet Challenge.
+> Skipped 0 bounding boxes not in ImageNet Challenge.
+> Wrote 615299 bounding boxes from 544546 annotated images.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os.path
+import sys
+import xml.etree.ElementTree as ET
+
+
+class BoundingBox(object):
+  pass
+
+
+def GetItem(name, root, index=0):
+  count = 0
+  for item in root.iter(name):
+    if count == index:
+      return item.text
+    count += 1
+  # Failed to find "index" occurrence of item.
+  return -1
+
+
+def GetInt(name, root, index=0):
+  # In some XML annotation files, the point values are not integers, but floats.
+  # So we add a float function to avoid ValueError.
+  return int(float(GetItem(name, root, index)))
+
+
+def FindNumberBoundingBoxes(root):
+  index = 0
+  while True:
+    if GetInt('xmin', root, index) == -1:
+      break
+    index += 1
+  return index
+
+
+def ProcessXMLAnnotation(xml_file):
+  """Process a single XML file containing a bounding box."""
+  # pylint: disable=broad-except
+  try:
+    tree = ET.parse(xml_file)
+  except Exception:
+    print('Failed to parse: ' + xml_file, file=sys.stderr)
+    return None
+  # pylint: enable=broad-except
+  root = tree.getroot()
+
+  num_boxes = FindNumberBoundingBoxes(root)
+  boxes = []
+
+  for index in range(num_boxes):
+    box = BoundingBox()
+    # Grab the 'index' annotation.
+    box.xmin = GetInt('xmin', root, index)
+    box.ymin = GetInt('ymin', root, index)
+    box.xmax = GetInt('xmax', root, index)
+    box.ymax = GetInt('ymax', root, index)
+
+    box.width = GetInt('width', root)
+    box.height = GetInt('height', root)
+    box.filename = GetItem('filename', root) + '.JPEG'
+    box.label = GetItem('name', root)
+
+    xmin = float(box.xmin) / float(box.width)
+    xmax = float(box.xmax) / float(box.width)
+    ymin = float(box.ymin) / float(box.height)
+    ymax = float(box.ymax) / float(box.height)
+
+    # Some images contain bounding box annotations that
+    # extend outside of the supplied image. See, e.g.
+    # n03127925/n03127925_147.xml
+    # Additionally, for some bounding boxes, the min > max
+    # or the box is entirely outside of the image.
+    min_x = min(xmin, xmax)
+    max_x = max(xmin, xmax)
+    box.xmin_scaled = min(max(min_x, 0.0), 1.0)
+    box.xmax_scaled = min(max(max_x, 0.0), 1.0)
+
+    min_y = min(ymin, ymax)
+    max_y = max(ymin, ymax)
+    box.ymin_scaled = min(max(min_y, 0.0), 1.0)
+    box.ymax_scaled = min(max(max_y, 0.0), 1.0)
+
+    boxes.append(box)
+
+  return boxes
+
+if __name__ == '__main__':
+  if len(sys.argv) < 2 or len(sys.argv) > 3:
+    print('Invalid usage\n'
+          'usage: process_bounding_boxes.py <dir> [synsets-file]',
+          file=sys.stderr)
+    sys.exit(-1)
+
+  xml_files = glob.glob(sys.argv[1] + '/*/*.xml')
+  print('Identified %d XML files in %s' % (len(xml_files), sys.argv[1]),
+        file=sys.stderr)
+
+  if len(sys.argv) == 3:
+    labels = set([l.strip() for l in open(sys.argv[2]).readlines()])
+    print('Identified %d synset IDs in %s' % (len(labels), sys.argv[2]),
+          file=sys.stderr)
+  else:
+    labels = None
+
+  skipped_boxes = 0
+  skipped_files = 0
+  saved_boxes = 0
+  saved_files = 0
+  for file_index, one_file in enumerate(xml_files):
+    # Example: <...>/n06470073/n00141669_6790.xml
+    label = os.path.basename(os.path.dirname(one_file))
+
+    # Determine if the annotation is from an ImageNet Challenge label.
+    if labels is not None and label not in labels:
+      skipped_files += 1
+      continue
+
+    bboxes = ProcessXMLAnnotation(one_file)
+    assert bboxes is not None, 'No bounding boxes found in ' + one_file
+
+    found_box = False
+    for bbox in bboxes:
+      if labels is not None:
+        if bbox.label != label:
+          # Note: There is a slight bug in the bounding box annotation data.
+          # Many of the dog labels have the human label 'Scottish_deerhound'
+          # instead of the synset ID 'n02092002' in the bbox.label field. As a
+          # simple hack to overcome this issue, we only exclude bbox labels
+          # *which are synset ID's* that do not match original synset label for
+          # the XML file.
+          if bbox.label in labels:
+            skipped_boxes += 1
+            continue
+
+      # Guard against improperly specified boxes.
+      if (bbox.xmin_scaled >= bbox.xmax_scaled or
+          bbox.ymin_scaled >= bbox.ymax_scaled):
+        skipped_boxes += 1
+        continue
+
+      # Note bbox.filename occasionally contains '%s' in the name. This is
+      # data set noise that is fixed by just using the basename of the XML file.
+      image_filename = os.path.splitext(os.path.basename(one_file))[0]
+      print('%s.JPEG,%.4f,%.4f,%.4f,%.4f' %
+            (image_filename,
+             bbox.xmin_scaled, bbox.ymin_scaled,
+             bbox.xmax_scaled, bbox.ymax_scaled))
+
+      saved_boxes += 1
+      found_box = True
+    if found_box:
+      saved_files += 1
+    else:
+      skipped_files += 1
+
+    if not file_index % 5000:
+      print('--> processed %d of %d XML files.' %
+            (file_index + 1, len(xml_files)),
+            file=sys.stderr)
+      print('--> skipped %d boxes and %d XML files.' %
+            (skipped_boxes, skipped_files), file=sys.stderr)
+
+  print('Finished processing %d XML files.' % len(xml_files), file=sys.stderr)
+  print('Skipped %d XML files not in ImageNet Challenge.' % skipped_files,
+        file=sys.stderr)
+  print('Skipped %d bounding boxes not in ImageNet Challenge.' % skipped_boxes,
+        file=sys.stderr)
+  print('Wrote %d bounding boxes from %d annotated images.' %
+        (saved_boxes, saved_files),
+        file=sys.stderr)
+  print('Finished.', file=sys.stderr)
--- a/TensorFlow/Classification/ConvNets/main.py
+++ b/TensorFlow/Classification/ConvNets/main.py
@ -42,12 +42,10 @@ if __name__ == "__main__":
        log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
        os.makedirs(FLAGS.results_dir, exist_ok=True)

-        dllogger.init(
-            backends=[
-                dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path),
-                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
-            ]
-        )
+        dllogger.init(backends=[
+            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path),
+            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
+        ])
    else:
        dllogger.init(backends=[])
    dllogger.log(data=vars(FLAGS), step='PARAMETER')
@ -58,49 +56,46 @@ if __name__ == "__main__":
        architecture=FLAGS.arch,
        input_format='NHWC',
        compute_format=FLAGS.data_format,
-        dtype=tf.float32 if FLAGS.precision == 'fp32' else tf.float16,
+        dtype=tf.float32,
        n_channels=3,
-        height=224,
-        width=224,
+        height=224 if FLAGS.data_dir else FLAGS.synthetic_data_size,
+        width=224 if FLAGS.data_dir else FLAGS.synthetic_data_size,
        distort_colors=False,
        log_dir=FLAGS.results_dir,
        model_dir=FLAGS.model_dir if FLAGS.model_dir is not None else FLAGS.results_dir,
        data_dir=FLAGS.data_dir,
        data_idx_dir=FLAGS.data_idx_dir,
        weight_init=FLAGS.weight_init,
-        use_xla=FLAGS.use_xla,
-        use_tf_amp=FLAGS.use_tf_amp,
-        use_dali=FLAGS.use_dali,
+        use_xla=FLAGS.xla,
+        use_tf_amp=FLAGS.amp,
+        use_dali=FLAGS.dali,
        gpu_memory_fraction=FLAGS.gpu_memory_fraction,
        gpu_id=FLAGS.gpu_id,
-        seed=FLAGS.seed
-    )
+        seed=FLAGS.seed)

    if FLAGS.mode in ["train", "train_and_evaluate", "training_benchmark"]:
-        runner.train(
-            iter_unit=FLAGS.iter_unit,
-            num_iter=FLAGS.num_iter,
-            run_iter=FLAGS.run_iter,
-            batch_size=FLAGS.batch_size,
-            warmup_steps=FLAGS.warmup_steps,
-            log_every_n_steps=FLAGS.display_every,
-            weight_decay=FLAGS.weight_decay,
-            lr_init=FLAGS.lr_init,
-            lr_warmup_epochs=FLAGS.lr_warmup_epochs,
-            momentum=FLAGS.momentum,
-            loss_scale=FLAGS.loss_scale,
-            label_smoothing=FLAGS.label_smoothing,
-            mixup=FLAGS.mixup,
-            use_static_loss_scaling=FLAGS.use_static_loss_scaling,
-            use_cosine_lr=FLAGS.use_cosine_lr,
-            is_benchmark=FLAGS.mode == 'training_benchmark',
-            use_final_conv=FLAGS.use_final_conv,
-            quantize=FLAGS.quantize,
-            symmetric=FLAGS.symmetric,
-            quant_delay = FLAGS.quant_delay,
-            use_qdq = FLAGS.use_qdq,
-            finetune_checkpoint=FLAGS.finetune_checkpoint,
-        )
+        runner.train(iter_unit=FLAGS.iter_unit,
+                     num_iter=FLAGS.num_iter,
+                     run_iter=FLAGS.run_iter,
+                     batch_size=FLAGS.batch_size,
+                     warmup_steps=FLAGS.warmup_steps,
+                     log_every_n_steps=FLAGS.display_every,
+                     weight_decay=FLAGS.weight_decay,
+                     lr_init=FLAGS.lr_init,
+                     lr_warmup_epochs=FLAGS.lr_warmup_epochs,
+                     momentum=FLAGS.momentum,
+                     loss_scale=FLAGS.static_loss_scale,
+                     label_smoothing=FLAGS.label_smoothing,
+                     mixup=FLAGS.mixup,
+                     use_static_loss_scaling=(FLAGS.static_loss_scale != -1),
+                     use_cosine_lr=FLAGS.cosine_lr,
+                     is_benchmark=FLAGS.mode == 'training_benchmark',
+                     use_final_conv=FLAGS.use_final_conv,
+                     quantize=FLAGS.quantize,
+                     symmetric=FLAGS.symmetric,
+                     quant_delay=FLAGS.quant_delay,
+                     use_qdq=FLAGS.use_qdq,
+                     finetune_checkpoint=FLAGS.finetune_checkpoint)

    if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:

@ -109,19 +104,17 @@ if __name__ == "__main__":

        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:

-            runner.evaluate(
-                iter_unit=FLAGS.iter_unit if FLAGS.mode != "train_and_evaluate" else "epoch",
-                num_iter=FLAGS.num_iter if FLAGS.mode != "train_and_evaluate" else 1,
-                warmup_steps=FLAGS.warmup_steps,
-                batch_size=FLAGS.batch_size,
-                log_every_n_steps=FLAGS.display_every,
-                is_benchmark=FLAGS.mode == 'inference_benchmark',
-                export_dir=FLAGS.export_dir,
-                quantize=FLAGS.quantize,
-                symmetric=FLAGS.symmetric,
-                use_final_conv=FLAGS.use_final_conv,
-                use_qdq=FLAGS.use_qdq
-            )
+            runner.evaluate(iter_unit=FLAGS.iter_unit if FLAGS.mode != "train_and_evaluate" else "epoch",
+                            num_iter=FLAGS.num_iter if FLAGS.mode != "train_and_evaluate" else 1,
+                            warmup_steps=FLAGS.warmup_steps,
+                            batch_size=FLAGS.batch_size,
+                            log_every_n_steps=FLAGS.display_every,
+                            is_benchmark=FLAGS.mode == 'inference_benchmark',
+                            export_dir=FLAGS.export_dir,
+                            quantize=FLAGS.quantize,
+                            symmetric=FLAGS.symmetric,
+                            use_final_conv=FLAGS.use_final_conv,
+                            use_qdq=FLAGS.use_qdq)

    if FLAGS.mode == 'predict':
        if FLAGS.to_predict is None:
@ -134,4 +127,8 @@ if __name__ == "__main__":
            raise NotImplementedError("Only single GPU inference is implemented.")

        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
-            runner.predict(FLAGS.to_predict, quantize=FLAGS.quantize, symmetric=FLAGS.symmetric, use_qdq=FLAGS.use_qdq, use_final_conv=FLAGS.use_final_conv)
+            runner.predict(FLAGS.to_predict,
+                           quantize=FLAGS.quantize,
+                           symmetric=FLAGS.symmetric,
+                           use_qdq=FLAGS.use_qdq,
+                           use_final_conv=FLAGS.use_final_conv)
--- a/TensorFlow/Classification/ConvNets/model/layers/conv2d.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/conv2d.py
@ -29,7 +29,7 @@ def conv2d(
    data_format='NHWC',
    dilation_rate=(1, 1),
    use_bias=True,
-    kernel_initializer=tf.variance_scaling_initializer(),
+    kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
    bias_initializer=tf.zeros_initializer(),
    trainable=True,
    name=None
@ -56,6 +56,5 @@ def conv2d(
        activation=None,
        name=name
    )
-    
-    return net

+    return net
--- a/TensorFlow/Classification/ConvNets/model/layers/dense.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/dense.py
@ -22,7 +22,7 @@ def dense(
    units,
    use_bias=True,
    trainable=True,
-    kernel_initializer=tf.variance_scaling_initializer(),
+    kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
    bias_initializer=tf.zeros_initializer()
 ):

--- a/TensorFlow/Classification/ConvNets/model/layers/squeeze_excitation_layer.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/squeeze_excitation_layer.py
@ -29,7 +29,7 @@ def squeeze_excitation_layer(
    ratio,
    training=True,
    data_format='NCHW',
-    kernel_initializer=tf.variance_scaling_initializer(),
+    kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
    bias_initializer=tf.zeros_initializer(),
    name="squeeze_excitation_layer"
 ):
--- a/TensorFlow/Classification/ConvNets/model/resnet.py
+++ b/TensorFlow/Classification/ConvNets/model/resnet.py
@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
 from __future__ import print_function

 import tensorflow as tf
@ -34,7 +33,6 @@ from utils.data_utils import normalized_inputs
 from utils.learning_rate import learning_rate_scheduler
 from utils.optimizers import FixedLossScalerOptimizer

-
 __all__ = [
    'ResnetModel',
 ]
@ -89,14 +87,14 @@ class ResnetModel(object):
        )

        self.conv2d_hparams = tf.contrib.training.HParams(
-            kernel_initializer=tf.variance_scaling_initializer(
+            kernel_initializer=tf.compat.v1.variance_scaling_initializer(
                scale=2.0, distribution='truncated_normal', mode=weight_init
            ),
            bias_initializer=tf.constant_initializer(0.0)
        )

        self.dense_hparams = tf.contrib.training.HParams(
-            kernel_initializer=tf.variance_scaling_initializer(
+            kernel_initializer=tf.compat.v1.variance_scaling_initializer(
                scale=2.0, distribution='truncated_normal', mode=weight_init
            ),
            bias_initializer=tf.constant_initializer(0.0)
@ -109,12 +107,13 @@ class ResnetModel(object):
            print("Input_format", input_format)
            print("dtype", str(dtype))

-
    def __call__(self, features, labels, mode, params):

        if mode == tf.estimator.ModeKeys.TRAIN:
-            mandatory_params = ["batch_size", "lr_init", "num_gpus", "steps_per_epoch",
-                                "momentum", "weight_decay", "loss_scale", "label_smoothing"]
+            mandatory_params = [
+                "batch_size", "lr_init", "num_gpus", "steps_per_epoch", "momentum", "weight_decay", "loss_scale",
+                "label_smoothing"
+            ]
            for p in mandatory_params:
                if p not in params:
                    raise RuntimeError("Parameter {} is missing.".format(p))
@ -141,43 +140,46 @@ class ResnetModel(object):

            mixup = 0
            eta = 0
-            
-            if mode == tf.estimator.ModeKeys.TRAIN:        
+
+            if mode == tf.estimator.ModeKeys.TRAIN:
                eta = params['label_smoothing']
                mixup = params['mixup']
-                
-            if mode != tf.estimator.ModeKeys.PREDICT: 
-                one_hot_smoothed_labels = tf.one_hot(labels, 1001, 
-                                                     on_value = 1 - eta + eta/1001,
-                                                     off_value = eta/1001)
+
+            if mode != tf.estimator.ModeKeys.PREDICT:
+                n_cls = self.model_hparams.n_classes
+                one_hot_smoothed_labels = tf.one_hot(labels, n_cls, 
+                        on_value=1 - eta + eta / n_cls, off_value=eta / n_cls)
                if mixup != 0:

                    print("Using mixup training with beta=", params['mixup'])
                    beta_distribution = tf.distributions.Beta(params['mixup'], params['mixup'])

-                    feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1])      
+                    feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1])

-                    reversed_feature_coefficients = tf.subtract(tf.ones(shape=feature_coefficients.shape), feature_coefficients)
+                    reversed_feature_coefficients = tf.subtract(
+                        tf.ones(shape=feature_coefficients.shape), feature_coefficients
+                    )

-                    rotated_features = tf.reverse(features, axis=[0])      
+                    rotated_features = tf.reverse(features, axis=[0])

                    features = feature_coefficients * features + reversed_feature_coefficients * rotated_features

                    label_coefficients = tf.squeeze(feature_coefficients, axis=[2, 3])

-                    rotated_labels = tf.reverse(one_hot_smoothed_labels, axis=[0])    
+                    rotated_labels = tf.reverse(one_hot_smoothed_labels, axis=[0])

-                    reversed_label_coefficients = tf.subtract(tf.ones(shape=label_coefficients.shape), label_coefficients)
+                    reversed_label_coefficients = tf.subtract(
+                        tf.ones(shape=label_coefficients.shape), label_coefficients
+                    )

                    one_hot_smoothed_labels = label_coefficients * one_hot_smoothed_labels + reversed_label_coefficients * rotated_labels
-                
-                
+
            # Update Global Step
            global_step = tf.train.get_or_create_global_step()
            tf.identity(global_step, name="global_step_ref")

            tf.identity(features, name="features_ref")
-            
+
            if mode == tf.estimator.ModeKeys.TRAIN:
                tf.identity(labels, name="labels_ref")

@ -202,16 +204,31 @@ class ResnetModel(object):
            tf.identity(probs, name="probs_ref")
            tf.identity(y_preds, name="y_preds_ref")

+            #if mode == tf.estimator.ModeKeys.TRAIN:
+            #
+            #    assert (len(tf.trainable_variables()) == 161)
+            #
+            #else:
+            #
+            #    assert (len(tf.trainable_variables()) == 0)
+
            if mode == tf.estimator.ModeKeys.TRAIN and params['quantize']:
                dllogger.log(data={"QUANTIZATION AWARE TRAINING ENABLED": True}, step=tuple())
                if params['symmetric']:
-                    dllogger.log(data={"MODE":"USING SYMMETRIC MODE"}, step=tuple())
-                    tf.contrib.quantize.experimental_create_training_graph(tf.get_default_graph(), symmetric=True, use_qdq=params['use_qdq'] ,quant_delay=params['quant_delay'])
+                    dllogger.log(data={"MODE": "USING SYMMETRIC MODE"}, step=tuple())
+                    tf.contrib.quantize.experimental_create_training_graph(
+                        tf.get_default_graph(),
+                        symmetric=True,
+                        use_qdq=params['use_qdq'],
+                        quant_delay=params['quant_delay']
+                    )
                else:
-                    dllogger.log(data={"MODE":"USING ASSYMETRIC MODE"}, step=tuple())
-                    tf.contrib.quantize.create_training_graph(tf.get_default_graph(), quant_delay=params['quant_delay'], use_qdq=params['use_qdq'])
-            
-            # Fix for restoring variables during fine-tuning of Resnet-50
+                    dllogger.log(data={"MODE": "USING ASSYMETRIC MODE"}, step=tuple())
+                    tf.contrib.quantize.create_training_graph(
+                        tf.get_default_graph(), quant_delay=params['quant_delay'], use_qdq=params['use_qdq']
+                    )
+
+            # Fix for restoring variables during fine-tuning of Resnet
            if 'finetune_checkpoint' in params.keys():
                train_vars = tf.trainable_variables()
                train_var_dict = {}
@ -220,6 +237,13 @@ class ResnetModel(object):
                dllogger.log(data={"Restoring variables from checkpoint": params['finetune_checkpoint']}, step=tuple())
                tf.train.init_from_checkpoint(params['finetune_checkpoint'], train_var_dict)

+        with tf.device("/cpu:0"):
+            if hvd_utils.is_using_hvd():
+                sync_var = tf.Variable(initial_value=[0], dtype=tf.int32, name="signal_handler_var")
+                sync_var_assing = sync_var.assign([1], name="signal_handler_var_set")
+                sync_var_reset = sync_var.assign([0], name="signal_handler_var_reset")
+                sync_op = hvd.allreduce(sync_var, op=hvd.Sum, name="signal_handler_all_reduce")
+
        if mode == tf.estimator.ModeKeys.PREDICT:

            predictions = {'classes': y_preds, 'probabilities': probs}
@ -239,8 +263,12 @@ class ResnetModel(object):
                    acc_top5 = tf.nn.in_top_k(predictions=logits, targets=labels, k=5)

                else:
-                    acc_top1, acc_top1_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=1))
-                    acc_top5, acc_top5_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=5))
+                    acc_top1, acc_top1_update_op = tf.metrics.mean(
+                        tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
+                    )
+                    acc_top5, acc_top5_update_op = tf.metrics.mean(
+                        tf.nn.in_top_k(predictions=logits, targets=labels, k=5)
+                    )

                tf.identity(acc_top1, name="acc_top1_ref")
                tf.identity(acc_top5, name="acc_top5_ref")
@ -251,20 +279,21 @@ class ResnetModel(object):
                    'accuracy_top1': acc_top1,
                    'accuracy_top5': acc_top5
                }
-                
-                cross_entropy = tf.losses.softmax_cross_entropy(
-                    logits=logits, onehot_labels=one_hot_smoothed_labels)
+
+                cross_entropy = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=one_hot_smoothed_labels)

                assert (cross_entropy.dtype == tf.float32)
                tf.identity(cross_entropy, name='cross_entropy_loss_ref')

                def loss_filter_fn(name):
                    """we don't need to compute L2 loss for BN and bias (eq. to add a cste)"""
-                    return all([
-                        tensor_name not in name.lower()
-                        # for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"]
-                        for tensor_name in ["batchnorm", "batch_norm", "batch_normalization"]
-                    ])
+                    return all(
+                        [
+                            tensor_name not in name.lower()
+                            # for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"]
+                            for tensor_name in ["batchnorm", "batch_norm", "batch_normalization"]
+                        ]
+                    )

                filtered_params = [tf.cast(v, tf.float32) for v in tf.trainable_variables() if loss_filter_fn(v.name)]

@ -287,7 +316,7 @@ class ResnetModel(object):
                tf.summary.scalar('cross_entropy', cross_entropy)
                tf.summary.scalar('l2_loss', l2_loss)
                tf.summary.scalar('total_loss', total_loss)
-                
+
                if mode == tf.estimator.ModeKeys.TRAIN:

                    with tf.device("/cpu:0"):
@ -317,17 +346,18 @@ class ResnetModel(object):
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if mode != tf.estimator.ModeKeys.TRAIN:
                        update_ops += [acc_top1_update_op, acc_top5_update_op]
-                    
+
                    deterministic = True
-                    gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE)
+                    gate_gradients = (tf.compat.v1.train.Optimizer.GATE_OP if deterministic else tf.compat.v1.train.Optimizer.GATE_NONE)

                    backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step)

-                    
                    if self.model_hparams.use_dali:
                        train_ops = tf.group(backprop_op, update_ops, name='train_ops')
                    else:
-                        train_ops = tf.group(backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops')
+                        train_ops = tf.group(
+                            backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops'
+                        )

                    return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_ops)

@ -338,23 +368,18 @@ class ResnetModel(object):
                    }

                    return tf.estimator.EstimatorSpec(
-                        mode=mode,
-                        predictions=predictions,
-                        loss=total_loss,
-                        eval_metric_ops=eval_metrics
+                        mode=mode, predictions=predictions, loss=total_loss, eval_metric_ops=eval_metrics
                    )

                else:
                    raise NotImplementedError('Unknown mode {}'.format(mode))

-                
    @staticmethod
    def _stage(tensors):
        """Stages the given tensors in a StagingArea for asynchronous put/get.
        """
        stage_area = tf.contrib.staging.StagingArea(
-            dtypes=[tensor.dtype for tensor in tensors],
-            shapes=[tensor.get_shape() for tensor in tensors]
+            dtypes=[tensor.dtype for tensor in tensors], shapes=[tensor.get_shape() for tensor in tensors]
        )

        put_op = stage_area.put(tensors)
@ -364,14 +389,11 @@ class ResnetModel(object):

        return put_op, get_tensors

-
-
    def build_model(self, inputs, training=True, reuse=False, use_final_conv=False):
-        
+
        with var_storage.model_variable_scope(
-            self.model_hparams.model_name,
-            reuse=reuse,
-            dtype=self.model_hparams.dtype):
+            self.model_hparams.model_name, reuse=reuse, dtype=self.model_hparams.dtype
+        ):

            with tf.variable_scope("input_reshape"):
                if self.model_hparams.input_format == 'NHWC' and self.model_hparams.compute_format == 'NCHW':
@ -426,27 +448,29 @@ class ResnetModel(object):
                        batch_norm_hparams=self.batch_norm_hparams,
                        block_name="btlnck_block_%d_%d" % (block_id, layer_id),
                        use_se=self.model_hparams.use_se,
-                        ratio=self.model_hparams.se_ratio)
+                        ratio=self.model_hparams.se_ratio
+                    )

            with tf.variable_scope("output"):
                net = layers.reduce_mean(
-                    net, keepdims=use_final_conv, data_format=self.model_hparams.compute_format, name='spatial_mean')
+                    net, keepdims=False, data_format=self.model_hparams.compute_format, name='spatial_mean'
+                )

                if use_final_conv:
                    logits = layers.conv2d(
-                                    net,
-                                    n_channels=self.model_hparams.n_classes,
-                                    kernel_size=(1, 1),
-                                    strides=(1, 1),
-                                    padding='SAME',
-                                    data_format=self.model_hparams.compute_format,
-                                    dilation_rate=(1, 1),
-                                    use_bias=True,
-                                    kernel_initializer=self.dense_hparams.kernel_initializer,
-                                    bias_initializer=self.dense_hparams.bias_initializer,
-                                    trainable=training,
-                                    name='dense'
-                                )
+                        net,
+                        n_channels=self.model_hparams.n_classes,
+                        kernel_size=(1, 1),
+                        strides=(1, 1),
+                        padding='SAME',
+                        data_format=self.model_hparams.compute_format,
+                        dilation_rate=(1, 1),
+                        use_bias=True,
+                        kernel_initializer=self.dense_hparams.kernel_initializer,
+                        bias_initializer=self.dense_hparams.bias_initializer,
+                        trainable=training,
+                        name='dense'
+                    )
                else:
                    logits = layers.dense(
                        inputs=net,
@ -454,7 +478,8 @@ class ResnetModel(object):
                        use_bias=True,
                        trainable=training,
                        kernel_initializer=self.dense_hparams.kernel_initializer,
-                        bias_initializer=self.dense_hparams.bias_initializer)
+                        bias_initializer=self.dense_hparams.bias_initializer
+                    )

                if logits.dtype != tf.float32:
                    logits = tf.cast(logits, tf.float32)
@ -464,27 +489,25 @@ class ResnetModel(object):

            return probs, logits

+
 model_architectures = {
    'resnet50': {
        'layers': [3, 4, 6, 3],
        'widths': [64, 128, 256, 512],
        'expansions': 4,
    },
-
    'resnext101-32x4d': {
        'layers': [3, 4, 23, 3],
        'widths': [128, 256, 512, 1024],
        'expansions': 2,
        'cardinality': 32,
    },
-
-    'se-resnext101-32x4d' : {
-        'cardinality' : 32,
-        'layers' : [3, 4, 23, 3],
-        'widths' : [128, 256, 512, 1024],
-        'expansions' : 2,
+    'se-resnext101-32x4d': {
+        'cardinality': 32,
+        'layers': [3, 4, 23, 3],
+        'widths': [128, 256, 512, 1024],
+        'expansions': 2,
        'use_se': True,
        'se_ratio': 16,
    },
-
 }
--- a/TensorFlow/Classification/ConvNets/postprocess_ckpt.py
+++ b/TensorFlow/Classification/ConvNets/postprocess_ckpt.py
@ -71,4 +71,4 @@ if __name__=='__main__':
        file.write("model_checkpoint_path: "+ "\"" + new_ckpt + "\"")
        
    # Process the input checkpoint, apply transforms and generate a new checkpoint.
-    process_checkpoint(input_ckpt, new_ckpt_path, args.dense_layer)
+    process_checkpoint(input_ckpt, new_ckpt_path, args.dense_layer)
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/README.md
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/README.md
@ -244,16 +244,16 @@ For example, to train on DGX-1 for 90 epochs using AMP, run:
 Additionally, features like DALI data preprocessing or TensorFlow XLA can be enabled with
 following arguments when running those scripts:

-`bash ./resnet50v1.5/training/DGX1_RN50_AMP_90E.sh /path/to/result /data --use_xla --use_dali`
+`bash ./resnet50v1.5/training/DGX1_RN50_AMP_90E.sh /path/to/result /data --xla --dali`

 7. Start validation/evaluation.
 To evaluate the validation dataset located in `/data/tfrecords`, run `main.py` with
 `--mode=evaluate`. For example:

 `python main.py --mode=evaluate --data_dir=/data/tfrecords --batch_size <batch size> --model_dir
-<model location> --results_dir <output location> [--use_xla] [--use_tf_amp]`
+<model location> --results_dir <output location> [--xla] [--amp]`

-The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during evaluation. 
+The optional `--xla` and `--amp` flags control XLA and AMP during evaluation. 

 ## Advanced

@ -292,99 +292,116 @@ The `runtime/` directory contains the following module that define the mechanics
 The script for training and evaluating the ResNet-50 v1.5 model has a variety of parameters that control these processes.

 ```
-usage: main.py [-h]
-               [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
+usage: main.py [-h] [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
               [--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}]
-               [--data_dir DATA_DIR] [--data_idx_dir DATA_IDX_DIR]
-               [--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
-               [--batch_size BATCH_SIZE] [--num_iter NUM_ITER]
-               [--iter_unit {epoch,batch}] [--warmup_steps WARMUP_STEPS]
-               [--model_dir MODEL_DIR] [--results_dir RESULTS_DIR]
-               [--log_filename LOG_FILENAME] [--display_every DISPLAY_EVERY]
-               [--lr_init LR_INIT] [--lr_warmup_epochs LR_WARMUP_EPOCHS]
-               [--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
-               [--momentum MOMENTUM] [--loss_scale LOSS_SCALE]
-               [--label_smoothing LABEL_SMOOTHING] [--mixup MIXUP]
-               [--use_static_loss_scaling | --nouse_static_loss_scaling]
-               [--use_xla | --nouse_xla] [--use_dali | --nouse_dali]
-               [--use_tf_amp | --nouse_tf_amp]
-               [--use_cosine_lr | --nouse_cosine_lr] [--seed SEED]
+               [--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]       
+               --batch_size BATCH_SIZE [--num_iter NUM_ITER]  
+               [--run_iter RUN_ITER] [--iter_unit {epoch,batch}]              
+               [--warmup_steps WARMUP_STEPS] [--model_dir MODEL_DIR]
+               [--results_dir RESULTS_DIR] [--log_filename LOG_FILENAME]      
+               [--display_every DISPLAY_EVERY] [--seed SEED]
               [--gpu_memory_fraction GPU_MEMORY_FRACTION] [--gpu_id GPU_ID]
-
-JoC-RN50v1.5-TF
-
-optional arguments:
-  -h, --help            Show this help message and exit
+               [--finetune_checkpoint FINETUNE_CHECKPOINT] [--use_final_conv]
+               [--quant_delay QUANT_DELAY] [--quantize] [--use_qdq]        
+               [--symmetric] [--data_dir DATA_DIR]         
+               [--data_idx_dir DATA_IDX_DIR] [--dali]
+               [--synthetic_data_size SYNTHETIC_DATA_SIZE] [--lr_init LR_INIT]
+               [--lr_warmup_epochs LR_WARMUP_EPOCHS] 
+               [--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
+               [--momentum MOMENTUM] [--label_smoothing LABEL_SMOOTHING]
+               [--mixup MIXUP] [--cosine_lr] [--xla]            
+               [--data_format {NHWC,NCHW}] [--amp]
+               [--static_loss_scale STATIC_LOSS_SCALE]
+                                                            
+JoC-RN50v1.5-TF                      
+                                                                           
+optional arguments:          
+  -h, --help            show this help message and exit.
  --arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}
-                        Architecture of model to run (default is resnet50)
+                        Architecture of model to run.                           
  --mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}
                        The execution mode of the script.
+  --export_dir EXPORT_DIR                                                                                                                                                                                                                                                  
+                        Directory in which to write exported SavedModel.         
+  --to_predict TO_PREDICT        
+                        Path to file or directory of files to run prediction
+                        on.
+  --batch_size BATCH_SIZE      
+                        Size of each minibatch per GPU.                    
+  --num_iter NUM_ITER   Number of iterations to run.
+  --run_iter RUN_ITER   Number of training iterations to run on single run.
+  --iter_unit {epoch,batch}                                
+                        Unit of iterations.                                  
+  --warmup_steps WARMUP_STEPS                                    
+                        Number of steps considered as warmup and not taken
+                        into account for performance measurements.                                  
+  --model_dir MODEL_DIR                
+                        Directory in which to write model. If undefined,         
+                        results dir will be used.                                                  
+  --results_dir RESULTS_DIR
+                        Directory in which to write training logs, summaries
+                        and checkpoints.
+  --log_filename LOG_FILENAME
+                        Name of the JSON file to which write the training log.
+  --display_every DISPLAY_EVERY
+                        How often (in batches) to print out running
+                        information.
+  --seed SEED           Random seed.
+  --gpu_memory_fraction GPU_MEMORY_FRACTION
+                        Limit memory fraction used by training script for DALI.
+  --gpu_id GPU_ID       Specify ID of the target GPU on multi-device platform.
+                        Effective only for single-GPU mode.
+  --finetune_checkpoint FINETUNE_CHECKPOINT
+                        Path to pre-trained checkpoint which will be used for
+                        fine-tuning.
+  --use_final_conv      Use convolution operator instead of MLP as last layer.
+  --quant_delay QUANT_DELAY
+                        Number of steps to be run before quantization starts
+                        to happen.
+  --quantize            Quantize weights and activations during training.
+                        (Defaults to Assymmetric quantization)
+  --use_qdq             Use QDQV3 op instead of FakeQuantWithMinMaxVars op for
+                        quantization. QDQv3 does only scaling.
+  --symmetric           Quantize weights and activations during training using
+                        symmetric quantization.
+
+Dataset arguments:
  --data_dir DATA_DIR   Path to dataset in TFRecord format. Files should be
                        named 'train-*' and 'validation-*'.
  --data_idx_dir DATA_IDX_DIR
                        Path to index files for DALI. Files should be named
                        'train-*' and 'validation-*'.
-  --export_dir EXPORT_DIR
-                        Directory in which to write exported SavedModel.
-  --to_predict TO_PREDICT
-                        Path to file or directory of files to run prediction
-                        on.
-  --batch_size BATCH_SIZE
-                        Size of each minibatch per GPU.
-  --num_iter NUM_ITER   Number of iterations to run.
-  --iter_unit {epoch,batch}
-                        Unit of iterations.
-  --warmup_steps WARMUP_STEPS
-                        Number of steps considered as warmup and not taken
-                        into account for performance measurements.
-  --model_dir MODEL_DIR
-                        Directory in which to write the model. If undefined,
-                        results directory will be used.
-  --results_dir RESULTS_DIR
-                        Directory in which to write training logs, summaries
-                        and checkpoints.
-  --log_filename LOG_FILENAME
-                        Name of the JSON file to which write the training log
-  --display_every DISPLAY_EVERY
-                        How often (in batches) to print out running
-                        information.
+  --dali                Enable DALI data input.
+  --synthetic_data_size SYNTHETIC_DATA_SIZE
+                        Dimension of image for synthetic dataset.
+
+Training arguments:
  --lr_init LR_INIT     Initial value for the learning rate.
  --lr_warmup_epochs LR_WARMUP_EPOCHS
-                        Number of warmup epochs for the learning rate schedule.
+                        Number of warmup epochs for learning rate schedule.
  --weight_decay WEIGHT_DECAY
                        Weight Decay scale factor.
  --weight_init {fan_in,fan_out}
                        Model weight initialization method.
-  --momentum MOMENTUM   SGD momentum value for the momentum optimizer.
-  --loss_scale LOSS_SCALE
-                        Loss scale for FP16 training and fast math FP32.
+  --momentum MOMENTUM   SGD momentum value for the Momentum optimizer.
  --label_smoothing LABEL_SMOOTHING
                        The value of label smoothing.
  --mixup MIXUP         The alpha parameter for mixup (if 0 then mixup is not
                        applied).
-  --use_static_loss_scaling
-                        Use static loss scaling in FP16 or FP32 AMP.
-  --nouse_static_loss_scaling
-  --use_xla             Enable XLA (Accelerated Linear Algebra) computation
+  --cosine_lr           Use cosine learning rate schedule.
+
+Generic optimization arguments:
+  --xla                 Enable XLA (Accelerated Linear Algebra) computation
                        for improved performance.
-  --nouse_xla
-  --use_dali            Enable DALI data input.
-  --nouse_dali
-  --use_tf_amp          Enable AMP to speedup FP32
-                        computation using Tensor Cores.
-  --nouse_tf_amp
-  --use_cosine_lr       Use cosine learning rate schedule.
-  --nouse_cosine_lr
-  --seed SEED           Random seed.
-  --gpu_memory_fraction GPU_MEMORY_FRACTION
-                        Limit memory fraction used by the training script for DALI
-  --gpu_id GPU_ID       Specify the ID of the target GPU on a multi-device platform.
-                        Effective only for single-GPU mode.
-  --quantize            Used to add quantization nodes in the graph (Default: Asymmetric quantization)
-  --symmetric           If --quantize mode is used, this option enables symmetric quantization
-  --use_qdq             Use quantize_and_dequantize (QDQ) op instead of FakeQuantWithMinMaxVars op for quantization. QDQ does only scaling.
-  --finetune_checkpoint Path to pre-trained checkpoint which can be used for fine-tuning
-  --quant_delay         Number of steps to be run before quantization starts to happen
+  --data_format {NHWC,NCHW}
+                        Data format used to do calculations.
+  --amp                 Enable Automatic Mixed Precision to speedup
+                        computation using tensor cores.
+
+Automatic Mixed Precision arguments:
+  --static_loss_scale STATIC_LOSS_SCALE
+                        Use static loss scaling in FP32 AMP.
+
 ```

 ### Quantization Aware Training
@ -424,12 +441,13 @@ Arguments:
 * `--input_format` : Data format of input tensor (Default: NCHW). Use NCHW format to optimize the graph with TensorRT.
 * `--compute_format` : Data format of the operations in the network (Default: NCHW). Use NCHW format to optimize the graph with TensorRT.

+
 ### Inference process
 To run inference on a single example with a checkpoint and a model script, use: 

 `python main.py --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`

-The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during inference.
+The optional `--xla` and `--amp` flags control XLA and AMP during inference.

 ## Performance

@ -448,7 +466,7 @@ To benchmark the training performance on a specific batch size, run:
        
    * AMP

-        `python ./main.py --mode=training_benchmark  --use_tf_amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        `python ./main.py --mode=training_benchmark  --amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
        
 * For multiple GPUs
    * FP32 / TF32
@ -457,16 +475,18 @@ To benchmark the training performance on a specific batch size, run:
        
    * AMP
    
-        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --mode=training_benchmark --use_tf_amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --mode=training_benchmark --amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
        
        
 Each of these scripts runs 200 warm-up iterations and measures the first epoch.

 To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
-with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
-For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
+with `--xla` and `--dali` flags. For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
 Suggested batch sizes for training are 256 for mixed precision training and 128 for single precision training per single V100 16 GB.

+If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. The resolution of synthetic images used can be controlled with `--synthetic_data_size` flag.
+
+
 #### Inference performance benchmark

 To benchmark the inference performance on a specific batch size, run:
@ -477,11 +497,10 @@ To benchmark the inference performance on a specific batch size, run:

 * AMP

-`python ./main.py --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+`python ./main.py --mode=inference_benchmark --amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`

 By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
 To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. 
-For proper throughput and latency reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
 If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.

 The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnet50v1.5`, by simply running:
@ -490,6 +509,9 @@ The benchmark can be automated with the `inference_benchmark.sh` script provided
 The `<data dir>` parameter refers to the input data directory (by default `/data/tfrecords` inside the container). 
 By default, the benchmark tests the following configurations: **FP32**, **AMP**, **AMP + XLA** with different batch sizes.
 When the optional directory with the DALI index files `<data idx dir>` is specified, the benchmark executes an additional **DALI + AMP + XLA** configuration.
+For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
+
+For performance benchmark of raw model, synthetic dataset can be used. To use synthetic dataset, use `--synthetic_data_size` flag instead of `--data_dir` to specify input image size.

 ### Results

@ -568,17 +590,6 @@ on NVIDIA DGX A100 with (8x A100 40G) GPUs.
 | 8 | ~2h    | ~5h   | 


-##### Training time: NVIDIA DGX A100 (8x A100 40GB)
-
-Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-a100-8x-a100-40g) 
-on NVIDIA DGX A100 with (8x A100 40G) GPUs.
-
-| GPUs | Time to train - mixed precision + XLA | Time to train - mixed precision | Time to train - TF32 + XLA | Time to train - TF32 |
-|---|--------|---------|---------|-------|
-| 1 | ~18h   | ~19.5h | ~40h   | ~47h   |
-| 8 | ~2h    | ~2.5h  | ~5h    | ~6h    | 
-
-
 ##### Training time: NVIDIA DGX-1 (8x V100 16G)

 Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-1-8x-v100-16g) 
@ -821,22 +832,25 @@ on NVIDIA T4 with (1x T4 16G) GPU.
  * Added benchmark results for DGX-2 and XLA-enabled DGX-1 and DGX-2.
 3. July, 2019
  * Added Cosine learning rate schedule
-3. August, 2019
+4. August, 2019
  * Added mixup regularization
  * Added T4 benchmarks
  * Improved inference capabilities
  * Added SavedModel export 
-4. January, 2020
+5. January, 2020
  * Removed manual checks for dataset paths to facilitate cloud storage solutions
  * Move to a new logging solution
  * Bump base docker image version
-5. March, 2020
+6. March, 2020
  * Code cleanup and refactor
  * Improved training process
-6. June, 2020
+7. June, 2020
  * Added benchmark results for DGX-A100
  * Updated benchmark results for DGX-1, DGX-2 and T4
  * Updated base docker image version
+8. August 2020
+  * Updated command line argument names
+  * Added support for syntetic dataset with different image size

 ### Known issues
-Performance without XLA enabled is low. We recommend using XLA.
+Performance without XLA enabled is low due to BN + ReLU fusion bug.
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/inference_benchmark.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/inference_benchmark.sh
@ -22,12 +22,12 @@ function test_configuration() {
 }

 test_configuration "FP32 nodali noxla"
-test_configuration "FP32 nodali xla" "--use_xla"
-test_configuration "FP16 nodali noxla" "--use_tf_amp"
-test_configuration "FP16 nodali xla" "--use_tf_amp --use_xla"
+test_configuration "FP32 nodali xla" "--xla"
+test_configuration "FP16 nodali noxla" "--amp"
+test_configuration "FP16 nodali xla" "--amp --xla"

 if [ ! -z $DALI_DIR ]; then
-    test_configuration "FP16 dali xla" "--use_tf_amp --use_xla --use_dali --data_idx_dir ${DALI_DIR}"
+    test_configuration "FP16 dali xla" "--amp --xla --dali --data_idx_dir ${DALI_DIR}"
 fi

 cat $INFERENCE_BENCHMARK
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX1_RN50_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX1_RN50_AMP_250E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX1_RN50_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX1_RN50_AMP_90E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX1_RN50_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX1_RN50_FP32_250E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX1_RN50_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX1_RN50_FP32_90E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX2_RN50_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX2_RN50_AMP_250E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX2_RN50_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX2_RN50_AMP_90E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX2_RN50_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX2_RN50_FP32_250E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX2_RN50_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGX2_RN50_FP32_90E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGXA100_RN50_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGXA100_RN50_AMP_90E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGXA100_RN50_TF32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/DGXA100_RN50_TF32_90E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnet50 \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/GPU1_RN50_QAT.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/GPU1_RN50_QAT.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#       http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script does Quantization aware training of Resnet-50 by finetuning on the pre-trained model using 1 GPU and a batch size of 32.
-# Usage ./GPU1_RN50_QAT.sh <path to the pre-trained model> <path to dataset> <path to results directory>
-
-python main.py --mode=train_and_evaluate --batch_size=32 --lr_warmup_epochs=1 --quantize --symmetric --use_qdq --label_smoothing 0.1 --lr_init=0.00005 --momentum=0.875 --weight_decay=3.0517578125e-05 --finetune_checkpoint=$1 --data_dir=$2 --results_dir=$3 --num_iter 10 --data_format NHWC
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/training_perf.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/training_perf.sh
@ -26,13 +26,13 @@ function run_benchmark() {
    MODE_SIZE=$2
    
    if [[ $4 -eq "1" ]]; then
-        XLA="--use_xla"
+        XLA="--xla"
    else
        XLA=""
    fi

    case $2 in
-        "amp") MODE_FLAGS="--use_tf_amp --use_static_loss_scaling --loss_scale=128";;
+        "amp") MODE_FLAGS="--amp --static_loss_scale 128";;
        "fp32"|"tf32") MODE_FLAGS="";;
        *) echo "Unsupported configuration, use amp, tf32 or fp32";;
    esac
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md
@ -251,16 +251,16 @@ For example, to train on DGX-1 for 90 epochs using AMP, run:
 Additionally, features like DALI data preprocessing or TensorFlow XLA can be enabled with
 following arguments when running those scripts:

-`bash ./resnext101-32x4d/training/DGX1_RNxt101-32x4d_AMP_90E.sh /path/to/result /data --use_xla --use_dali`
+`bash ./resnext101-32x4d/training/DGX1_RNxt101-32x4d_AMP_90E.sh /path/to/result /data --xla --dali`

 7. Start validation/evaluation.
 To evaluate the validation dataset located in `/data/tfrecords`, run `main.py` with
 `--mode=evaluate`. For example:

 `python main.py --arch=resnext101-32x4d --mode=evaluate --data_dir=/data/tfrecords --batch_size <batch size> --model_dir
-<model location> --results_dir <output location> [--use_xla] [--use_tf_amp]`
+<model location> --results_dir <output location> [--xla] [--amp]`

-The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during evaluation. 
+The optional `--xla` and `--amp` flags control XLA and AMP during evaluation. 

 ## Advanced

@ -299,95 +299,116 @@ The `runtime/` directory contains the following module that define the mechanics
 The script for training and evaluating the ResNext101-32x4d model has a variety of parameters that control these processes.

 ```
-usage: main.py [-h]
-               [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
+usage: main.py [-h] [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
               [--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}]
-               [--data_dir DATA_DIR] [--data_idx_dir DATA_IDX_DIR]
-               [--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
-               [--batch_size BATCH_SIZE] [--num_iter NUM_ITER]
-               [--iter_unit {epoch,batch}] [--warmup_steps WARMUP_STEPS]
-               [--model_dir MODEL_DIR] [--results_dir RESULTS_DIR]
-               [--log_filename LOG_FILENAME] [--display_every DISPLAY_EVERY]
-               [--lr_init LR_INIT] [--lr_warmup_epochs LR_WARMUP_EPOCHS]
-               [--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
-               [--momentum MOMENTUM] [--loss_scale LOSS_SCALE]
-               [--label_smoothing LABEL_SMOOTHING] [--mixup MIXUP]
-               [--use_static_loss_scaling | --nouse_static_loss_scaling]
-               [--use_xla | --nouse_xla] [--use_dali | --nouse_dali]
-               [--use_tf_amp | --nouse_tf_amp]
-               [--use_cosine_lr | --nouse_cosine_lr] [--seed SEED]
+               [--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]       
+               --batch_size BATCH_SIZE [--num_iter NUM_ITER]  
+               [--run_iter RUN_ITER] [--iter_unit {epoch,batch}]              
+               [--warmup_steps WARMUP_STEPS] [--model_dir MODEL_DIR]
+               [--results_dir RESULTS_DIR] [--log_filename LOG_FILENAME]      
+               [--display_every DISPLAY_EVERY] [--seed SEED]
               [--gpu_memory_fraction GPU_MEMORY_FRACTION] [--gpu_id GPU_ID]
-
-JoC-RN50v1.5-TF
-
-optional arguments:
-  -h, --help            Show this help message and exit
+               [--finetune_checkpoint FINETUNE_CHECKPOINT] [--use_final_conv]
+               [--quant_delay QUANT_DELAY] [--quantize] [--use_qdq]        
+               [--symmetric] [--data_dir DATA_DIR]         
+               [--data_idx_dir DATA_IDX_DIR] [--dali]
+               [--synthetic_data_size SYNTHETIC_DATA_SIZE] [--lr_init LR_INIT]
+               [--lr_warmup_epochs LR_WARMUP_EPOCHS] 
+               [--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
+               [--momentum MOMENTUM] [--label_smoothing LABEL_SMOOTHING]
+               [--mixup MIXUP] [--cosine_lr] [--xla]            
+               [--data_format {NHWC,NCHW}] [--amp]
+               [--static_loss_scale STATIC_LOSS_SCALE]
+                                                            
+JoC-RN50v1.5-TF                      
+                                                                           
+optional arguments:          
+  -h, --help            show this help message and exit.
  --arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}
-                        Architecture of model to run (to run Resnext-32x4d set
-                        --arch=rensext101-32x4d)
+                        Architecture of model to run.                           
  --mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}
                        The execution mode of the script.
+  --export_dir EXPORT_DIR                                                                                                                                                                                                                                                  
+                        Directory in which to write exported SavedModel.         
+  --to_predict TO_PREDICT        
+                        Path to file or directory of files to run prediction
+                        on.
+  --batch_size BATCH_SIZE      
+                        Size of each minibatch per GPU.                    
+  --num_iter NUM_ITER   Number of iterations to run.
+  --run_iter RUN_ITER   Number of training iterations to run on single run.
+  --iter_unit {epoch,batch}                                
+                        Unit of iterations.                                  
+  --warmup_steps WARMUP_STEPS                                    
+                        Number of steps considered as warmup and not taken
+                        into account for performance measurements.                                  
+  --model_dir MODEL_DIR                
+                        Directory in which to write model. If undefined,         
+                        results dir will be used.                                                  
+  --results_dir RESULTS_DIR
+                        Directory in which to write training logs, summaries
+                        and checkpoints.
+  --log_filename LOG_FILENAME
+                        Name of the JSON file to which write the training log.
+  --display_every DISPLAY_EVERY
+                        How often (in batches) to print out running
+                        information.
+  --seed SEED           Random seed.
+  --gpu_memory_fraction GPU_MEMORY_FRACTION
+                        Limit memory fraction used by training script for DALI.
+  --gpu_id GPU_ID       Specify ID of the target GPU on multi-device platform.
+                        Effective only for single-GPU mode.
+  --finetune_checkpoint FINETUNE_CHECKPOINT
+                        Path to pre-trained checkpoint which will be used for
+                        fine-tuning.
+  --use_final_conv      Use convolution operator instead of MLP as last layer.
+  --quant_delay QUANT_DELAY
+                        Number of steps to be run before quantization starts
+                        to happen.
+  --quantize            Quantize weights and activations during training.
+                        (Defaults to Assymmetric quantization)
+  --use_qdq             Use QDQV3 op instead of FakeQuantWithMinMaxVars op for
+                        quantization. QDQv3 does only scaling.
+  --symmetric           Quantize weights and activations during training using
+                        symmetric quantization.
+
+Dataset arguments:
  --data_dir DATA_DIR   Path to dataset in TFRecord format. Files should be
                        named 'train-*' and 'validation-*'.
  --data_idx_dir DATA_IDX_DIR
                        Path to index files for DALI. Files should be named
                        'train-*' and 'validation-*'.
-  --export_dir EXPORT_DIR
-                        Directory in which to write exported SavedModel.
-  --to_predict TO_PREDICT
-                        Path to file or directory of files to run prediction
-                        on.
-  --batch_size BATCH_SIZE
-                        Size of each minibatch per GPU.
-  --num_iter NUM_ITER   Number of iterations to run.
-  --iter_unit {epoch,batch}
-                        Unit of iterations.
-  --warmup_steps WARMUP_STEPS
-                        Number of steps considered as warmup and not taken
-                        into account for performance measurements.
-  --model_dir MODEL_DIR
-                        Directory in which to write the model. If undefined,
-                        results directory will be used.
-  --results_dir RESULTS_DIR
-                        Directory in which to write training logs, summaries
-                        and checkpoints.
-  --log_filename LOG_FILENAME
-                        Name of the JSON file to which write the training log
-  --display_every DISPLAY_EVERY
-                        How often (in batches) to print out running
-                        information.
+  --dali                Enable DALI data input.
+  --synthetic_data_size SYNTHETIC_DATA_SIZE
+                        Dimension of image for synthetic dataset.
+
+Training arguments:
  --lr_init LR_INIT     Initial value for the learning rate.
  --lr_warmup_epochs LR_WARMUP_EPOCHS
-                        Number of warmup epochs for the learning rate schedule.
+                        Number of warmup epochs for learning rate schedule.
  --weight_decay WEIGHT_DECAY
                        Weight Decay scale factor.
  --weight_init {fan_in,fan_out}
                        Model weight initialization method.
-  --momentum MOMENTUM   SGD momentum value for the momentum optimizer.
-  --loss_scale LOSS_SCALE
-                        Loss scale for FP16 training and fast math FP32.
+  --momentum MOMENTUM   SGD momentum value for the Momentum optimizer.
  --label_smoothing LABEL_SMOOTHING
                        The value of label smoothing.
  --mixup MIXUP         The alpha parameter for mixup (if 0 then mixup is not
                        applied).
-  --use_static_loss_scaling
-                        Use static loss scaling in FP16 or FP32 AMP.
-  --nouse_static_loss_scaling
-  --use_xla             Enable XLA (Accelerated Linear Algebra) computation
+  --cosine_lr           Use cosine learning rate schedule.
+
+Generic optimization arguments:
+  --xla                 Enable XLA (Accelerated Linear Algebra) computation
                        for improved performance.
-  --nouse_xla
-  --use_dali            Enable DALI data input.
-  --nouse_dali
-  --use_tf_amp          Enable AMP to speedup FP32
-                        computation using Tensor Cores.
-  --nouse_tf_amp
-  --use_cosine_lr       Use cosine learning rate schedule.
-  --nouse_cosine_lr
-  --seed SEED           Random seed.
-  --gpu_memory_fraction GPU_MEMORY_FRACTION
-                        Limit memory fraction used by the training script for DALI
-  --gpu_id GPU_ID       Specify the ID of the target GPU on a multi-device platform.
-                        Effective only for single-GPU mode.
+  --data_format {NHWC,NCHW}
+                        Data format used to do calculations.
+  --amp                 Enable Automatic Mixed Precision to speedup
+                        computation using tensor cores.
+
+Automatic Mixed Precision arguments:
+  --static_loss_scale STATIC_LOSS_SCALE
+                        Use static loss scaling in FP32 AMP.
+
 ```

 ### Inference process
@ -395,7 +416,7 @@ To run inference on a single example with a checkpoint and a model script, use:

 `python main.py --arch=resnext101-32x4d --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`

-The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during inference.
+The optional `--xla` and `--amp` flags control XLA and AMP during inference.

 ## Performance

@ -414,7 +435,7 @@ To benchmark the training performance on a specific batch size, run:
        
    * AMP

-        `python ./main.py --arch=resnext101-32x4d --mode=training_benchmark  --use_tf_amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        `python ./main.py --arch=resnext101-32x4d --mode=training_benchmark  --amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
        
 * For multiple GPUs
    * FP32 / TF32
@ -423,16 +444,16 @@ To benchmark the training performance on a specific batch size, run:
        
    * AMP

-        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=resnext101-32x4d --mode=training_benchmark --use_tf_amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=resnext101-32x4d --mode=training_benchmark --amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
        
        
 Each of these scripts runs 200 warm-up iterations and measures the first epoch.

 To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
-with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
-For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
+with `--xla` and `--dali` flags. For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
 Suggested batch sizes for training are 128 for mixed precision training and 64 for single precision training per single V100 16 GB.

+If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. The resolution of synthetic images used can be controlled with `--synthetic_data_size` flag.

 #### Inference performance benchmark

@ -444,11 +465,10 @@ To benchmark the inference performance on a specific batch size, run:

 * AMP

-`python ./main.py --arch=resnext101-32x4d --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+`python ./main.py --arch=resnext101-32x4d --mode=inference_benchmark --amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`

 By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
 To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
-For proper throughput and latency reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
 If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.

 The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnext101-32x4d`, by simply running:
@ -457,6 +477,9 @@ The benchmark can be automated with the `inference_benchmark.sh` script provided
 The `<data dir>` parameter refers to the input data directory (by default `/data/tfrecords` inside the container). 
 By default, the benchmark tests the following configurations: **FP32**, **AMP**, **AMP + XLA** with different batch sizes.
 When the optional directory with the DALI index files `<data idx dir>` is specified, the benchmark executes an additional **DALI + AMP + XLA** configuration.
+For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
+
+For performance benchamrk of raw model, synthetic dataset can be used. To use synthetic dataset, use `--synthetic_data_size` flag instead of `--data_dir` to specify input image size.

 ### Results

@ -769,6 +792,9 @@ on NVIDIA T4 with (1x T4 16G) GPU.

 June 2020
   - Initial release
+August 2020
+   - Updated command line argument names
+   - Added support for syntetic dataset with different image size

 ### Known issues
-Performance without XLA enabled is low. We recommend using XLA.
+Performance without XLA enabled is low due to BN + ReLU fusion bug.
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/inference_benchmark.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/inference_benchmark.sh
@ -22,12 +22,12 @@ function test_configuration() {
 }

 test_configuration "FP32 nodali noxla"
-test_configuration "FP32 nodali xla" "--use_xla"
-test_configuration "FP16 nodali noxla" "--use_tf_amp"
-test_configuration "FP16 nodali xla" "--use_tf_amp --use_xla"
+test_configuration "FP32 nodali xla" "--xla"
+test_configuration "FP16 nodali noxla" "--amp"
+test_configuration "FP16 nodali xla" "--amp --xla"

 if [ ! -z $DALI_DIR ]; then
-    test_configuration "FP16 dali xla" "--use_tf_amp --use_xla --use_dali --data_idx_dir ${DALI_DIR}"
+    test_configuration "FP16 dali xla" "--amp --xla --dali --data_idx_dir ${DALI_DIR}"
 fi

 cat $INFERENCE_BENCHMARK
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX1_RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX1_RNxt101-32x4d_AMP_250E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX1_RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX1_RNxt101-32x4d_AMP_90E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX1_RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX1_RNxt101-32x4d_FP32_250E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX1_RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX1_RNxt101-32x4d_FP32_90E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX2_RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX2_RNxt101-32x4d_AMP_250E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX2_RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX2_RNxt101-32x4d_AMP_90E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX2_RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX2_RNxt101-32x4d_FP32_250E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX2_RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGX2_RNxt101-32x4d_FP32_90E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGXA100_RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGXA100_RNxt101-32x4d_AMP_90E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGXA100_RNxt101-32x4d_TF32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/DGXA100_RNxt101-32x4d_TF32_90E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/training_perf.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/training_perf.sh
@ -26,13 +26,13 @@ function run_benchmark() {
    MODE_SIZE=$2
    
    if [[ $4 -eq "1" ]]; then
-        XLA="--use_xla"
+        XLA="--xla"
    else
        XLA=""
    fi

    case $2 in
-        "amp") MODE_FLAGS="--use_tf_amp --use_static_loss_scaling --loss_scale=128";;
+        "amp") MODE_FLAGS="--amp --static_loss_scale 128";;
        "fp32"|"tf32") MODE_FLAGS="";;
        *) echo "Unsupported configuration, use amp, tf32 or fp32";;
    esac
--- a/TensorFlow/Classification/ConvNets/runtime/runner.py
+++ b/TensorFlow/Classification/ConvNets/runtime/runner.py
@ -39,36 +39,34 @@ __all__ = [


 class Runner(object):
-
    def __init__(
-        self,
-        # ========= Model HParams ========= #
-        n_classes=1001,
-        architecture='resnet50',
-        input_format='NHWC',  # NCHW or NHWC
-        compute_format='NCHW',  # NCHW or NHWC
-        dtype=tf.float32,  # tf.float32 or tf.float16
-        n_channels=3,
-        height=224,
-        width=224,
-        distort_colors=False,
-        model_dir=None,
-        log_dir=None,
-        data_dir=None,
-        data_idx_dir=None,
-        weight_init="fan_out",
+            self,
+            # ========= Model HParams ========= #
+            n_classes=1001,
+            architecture='resnet50',
+            input_format='NHWC',  # NCHW or NHWC
+            compute_format='NCHW',  # NCHW or NHWC
+            dtype=tf.float32,  # tf.float32 or tf.float16
+            n_channels=3,
+            height=224,
+            width=224,
+            distort_colors=False,
+            model_dir=None,
+            log_dir=None,
+            data_dir=None,
+            data_idx_dir=None,
+            weight_init="fan_out",

-        # ======= Optimization HParams ======== #
-        use_xla=False,
-        use_tf_amp=False,
-        use_dali=False,
-        gpu_memory_fraction=1.0,
-        gpu_id=0,
+            # ======= Optimization HParams ======== #
+            use_xla=False,
+            use_tf_amp=False,
+            use_dali=False,
+            gpu_memory_fraction=1.0,
+            gpu_id=0,

-        # ======== Debug Flags ======== #
-        debug_verbosity=0,
-        seed=None
-    ):
+            # ======== Debug Flags ======== #
+            debug_verbosity=0,
+            seed=None):

        if dtype not in [tf.float32, tf.float16]:
            raise ValueError("Unknown dtype received: %s (allowed: `tf.float32` and `tf.float16`)" % dtype)
@ -123,56 +121,49 @@ class Runner(object):

        # =================================================

-        model_hparams = tf.contrib.training.HParams(
-            width=height,
-            height=width,
-            n_channels=n_channels,
-            n_classes=n_classes,
-            dtype=dtype,
-            input_format=input_format,
-            compute_format=compute_format,
-            distort_colors=distort_colors,
-            seed=tf_seed
-        )
+        model_hparams = tf.contrib.training.HParams(width=height,
+                                                    height=width,
+                                                    n_channels=n_channels,
+                                                    n_classes=n_classes,
+                                                    dtype=dtype,
+                                                    input_format=input_format,
+                                                    compute_format=compute_format,
+                                                    distort_colors=distort_colors,
+                                                    seed=tf_seed)

        num_preprocessing_threads = 10 if not use_dali else 4
-        run_config_performance = tf.contrib.training.HParams(
-            num_preprocessing_threads=num_preprocessing_threads,
-            use_tf_amp=use_tf_amp,
-            use_xla=use_xla,
-            use_dali=use_dali,
-            gpu_memory_fraction=gpu_memory_fraction,
-            gpu_id=gpu_id
-        )
+        run_config_performance = tf.contrib.training.HParams(num_preprocessing_threads=num_preprocessing_threads,
+                                                             use_tf_amp=use_tf_amp,
+                                                             use_xla=use_xla,
+                                                             use_dali=use_dali,
+                                                             gpu_memory_fraction=gpu_memory_fraction,
+                                                             gpu_id=gpu_id)

        run_config_additional = tf.contrib.training.HParams(
-            model_dir=model_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
+            model_dir=model_dir, #if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
            log_dir=log_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
            data_dir=data_dir,
            data_idx_dir=data_idx_dir,
-            num_preprocessing_threads=num_preprocessing_threads
-        )
+            num_preprocessing_threads=num_preprocessing_threads)

        self.run_hparams = Runner._build_hparams(model_hparams, run_config_additional, run_config_performance)

        model_name = architecture
        architecture = resnet.model_architectures[architecture]

-        self._model = resnet.ResnetModel(
-            model_name=model_name,
-            n_classes=model_hparams.n_classes,
-            layers_count=architecture["layers"],
-            layers_depth=architecture["widths"],
-            expansions=architecture["expansions"],
-            input_format=model_hparams.input_format,
-            compute_format=model_hparams.compute_format,
-            dtype=model_hparams.dtype,
-            weight_init=weight_init,
-            use_dali=use_dali,
-            cardinality=architecture['cardinality'] if 'cardinality' in architecture else 1,
-            use_se=architecture['use_se'] if 'use_se' in architecture else False,
-            se_ratio=architecture['se_ratio'] if 'se_ratio' in architecture else 1
-        )
+        self._model = resnet.ResnetModel(model_name=model_name,
+                                         n_classes=model_hparams.n_classes,
+                                         layers_count=architecture["layers"],
+                                         layers_depth=architecture["widths"],
+                                         expansions=architecture["expansions"],
+                                         input_format=model_hparams.input_format,
+                                         compute_format=model_hparams.compute_format,
+                                         dtype=model_hparams.dtype,
+                                         weight_init=weight_init,
+                                         use_dali=use_dali,
+                                         cardinality=architecture['cardinality'] if 'cardinality' in architecture else 1,
+                                         use_se=architecture['use_se'] if 'use_se' in architecture else False,
+                                         se_ratio=architecture['se_ratio'] if 'se_ratio' in architecture else 1)

        if self.run_hparams.seed is not None:
            tf.set_random_seed(self.run_hparams.seed)
@ -196,9 +187,7 @@ class Runner(object):
                except ValueError:
                    warnings.warn(
                        "the parameter `{}` already exists - existing value: {} and duplicated value: {}".format(
-                            key, hparams.get(key), val
-                        )
-                    )
+                            key, hparams.get(key), val))

        return hparams

@ -214,9 +203,8 @@ class Runner(object):
    def _get_session_config(mode, use_xla, use_dali, gpu_memory_fraction, gpu_id=0):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
-            raise ValueError(
-                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode
-            )
+            raise ValueError("Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" %
+                             mode)

        # Limit available GPU memory (tune the size)
        if use_dali:
@ -240,10 +228,6 @@ class Runner(object):

        config.gpu_options.force_gpu_compatible = True  # Force pinned memory

-        # Bug - disable bn+relu fusion
-        from tensorflow.core.protobuf import rewriter_config_pb2
-        config.graph_options.rewrite_options.remapping = (rewriter_config_pb2.RewriterConfig.OFF)
-
        if mode == 'train':
            config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
            config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // max(hvd.size(), 8) - 2))
@ -254,9 +238,8 @@ class Runner(object):
    def _get_run_config(mode, model_dir, use_xla, use_dali, gpu_memory_fraction, gpu_id=0, seed=None):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
-            raise ValueError(
-                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode
-            )
+            raise ValueError("Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" %
+                             mode)

        if seed is not None:
            if hvd_utils.is_using_hvd():
@ -272,9 +255,11 @@ class Runner(object):
            save_summary_steps=100 if mode in ['train', 'validation'] else 1e9,  # disabled in benchmark mode
            save_checkpoints_steps=None,
            save_checkpoints_secs=None,
-            session_config=Runner._get_session_config(
-                mode=mode, use_xla=use_xla, use_dali=use_dali, gpu_memory_fraction=gpu_memory_fraction, gpu_id=gpu_id
-            ),
+            session_config=Runner._get_session_config(mode=mode,
+                                                      use_xla=use_xla,
+                                                      use_dali=use_dali,
+                                                      gpu_memory_fraction=gpu_memory_fraction,
+                                                      gpu_id=gpu_id),
            keep_checkpoint_max=5,
            keep_checkpoint_every_n_hours=1e6,  # disabled
            log_step_count_steps=1e9,
@ -282,14 +267,12 @@ class Runner(object):
            device_fn=None,
            protocol=None,
            eval_distribute=None,
-            experimental_distribute=None
-        )
+            experimental_distribute=None)

        if mode == 'train':
            if hvd_utils.is_using_hvd():
-                config = config.replace(
-                    save_checkpoints_steps=1000 if hvd.rank() == 0 else None, keep_checkpoint_every_n_hours=3
-                )
+                config = config.replace(save_checkpoints_steps=1000 if hvd.rank() == 0 else None,
+                                        keep_checkpoint_every_n_hours=3)
            else:
                config = config.replace(save_checkpoints_steps=1000, keep_checkpoint_every_n_hours=3)

@ -298,49 +281,45 @@ class Runner(object):
    def _get_estimator(self, mode, run_params, use_xla, use_dali, gpu_memory_fraction, gpu_id=0):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
-            raise ValueError(
-                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode
-            )
+            raise ValueError("Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" %
+                             mode)

-        run_config = Runner._get_run_config(
-            mode=mode,
-            model_dir=self.run_hparams.model_dir,
-            use_xla=use_xla,
-            use_dali=use_dali,
-            gpu_memory_fraction=gpu_memory_fraction,
-            gpu_id=gpu_id,
-            seed=self.run_hparams.seed
-        )
+        run_config = Runner._get_run_config(mode=mode,
+                                            model_dir=self.run_hparams.model_dir,
+                                            use_xla=use_xla,
+                                            use_dali=use_dali,
+                                            gpu_memory_fraction=gpu_memory_fraction,
+                                            gpu_id=gpu_id,
+                                            seed=self.run_hparams.seed)

-        return tf.estimator.Estimator(
-            model_fn=self._model, model_dir=self.run_hparams.model_dir, config=run_config, params=run_params
-        )
+        return tf.estimator.Estimator(model_fn=self._model,
+                                      model_dir=self.run_hparams.model_dir,
+                                      config=run_config,
+                                      params=run_params)

-    def train(
-        self,
-        iter_unit,
-        num_iter,
-        run_iter,
-        batch_size,
-        warmup_steps=50,
-        weight_decay=1e-4,
-        lr_init=0.1,
-        lr_warmup_epochs=5,
-        momentum=0.9,
-        log_every_n_steps=1,
-        loss_scale=256,
-        label_smoothing=0.0,
-        mixup=0.0,
-        use_cosine_lr=False,
-        use_static_loss_scaling=False,
-        is_benchmark=False,
-        quantize=False,
-        symmetric=False,
-        quant_delay=0,
-        finetune_checkpoint=None,
-        use_final_conv=False,
-        use_qdq=False
-    ):
+    def train(self,
+              iter_unit,
+              num_iter,
+              run_iter,
+              batch_size,
+              warmup_steps=50,
+              weight_decay=1e-4,
+              lr_init=0.1,
+              lr_warmup_epochs=5,
+              momentum=0.9,
+              log_every_n_steps=1,
+              loss_scale=256,
+              label_smoothing=0.0,
+              mixup=0.0,
+              use_cosine_lr=False,
+              use_static_loss_scaling=False,
+              is_benchmark=False,
+              quantize=False,
+              symmetric=False,
+              quant_delay=0,
+              finetune_checkpoint=None,
+              use_final_conv=False,
+              use_qdq=False):

        if iter_unit not in ["epoch", "batch"]:
            raise ValueError('`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit)
@ -383,9 +362,8 @@ class Runner(object):
            run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter

        if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
-            idx_filenames = runner_utils.parse_dali_idx_dataset(
-                data_idx_dir=self.run_hparams.data_idx_dir, mode="train"
-            )
+            idx_filenames = runner_utils.parse_dali_idx_dataset(data_idx_dir=self.run_hparams.data_idx_dir,
+                                                                mode="train")

        training_hooks = []

@ -447,14 +425,12 @@ class Runner(object):
        if finetune_checkpoint:
            estimator_params['finetune_checkpoint'] = finetune_checkpoint

-        image_classifier = self._get_estimator(
-            mode='train',
-            run_params=estimator_params,
-            use_xla=self.run_hparams.use_xla,
-            use_dali=self.run_hparams.use_dali,
-            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
-            gpu_id=self.run_hparams.gpu_id
-        )
+        image_classifier = self._get_estimator(mode='train',
+                                               run_params=estimator_params,
+                                               use_xla=self.run_hparams.use_xla,
+                                               use_dali=self.run_hparams.use_dali,
+                                               gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
+                                               gpu_id=self.run_hparams.gpu_id)

        def training_data_fn():

@ -462,30 +438,26 @@ class Runner(object):
                if hvd.rank() == 0:
                    print("Using DALI input... ")

-                return data_utils.get_dali_input_fn(
-                    filenames=filenames,
-                    idx_filenames=idx_filenames,
-                    batch_size=batch_size,
-                    height=self.run_hparams.height,
-                    width=self.run_hparams.width,
-                    training=True,
-                    distort_color=self.run_hparams.distort_colors,
-                    num_threads=self.run_hparams.num_preprocessing_threads,
-                    deterministic=False if self.run_hparams.seed is None else True
-                )
+                return data_utils.get_dali_input_fn(filenames=filenames,
+                                                    idx_filenames=idx_filenames,
+                                                    batch_size=batch_size,
+                                                    height=self.run_hparams.height,
+                                                    width=self.run_hparams.width,
+                                                    training=True,
+                                                    distort_color=self.run_hparams.distort_colors,
+                                                    num_threads=self.run_hparams.num_preprocessing_threads,
+                                                    deterministic=False if self.run_hparams.seed is None else True)

            elif self.run_hparams.data_dir is not None:

-                return data_utils.get_tfrecords_input_fn(
-                    filenames=filenames,
-                    batch_size=batch_size,
-                    height=self.run_hparams.height,
-                    width=self.run_hparams.width,
-                    training=True,
-                    distort_color=self.run_hparams.distort_colors,
-                    num_threads=self.run_hparams.num_preprocessing_threads,
-                    deterministic=False if self.run_hparams.seed is None else True
-                )
+                return data_utils.get_tfrecords_input_fn(filenames=filenames,
+                                                         batch_size=batch_size,
+                                                         height=self.run_hparams.height,
+                                                         width=self.run_hparams.width,
+                                                         training=True,
+                                                         distort_color=self.run_hparams.distort_colors,
+                                                         num_threads=self.run_hparams.num_preprocessing_threads,
+                                                         deterministic=False if self.run_hparams.seed is None else True)

            else:
                if hvd.rank() == 0:
@ -555,14 +527,12 @@ class Runner(object):
                            'use_qdq': use_qdq,
                            'use_final_conv': use_final_conv}

-        image_classifier = self._get_estimator(
-            mode='validation',
-            run_params=estimator_params,
-            use_xla=self.run_hparams.use_xla,
-            use_dali=self.run_hparams.use_dali,
-            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
-            gpu_id=self.run_hparams.gpu_id
-        )
+        image_classifier = self._get_estimator(mode='validation',
+                                               run_params=estimator_params,
+                                               use_xla=self.run_hparams.use_xla,
+                                               use_dali=self.run_hparams.use_dali,
+                                               gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
+                                               gpu_id=self.run_hparams.gpu_id)

        if self.run_hparams.data_dir is not None:
            filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
@ -579,9 +549,8 @@ class Runner(object):
            num_steps = num_iter

        if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
-            idx_filenames = runner_utils.parse_dali_idx_dataset(
-                data_idx_dir=self.run_hparams.data_idx_dir, mode="validation"
-            )
+            idx_filenames = runner_utils.parse_dali_idx_dataset(data_idx_dir=self.run_hparams.data_idx_dir,
+                                                                mode="validation")

        eval_hooks = []

@ -603,29 +572,25 @@ class Runner(object):
                if hvd.rank() == 0:
                    print("Using DALI input... ")

-                return data_utils.get_dali_input_fn(
-                    filenames=filenames,
-                    idx_filenames=idx_filenames,
-                    batch_size=batch_size,
-                    height=self.run_hparams.height,
-                    width=self.run_hparams.width,
-                    training=False,
-                    distort_color=self.run_hparams.distort_colors,
-                    num_threads=self.run_hparams.num_preprocessing_threads,
-                    deterministic=False if self.run_hparams.seed is None else True
-                )
+                return data_utils.get_dali_input_fn(filenames=filenames,
+                                                    idx_filenames=idx_filenames,
+                                                    batch_size=batch_size,
+                                                    height=self.run_hparams.height,
+                                                    width=self.run_hparams.width,
+                                                    training=False,
+                                                    distort_color=self.run_hparams.distort_colors,
+                                                    num_threads=self.run_hparams.num_preprocessing_threads,
+                                                    deterministic=False if self.run_hparams.seed is None else True)

            elif self.run_hparams.data_dir is not None:
-                return data_utils.get_tfrecords_input_fn(
-                    filenames=filenames,
-                    batch_size=batch_size,
-                    height=self.run_hparams.height,
-                    width=self.run_hparams.width,
-                    training=False,
-                    distort_color=self.run_hparams.distort_colors,
-                    num_threads=self.run_hparams.num_preprocessing_threads,
-                    deterministic=False if self.run_hparams.seed is None else True
-                )
+                return data_utils.get_tfrecords_input_fn(filenames=filenames,
+                                                         batch_size=batch_size,
+                                                         height=self.run_hparams.height,
+                                                         width=self.run_hparams.width,
+                                                         training=False,
+                                                         distort_color=self.run_hparams.distort_colors,
+                                                         num_threads=self.run_hparams.num_preprocessing_threads,
+                                                         deterministic=False if self.run_hparams.seed is None else True)

            else:
                print("Using Synthetic Data ...\n")
@ -651,29 +616,25 @@ class Runner(object):
            eval_latencies_q = np.quantile(eval_latencies, q=[0.9, 0.95, 0.99])
            eval_latencies_mean = np.mean(eval_latencies)

-            dllogger.log(
-                data={
-                    'top1_accuracy': float(eval_results['top1_accuracy']),
-                    'top5_accuracy': float(eval_results['top5_accuracy']),
-                    'eval_throughput': eval_throughput,
-                    'eval_latency_avg': eval_latencies_mean,
-                    'eval_latency_p90': eval_latencies_q[0],
-                    'eval_latency_p95': eval_latencies_q[1],
-                    'eval_latency_p99': eval_latencies_q[2],
-                },
-                step=tuple()
-            )
+            dllogger.log(data={
+                'top1_accuracy': float(eval_results['top1_accuracy']),
+                'top5_accuracy': float(eval_results['top5_accuracy']),
+                'eval_throughput': eval_throughput,
+                'eval_latency_avg': eval_latencies_mean,
+                'eval_latency_p90': eval_latencies_q[0],
+                'eval_latency_p95': eval_latencies_q[1],
+                'eval_latency_p99': eval_latencies_q[2],
+            },
+                         step=tuple())

            if export_dir is not None:
                dllogger.log(data={'export_dir': export_dir}, step=tuple())
-                input_receiver_fn = data_utils.get_serving_input_receiver_fn(
-                    batch_size=None,
-                    height=self.run_hparams.height,
-                    width=self.run_hparams.width,
-                    num_channels=self.run_hparams.n_channels,
-                    data_format=self.run_hparams.input_format,
-                    dtype=self.run_hparams.dtype
-                )
+                input_receiver_fn = data_utils.get_serving_input_receiver_fn(batch_size=None,
+                                                                             height=self.run_hparams.height,
+                                                                             width=self.run_hparams.width,
+                                                                             num_channels=self.run_hparams.n_channels,
+                                                                             data_format=self.run_hparams.input_format,
+                                                                             dtype=self.run_hparams.dtype)

                image_classifier.export_savedmodel(export_dir, input_receiver_fn)

@ -684,33 +645,35 @@ class Runner(object):

    def predict(self, to_predict, quantize=False, symmetric=False, use_qdq=False, use_final_conv=False):

-        estimator_params = {'quantize': quantize, 'symmetric': symmetric, 'use_qdq': use_qdq, 'use_final_conv': use_final_conv}
+        estimator_params = {
+            'quantize': quantize,
+            'symmetric': symmetric,
+            'use_qdq': use_qdq,
+            'use_final_conv': use_final_conv
+        }

        if to_predict is not None:
            filenames = runner_utils.parse_inference_input(to_predict)

-        image_classifier = self._get_estimator(
-            mode='inference',
-            run_params=estimator_params,
-            use_xla=self.run_hparams.use_xla,
-            use_dali=self.run_hparams.use_dali,
-            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction
-        )
+        image_classifier = self._get_estimator(mode='inference',
+                                               run_params=estimator_params,
+                                               use_xla=self.run_hparams.use_xla,
+                                               use_dali=self.run_hparams.use_dali,
+                                               gpu_memory_fraction=self.run_hparams.gpu_memory_fraction)

        inference_hooks = []

        def inference_data_fn():
-            return data_utils.get_inference_input_fn(
-                filenames=filenames,
-                height=self.run_hparams.height,
-                width=self.run_hparams.width,
-                num_threads=self.run_hparams.num_preprocessing_threads
-            )
+            return data_utils.get_inference_input_fn(filenames=filenames,
+                                                     height=self.run_hparams.height,
+                                                     width=self.run_hparams.width,
+                                                     num_threads=self.run_hparams.num_preprocessing_threads)

        try:
-            inference_results = image_classifier.predict(
-                input_fn=inference_data_fn, predict_keys=None, hooks=inference_hooks, yield_single_examples=True
-            )
+            inference_results = image_classifier.predict(input_fn=inference_data_fn,
+                                                         predict_keys=None,
+                                                         hooks=inference_hooks,
+                                                         yield_single_examples=True)

            for result in inference_results:
                print(result['classes'], str(result['probabilities'][result['classes']]))
--- a/TensorFlow/Classification/ConvNets/runtime/runner_utils.py
+++ b/TensorFlow/Classification/ConvNets/runtime/runner_utils.py
@ -48,13 +48,13 @@ def list_filenames_in_dataset(data_dir, mode, count=True):

    filename_pattern = os.path.join(data_dir, '%s-*' % mode)

-    file_list = sorted(tf.gfile.Glob(filename_pattern))
+    file_list = sorted(tf.compat.v1.gfile.Glob(filename_pattern))
    num_samples = 0 
    
    if count:
        def count_records(tf_record_filename):
            count = 0
-            for _ in tf.python_io.tf_record_iterator(tf_record_filename):
+            for _ in tf.compat.v1.io.tf_record_iterator(tf_record_filename):
                count += 1
            return count

--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md
@ -246,16 +246,16 @@ For example, to train on DGX-1 for 90 epochs using AMP, run:
 Additionally, features like DALI data preprocessing or TensorFlow XLA can be enabled with
 following arguments when running those scripts:

-`bash ./se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_AMP_90E.sh /path/to/result /data/ --use_xla --use_dali`
+`bash ./se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_AMP_90E.sh /path/to/result /data/ --xla --dali`

 7. Start validation/evaluation.
 To evaluate the validation dataset located in `/data/tfrecords`, run `main.py` with
 `--mode=evaluate`. For example:

 `python main.py --arch=se-resnext101-32x4d --mode=evaluate --data_dir=/data/tfrecords --batch_size <batch size> --model_dir
-<model location> --results_dir <output location> [--use_xla] [--use_tf_amp]`
+<model location> --results_dir <output location> [--xla] [--amp]`

-The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during evaluation. 
+The optional `--xla` and `--amp` flags control XLA and AMP during evaluation. 

 ## Advanced

@ -294,95 +294,116 @@ The `runtime/` directory contains the following module that define the mechanics
 The script for training and evaluating the ResNext101-32x4d model has a variety of parameters that control these processes.

 ```
-usage: main.py [-h]
-               [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
+usage: main.py [-h] [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
               [--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}]
-               [--data_dir DATA_DIR] [--data_idx_dir DATA_IDX_DIR]
-               [--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
-               [--batch_size BATCH_SIZE] [--num_iter NUM_ITER]
-               [--iter_unit {epoch,batch}] [--warmup_steps WARMUP_STEPS]
-               [--model_dir MODEL_DIR] [--results_dir RESULTS_DIR]
-               [--log_filename LOG_FILENAME] [--display_every DISPLAY_EVERY]
-               [--lr_init LR_INIT] [--lr_warmup_epochs LR_WARMUP_EPOCHS]
-               [--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
-               [--momentum MOMENTUM] [--loss_scale LOSS_SCALE]
-               [--label_smoothing LABEL_SMOOTHING] [--mixup MIXUP]
-               [--use_static_loss_scaling | --nouse_static_loss_scaling]
-               [--use_xla | --nouse_xla] [--use_dali | --nouse_dali]
-               [--use_tf_amp | --nouse_tf_amp]
-               [--use_cosine_lr | --nouse_cosine_lr] [--seed SEED]
+               [--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]       
+               --batch_size BATCH_SIZE [--num_iter NUM_ITER]  
+               [--run_iter RUN_ITER] [--iter_unit {epoch,batch}]              
+               [--warmup_steps WARMUP_STEPS] [--model_dir MODEL_DIR]
+               [--results_dir RESULTS_DIR] [--log_filename LOG_FILENAME]      
+               [--display_every DISPLAY_EVERY] [--seed SEED]
               [--gpu_memory_fraction GPU_MEMORY_FRACTION] [--gpu_id GPU_ID]
-
-JoC-RN50v1.5-TF
-
-optional arguments:
-  -h, --help            Show this help message and exit
+               [--finetune_checkpoint FINETUNE_CHECKPOINT] [--use_final_conv]
+               [--quant_delay QUANT_DELAY] [--quantize] [--use_qdq]        
+               [--symmetric] [--data_dir DATA_DIR]         
+               [--data_idx_dir DATA_IDX_DIR] [--dali]
+               [--synthetic_data_size SYNTHETIC_DATA_SIZE] [--lr_init LR_INIT]
+               [--lr_warmup_epochs LR_WARMUP_EPOCHS] 
+               [--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
+               [--momentum MOMENTUM] [--label_smoothing LABEL_SMOOTHING]
+               [--mixup MIXUP] [--cosine_lr] [--xla]            
+               [--data_format {NHWC,NCHW}] [--amp]
+               [--static_loss_scale STATIC_LOSS_SCALE]
+                                                            
+JoC-RN50v1.5-TF                      
+                                                                           
+optional arguments:          
+  -h, --help            show this help message and exit.
  --arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}
-                        Architecture of model to run (to run se-resnext-32x4d set
-                        --arch=se-rensext101-32x4d)
+                        Architecture of model to run.                           
  --mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}
                        The execution mode of the script.
+  --export_dir EXPORT_DIR                                                                                                                                                                                                                                                  
+                        Directory in which to write exported SavedModel.         
+  --to_predict TO_PREDICT        
+                        Path to file or directory of files to run prediction
+                        on.
+  --batch_size BATCH_SIZE      
+                        Size of each minibatch per GPU.                    
+  --num_iter NUM_ITER   Number of iterations to run.
+  --run_iter RUN_ITER   Number of training iterations to run on single run.
+  --iter_unit {epoch,batch}                                
+                        Unit of iterations.                                  
+  --warmup_steps WARMUP_STEPS                                    
+                        Number of steps considered as warmup and not taken
+                        into account for performance measurements.                                  
+  --model_dir MODEL_DIR                
+                        Directory in which to write model. If undefined,         
+                        results dir will be used.                                                  
+  --results_dir RESULTS_DIR
+                        Directory in which to write training logs, summaries
+                        and checkpoints.
+  --log_filename LOG_FILENAME
+                        Name of the JSON file to which write the training log.
+  --display_every DISPLAY_EVERY
+                        How often (in batches) to print out running
+                        information.
+  --seed SEED           Random seed.
+  --gpu_memory_fraction GPU_MEMORY_FRACTION
+                        Limit memory fraction used by training script for DALI.
+  --gpu_id GPU_ID       Specify ID of the target GPU on multi-device platform.
+                        Effective only for single-GPU mode.
+  --finetune_checkpoint FINETUNE_CHECKPOINT
+                        Path to pre-trained checkpoint which will be used for
+                        fine-tuning.
+  --use_final_conv      Use convolution operator instead of MLP as last layer.
+  --quant_delay QUANT_DELAY
+                        Number of steps to be run before quantization starts
+                        to happen.
+  --quantize            Quantize weights and activations during training.
+                        (Defaults to Assymmetric quantization)
+  --use_qdq             Use QDQV3 op instead of FakeQuantWithMinMaxVars op for
+                        quantization. QDQv3 does only scaling.
+  --symmetric           Quantize weights and activations during training using
+                        symmetric quantization.
+
+Dataset arguments:
  --data_dir DATA_DIR   Path to dataset in TFRecord format. Files should be
                        named 'train-*' and 'validation-*'.
  --data_idx_dir DATA_IDX_DIR
                        Path to index files for DALI. Files should be named
                        'train-*' and 'validation-*'.
-  --export_dir EXPORT_DIR
-                        Directory in which to write exported SavedModel.
-  --to_predict TO_PREDICT
-                        Path to file or directory of files to run prediction
-                        on.
-  --batch_size BATCH_SIZE
-                        Size of each minibatch per GPU.
-  --num_iter NUM_ITER   Number of iterations to run.
-  --iter_unit {epoch,batch}
-                        Unit of iterations.
-  --warmup_steps WARMUP_STEPS
-                        Number of steps considered as warmup and not taken
-                        into account for performance measurements.
-  --model_dir MODEL_DIR
-                        Directory in which to write the model. If undefined,
-                        results directory will be used.
-  --results_dir RESULTS_DIR
-                        Directory in which to write training logs, summaries
-                        and checkpoints.
-  --log_filename LOG_FILENAME
-                        Name of the JSON file to which write the training log
-  --display_every DISPLAY_EVERY
-                        How often (in batches) to print out running
-                        information.
+  --dali                Enable DALI data input.
+  --synthetic_data_size SYNTHETIC_DATA_SIZE
+                        Dimension of image for synthetic dataset.
+
+Training arguments:
  --lr_init LR_INIT     Initial value for the learning rate.
  --lr_warmup_epochs LR_WARMUP_EPOCHS
-                        Number of warmup epochs for the learning rate schedule.
+                        Number of warmup epochs for learning rate schedule.
  --weight_decay WEIGHT_DECAY
                        Weight Decay scale factor.
  --weight_init {fan_in,fan_out}
                        Model weight initialization method.
-  --momentum MOMENTUM   SGD momentum value for the momentum optimizer.
-  --loss_scale LOSS_SCALE
-                        Loss scale for FP16 training and fast math FP32.
+  --momentum MOMENTUM   SGD momentum value for the Momentum optimizer.
  --label_smoothing LABEL_SMOOTHING
                        The value of label smoothing.
  --mixup MIXUP         The alpha parameter for mixup (if 0 then mixup is not
                        applied).
-  --use_static_loss_scaling
-                        Use static loss scaling in FP16 or FP32 AMP.
-  --nouse_static_loss_scaling
-  --use_xla             Enable XLA (Accelerated Linear Algebra) computation
+  --cosine_lr           Use cosine learning rate schedule.
+
+Generic optimization arguments:
+  --xla                 Enable XLA (Accelerated Linear Algebra) computation
                        for improved performance.
-  --nouse_xla
-  --use_dali            Enable DALI data input.
-  --nouse_dali
-  --use_tf_amp          Enable AMP to speedup FP32
-                        computation using Tensor Cores.
-  --nouse_tf_amp
-  --use_cosine_lr       Use cosine learning rate schedule.
-  --nouse_cosine_lr
-  --seed SEED           Random seed.
-  --gpu_memory_fraction GPU_MEMORY_FRACTION
-                        Limit memory fraction used by the training script for DALI
-  --gpu_id GPU_ID       Specify the ID of the target GPU on a multi-device platform.
-                        Effective only for single-GPU mode.
+  --data_format {NHWC,NCHW}
+                        Data format used to do calculations.
+  --amp                 Enable Automatic Mixed Precision to speedup
+                        computation using tensor cores.
+
+Automatic Mixed Precision arguments:
+  --static_loss_scale STATIC_LOSS_SCALE
+                        Use static loss scaling in FP32 AMP.
+
 ```

 ### Inference process
@ -390,7 +411,7 @@ To run inference on a single example with a checkpoint and a model script, use:

 `python main.py --arch=se-resnext101-32x4d --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`

-The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during inference.
+The optional `--xla` and `--amp` flags control XLA and AMP during inference.

 ## Performance

@ -409,7 +430,7 @@ To benchmark the training performance on a specific batch size, run:
        
    * AMP

-        `python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark  --use_tf_amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        `python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark  --amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
        
 * For multiple GPUs
    * FP32 / TF32
@ -418,16 +439,17 @@ To benchmark the training performance on a specific batch size, run:
        
    * AMP

-        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark --use_tf_amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark --amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
        
        
 Each of these scripts runs 200 warm-up iterations and measures the first epoch.

 To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
-with `--use_xla` and `--use_dali` flags. If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.
-For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
+with `--xla` and `--dali` flags. For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
 Suggested batch sizes for training are 96 for mixed precision training and 64 for single precision training per single V100 16 GB.

+If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset. The resolution of synthetic images used can be controlled with `--synthetic_data_size` flag.
+

 #### Inference performance benchmark

@ -439,11 +461,10 @@ To benchmark the inference performance on a specific batch size, run:

 * AMP

-`python ./main.py --arch=se-resnext101-32x4d --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+`python ./main.py --arch=se-resnext101-32x4d --mode=inference_benchmark --amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`

 By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
 To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
-For proper throughput and latency reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
 If no `--data_dir=<path to imagenet>` flag is specified then the benchmarks will use a synthetic dataset.

 The benchmark can be automated with the `inference_benchmark.sh` script provided in `se-resnext101-32x4d`, by simply running:
@ -452,6 +473,9 @@ The benchmark can be automated with the `inference_benchmark.sh` script provided
 The `<data dir>` parameter refers to the input data directory (by default `/data/tfrecords` inside the container). 
 By default, the benchmark tests the following configurations: **FP32**, **AMP**, **AMP + XLA** with different batch sizes.
 When the optional directory with the DALI index files `<data idx dir>` is specified, the benchmark executes an additional **DALI + AMP + XLA** configuration.
+For proper throughput reporting the value of `--num_iter` must be greater than `--warmup_steps` value.
+
+For performance benchamrk of raw model, synthetic dataset can be used. To use synthetic dataset, use `--synthetic_data_size` flag instead of `--data_dir` to specify input image size.

 ### Results

@ -761,6 +785,9 @@ on NVIDIA T4 with (1x T4 16G) GPU.

 April 2020
   - Initial release
+August 2020
+   - Updated command line argument names
+   - Added support for syntetic dataset with different image size

 ### Known issues
-Performance without XLA enabled is low. We recommend using XLA.
+Performance without XLA enabled is low due to BN + ReLU fusion bug.
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/inference_benchmark.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/inference_benchmark.sh
@ -22,12 +22,12 @@ function test_configuration() {
 }

 test_configuration "FP32 nodali noxla"
-test_configuration "FP32 nodali xla" "--use_xla"
-test_configuration "FP16 nodali noxla" "--use_tf_amp"
-test_configuration "FP16 nodali xla" "--use_tf_amp --use_xla"
+test_configuration "FP32 nodali xla" "--xla"
+test_configuration "FP16 nodali noxla" "--amp"
+test_configuration "FP16 nodali xla" "--amp --xla"

 if [ ! -z $DALI_DIR ]; then
-    test_configuration "FP16 dali xla" "--use_tf_amp --use_xla --use_dali --data_idx_dir ${DALI_DIR}"
+    test_configuration "FP16 dali xla" "--amp --xla --dali --data_idx_dir ${DALI_DIR}"
 fi

 cat $INFERENCE_BENCHMARK
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_AMP_250E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=96 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=96 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_AMP_90E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=96 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=96 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_FP32_250E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX1_SE-RNxt101-32x4d_FP32_90E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX2_SE-RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX2_SE-RNxt101-32x4d_AMP_250E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=96 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=96 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX2_SE-RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX2_SE-RNxt101-32x4d_AMP_90E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=se-resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=96 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=96 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp --use_static_loss_scaling --loss_scale 128 \
+    --amp --static_loss_scale 128 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX2_SE-RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX2_SE-RNxt101-32x4d_FP32_250E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=250 --muxup=0.2 \
-    --batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX2_SE-RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGX2_SE-RNxt101-32x4d_FP32_90E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 16 python3 main.py --arch=se-resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=64 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=64 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGXA100_SE-RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGXA100_SE-RNxt101-32x4d_AMP_90E.sh
@ -25,9 +25,9 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=256 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=256 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
-    --use_tf_amp \
+    --amp \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}

--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGXA100_SE-RNxt101-32x4d_TF32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/DGXA100_SE-RNxt101-32x4d_TF32_90E.sh
@ -25,7 +25,7 @@ fi

 mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np 8 python3 main.py --arch=se-resnext101-32x4d \
    --mode=train_and_evaluate --iter_unit=epoch --num_iter=90 \
-    --batch_size=128 --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --batch_size=128 --warmup_steps=100 --cosine_lr --label_smoothing 0.1 \
    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
    --data_dir=${DATA_DIR}/tfrecords --data_idx_dir=${DATA_DIR}/dali_idx \
    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/training_perf.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/training_perf.sh
@ -26,13 +26,13 @@ function run_benchmark() {
    MODE_SIZE=$2
    
    if [[ $4 -eq "1" ]]; then
-        XLA="--use_xla"
+        XLA="--xla"
    else
        XLA=""
    fi

    case $2 in
-        "amp") MODE_FLAGS="--use_tf_amp --use_static_loss_scaling --loss_scale=128";;
+        "amp") MODE_FLAGS="--amp --static_loss_scale=128";;
        "fp32"|"tf32") MODE_FLAGS="";;
        *) echo "Unsupported configuration, use amp, tf32 or fp32";;
    esac
--- a/TensorFlow/Classification/ConvNets/triton/README.md
+++ b/TensorFlow/Classification/ConvNets/triton/README.md
@ -0,0 +1,687 @@
+# Deploying the ResNet-50 v1.5 model on Triton Inference Server
+
+This folder contains instructions for deployment to run inference
+on Triton Inference Server as well as a detailed performance analysis.
+The purpose of this document is to help you with achieving
+the best inference performance.
+
+## Table of contents
+
+  - [Solution overview](#solution-overview)
+    - [Introduction](#introduction)
+    - [Deployment process](#deployment-process)
+  - [Setup](#setup)
+  - [Quick Start Guide](#quick-start-guide)
+  - [Advanced](#advanced)
+    - [Prepare configuration](#prepare-configuration)
+    - [Latency explanation](#latency-explanation)
+  - [Performance](#performance)
+    - [Offline scenario](#offline-scenario)
+      - [Offline: NVIDIA A40, TF-TRT with FP16](#offline-nvidia-a40-tf-trt-with-fp16)
+      - [Offline: NVIDIA DGX A100 (1x A100 80GB), TF-TRT with FP16](#offline-nvidia-dgx-a100-1x-a100-80gb-tf-trt-with-fp16)
+      - [Offline: NVIDIA DGX-1 (1x V100 32GB), TF-TRT with FP16](#offline-nvidia-dgx-1-1x-v100-32gb-tf-trt-with-fp16)
+      - [Offline: NVIDIA T4, TF-TRT with FP16](#offline-nvidia-t4-tf-trt-with-fp16)
+    - [Online scenario](#online-scenario)
+      - [Online: NVIDIA A40, TF-TRT with FP16](#online-nvidia-a40-tf-trt-with-fp16)
+      - [Online: NVIDIA DGX A100 (1x A100 80GB), TF-TRT with FP16](#online-nvidia-dgx-a100-1x-a100-80gb-tf-trt-with-fp16)
+      - [Online: NVIDIA DGX-1 (1x V100 32GB), TF-TRT with FP16](#online-nvidia-dgx-1-1x-v100-32gb-tf-trt-with-fp16)
+      - [Online: NVIDIA T4, TF-TRT with FP16](#online-nvidia-t4-tf-trt-with-fp16)
+  - [Release Notes](#release-notes)
+      - [Changelog](#changelog)
+      - [Known issues](#known-issues)
+
+
+
+
+## Solution overview
+
+
+### Introduction
+The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server)
+provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs.
+The server provides an inference service via an HTTP or gRPC endpoint,
+allowing remote clients to request inferencing for any number of GPU
+or CPU models being managed by the server.
+
+This README provides step-by-step deployment instructions for models generated
+during training (as described in the [model README](../README.md)).
+Additionally, this README provides the corresponding deployment scripts that
+ensure optimal GPU utilization during inferencing on Triton Inference Server.
+
+### Deployment process
+The deployment process consists of two steps:
+
+1. Conversion. The purpose of conversion is to find the best performing model
+   format supported by Triton Inference Server.
+   Triton Inference Server uses a number of runtime backends such as
+   [TensorRT](https://developer.nvidia.com/tensorrt),
+   [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend) and
+   [ONNX Runtime](https://github.com/triton-inference-server/onnxruntime_backend)
+   to support various model types. Refer to
+   [Triton documentation](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton)
+   for a list of available backends.
+2. Configuration. Model configuration on Triton Inference Server, which generates
+   necessary [configuration files](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md).
+
+To run benchmarks measuring the model performance in inference,
+perform the following steps:
+
+1. Start the Triton Inference Server.
+
+   The Triton Inference Server container is started
+   in one (possibly remote) container and ports for gRPC or REST API are exposed.
+
+2. Run accuracy tests.
+
+   Produce results which are tested against given accuracy thresholds.
+   Refer to step 8 in the [Quick Start Guide](#quick-start-guide).
+
+3. Run performance tests.
+
+   Produce latency and throughput results for offline (static batching)
+   and online (dynamic batching) scenarios.
+   Refer to step 11 in the [Quick Start Guide](#quick-start-guide).
+
+
+## Setup
+
+
+
+Ensure you have the following components:
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [TensorFlow1 NGC container 20.12](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
+* [Triton Inference Server NGC container 20.12](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver)
+* [NVIDIA CUDA repository](https://docs.nvidia.com/cuda/archive/11.1.1/index.html)
+* [NVIDIA Ampere](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/), [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+
+
+## Quick Start Guide
+Running the following scripts will build and launch the container with all
+required dependencies for native TensorFlow as well as Triton Inference Server.
+This is necessary for running inference and can also be used for data download,
+processing, and training of the model. 
+ 
+1. Clone the repository.
+   IMPORTANT: This step is executed on the host computer.
+ 
+   ```
+    git clone https://github.com/NVIDIA/DeepLearningExamples.git
+    cd DeepLearningExamples/TensorFlow/Classification/ConvNets
+   ```
+2. Setup the environment in host PC and start Triton Inference Server.
+ 
+   ```
+    source triton/scripts/setup_environment.sh
+    bash triton/scripts/docker/triton_inference_server.sh 
+   ```
+
+3. Build and run a container that extends the NGC TensorFlow container with
+   the Triton Inference Server client libraries and dependencies.
+ 
+   ```
+    bash triton/scripts/docker/build.sh
+    bash triton/scripts/docker/interactive.sh
+   ```
+
+
+4. Prepare the deployment configuration and create folders in Docker.
+ 
+   IMPORTANT: These and the following commands must be executed in the TensorFlow NGC container.
+ 
+ 
+   ```
+    source triton/scripts/setup_environment.sh
+   ```
+
+5. Download and pre-process the dataset.
+ 
+ 
+   ```
+    bash triton/scripts/download_data.sh
+    bash triton/scripts/process_dataset.sh
+   ```
+ 
+6. Setup the parameters for deployment.
+ 
+   ```
+    source triton/scripts/setup_parameters.sh
+   ```
+ 
+7. Convert the model from training to inference format (e.g. TensorRT).
+ 
+   ```
+    python3 triton/convert_model.py \
+        --input-path triton/rn50_model.py \
+        --input-type tf-estimator \
+        --output-path ${SHARED_DIR}/model \
+        --output-type ${FORMAT} \
+        --onnx-opset 12 \
+        --onnx-optimized 1 \
+        --max-batch-size ${MAX_BATCH_SIZE} \
+        --max-workspace-size 4294967296 \
+        --ignore-unknown-parameters \
+        \
+        --model-dir ${CHECKPOINT_DIR} \
+        --precision ${PRECISION} \
+        --dataloader triton/dataloader.py \
+        --data-dir ${DATASETS_DIR}/imagenet
+   ```
+ 
+8. Run the model accuracy tests in framework.
+
+   ```
+    python3 triton/run_inference_on_fw.py \
+        --input-path ${SHARED_DIR}/model \
+        --input-type ${FORMAT} \
+        --dataloader triton/dataloader.py \
+        --data-dir ${DATASETS_DIR}/imagenet \
+        --images-num 256 \
+        --batch-size ${MAX_BATCH_SIZE} \
+        --output-dir ${SHARED_DIR}/correctness_dump \
+        --dump-labels
+
+    python3 triton/calculate_metrics.py \
+        --dump-dir ${SHARED_DIR}/correctness_dump \
+        --metrics triton/metrics.py \
+        --output-used-for-metrics classes \
+        --csv ${SHARED_DIR}/correctness_metrics.csv
+
+    cat ${SHARED_DIR}/correctness_metrics.csv
+
+   ```
+ 
+9. Configure the model on Triton Inference Server.
+ 
+   Generate the configuration from your model repository.
+ 
+   ```
+    python3 triton/config_model_on_trion.py \
+        --model-repository ${MODEL_REPOSITORY_PATH} \
+        --model-path ${SHARED_DIR}/model \
+        --model-format ${FORMAT} \
+        --model-name ${MODEL_NAME} \
+        --model-version 1 \
+        --max-batch-size ${MAX_BATCH_SIZE} \
+        --precision ${PRECISION} \
+        --number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
+        --max-queue-delay-us ${TRITON_MAX_QUEUE_DELAY} \
+        --preferred-batch-sizes ${TRITON_PREFERRED_BATCH_SIZES} \
+        --capture-cuda-graph 0 \
+        --backend-accelerator ${BACKEND_ACCELERATOR} \
+        --load-model ${TRITON_LOAD_MODEL_METHOD}
+   ```
+ 
+10. Run the Triton Inference Server accuracy tests.
+ 
+   ```
+    python3 triton/run_inference_on_triton.py \
+        --server-url localhost:8001 \
+        --model-name ${MODEL_NAME} \
+        --model-version 1 \
+        --dataloader triton/dataloader.py \
+        --data-dir ${DATASETS_DIR}/imagenet \
+        --batch-size ${MAX_BATCH_SIZE} \
+        --output-dir ${SHARED_DIR}/accuracy_dump \
+        --dump-labels
+
+    python3 triton/calculate_metrics.py \
+        --dump-dir ${SHARED_DIR}/accuracy_dump \
+        --metrics triton/metrics.py \
+        --output-used-for-metrics classes \
+        --csv ${SHARED_DIR}/accuracy_metrics.csv
+
+    cat ${SHARED_DIR}/accuracy_metrics.csv
+   ```
+ 
+ 
+11. Run the Triton Inference Server performance online tests.
+ 
+   We want to maximize throughput within latency budget constraints.
+   Dynamic batching is a feature of Triton Inference Server that allows
+   inference requests to be combined by the server, so that a batch is
+   created dynamically, resulting in a reduced average latency.
+   You can set the Dynamic Batcher parameter `max_queue_delay_microseconds` to
+   indicate the maximum amount of time you are willing to wait and
+   `preferred_batch_size` to indicate your maximum server batch size
+   in the Triton Inference Server model configuration. The measurements
+   presented below set the maximum latency to zero to achieve the best latency
+   possible with good performance.
+ 
+ 
+   ```
+    python triton/run_offline_performance_test_on_triton.py \
+        --server-url ${TRITON_SERVER_URL} \
+        --model-name ${MODEL_NAME} \
+        --input-data random \
+        --batch-sizes ${BATCH_SIZE} \
+        --triton-instances ${TRITON_INSTANCES} \
+        --result-path ${SHARED_DIR}/triton_performance_offline.csv
+   ```
+
+
+12. Run the Triton Inference Server performance offline tests.
+ 
+   We want to maximize throughput. It assumes you have your data available
+   for inference or that your data saturate to maximum batch size quickly.
+   Triton Inference Server supports offline scenarios with static batching.
+   Static batching allows inference requests to be served
+   as they are received. The largest improvements to throughput come
+   from increasing the batch size due to efficiency gains in the GPU with larger
+   batches.
+ 
+   ```
+    python triton/run_online_performance_test_on_triton.py \
+        --server-url ${TRITON_SERVER_URL} \
+        --model-name ${MODEL_NAME} \
+        --input-data random \
+        --batch-sizes ${BATCH_SIZE} \
+        --triton-instances ${TRITON_INSTANCES} \
+        --number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
+        --result-path ${SHARED_DIR}/triton_performance_online.csv
+ 
+   ```
+
+
+## Advanced
+
+
+### Prepare configuration
+You can use the environment variables to set the parameters of your inference
+configuration.
+
+Triton deployment scripts support several inference runtimes listed in the table below:
+|  Inference runtime | Mnemonic used in scripts |
+|--------------------|--------------------------|
+| [TensorFlow SavedModel](https://www.tensorflow.org/guide/saved_model) | `tf-savedmodel`  |
+| [TensorFlow TensorRT](https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html) | `tf-trt` |
+| [ONNX](https://onnx.ai) | `onnx` |
+| [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) | `trt` |
+
+The name of the inference runtime should be put into the `FORMAT` variable.
+
+
+
+Example values of some key variables in one configuration:
+```
+PRECISION="fp16"
+FORMAT="tf-trt"
+BATCH_SIZE="1, 2, 4, 8, 16, 32, 64, 128"
+BACKEND_ACCELERATOR="trt"
+MAX_BATCH_SIZE="128"
+NUMBER_OF_MODEL_INSTANCES="2"
+TRITON_MAX_QUEUE_DELAY="1"
+TRITON_PREFERRED_BATCH_SIZES="64 128"
+
+```
+
+
+
+### Latency explanation
+A typical Triton Inference Server pipeline can be broken down into the following steps:
+
+1. The client serializes the inference request into a message and sends it to
+the server (Client Send).
+2. The message travels over the network from the client to the server (Network).
+3. The message arrives at the server and is deserialized (Server Receive).
+4. The request is placed on the queue (Server Queue).
+5. The request is removed from the queue and computed (Server Compute).
+6. The completed request is serialized in a message and sent back to
+the client (Server Send).
+7. The completed message then travels over the network from the server
+to the client (Network).
+8. The completed message is deserialized by the client and processed as
+a completed inference request (Client Receive).
+
+Generally, for local clients, steps 1-4 and 6-8 will only occupy
+a small fraction of time, compared to steps 5. As backend deep learning
+systems like Jasper are rarely exposed directly to end users, but instead
+only interfacing with local front-end servers, for the sake of Jasper,
+we can consider that all clients are local.
+
+
+
+
+
+## Performance
+
+
+### Offline scenario
+This table lists the common variable parameters for all performance measurements:
+| Parameter Name               | Parameter Value   |
+|:-----------------------------|:------------------|
+| Max Batch Size               | 128.0             |
+| Number of model instances    | 2.0               |
+| Triton Max Queue Delay       | 1.0               |
+| Triton Preferred Batch Sizes | 64 128            |
+
+
+#### Offline: NVIDIA A40, TF-TRT with FP16
+
+Our results were obtained using the following configuration:
+ * **GPU:** NVIDIA A40
+ * **Backend:** TensorFlow
+ * **Model binding:** TF-TRT
+ * **Precision:** FP16
+ * **Model format:** TensorFlow SavedModel
+
+|![](plots/graph_performance_offline_3l.svg)|![](plots/graph_performance_offline_3r.svg)|
+|-----|-----|
+
+<details>
+
+<summary>
+Full tabular data
+</summary>
+
+| Precision   | Backend Accelerator   |   Client Batch Size |   Inferences/second |   P90 Latency |   P95 Latency |   P99 Latency |   Avg Latency |
+|:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
+| FP16        | TensorRT             |                   1 |               329.5 |         3.23  |         3.43  |         3.973 |         3.031 |
+| FP16        | TensorRT             |                   2 |               513.8 |         4.292 |         4.412 |         4.625 |         3.888 |
+| FP16        | TensorRT             |                   4 |               720.8 |         6.122 |         6.264 |         6.5   |         5.543 |
+| FP16        | TensorRT             |                   8 |               919.2 |         9.145 |         9.664 |        10.3   |         8.701 |
+| FP16        | TensorRT             |                  16 |              1000   |        17.522 |        17.979 |        19.098 |        16.01  |
+| FP16        | TensorRT             |                  32 |               889.6 |        37.49  |        38.481 |        40.316 |        35.946 |
+| FP16        | TensorRT             |                  64 |               992   |        66.837 |        67.923 |        70.324 |        64.645 |
+| FP16        | TensorRT             |                 128 |               896   |       148.461 |       149.854 |       150.05  |       143.684 |
+
+</details>
+
+
+#### Offline: NVIDIA DGX A100 (1x A100 80GB), TF-TRT with FP16
+
+Our results were obtained using the following configuration:
+ * **GPU:** NVIDIA DGX A100 (1x A100 80GB)
+ * **Backend:** TensorFlow
+ * **Model binding:** TF-TRT
+ * **Precision:** FP16
+ * **Model format:** TensorFlow SavedModel
+
+|![](plots/graph_performance_offline_7l.svg)|![](plots/graph_performance_offline_7r.svg)|
+|-----|-----|
+
+<details>
+
+<summary>
+Full tabular data
+</summary>
+
+| Precision   | Backend Accelerator   |   Client Batch Size |   Inferences/second |   P90 Latency |   P95 Latency |   P99 Latency |   Avg Latency |
+|:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
+| FP16        | TensorRT             |                   1 |               387.9 |         2.626 |         2.784 |         2.875 |         2.574 |
+| FP16        | TensorRT             |                   2 |               637.2 |         3.454 |         3.506 |         3.547 |         3.135 |
+| FP16        | TensorRT             |                   4 |               982.4 |         4.328 |         4.454 |         4.627 |         4.07  |
+| FP16        | TensorRT             |                   8 |              1181.6 |         7.012 |         7.074 |         7.133 |         6.765 |
+| FP16        | TensorRT             |                  16 |              1446.4 |        11.162 |        11.431 |        11.941 |        11.061 |
+| FP16        | TensorRT             |                  32 |              1353.6 |        24.392 |        24.914 |        25.178 |        23.603 |
+| FP16        | TensorRT             |                  64 |              1478.4 |        45.539 |        46.096 |        47.546 |        43.401 |
+| FP16        | TensorRT             |                 128 |              1331.2 |        97.504 |       100.611 |       101.896 |        96.198 |
+
+</details>
+
+
+#### Offline: NVIDIA DGX-1 (1x V100 32GB), TF-TRT with FP16
+
+Our results were obtained using the following configuration:
+ * **GPU:** NVIDIA DGX A100 (1x A100 80GB)
+ * **Backend:** TensorFlow
+ * **Model binding:** TF-TRT
+ * **Precision:** FP16
+ * **Model format:** TensorFlow SavedModel
+
+|![](plots/graph_performance_offline_11l.svg)|![](plots/graph_performance_offline_11r.svg)|
+|-----|-----|
+
+<details>
+
+<summary>
+Full tabular data
+</summary>
+
+| Precision   | Backend Accelerator   |   Client Batch Size |   Inferences/second |   P90 Latency |   P95 Latency |   P99 Latency |   Avg Latency |
+|:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
+| FP16        | TensorRT             |                   1 |               255.6 |         4.032 |         4.061 |         4.141 |         3.909 |
+| FP16        | TensorRT             |                   2 |               419.2 |         4.892 |         4.94  |         5.133 |         4.766 |
+| FP16        | TensorRT             |                   4 |               633.6 |         6.603 |         6.912 |         7.18  |         6.306 |
+| FP16        | TensorRT             |                   8 |               865.6 |         9.657 |         9.73  |         9.834 |         9.236 |
+| FP16        | TensorRT             |                  16 |               950.4 |        18.396 |        20.748 |        23.873 |        16.824 |
+| FP16        | TensorRT             |                  32 |               854.4 |        37.965 |        38.599 |        40.34  |        37.432 |
+| FP16        | TensorRT             |                  64 |               825.6 |        80.118 |        80.758 |        87.374 |        77.596 |
+| FP16        | TensorRT             |                 128 |               704   |       189.198 |       189.87  |       191.259 |       183.205 |
+
+</details>
+
+
+
+#### Offline: NVIDIA T4, TF-TRT with FP16
+
+Our results were obtained using the following configuration:
+ * **GPU:** NVIDIA T4
+ * **Backend:** TensorFlow
+ * **Model binding:** TF-TRT
+ * **Precision:** FP16
+ * **Model format:** TensorFlow SavedModel
+
+|![](plots/graph_performance_offline_15l.svg)|![](plots/graph_performance_offline_15r.svg)|
+|-----|-----|
+
+<details>
+
+<summary>
+Full tabular data
+</summary>
+
+| Precision   | Backend Accelerator   |   Client Batch Size |   Inferences/second |   P90 Latency |   P95 Latency |   P99 Latency |   Avg Latency |
+|:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
+| FP16        | TensorRT             |                   1 |               211.7 |         4.89  |         4.926 |         4.965 |         4.717 |
+| FP16        | TensorRT             |                   2 |               327.8 |         6.258 |         6.309 |         6.436 |         6.094 |
+| FP16        | TensorRT             |                   4 |               468.4 |         8.996 |         9.085 |         9.239 |         8.531 |
+| FP16        | TensorRT             |                   8 |               544.8 |        15.654 |        15.978 |        16.324 |        14.673 |
+| FP16        | TensorRT             |                  16 |               544   |        30.626 |        30.788 |        31.311 |        29.477 |
+| FP16        | TensorRT             |                  32 |               524.8 |        64.527 |        65.35  |        66.13  |        60.943 |
+| FP16        | TensorRT             |                  64 |               556.8 |       115.455 |       115.717 |       116.02  |       113.802 |
+| FP16        | TensorRT             |                 128 |               537.6 |       242.501 |       244.599 |       246.16  |       238.384 |
+
+</details>
+
+
+
+
+
+### Online scenario
+
+This table lists the common variable parameters for all performance measurements:
+| Parameter Name               | Parameter Value   |
+|:-----------------------------|:------------------|
+| Max Batch Size               | 128.0             |
+| Number of model instances      | 2.0               |
+| Triton Max Queue Delay       | 1.0               |
+| Triton Preferred Batch Sizes | 64 128            |
+
+
+
+
+
+#### Online: NVIDIA A40, TF-TRT with FP16
+
+Our results were obtained using the following configuration:
+ * **GPU:** NVIDIA A40
+ * **Backend:** TensorFlow
+ * **Model binding:** TF-TRT
+ * **Precision:** FP16
+ * **Model format:** TensorFlow SavedModel
+
+![](plots/graph_performance_online_6.svg)
+ 
+<details>
+
+<summary>
+Full tabular data
+</summary>
+
+|   Concurrent client requests |   Inferences/second |   Client Send |   Network+server Send/recv |   Server Queue |   Server Compute Input |   Server Compute Infer |   Server Compute Output |   Client Recv |   P50 Latency |   P90 Latency |   P95 Latency |   P99 Latency |   Avg Latency |
+|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
+|                           16 |              1421.3 |         0.109 |                      4.875 |          1.126 |                  0.895 |                  4.188 |                   0.053 |             0 |        11.046 |        17.34  |        17.851 |        19.013 |        11.246 |
+|                           32 |              1920   |         0.118 |                      8.402 |          1.47  |                  1.323 |                  5.277 |                   0.09  |             0 |        16.328 |        28.052 |        29.871 |        31.932 |        16.68  |
+|                           48 |              2270.4 |         0.12  |                     11.505 |          1.856 |                  1.582 |                  5.953 |                   0.113 |             0 |        22.172 |        31.87  |        35.395 |        41.256 |        21.129 |
+|                           64 |              2401.9 |         0.12  |                     14.443 |          2.299 |                  2.358 |                  7.285 |                   0.149 |             0 |        26.69  |        37.388 |        40.73  |        47.503 |        26.654 |
+|                           80 |              2823   |         0.126 |                     14.917 |          2.71  |                  2.406 |                  7.977 |                   0.174 |             0 |        29.113 |        39.932 |        43.789 |        51.24  |        28.31  |
+|                           96 |              2903.8 |         0.133 |                     18.824 |          2.929 |                  2.595 |                  8.364 |                   0.18  |             0 |        33.951 |        46.785 |        51.878 |        60.37  |        33.025 |
+|                          112 |              3096.6 |         0.135 |                     20.018 |          3.362 |                  2.97  |                  9.434 |                   0.209 |             0 |        37.927 |        50.587 |        55.169 |        63.141 |        36.128 |
+|                          128 |              3252   |         0.138 |                     21.092 |          3.912 |                  3.445 |                 10.505 |                   0.245 |             0 |        41.241 |        53.912 |        58.961 |        68.864 |        39.337 |
+|                          144 |              3352.4 |         0.137 |                     21.407 |          4.527 |                  4.237 |                 12.363 |                   0.293 |             0 |        44.211 |        59.876 |        65.971 |        79.335 |        42.964 |
+|                          160 |              3387.4 |         0.137 |                     22.947 |          5.179 |                  4.847 |                 13.805 |                   0.326 |             0 |        48.423 |        65.393 |        69.568 |        81.288 |        47.241 |
+|                          176 |              3409.1 |         0.142 |                     24.989 |          5.623 |                  5.539 |                 14.956 |                   0.357 |             0 |        52.714 |        71.332 |        78.478 |        99.086 |        51.606 |
+|                          192 |              3481.8 |         0.143 |                     25.661 |          6.079 |                  6.666 |                 16.442 |                   0.372 |             0 |        55.383 |        79.276 |        95.479 |       122.295 |        55.363 |
+|                          208 |              3523.8 |         0.147 |                     27.042 |          6.376 |                  7.526 |                 17.413 |                   0.4   |             0 |        58.823 |        86.375 |       104.134 |       123.278 |        58.904 |
+|                          224 |              3587.2 |         0.148 |                     29.648 |          6.776 |                  7.659 |                 17.85  |                   0.411 |             0 |        61.973 |        91.804 |       107.987 |       130.413 |        62.492 |
+|                          240 |              3507.4 |         0.153 |                     31.079 |          7.987 |                  9.246 |                 19.342 |                   0.426 |             0 |        65.697 |       106.035 |       121.914 |       137.572 |        68.233 |
+|                          256 |              3504.4 |         0.16  |                     34.664 |          8.252 |                  9.886 |                 19.567 |                   0.461 |             0 |        70.708 |       115.965 |       127.808 |       147.327 |        72.99  |
+
+</details>
+
+
+
+#### Online: NVIDIA DGX A100 (1x A100 80GB), TF-TRT with FP16
+
+Our results were obtained using the following configuration:
+ * **GPU:** NVIDIA DGX A100 (1x A100 80GB)
+ * **Backend:** TensorFlow
+ * **Model binding:** TF-TRT
+ * **Precision:** FP16
+ * **Model format:** TensorFlow SavedModel
+
+![](plots/graph_performance_online_14.svg)
+ 
+<details>
+
+<summary>
+Full tabular data
+</summary>
+
+|   Concurrent client requests |   Inferences/second |   Client Send |   Network+server Send/recv |   Server Queue |   Server Compute Input |   Server Compute Infer |   Server Compute Output |   Client Recv |   P50 Latency |   P90 Latency |   P95 Latency |   P99 Latency |   Avg Latency |
+|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
+|                           16 |              1736.5 |         0.11  |                      2.754 |          1.272 |                  0.954 |                  4.08  |                   0.036 |             0 |         9.037 |        12.856 |        13.371 |        15.174 |         9.206 |
+|                           32 |              2418.9 |         0.114 |                      5.15  |          1.494 |                  1.361 |                  5.031 |                   0.072 |             0 |        13.234 |        20.638 |        21.717 |        23.352 |        13.222 |
+|                           48 |              2891.3 |         0.112 |                      7.389 |          1.721 |                  1.586 |                  5.688 |                   0.096 |             0 |        17.089 |        25.946 |        27.611 |        29.784 |        16.592 |
+|                           64 |              3432.6 |         0.11  |                      7.866 |          2.11  |                  2.126 |                  6.301 |                   0.131 |             0 |        19.322 |        25.971 |        28.845 |        34.024 |        18.644 |
+|                           80 |              3644.6 |         0.116 |                      9.665 |          2.33  |                  2.493 |                  7.185 |                   0.146 |             0 |        22.834 |        29.061 |        32.281 |        37.224 |        21.935 |
+|                           96 |              3902.2 |         0.116 |                     11.138 |          2.676 |                  2.828 |                  7.684 |                   0.166 |             0 |        25.589 |        32.572 |        35.307 |        40.123 |        24.608 |
+|                          112 |              3960.6 |         0.124 |                     13.321 |          2.964 |                  3.209 |                  8.438 |                   0.186 |             0 |        29.537 |        37.388 |        40.602 |        46.193 |        28.242 |
+|                          128 |              4137.7 |         0.124 |                     14.325 |          3.372 |                  3.646 |                  9.244 |                   0.219 |             0 |        31.587 |        41.968 |        44.993 |        51.38  |        30.93  |
+|                          144 |              4139.6 |         0.136 |                     15.919 |          3.803 |                  4.451 |                 10.274 |                   0.233 |             0 |        35.696 |        48.301 |        51.345 |        57.414 |        34.816 |
+|                          160 |              4300.5 |         0.134 |                     16.453 |          4.341 |                  4.934 |                 10.979 |                   0.274 |             0 |        38.495 |        50.566 |        53.943 |        61.406 |        37.115 |
+|                          176 |              4166.6 |         0.143 |                     18.436 |          4.959 |                  6.081 |                 12.321 |                   0.309 |             0 |        43.451 |        60.739 |        69.51  |        84.959 |        42.249 |
+|                          192 |              4281.3 |         0.138 |                     19.585 |          5.201 |                  6.571 |                 13.042 |                   0.313 |             0 |        46.175 |        62.718 |        69.46  |        83.032 |        44.85  |
+|                          208 |              4314.8 |         0.15  |                     20.046 |          5.805 |                  7.752 |                 14.062 |                   0.335 |             0 |        47.957 |        73.848 |        84.644 |        96.408 |        48.15  |
+|                          224 |              4388.2 |         0.141 |                     21.393 |          6.105 |                  8.236 |                 14.85  |                   0.343 |             0 |        50.449 |        77.534 |        88.553 |       100.727 |        51.068 |
+|                          240 |              4371.8 |         0.143 |                     22.342 |          6.711 |                  9.423 |                 15.78  |                   0.377 |             0 |        53.216 |        85.983 |        97.756 |       112.48  |        54.776 |
+|                          256 |              4617.3 |         0.144 |                     23.392 |          6.595 |                  9.466 |                 15.568 |                   0.367 |             0 |        54.703 |        86.054 |        93.95  |       105.917 |        55.532 |
+
+</details>
+
+
+#### Online: NVIDIA DGX-1 (1x V100 32GB), TF-TRT with FP16
+
+Our results were obtained using the following configuration:
+ * **GPU:** NVIDIA DGX-1 (1x V100 32GB)
+ * **Backend:** TensorFlow
+ * **Model binding:** TF-TRT
+ * **Precision:** FP16
+ * **Model format:** TensorFlow SavedModel
+
+![](plots/graph_performance_online_22.svg)
+ 
+<details>
+
+<summary>
+Full tabular data
+</summary>
+
+|   Concurrent client requests |   Inferences/second |   Client Send |   Network+server Send/recv |   Server Queue |   Server Compute Input |   Server Compute Infer |   Server Compute Output |   Client Recv |   P50 Latency |   P90 Latency |   P95 Latency |   P99 Latency |   Avg Latency |
+|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
+|                           16 |              1259.7 |         0.121 |                      3.735 |          1.999 |                  0.803 |                  5.998 |                   0.034 |         0     |        13.623 |        17.271 |        17.506 |        18.938 |        12.69  |
+|                           32 |              1686.4 |         0.17  |                      6.9   |          2.33  |                  2.212 |                  7.303 |                   0.07  |         0     |        18.836 |        28.302 |        30.423 |        32.916 |        18.985 |
+|                           48 |              1888.3 |         0.183 |                      9.068 |          3.372 |                  3.65  |                  9.058 |                   0.108 |         0.001 |        26.571 |        36.583 |        40.84  |        50.402 |        25.44  |
+|                           64 |              2103.9 |         0.204 |                     12.416 |          3.146 |                  4.304 |                 10.127 |                   0.145 |         0.001 |        32.401 |        37.121 |        41.252 |        49.094 |        30.343 |
+|                           80 |              2255.2 |         0.211 |                     13.753 |          4.074 |                  5.455 |                 11.776 |                   0.192 |         0.001 |        38.298 |        47.082 |        54.476 |        65.412 |        35.462 |
+|                           96 |              2376.6 |         0.214 |                     16.22  |          4.873 |                  5.972 |                 12.911 |                   0.208 |         0.001 |        43.008 |        52.947 |        57.126 |        69.778 |        40.399 |
+|                          112 |              2445.6 |         0.243 |                     18.495 |          5.461 |                  7.012 |                 14.365 |                   0.248 |         0.001 |        48.081 |        62.414 |        68.274 |        85.766 |        45.825 |
+|                          128 |              2534.2 |         0.261 |                     19.294 |          6.486 |                  7.925 |                 16.312 |                   0.282 |         0.001 |        52.894 |        68.475 |        74.852 |        89.979 |        50.561 |
+|                          144 |              2483.9 |         0.27  |                     20.771 |          7.744 |                  9.993 |                 18.865 |                   0.414 |         0.001 |        64.866 |        70.434 |        80.279 |        99.177 |        58.058 |
+|                          160 |              2512.8 |         0.302 |                     24.205 |          7.838 |                 11.217 |                 19.689 |                   0.373 |         0.001 |        69.085 |        85.576 |        95.016 |       109.455 |        63.625 |
+|                          176 |              2541   |         0.311 |                     26.206 |          8.556 |                 12.439 |                 21.393 |                   0.418 |         0.001 |        76.666 |        92.266 |       106.889 |       127.055 |        69.324 |
+|                          192 |              2623.4 |         0.33  |                     27.783 |          9.058 |                 13.198 |                 22.181 |                   0.433 |         0.001 |        79.724 |        97.736 |       111.44  |       142.418 |        72.984 |
+|                          208 |              2616.2 |         0.353 |                     29.667 |          9.759 |                 15.693 |                 23.567 |                   0.444 |         0.001 |        80.571 |       125.202 |       140.527 |       175.331 |        79.484 |
+|                          224 |              2693.9 |         0.369 |                     32.283 |          9.941 |                 15.769 |                 24.304 |                   0.439 |         0.001 |        78.743 |       137.09  |       151.955 |       183.397 |        83.106 |
+|                          240 |              2700.4 |         0.447 |                     32.287 |         11.128 |                 18.204 |                 26.578 |                   0.456 |         0.001 |        82.561 |       155.011 |       177.925 |       191.51  |        89.101 |
+|                          256 |              2743.8 |         0.481 |                     34.688 |         11.834 |                 19.087 |                 26.597 |                   0.459 |         0.001 |        89.387 |       153.866 |       177.805 |       204.319 |        93.147 |
+
+</details>
+
+
+
+#### Online: NVIDIA T4, TF-TRT with FP16
+
+Our results were obtained using the following configuration:
+ * **GPU:** NVIDIA T4
+ * **Backend:** TensorFlow
+ * **Model binding:** TF-TRT
+ * **Precision:** FP16
+ * **Model format:** TensorFlow SavedModel
+
+![](plots/graph_performance_online_30.svg)
+ 
+<details>
+
+<summary>
+Full tabular data
+</summary>
+
+|   Concurrent client requests |   Inferences/second |   Client Send |   Network+server Send/recv |   Server Queue |   Server Compute Input |   Server Compute Infer |   Server Compute Output |   Client Recv |   P50 Latency |   P90 Latency |   P95 Latency |   P99 Latency |   Avg Latency |
+|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
+|                           16 |               731.4 |         0.271 |                      6.9   |          3.745 |                  2.073 |                  8.802 |                   0.081 |         0.001 |        25.064 |        28.863 |        29.7   |        32.01  |        21.873 |
+|                           32 |               935   |         0.273 |                     12.023 |          3.48  |                  4.375 |                 13.885 |                   0.141 |         0.001 |        31.339 |        50.564 |        52.684 |        55.823 |        34.178 |
+|                           48 |              1253   |         0.298 |                     12.331 |          5.313 |                  4.623 |                 15.634 |                   0.178 |         0.001 |        38.099 |        60.665 |        64.537 |        72.38  |        38.378 |
+|                           64 |              1368.3 |         0.303 |                     15.3   |          6.926 |                  4.9   |                 19.118 |                   0.2   |         0.001 |        48.758 |        66.391 |        73.271 |        81.537 |        46.748 |
+|                           80 |              1410.7 |         0.296 |                     15.525 |         11.06  |                  6.934 |                 22.476 |                   0.286 |         0.001 |        60.346 |        65.664 |        76.055 |        84.643 |        56.578 |
+|                           96 |              1473.1 |         0.309 |                     18.846 |         11.746 |                  7.825 |                 26.165 |                   0.319 |         0.001 |        69.785 |        77.337 |        91.586 |       100.918 |        65.211 |
+|                          112 |              1475.5 |         0.316 |                     23.275 |         12.412 |                  8.954 |                 30.724 |                   0.338 |         0.001 |        79.904 |       106.324 |       111.382 |       126.559 |        76.02  |
+|                          128 |              1535.9 |         0.328 |                     23.486 |         14.64  |                 10.057 |                 34.534 |                   0.352 |         0.001 |        89.451 |       110.789 |       121.814 |       140.139 |        83.398 |
+|                          144 |              1512.3 |         0.336 |                     25.79  |         18.7   |                 12.205 |                 37.909 |                   0.435 |         0.001 |       103.388 |       108.917 |       114.44  |       136.469 |        95.376 |
+|                          160 |              1533.6 |         0.406 |                     29.825 |         17.67  |                 13.751 |                 42.259 |                   0.44  |         0.001 |       111.899 |       140.67  |       154.76  |       191.391 |       104.352 |
+|                          176 |              1515.1 |         0.438 |                     34.286 |         17.867 |                 16.42  |                 46.792 |                   0.461 |         0.001 |       120.503 |       187.317 |       205.71  |       223.391 |       116.265 |
+|                          192 |              1532.2 |         0.476 |                     34.796 |         18.86  |                 19.071 |                 51.446 |                   0.483 |         0.001 |       124.044 |       211.466 |       226.921 |       237.664 |       125.133 |
+|                          208 |              1616.7 |         0.697 |                     32.363 |         21.465 |                 18.315 |                 55.539 |                   0.516 |         0.001 |       127.891 |       200.478 |       221.404 |       250.348 |       128.896 |
+|                          224 |              1541.5 |         0.702 |                     35.932 |         22.786 |                 22.138 |                 62.657 |                   0.527 |         0.001 |       141.32  |       248.069 |       263.661 |       276.579 |       144.743 |
+|                          240 |              1631.7 |         0.79  |                     37.581 |         22.791 |                 21.651 |                 64.278 |                   0.549 |         0.001 |       141.393 |       250.354 |       272.17  |       289.926 |       147.641 |
+|                          256 |              1607.4 |         0.801 |                     39.342 |         29.09  |                 23.416 |                 66.866 |                   0.593 |         0.001 |       157.87  |       262.818 |       280.921 |       310.504 |       160.109 |
+
+</details>
+
+
+
+
+
+
+## Release Notes
+We’re constantly refining and improving our performance on AI
+and HPC workloads even on the same hardware with frequent updates
+to our software stack. For our latest performance data please refer
+to these pages for
+[AI](https://developer.nvidia.com/deep-learning-performance-training-inference)
+and [HPC](https://developer.nvidia.com/hpc-application-performance) benchmarks.
+
+### Changelog
+
+July 2020
+- Initial release 
+
+April 2021
+- NVIDIA A100 results added
+
+### Known issues
+
+There are no known issues with this model with this model.
+
+
--- a/TensorFlow/Classification/ConvNets/triton/calculate_metrics.py
+++ b/TensorFlow/Classification/ConvNets/triton/calculate_metrics.py
@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Using `calculate_metrics.py` script, you can obtain model accuracy/error metrics using defined `MetricsCalculator` class.
+See [documentation](https://gitlab-master.nvidia.com/dl/JoC/bermuda-api/-/blob/develop/bermuda_api_toolset/docs/metrics.md) on preparation of this class.
+
+Data provided to `MetricsCalculator` are obtained from [npz dump files](https://gitlab-master.nvidia.com/dl/JoC/bermuda-api/-/blob/develop/bermuda_api_toolset/docs/dump_files.md)
+stored in directory pointed by `--dump-dir` argument.
+Above files are prepared by `run_inference_on_fw.py` and `run_inference_on_triton.py` scripts.
+
+Output data is stored in csv file pointed by `--csv` argument.
+
+Example call:
+
+```shell script
+python ./triton/calculate_metrics.py \
+    --dump-dir /results/dump_triton \
+    --csv /results/accuracy_results.csv \
+    --metrics metrics.py \
+    --metric-class-param1 value
+```
+"""
+
+import argparse
+import csv
+import logging
+import string
+from pathlib import Path
+
+import numpy as np
+
+# method from PEP-366 to support relative import in executed modules
+
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator
+from .deployment_toolkit.core import BaseMetricsCalculator, load_from_file
+from .deployment_toolkit.dump import pad_except_batch_axis
+
+LOGGER = logging.getLogger("calculate_metrics")
+TOTAL_COLUMN_NAME = "_total_"
+
+
+def get_data(dump_dir, prefix):
+    """Loads and concatenates dump files for given prefix (ex. inputs, outputs, labels, ids)"""
+    dump_dir = Path(dump_dir)
+    npz_files = sorted(dump_dir.glob(f"{prefix}*.npz"))
+    data = None
+    if npz_files:
+        # assume that all npz files with given prefix contain same set of names
+        names = list(np.load(npz_files[0].as_posix()).keys())
+        # calculate target shape
+        target_shape = {
+            name: tuple(np.max([np.load(npz_file.as_posix())[name].shape for npz_file in npz_files], axis=0))
+            for name in names
+        }
+        # pad and concatenate data
+        data = {
+            name: np.concatenate(
+                [pad_except_batch_axis(np.load(npz_file.as_posix())[name], target_shape[name]) for npz_file in npz_files]
+            )
+            for name in names
+        }
+    return data
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser(description="Run models with given dataloader", allow_abbrev=False)
+    parser.add_argument("--metrics", help=f"Path to python module containing metrics calculator", required=True)
+    parser.add_argument("--csv", help="Path to csv file", required=True)
+    parser.add_argument("--dump-dir", help="Path to directory with dumped outputs (and labels)", required=True)
+
+    args, *_ = parser.parse_known_args()
+
+    MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
+    ArgParserGenerator(MetricsCalculator).update_argparser(parser)
+
+    args = parser.parse_args()
+
+    LOGGER.info(f"args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
+    metrics_calculator: BaseMetricsCalculator = ArgParserGenerator(MetricsCalculator).from_args(args)
+
+    ids = get_data(args.dump_dir, "ids")["ids"]
+    x = get_data(args.dump_dir, "inputs")
+    y_true = get_data(args.dump_dir, "labels")
+    y_pred = get_data(args.dump_dir, "outputs")
+
+    common_keys = list({k for k in (y_true or [])} & {k for k in (y_pred or [])})
+    for key in common_keys:
+        if y_true[key].shape != y_pred[key].shape:
+            LOGGER.warning(
+                f"Model predictions and labels shall have equal shapes. "
+                f"y_pred[{key}].shape={y_pred[key].shape} != "
+                f"y_true[{key}].shape={y_true[key].shape}"
+            )
+
+    metrics = metrics_calculator.calc(ids=ids, x=x, y_pred=y_pred, y_real=y_true)
+    metrics = {TOTAL_COLUMN_NAME: len(ids), **metrics}
+
+    metric_names_with_space = [name for name in metrics if any([c in string.whitespace for c in name])]
+    if metric_names_with_space:
+        raise ValueError(f"Metric names shall have no spaces; Incorrect names: {', '.join(metric_names_with_space)}")
+
+    csv_path = Path(args.csv)
+    csv_path.parent.mkdir(parents=True, exist_ok=True)
+    with csv_path.open("w") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=list(metrics.keys()))
+        writer.writeheader()
+        writer.writerow(metrics)
+
+
+if __name__ == "__main__":
+    main()
--- a/TensorFlow/Classification/ConvNets/triton/config_model_on_trion.py
+++ b/TensorFlow/Classification/ConvNets/triton/config_model_on_trion.py
@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+To deploy model in Triton, you can use `deploy_model.py` script.
+This will prepare layout of Model Repository, including  Model Configuration.
+
+```shell script
+python ./triton/deploy_model.py \
+    --model-repository /model_repository \
+    --model-path /models/exported/model.onnx \
+    --model-format onnx \
+    --model-name ResNet50 \
+    --model-version 1 \
+    --max-batch-size 32 \
+    --precision fp16 \
+    --backend-accelerator trt \
+    --load-model \
+    --timeout 120 \
+    --verbose
+```
+
+If Triton server to which we prepare model repository is running with **explicit model control mode**,
+use `--load-model` argument to send request load_model request to Triton Inference Server.
+If server is listening on non-default address or port use `--server-url` argument to point server control endpoint.
+If it is required to use HTTP protocol to communcate with Triton server use `--http` argument.
+
+To improve inference throughput you can use
+[dynamic batching](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#dynamic-batcher)
+for your model by providing `--preferred-batch-sizes` and `--max-queue-delay-us` parameters.
+
+By default Triton will [automatically obtain inputs and outputs definitions](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#auto-generated-model-configuration).
+but for TorchScript models script uses file with I/O specs. This file is automatically generated
+when the model is converted to ScriptModule (either traced or scripted).
+If there is a need to pass different than default path to I/O spec file use `--io-spec` CLI argument.
+
+I/O spec file is yaml file with below structure:
+
+```yaml
+- inputs:
+  - name: input
+    dtype: float32   # np.dtype name
+    shape: [None, 224, 224, 3]
+- outputs:
+  - name: probabilities
+    dtype: float32
+    shape: [None, 1001]
+  - name: classes
+    dtype: int32
+    shape: [None, 1]
+```
+
+"""
+
+import argparse
+import logging
+
+from service_maker import Accelerator, Format, Precision
+from service_maker.args import str2bool
+from service_maker.log import dump_arguments, set_logger
+from service_maker.triton import ModelConfig, TritonClient, TritonModelStore
+
+LOGGER = logging.getLogger("deploy_model")
+
+
+def _available_enum_values(my_enum):
+    return [item.value for item in my_enum]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Create Triton model repository and model configuration", allow_abbrev=False
+    )
+    parser.add_argument("--model-repository", required=True, help="Path to Triton model repository.")
+    parser.add_argument("--model-path", required=True, help="Path to model to deploy")
+
+    # TODO: automation
+    parser.add_argument(
+        "--model-format",
+        required=True,
+        choices=_available_enum_values(Format),
+        help="Format of model to deploy",
+    )
+    parser.add_argument("--model-name", required=True, help="Model name")
+    parser.add_argument("--model-version", default="1", help="Version of model (default 1)")
+    parser.add_argument(
+        "--max-batch-size",
+        type=int,
+        default=32,
+        help="Maximum batch size allowed for inference. "
+        "A max_batch_size value of 0 indicates that batching is not allowed for the model",
+    )
+    # TODO: automation
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default=Precision.FP16.value,
+        choices=_available_enum_values(Precision),
+        help="Model precision (parameter used only by Tensorflow backend with TensorRT optimization)",
+    )
+
+    # Triton Inference Server endpoint
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="grpc://localhost:8001",
+        help="Inference server URL in format protocol://host[:port] (default grpc://localhost:8001)",
+    )
+    parser.add_argument(
+        "--load-model",
+        choices=["none", "poll", "explicit"],
+        help="Loading model while Triton Server is in given model control mode",
+    )
+    parser.add_argument(
+        "--timeout", default=120, help="Timeout in seconds to wait till model load (default=120)", type=int
+    )
+
+    # optimization related
+    parser.add_argument(
+        "--backend-accelerator",
+        type=str,
+        choices=_available_enum_values(Accelerator),
+        default=Accelerator.TRT.value,
+        help="Select Backend Accelerator used to serve model",
+    )
+    parser.add_argument("--number-of-model-instances", type=int, default=1, help="Number of model instances per GPU")
+    parser.add_argument(
+        "--preferred-batch-sizes",
+        type=int,
+        nargs="*",
+        help="Batch sizes that the dynamic batcher should attempt to create. "
+        "In case --max-queue-delay-us is set and this parameter is not, default value will be --max-batch-size",
+    )
+    parser.add_argument(
+        "--max-queue-delay-us",
+        type=int,
+        default=0,
+        help="Max delay time which dynamic batcher shall wait to form a batch (default 0)",
+    )
+    parser.add_argument(
+        "--capture-cuda-graph",
+        type=int,
+        default=0,
+        help="Use cuda capture graph (used only by TensorRT platform)",
+    )
+
+    parser.add_argument("-v", "--verbose", help="Provide verbose logs", type=str2bool, default=False)
+    args = parser.parse_args()
+
+    set_logger(verbose=args.verbose)
+    dump_arguments(args)
+
+    config = ModelConfig.create(
+        model_path=args.model_path,
+        # model definition
+        model_name=args.model_name,
+        model_version=args.model_version,
+        model_format=args.model_format,
+        precision=args.precision,
+        max_batch_size=args.max_batch_size,
+        # optimization
+        accelerator=args.backend_accelerator,
+        gpu_engine_count=args.number_of_model_instances,
+        preferred_batch_sizes=args.preferred_batch_sizes or [],
+        max_queue_delay_us=args.max_queue_delay_us,
+        capture_cuda_graph=args.capture_cuda_graph,
+    )
+
+    model_store = TritonModelStore(args.model_repository)
+    model_store.deploy_model(model_config=config, model_path=args.model_path)
+
+    if args.load_model != "none":
+        client = TritonClient(server_url=args.server_url, verbose=args.verbose)
+        if args.load_model == "explicit":
+            client.load_model(model_name=args.model_name)
+        client.wait_for_model(model_name=args.model_name, model_version=args.model_version, timeout_s=args.timeout)
+
+
+if __name__ == "__main__":
+    main()
--- a/TensorFlow/Classification/ConvNets/triton/convert_model.py
+++ b/TensorFlow/Classification/ConvNets/triton/convert_model.py
@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""
+`convert_model.py` script allows to convert between model formats with additional model optimizations
+for faster inference.
+It converts model from results of [`get_model`](https://gitlab-master.nvidia.com/dl/JoC/bermuda-api/-/blob/develop/bermuda_api_toolset/docs/model.md) function.
+
+Currently supported input and output formats are:
+
+  - inputs
+    - `tf-estimator` - `get_model` function returning Tensorflow Estimator
+    - `tf-keras` - `get_model` function returning Tensorflow Keras Model
+    - `tf-savedmodel` - Tensorflow SavedModel binary
+    - `pyt` - `get_model` function returning PyTorch Module
+  - output
+    - `tf-savedmodel` - Tensorflow saved model
+    - `tf-trt` - TF-TRT saved model
+    - `ts-trace` - PyTorch traced ScriptModule
+    - `ts-script` - PyTorch scripted ScriptModule
+    - `onnx` - ONNX
+    - `trt` - TensorRT plan file
+
+For tf-keras input you can use:
+  - --large-model flag - helps loading model which exceeds maximum protobuf size of 2GB
+  - --tf-allow-growth flag - control limiting GPU memory growth feature
+    (https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth). By default it is disabled.
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "1"
+
+# method from PEP-366 to support relative import in executed modules
+if __name__ == "__main__" and __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator
+from .deployment_toolkit.core import (
+    DATALOADER_FN_NAME,
+    BaseConverter,
+    BaseLoader,
+    BaseSaver,
+    Format,
+    Precision,
+    load_from_file,
+)
+from .deployment_toolkit.extensions import converters, loaders, savers
+
+LOGGER = logging.getLogger("convert_model")
+
+INPUT_MODEL_TYPES = [Format.TF_ESTIMATOR, Format.TF_KERAS, Format.TF_SAVEDMODEL, Format.PYT]
+OUTPUT_MODEL_TYPES = [Format.TF_SAVEDMODEL, Format.TF_TRT, Format.ONNX, Format.TRT, Format.TS_TRACE, Format.TS_SCRIPT]
+
+
+def _get_args():
+    parser = argparse.ArgumentParser(description="Script for conversion between model formats.", allow_abbrev=False)
+    parser.add_argument("--input-path", help="Path to input model file (python module or binary file)", required=True)
+    parser.add_argument(
+        "--input-type", help="Input model type", choices=[f.value for f in INPUT_MODEL_TYPES], required=True
+    )
+    parser.add_argument("--output-path", help="Path to output model file", required=True)
+    parser.add_argument(
+        "--output-type", help="Output model type", choices=[f.value for f in OUTPUT_MODEL_TYPES], required=True
+    )
+    parser.add_argument("--dataloader", help="Path to python module containing data loader")
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
+    parser.add_argument(
+        "--ignore-unknown-parameters",
+        help="Ignore unknown parameters (argument often used in CI where set of arguments is constant)",
+        action="store_true",
+        default=False,
+    )
+
+    args, unparsed_args = parser.parse_known_args()
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
+
+    converter_name = f"{args.input_type}--{args.output_type}"
+    Converter: BaseConverter = converters.get(converter_name)
+    if Converter is not None:
+        ArgParserGenerator(Converter).update_argparser(parser)
+
+    Saver: BaseSaver = savers.get(args.output_type)
+    ArgParserGenerator(Saver).update_argparser(parser)
+
+    if args.dataloader is not None:
+        get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+        ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+
+    if args.ignore_unknown_parameters:
+        args, unknown_args = parser.parse_known_args()
+        LOGGER.warning(f"Got additional args {unknown_args}")
+    else:
+        args = parser.parse_args()
+    return args
+
+
+def main():
+    args = _get_args()
+
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info(f"args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    requested_model_precision = Precision(args.precision)
+    dataloader_fn = None
+
+    # if conversion is required, temporary change model load precision to that required by converter
+    # it is for TensorRT converters which require fp32 models for all requested precisions
+    converter_name = f"{args.input_type}--{args.output_type}"
+    Converter: BaseConverter = converters.get(converter_name)
+    if Converter:
+        args.precision = Converter.required_source_model_precision(requested_model_precision).value
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
+    model = loader.load(args.input_path)
+
+
+    LOGGER.info("inputs: %s", model.inputs)
+    LOGGER.info("outputs: %s", model.outputs)
+
+    if Converter:  # if conversion is needed
+        # dataloader must much source model precision - so not recovering it yet
+        if args.dataloader is not None:
+            get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+            dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+
+    # recover precision to that requested by user
+    args.precision = requested_model_precision.value
+
+    if Converter:
+        converter = ArgParserGenerator(Converter).from_args(args)
+        model = converter.convert(model, dataloader_fn=dataloader_fn)
+
+    Saver: BaseSaver = savers.get(args.output_type)
+    saver = ArgParserGenerator(Saver).from_args(args)
+    saver.save(model, args.output_path)
+
+    return 0
+
+
+if __name__ == "__main__":
+    main()
--- a/TensorFlow/Classification/ConvNets/triton/dataloader.py
+++ b/TensorFlow/Classification/ConvNets/triton/dataloader.py
@ -0,0 +1,45 @@
+import logging
+from pathlib import Path
+
+import numpy as np
+from PIL import Image
+
+from rn50_model import HEIGHT, WIDTH
+
+LOGGER = logging.getLogger(__name__)
+
+
+def get_dataloader_fn(
+    *, data_dir: str, batch_size: int = 1, width: int = WIDTH, height: int = HEIGHT, images_num: int = None
+):
+    image_extensions = [".gif", ".png", ".jpeg", ".jpg"]
+
+    image_paths = sorted([p for p in Path(data_dir).rglob("*") if p.suffix.lower() in image_extensions])
+    if images_num is not None:
+        image_paths = image_paths[:images_num]
+
+    LOGGER.info(
+        f"Creating PIL dataloader on data_dir={data_dir} #images={len(image_paths)} "
+        f"image_size=({width}, {height}) batch_size={batch_size}"
+    )
+
+    def _dataloader_fn():
+        batch = []
+        for image_path in image_paths:
+            img = Image.open(image_path.as_posix()).convert('RGB')
+            img = img.resize((width, height))
+            img = np.array(img).astype(np.float32)
+            true_class = np.array([int(image_path.parent.name)])
+            assert tuple(img.shape) == (height, width, 3)
+            img = img[np.newaxis, ...]
+            batch.append((img, image_path.as_posix(), true_class))
+            if len(batch) >= batch_size:
+                ids = [image_path for _, image_path, *_ in batch]
+                x = {
+                    "input": np.concatenate([img for img, *_ in batch]),
+                }
+                y_real = {"classes": np.concatenate([class_ for *_, class_ in batch])}
+                batch = []
+                yield ids, x, y_real
+
+    return _dataloader_fn
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/.version
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/.version
@ -0,0 +1 @@
+0.4.6-46-g5bc739c
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/init.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/init.py
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/args.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/args.py
@ -0,0 +1,110 @@
+import argparse
+import inspect
+import logging
+from typing import Callable, Dict, Optional, Union
+
+from .core import GET_ARGPARSER_FN_NAME, load_from_file
+
+LOGGER = logging.getLogger(__name__)
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def filter_fn_args(args: Union[dict, argparse.Namespace], fn: Callable) -> dict:
+    signature = inspect.signature(fn)
+    parameters_names = list(signature.parameters)
+    if isinstance(args, argparse.Namespace):
+        args = vars(args)
+    args = {k: v for k, v in args.items() if k in parameters_names}
+    return args
+
+
+def add_args_for_fn_signature(parser, fn) -> argparse.ArgumentParser:
+    parser.conflict_handler = "resolve"
+    signature = inspect.signature(fn)
+    for parameter in signature.parameters.values():
+        if parameter.name in ["self", "args", "kwargs"]:
+            continue
+        argument_kwargs = {}
+        if parameter.annotation != inspect.Parameter.empty:
+            if parameter.annotation == bool:
+                argument_kwargs["type"] = str2bool
+                argument_kwargs["choices"] = [0, 1]
+            elif type(parameter.annotation) == type(Union):
+                types = [type_ for type_ in parameter.annotation.__args__ if not isinstance(None, type_)]
+                if len(types) != 1:
+                    raise RuntimeError(
+                        f"Could not prepare argument parser for {parameter.name}: {parameter.annotation} in {fn}"
+                    )
+                argument_kwargs["type"] = types[0]
+            else:
+                argument_kwargs["type"] = parameter.annotation
+
+        if parameter.default != inspect.Parameter.empty:
+            if parameter.annotation == bool:
+                argument_kwargs["default"] = str2bool(parameter.default)
+            else:
+                argument_kwargs["default"] = parameter.default
+        else:
+            argument_kwargs["required"] = True
+        name = parameter.name.replace("_", "-")
+        LOGGER.debug(f"Adding argument {name} with {argument_kwargs}")
+        parser.add_argument(f"--{name}", **argument_kwargs)
+    return parser
+
+
+class ArgParserGenerator:
+    def __init__(self, cls_or_fn, module_path: Optional[str] = None):
+        self._cls_or_fn = cls_or_fn
+
+        self._handle = cls_or_fn if inspect.isfunction(cls_or_fn) else getattr(cls_or_fn, "__init__")
+        input_is_python_file = module_path and module_path.endswith(".py")
+        self._input_path = module_path if input_is_python_file else None
+        self._required_fn_name_for_signature_parsing = getattr(
+            cls_or_fn, "required_fn_name_for_signature_parsing", None
+        )
+
+    def update_argparser(self, parser):
+        name = self._handle.__name__
+        group_parser = parser.add_argument_group(name)
+        add_args_for_fn_signature(group_parser, fn=self._handle)
+        self._update_argparser(group_parser)
+
+    def get_args(self, args: argparse.Namespace):
+        filtered_args = filter_fn_args(args, fn=self._handle)
+
+        tmp_parser = argparse.ArgumentParser(allow_abbrev=False)
+        self._update_argparser(tmp_parser)
+        custom_names = [
+            p.dest.replace("-", "_") for p in tmp_parser._actions if not isinstance(p, argparse._HelpAction)
+        ]
+        custom_params = {n: getattr(args, n) for n in custom_names}
+        filtered_args = {**filtered_args, **custom_params}
+        return filtered_args
+
+    def from_args(self, args: Union[argparse.Namespace, Dict]):
+        args = self.get_args(args)
+        LOGGER.info(f"Initializing {self._cls_or_fn.__name__}({args})")
+        return self._cls_or_fn(**args)
+
+    def _update_argparser(self, parser):
+        label = "argparser_update"
+        if self._input_path:
+            update_argparser_handle = load_from_file(self._input_path, label=label, target=GET_ARGPARSER_FN_NAME)
+            if update_argparser_handle:
+                update_argparser_handle(parser)
+            elif self._required_fn_name_for_signature_parsing:
+                fn_handle = load_from_file(
+                    self._input_path, label=label, target=self._required_fn_name_for_signature_parsing
+                )
+                if fn_handle:
+                    add_args_for_fn_signature(parser, fn_handle)
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/init.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/init.py
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/onnx.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/onnx.py
@ -0,0 +1,223 @@
+import logging
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+import numpy as np
+
+# pytype: disable=import-error
+import onnx
+import onnx.optimizer
+import onnx.shape_inference
+import onnxruntime
+from google.protobuf import text_format
+from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
+
+# pytype: enable=import-error
+
+from ..core import BaseLoader, BaseRunner, BaseRunnerSession, BaseSaver, Format, Model, Precision, TensorSpec
+from ..extensions import loaders, runners, savers
+from .utils import infer_precision
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _value_info2tensor_spec(value_info: onnx.ValueInfoProto):
+    onnx_data_type_map = {"float": "float32", "double": "float64"}
+
+    elem_type_name = onnx.TensorProto.DataType.Name(value_info.type.tensor_type.elem_type).lower()
+    dtype = onnx_data_type_map.get(elem_type_name, elem_type_name)
+
+    def _get_dim(dim):
+        which = dim.WhichOneof("value")
+        if which is not None:  # which is None when dim is None
+            dim = getattr(dim, which)
+        return None if isinstance(dim, (str, bytes)) else dim
+
+    shape = value_info.type.tensor_type.shape
+    shape = tuple([_get_dim(d) for d in shape.dim])
+    return TensorSpec(value_info.name, dtype=dtype, shape=shape)
+
+
+def _infer_graph_precision(onnx_graph: onnx.GraphProto) -> Optional[Precision]:
+    import networkx as nx
+
+    # build directed graph
+    nx_graph = nx.DiGraph()
+
+    def _get_dtype(vi):
+        t = vi.type
+        if hasattr(t, "tensor_type"):
+            type_id = t.tensor_type.elem_type
+        else:
+            raise NotImplementedError("Not implemented yet")
+        return TENSOR_TYPE_TO_NP_TYPE[type_id]
+
+    node_output2type = {vi.name: _get_dtype(vi) for vi in onnx_graph.value_info}
+
+    node_outputs2node = {output_name: node for node in onnx_graph.node for output_name in node.output}
+    node_inputs2node = {input_name: node for node in onnx_graph.node for input_name in node.input}
+
+    for node in onnx_graph.node:
+        node_dtype = node_output2type.get("+".join(node.output), None)
+        nx_graph.add_node(
+            node.name,
+            op=node.op_type,
+            attr={a.name: a for a in node.attribute},
+            dtype=node_dtype,
+        )
+        for input_name in node.input:
+            prev_node = node_outputs2node.get(input_name, None)
+            if prev_node:
+                nx_graph.add_edge(prev_node.name, node.name)
+
+    for input_node in onnx_graph.input:
+        input_name = input_node.name
+        nx_graph.add_node(input_name, op="input", dtype=_get_dtype(input_node))
+        next_node = node_inputs2node.get(input_name, None)
+        if next_node:
+            nx_graph.add_edge(input_name, next_node.name)
+
+    for output in onnx_graph.output:
+        output_name = output.name
+        nx_graph.add_node(output_name, op="output", dtype=_get_dtype(output))
+        prev_node = node_outputs2node.get(output_name, None)
+        if prev_node:
+            nx_graph.add_edge(prev_node.name, output_name)
+        else:
+            LOGGER.warning(f"Could not find previous node for {output_name}")
+
+    input_names = [n.name for n in onnx_graph.input]
+    output_names = [n.name for n in onnx_graph.output]
+    most_common_dtype = infer_precision(nx_graph, input_names, output_names, lambda node: node.get("dtype", None))
+    if most_common_dtype is not None:
+        precision = {np.dtype("float32"): Precision.FP32, np.dtype("float16"): Precision.FP16}[most_common_dtype]
+    else:
+        precision = None
+    return precision
+
+
+class OnnxLoader(BaseLoader):
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+
+        model = onnx.load(model_path)
+        onnx.checker.check_model(model)
+        onnx.helper.strip_doc_string(model)
+        model = onnx.shape_inference.infer_shapes(model)
+
+        # TODO: probably modification of onnx model ios causes error on optimize
+        # from onnx.utils import polish_model
+        # model = polish_model(model)  # run checker, docs strip, optimizer and shape inference
+
+        inputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.input}
+        outputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.output}
+
+        precision = _infer_graph_precision(model.graph)
+
+        return Model(model, precision, inputs, outputs)
+
+
+class OnnxSaver(BaseSaver):
+    def __init__(self, as_text: bool = False):
+        self._as_text = as_text
+
+    def save(self, model: Model, model_path: Union[str, Path]) -> None:
+        model_path = Path(model_path)
+        LOGGER.debug(f"Saving ONNX model to {model_path.as_posix()}")
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+        onnx_model: onnx.ModelProto = model.handle
+        if self._as_text:
+            with model_path.open("w") as f:
+                f.write(text_format.MessageToString(onnx_model))
+        else:
+            with model_path.open("wb") as f:
+                f.write(onnx_model.SerializeToString())
+
+
+"""
+ExecutionProviders on onnxruntime 1.4.0
+['TensorrtExecutionProvider',
+ 'CUDAExecutionProvider',
+ 'MIGraphXExecutionProvider',
+ 'NGRAPHExecutionProvider',
+ 'OpenVINOExecutionProvider',
+ 'DnnlExecutionProvider',
+ 'NupharExecutionProvider',
+ 'VitisAIExecutionProvider',
+ 'ArmNNExecutionProvider',
+ 'ACLExecutionProvider',
+ 'CPUExecutionProvider']
+"""
+
+
+def _check_providers(providers):
+    providers = providers or []
+    if not isinstance(providers, (list, tuple)):
+        providers = [providers]
+    available_providers = onnxruntime.get_available_providers()
+    unavailable = set(providers) - set(available_providers)
+    if unavailable:
+        raise RuntimeError(f"Unavailable providers {unavailable}")
+    return providers
+
+
+class OnnxRunner(BaseRunner):
+    def __init__(self, verbose_runtime_logs: bool = False):
+        self._providers = None
+        self._verbose_runtime_logs = verbose_runtime_logs
+
+    def init_inference(self, model: Model):
+        assert isinstance(model.handle, onnx.ModelProto)
+        return OnnxRunnerSession(
+            model=model, providers=self._providers, verbose_runtime_logs=self._verbose_runtime_logs
+        )
+
+
+class OnnxRunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model, providers, verbose_runtime_logs: bool = False):
+        super().__init__(model)
+        self._input_names = None
+        self._output_names = None
+        self._session = None
+        self._providers = providers
+        self._verbose_runtime_logs = verbose_runtime_logs
+        self._old_env_values = {}
+
+    def __enter__(self):
+        self._old_env_values = self._set_env_variables()
+        sess_options = onnxruntime.SessionOptions()  # default session options
+        if self._verbose_runtime_logs:
+            sess_options.log_severity_level = 0
+            sess_options.log_verbosity_level = 1
+        LOGGER.info(
+            f"Starting inference session for onnx model providers={self._providers} sess_options={sess_options}"
+        )
+
+        self._input_names = list(self._model.inputs)
+        self._output_names = list(self._model.outputs)
+
+        model_payload = self._model.handle.SerializeToString()
+        self._session = onnxruntime.InferenceSession(
+            model_payload, providers=self._providers, sess_options=sess_options
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._input_names = None
+        self._output_names = None
+        self._session = None
+        self._recover_env_variables(self._old_env_values)
+
+    def __call__(self, x: Dict[str, object]):
+        feed_dict = {k: x[k] for k in self._input_names}
+        y_pred = self._session.run(self._output_names, feed_dict)
+        y_pred = dict(zip(self._output_names, y_pred))
+
+        return y_pred
+
+
+loaders.register_extension(Format.ONNX.value, OnnxLoader)
+runners.register_extension(Format.ONNX.value, OnnxRunner)
+savers.register_extension(Format.ONNX.value, OnnxSaver)
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/onnx2trt_conv.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/onnx2trt_conv.py
@ -0,0 +1,100 @@
+import logging
+from typing import Dict, Iterable, Optional
+
+# pytype: disable=import-error
+import onnx
+import tensorrt as trt
+
+from ..core import BaseConverter, Format, Model, Precision, ShapeSpec
+from ..extensions import converters
+from .utils import get_input_shapes
+
+# pytype: enable=import-error
+
+
+LOGGER = logging.getLogger(__name__)
+TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+
+
+class Onnx2TRTConverter(BaseConverter):
+    def __init__(self, *, max_batch_size: int, max_workspace_size: int, precision: str):
+        self._max_batch_size = max_batch_size
+        self._max_workspace_size = max_workspace_size
+        self._precision = Precision(precision)
+
+    def convert(self, model: Model, dataloader_fn) -> Model:
+        input_shapes = get_input_shapes(dataloader_fn(), self._max_batch_size)
+        cuda_engine = onnx2trt(
+            model.handle,
+            shapes=input_shapes,
+            max_workspace_size=self._max_workspace_size,
+            max_batch_size=self._max_batch_size,
+            model_precision=self._precision.value,
+        )
+        return model._replace(handle=cuda_engine)
+
+    @staticmethod
+    def required_source_model_precision(requested_model_precision: Precision) -> Precision:
+        # TensorRT requires source models to be in FP32 precision
+        return Precision.FP32
+
+
+def onnx2trt(
+    onnx_model: onnx.ModelProto,
+    *,
+    shapes: Dict[str, ShapeSpec],
+    max_workspace_size: int,
+    max_batch_size: int,
+    model_precision: str,
+) -> "trt.ICudaEngine":
+    """
+    Converts onnx model to TensorRT ICudaEngine
+    Args:
+        onnx_model: onnx.Model to convert
+        shapes: dictionary containing min shape, max shape, opt shape for each input name
+        max_workspace_size: The maximum GPU temporary memory which the CudaEngine can use at execution time.
+        max_batch_size: The maximum batch size which can be used at execution time,
+                        and also the batch size for which the CudaEngine will be optimized.
+        model_precision: precision of kernels (possible values: fp16, fp32)
+
+    Returns: TensorRT ICudaEngine
+    """
+    # Whether or not 16-bit kernels are permitted.
+    # During :class:`ICudaEngine` build fp16 kernels will also be tried when this mode is enabled.
+    fp16_mode = "16" in model_precision
+
+    builder = trt.Builder(TRT_LOGGER)
+    builder.fp16_mode = fp16_mode
+    builder.max_batch_size = max_batch_size
+    builder.max_workspace_size = max_workspace_size
+
+    # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode,
+    # meaning that your network definition must be created with the explicitBatch flag set.
+    # For more information, see
+    # https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work_dynamic_shapes
+    flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(flags)
+
+    with trt.OnnxParser(network, TRT_LOGGER) as parser:
+        # onnx model parsing
+        if not parser.parse(onnx_model.SerializeToString()):
+            for i in range(parser.num_errors):
+                LOGGER.error(f"OnnxParser error {i}/{parser.num_errors}: {parser.get_error(i)}")
+            raise RuntimeError("Error during parsing ONNX model (see logs for details)")
+
+        # optimization
+        config = builder.create_builder_config()
+        config.flags |= bool(fp16_mode) << int(trt.BuilderFlag.FP16)
+        config.max_workspace_size = max_workspace_size
+
+        profile = builder.create_optimization_profile()
+        for name, spec in shapes.items():
+            profile.set_shape(name, **spec._asdict())
+
+        config.add_optimization_profile(profile)
+        engine = builder.build_engine(network, config=config)
+
+    return engine
+
+
+converters.register_extension(f"{Format.ONNX.value}--{Format.TRT.value}", Onnx2TRTConverter)
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/tensorrt.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/tensorrt.py
@ -0,0 +1,202 @@
+import logging
+import sys
+from pathlib import Path
+from typing import Dict, NamedTuple, Optional, Union
+
+import numpy as np
+
+# pytype: disable=import-error
+try:
+    import pycuda.autoinit
+    import pycuda.driver as cuda
+except (ImportError, Exception) as e:
+    logging.getLogger(__name__).debug(f"Problems with importing pycuda package; {e}")
+# pytype: enable=import-error
+
+import tensorrt as trt  # pytype: disable=import-error
+
+from ..core import BaseLoader, BaseRunner, BaseRunnerSession, BaseSaver, Format, Model, Precision, TensorSpec
+from ..extensions import loaders, runners, savers
+
+LOGGER = logging.getLogger(__name__)
+TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+
+"""
+documentation:
+https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html
+https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_samples_section
+"""
+
+
+class TensorRTLoader(BaseLoader):
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        model_path = Path(model_path)
+        LOGGER.debug(f"Loading TensorRT engine from {model_path}")
+
+        with model_path.open("rb") as fh, trt.Runtime(TRT_LOGGER) as runtime:
+            engine = runtime.deserialize_cuda_engine(fh.read())
+
+        if engine is None:
+            raise RuntimeError(f"Could not load ICudaEngine from {model_path}")
+
+        inputs = {}
+        outputs = {}
+        for binding_idx in range(engine.num_bindings):
+            name = engine.get_binding_name(binding_idx)
+            is_input = engine.binding_is_input(binding_idx)
+            dtype = engine.get_binding_dtype(binding_idx)
+            shape = engine.get_binding_shape(binding_idx)
+            if is_input:
+                inputs[name] = TensorSpec(name, dtype, shape)
+            else:
+                outputs[name] = TensorSpec(name, dtype, shape)
+
+        return Model(engine, None, inputs, outputs)
+
+
+class TensorRTSaver(BaseSaver):
+    def __init__(self):
+        pass
+
+    def save(self, model: Model, model_path: Union[str, Path]) -> None:
+        model_path = Path(model_path)
+        LOGGER.debug(f"Saving TensorRT engine to {model_path.as_posix()}")
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+        engine: "trt.ICudaEngine" = model.handle
+        with model_path.open("wb") as fh:
+            fh.write(engine.serialize())
+
+
+class TRTBuffers(NamedTuple):
+    x_host: Optional[Dict[str, object]]
+    x_dev: Dict[str, object]
+    y_pred_host: Dict[str, object]
+    y_pred_dev: Dict[str, object]
+
+
+class TensorRTRunner(BaseRunner):
+    def __init__(self):
+        pass
+
+    def init_inference(self, model: Model):
+        return TensorRTRunnerSession(model=model)
+
+
+class TensorRTRunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model):
+        super().__init__(model)
+        assert isinstance(model.handle, trt.ICudaEngine)
+        self._model = model
+        self._has_dynamic_shapes = None
+
+        self._context = None
+        self._engine: trt.ICudaEngine = self._model.handle
+        self._cuda_context = pycuda.autoinit.context
+
+        self._input_names = None
+        self._output_names = None
+        self._buffers = None
+
+    def __enter__(self):
+        self._context = self._engine.create_execution_context()
+        self._context.__enter__()
+
+        self._input_names = [
+            self._engine[idx] for idx in range(self._engine.num_bindings) if self._engine.binding_is_input(idx)
+        ]
+        self._output_names = [
+            self._engine[idx] for idx in range(self._engine.num_bindings) if not self._engine.binding_is_input(idx)
+        ]
+        # all_binding_shapes_specified is True for models without dynamic shapes
+        # so initially this variable is False for models with dynamic shapes
+        self._has_dynamic_shapes = not self._context.all_binding_shapes_specified
+
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._context.__exit__(exc_type, exc_value, traceback)
+        self._input_names = None
+        self._output_names = None
+
+        # TODO: are cuda buffers dealloc automatically?
+        self._buffers = None
+
+    def __call__(self, x):
+        buffers = self._prepare_buffers_if_needed(x)
+        bindings = self._update_bindings(buffers)
+
+        for name in self._input_names:
+            cuda.memcpy_htod(buffers.x_dev[name], buffers.x_host[name])
+        self._cuda_context.push()
+        self._context.execute_v2(bindings=bindings)
+        self._cuda_context.pop()
+        for name in self._output_names:
+            cuda.memcpy_dtoh(buffers.y_pred_host[name], buffers.y_pred_dev[name])
+
+        return buffers.y_pred_host
+
+    def _update_bindings(self, buffers: TRTBuffers):
+        bindings = [None] * self._engine.num_bindings
+        for name in buffers.y_pred_dev:
+            binding_idx: int = self._engine[name]
+            bindings[binding_idx] = buffers.y_pred_dev[name]
+
+        for name in buffers.x_dev:
+            binding_idx: int = self._engine[name]
+            bindings[binding_idx] = buffers.x_dev[name]
+
+        return bindings
+
+    def _set_dynamic_input_shapes(self, x_host):
+        def _is_shape_dynamic(input_shape):
+            return any([dim is None or dim == -1 for dim in input_shape])
+
+        for name in self._input_names:
+            bindings_idx = self._engine[name]
+            data_shape = x_host[name].shape  # pytype: disable=attribute-error
+            if self._engine.is_shape_binding(bindings_idx):
+                input_shape = self._context.get_shape(bindings_idx)
+                if _is_shape_dynamic(input_shape):
+                    self._context.set_shape_input(bindings_idx, data_shape)
+            else:
+                input_shape = self._engine.get_binding_shape(bindings_idx)
+                if _is_shape_dynamic(input_shape):
+                    self._context.set_binding_shape(bindings_idx, data_shape)
+
+        assert self._context.all_binding_shapes_specified and self._context.all_shape_inputs_specified
+
+    def _prepare_buffers_if_needed(self, x_host: Dict[str, object]):
+        # pytype: disable=attribute-error
+        new_batch_size = list(x_host.values())[0].shape[0]
+        current_batch_size = list(self._buffers.y_pred_host.values())[0].shape[0] if self._buffers else 0
+        # pytype: enable=attribute-error
+
+        if self._has_dynamic_shapes or new_batch_size != current_batch_size:
+            # TODO: are CUDA buffers dealloc automatically?
+
+            self._set_dynamic_input_shapes(x_host)
+
+            y_pred_host = {}
+            for name in self._output_names:
+                shape = self._context.get_binding_shape(self._engine[name])
+                y_pred_host[name] = np.zeros(shape, dtype=trt.nptype(self._model.outputs[name].dtype))
+
+            y_pred_dev = {name: cuda.mem_alloc(data.nbytes) for name, data in y_pred_host.items()}
+
+            x_dev = {
+                name: cuda.mem_alloc(host_input.nbytes)
+                for name, host_input in x_host.items()
+                if name in self._input_names  # pytype: disable=attribute-error
+            }
+
+            self._buffers = TRTBuffers(None, x_dev, y_pred_host, y_pred_dev)
+
+        return self._buffers._replace(x_host=x_host)
+
+
+if "pycuda.driver" in sys.modules:
+    loaders.register_extension(Format.TRT.value, TensorRTLoader)
+    runners.register_extension(Format.TRT.value, TensorRTRunner)
+    savers.register_extension(Format.TRT.value, TensorRTSaver)
+else:
+    LOGGER.debug("Do not register TensorRT extension due problems with importing pycuda.driver package.")
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/tf.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/tf.py
@ -0,0 +1,535 @@
+import logging
+from pathlib import Path
+from typing import Dict, Iterable, Optional, Tuple, Union
+
+import numpy as np
+
+# pytype: disable=import-error
+import tensorflow as tf
+from tensorflow.python.eager import wrap_function
+from tf2onnx.shape_inference import infer_shape
+from tf2onnx.tf_loader import (
+    freeze_session,
+    from_function,
+    inputs_without_resource,
+    is_function,
+    remove_redundant_inputs,
+    tf_optimize,
+)
+
+# pytype: enable=import-error
+
+from ..args import filter_fn_args
+from ..core import (
+    GET_MODEL_FN_NAME,
+    GET_SERVING_INPUT_RECEIVER_FN,
+    BaseConverter,
+    BaseLoader,
+    BaseRunner,
+    BaseRunnerSession,
+    BaseSaver,
+    Format,
+    Model,
+    Precision,
+    TensorSpec,
+    load_from_file,
+)
+from ..extensions import converters, loaders, runners, savers
+from .utils import infer_precision
+
+LOGGER = logging.getLogger(__name__)
+
+
+def is_tf2():
+    return tf.__version__.startswith("2.")
+
+
+def create_session_config(*, allow_growth=False, use_xla=False, gpu_memory_fraction=1.0):
+    gpu_options = tf.compat.v1.GPUOptions(
+        per_process_gpu_memory_fraction=gpu_memory_fraction, allow_growth=allow_growth
+    )
+    config = tf.compat.v1.ConfigProto(gpu_options=gpu_options)
+    if use_xla:
+        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+
+    LOGGER.debug(
+        f"Using gpu memory fraction: allow_growth={allow_growth} "
+        f"gpu_memory_fraction={gpu_memory_fraction} "
+        f"use_xla={use_xla}"
+    )
+    return config
+
+
+class TFTRTConverter(BaseConverter):
+    def __init__(
+        self,
+        *,
+        is_dynamic_op: bool = False,
+        minimum_segment_size: int = 3,
+        max_batch_size: int = 1,
+        max_workspace_size: int = (4 << 30) - 1000,  # ~3.999GB
+        maximum_cached_engines: int = 1000,
+        precision: str,
+    ):
+        self._is_dynamic_op = is_dynamic_op
+        self._minimum_segment_size = minimum_segment_size
+        self._max_batch_size = max_batch_size
+        self._max_workspace_size = max_workspace_size
+        self._maximum_cached_engines = maximum_cached_engines
+        self._precision = Precision(precision)
+
+    def convert(self, model: Model, dataloader_fn) -> Model:
+        # https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html
+        # converting graph_def is not supported in TF2
+        from tensorflow.python.compiler.tensorrt import trt_convert  # pytype: disable=import-error
+
+        assert isinstance(model.handle, tf.compat.v1.GraphDef)
+
+        session_config = create_session_config(allow_growth=True)
+        output_node_names = [spec.name.split(":")[0] for spec in model.outputs.values()]
+
+        converter = trt_convert.TrtGraphConverter(
+            input_graph_def=model.handle,
+            session_config=session_config,
+            nodes_blacklist=output_node_names,
+            is_dynamic_op=self._is_dynamic_op,
+            precision_mode=self._precision.value,
+            max_workspace_size_bytes=self._max_workspace_size,
+            maximum_cached_engines=self._maximum_cached_engines,
+            max_batch_size=self._max_batch_size,
+            minimum_segment_size=self._minimum_segment_size,
+        )
+        graph_def = converter.convert()
+
+        return model._replace(handle=graph_def)
+
+    @staticmethod
+    def required_source_model_precision(requested_model_precision: Precision) -> Precision:
+        # TensorRT requires source models to be in FP32 precision
+        return Precision.FP32
+
+
+def _from_saved_model_v1(sess, model_path, tag, signatures):
+    """
+    Load tensorflow graph from saved_model.
+    NOTICE: Modified version from tf2onnx project
+    """
+
+    wrn_no_tag = "'--tag' not specified for saved_model. Using --tag serve"
+    wrn_empty_tag = "'--tag' value is empty string. Using tag =[[]]"
+
+    if tag is None:
+        tag = [tf.saved_model.SERVING]
+        LOGGER.warning(wrn_no_tag)
+
+    if tag == "":
+        tag = [[]]
+        LOGGER.warning(wrn_empty_tag)
+
+    if not isinstance(tag, list):
+        tag = [tag]
+
+    imported = tf.compat.v1.saved_model.loader.load(sess, tag, model_path)
+    for k in imported.signature_def.keys():
+        if k.startswith("_"):
+            # consider signatures starting with '_' private
+            continue
+        signatures.append(k)
+    try:
+        from tensorflow.contrib.saved_model.python.saved_model import (  # pytype: disable=import-error
+            signature_def_utils,
+        )
+
+        # pylint: disable=unnecessary-lambda
+        get_signature_def = lambda meta_graph_def, k: signature_def_utils.get_signature_def_by_key(meta_graph_def, k)
+    except ImportError:
+        # TF1.12 changed the api
+        get_signature_def = lambda meta_graph_def, k: meta_graph_def.signature_def[k]
+
+    inputs = {}
+    outputs = {}
+    for k in signatures:
+        inputs_tensor_info = get_signature_def(imported, k).inputs
+        for name, input_tensor in inputs_tensor_info.items():
+            inputs[name] = input_tensor.name
+        outputs_tensor_info = get_signature_def(imported, k).outputs
+        for name, output_tensor in outputs_tensor_info.items():
+            outputs[name] = output_tensor.name
+    frozen_graph = freeze_session(sess, input_names=list(inputs.values()), output_names=list(outputs.values()))
+    return frozen_graph, inputs, outputs
+
+
+def _infer_model_precision(
+    tf_graph: tf.compat.v1.GraphDef, inputs_dict: Dict[str, TensorSpec], outputs_dict: Dict[str, TensorSpec]
+) -> Optional[Precision]:
+    import networkx as nx
+
+    def _get_dtype(node_def):
+        node_type = node_def.attr.get("T", None) or node_def.attr.get("dtype", None)
+        if node_type:
+            if node_type.list.type:
+                assert len(set(node_type.list.type)) == 1
+                node_type = tf.dtypes.DType(node_type.list.type[0])
+            else:
+                node_type = tf.dtypes.DType(node_type.type)
+        return np.dtype(node_type.as_numpy_dtype()) if node_type and node_type.is_numpy_compatible else node_type
+
+    # build directed graph
+    nx_graph = nx.DiGraph()
+    for node_def in tf_graph.node:
+        nx_graph.add_node(
+            node_def.name,
+            op=node_def.op,
+            **{key: value for key, value in node_def.attr.items() if key not in ["value", "dtype"]},
+            dtype=_get_dtype(node_def),
+        )
+        for input in node_def.input:
+            nx_graph.add_edge(input, node_def.name)
+
+    input_names = [spec.name.split(":")[0] for spec in inputs_dict.values()]
+    output_names = [spec.name.split(":")[0] for spec in outputs_dict.values()]
+    most_common_dtype = infer_precision(nx_graph, input_names, output_names, _get_dtype)
+    if most_common_dtype is not None:
+        precision = {np.dtype("float32"): Precision.FP32, np.dtype("float16"): Precision.FP16}[most_common_dtype]
+    else:
+        precision = None
+    return precision
+
+
+class TFEstimatorLoader(BaseLoader):
+    required_fn_name_for_signature_parsing: Optional[str] = GET_MODEL_FN_NAME
+
+    def __init__(self, **kwargs):
+        self._model_args = kwargs
+
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+
+        get_model = load_from_file(model_path, "model", GET_MODEL_FN_NAME)
+        get_serving_input_receiver_fn = load_from_file(model_path, "model", GET_SERVING_INPUT_RECEIVER_FN)
+
+        if get_model is None:
+            raise RuntimeError(f"Could not find {GET_MODEL_FN_NAME} in {model_path}")
+        if get_serving_input_receiver_fn is None:
+            raise RuntimeError(f"Could not find {GET_SERVING_INPUT_RECEIVER_FN} in {model_path}")
+
+        model_args = filter_fn_args(self._model_args, fn=get_model)
+        serving_input_receiver_args = filter_fn_args(self._model_args, fn=get_serving_input_receiver_fn)
+
+        session_config = create_session_config(allow_growth=True)
+        tf.compat.v1.reset_default_graph()
+        with tf.compat.v1.Session(config=session_config) as sess:
+            estimator = get_model(**model_args)
+            serving_input_receiver_fn = get_serving_input_receiver_fn(**serving_input_receiver_args)
+
+            input_receiver = serving_input_receiver_fn()
+            estimator_spec = estimator.model_fn(
+                features=input_receiver.features,
+                labels=None,
+                mode=tf.estimator.ModeKeys.PREDICT,
+                config=estimator.config,
+            )
+
+            input_tensors_dict = input_receiver.receiver_tensors
+            output_tensors_dict = estimator_spec.predictions
+            inputs_dict = {k: tensor2tensor_spec(tensor) for k, tensor in input_tensors_dict.items()}
+            outputs_dict = {k: tensor2tensor_spec(tensor) for k, tensor in output_tensors_dict.items()}
+
+            input_tensor_names = [t.name for t in inputs_dict.values()]
+            output_tensor_names = [t.name for t in outputs_dict.values()]
+
+            graph_saver = estimator_spec.scaffold.saver or tf.compat.v1.train.Saver(sharded=True)
+            graph_saver.restore(sess, estimator.latest_checkpoint())
+
+            input_tensor_names = inputs_without_resource(sess, input_tensor_names)
+            frozen_graph = freeze_session(sess, input_names=input_tensor_names, output_names=output_tensor_names)
+            input_tensor_names = remove_redundant_inputs(frozen_graph, input_tensor_names)
+
+        tf.compat.v1.reset_default_graph()
+        with tf.compat.v1.Session(config=estimator.config.session_config):
+            frozen_graph = tf_optimize(input_tensor_names, output_tensor_names, frozen_graph)
+        tf.compat.v1.reset_default_graph()
+
+        precision = _infer_model_precision(frozen_graph, inputs_dict, outputs_dict)
+
+        return Model(frozen_graph, precision, inputs_dict, outputs_dict)
+
+
+class TFKerasLoader(BaseLoader):
+    """
+    Loads keras model from source code
+
+    The large-model flag helps loading model which exceeds maximum protobuf size of 2GB. By default it is disabled.
+
+    The tf-allow-growth flag control limiting GPU memory growth feature
+    (https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth). By default it is disabled.
+    """
+
+    required_fn_name_for_signature_parsing: Optional[str] = GET_MODEL_FN_NAME
+
+    def __init__(self, large_model: bool = False, tf_allow_growth: bool = False, **kwargs):
+        self._large_model = large_model
+        self._allow_growth = tf_allow_growth
+        self._model_args = kwargs
+
+    def load(self, model_path: Union[str, Path], **_) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+
+        get_model = load_from_file(model_path, "model", GET_MODEL_FN_NAME)
+        if get_model is None:
+            raise RuntimeError(f"Could not find {GET_MODEL_FN_NAME} in {model_path}")
+
+        model_args = filter_fn_args(self._model_args, fn=get_model)
+
+        if self._allow_growth:
+            physical_devices = tf.config.experimental.list_physical_devices("GPU")
+            for device in physical_devices:
+                tf.config.experimental.set_memory_growth(device, True)
+
+        tf.keras.backend.clear_session()
+        tf.keras.backend.set_learning_phase(False)
+
+        eager_model, call_fn = get_model(**model_args)
+
+        inputs_dict: Dict[str, TensorSpec] = {
+            input_name: TensorSpec(t.name, t.dtype.name, tuple(t.shape.as_list()))
+            for input_name, t in zip(eager_model.input_names, eager_model.inputs)
+        }
+
+        concrete_func = call_fn.get_concrete_function(
+            *[tf.TensorSpec(shape=spec.shape, dtype=spec.dtype, name=name) for name, spec in inputs_dict.items()]
+        )
+
+        input_tensors_names = [tensor.name for tensor in concrete_func.inputs if tensor.dtype != tf.dtypes.resource]
+        output_tensors_names = [tensor.name for tensor in concrete_func.outputs]
+
+        graph_def = from_function(
+            concrete_func, input_tensors_names, output_tensors_names, large_model=self._large_model
+        )
+
+        # tensor names changes after wrapping with call_fn, thus need to use those from concrete_func
+        outputs_dict: Dict[str, TensorSpec] = {
+            output_name: TensorSpec(output_tensor_name, t.dtype.name, tuple(t.shape.as_list()))
+            for output_name, output_tensor_name, t in zip(
+                eager_model.output_names, output_tensors_names, eager_model.outputs
+            )
+        }
+
+        precision = _infer_model_precision(graph_def, inputs_dict, outputs_dict)
+
+        tf.keras.backend.clear_session()
+        tf.keras.backend.set_learning_phase(False)
+
+        return Model(graph_def, precision, inputs_dict, outputs_dict)
+
+
+class TFSavedModelLoader(BaseLoader):
+    def load(self, model_path: Union[str, Path], **kwargs) -> Model:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+        tf.compat.v1.reset_default_graph()
+        if is_tf2():
+            from tf2onnx.tf_loader import _from_saved_model_v2  # pytype: disable=import-error
+
+            graph_def, input_names, output_names, concrete_func, imported, initialized_tables = _from_saved_model_v2(
+                model_path=model_path,
+                input_names=None,
+                output_names=None,
+                tag=None,
+                signature_def=[],
+                concrete_function_index=None,
+                large_model=False,
+            )
+
+            # inspired by https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/saved_model_cli.py#L205
+            if concrete_func.structured_input_signature:
+                input_args, input_kwargs = concrete_func.structured_input_signature
+                input_names = list(input_kwargs)
+                assert (
+                    not input_args
+                ), f"Not supported args in concrete function signature args={input_args}, kwargs={input_kwargs}"
+            elif concrete_func._arg_keywords:  # pylint: disable=protected-access
+                # For pure ConcreteFunctions we might have nothing better than _arg_keywords.
+                assert concrete_func._num_positional_args in [0, 1]
+                input_names = concrete_func._arg_keywords
+
+            input_tensors = [tensor for tensor in concrete_func.inputs if tensor.dtype != tf.dtypes.resource]
+            inputs = {name: tensor.name for name, tensor in zip(input_names, input_tensors)}
+
+            # they are already flattened
+            output_tensors = [tensor for tensor in concrete_func.outputs if tensor.dtype != tf.dtypes.resource]
+            output_names = sorted(concrete_func.structured_outputs)  # because outputs are in flatten form
+            outputs = {name: tensor.name for name, tensor in zip(output_names, output_tensors)}
+        else:
+            session_config = create_session_config(allow_growth=True)
+            with tf.compat.v1.Session(config=session_config) as sess:
+                graph_def, inputs, outputs = _from_saved_model_v1(sess, model_path, tag=None, signatures=[])
+
+        inputs, outputs = handle_tensor_specs(graph_def, inputs, outputs)
+
+        precision = _infer_model_precision(graph_def, inputs, outputs)
+
+        return Model(graph_def, precision, inputs, outputs)
+
+
+class TFRunner(BaseRunner):
+    def __init__(self):
+        pass
+
+    def init_inference(self, model: Model):
+        if is_tf2():
+            return TF2RunnerSession(model=model)
+        else:
+            return TF1RunnerSession(model=model)
+
+
+class TF1RunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model):
+        super().__init__(model)
+
+        assert isinstance(model.handle, tf.compat.v1.GraphDef)
+
+        self._inputs = None
+        self._outputs = None
+        self._session = None
+        self._old_env_values = {}
+
+    def __enter__(self):
+        self._old_env_values = self._set_env_variables()
+
+        tf.compat.v1.reset_default_graph()
+
+        session_config = create_session_config(allow_growth=True)
+        self._session = tf.compat.v1.Session(config=session_config)
+        self._session.__enter__()
+
+        tf.import_graph_def(self._model.handle, name="")
+
+        self._inputs = {
+            name: self._session.graph.get_tensor_by_name(spec.name) for name, spec in self._model.inputs.items()
+        }
+        self._outputs = {
+            name: self._session.graph.get_tensor_by_name(spec.name) for name, spec in self._model.outputs.items()
+        }
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._session.__exit__(exc_type, exc_value, traceback)
+        tf.compat.v1.reset_default_graph()
+        self._inputs = None
+        self._outputs = None
+        self._session = None
+        self._recover_env_variables(self._old_env_values)
+
+    def __call__(self, x: Dict[str, object]):
+        feed_dict = {placeholder: x[name] for name, placeholder in self._inputs.items()}
+        return self._session.run(self._outputs, feed_dict=feed_dict)
+
+
+class TF2RunnerSession(BaseRunnerSession):
+    def __init__(self, model: Model):
+        super().__init__(model)
+        assert isinstance(model.handle, tf.compat.v1.GraphDef)
+        self._concrete_func = None
+
+    def __enter__(self):
+        tf.compat.v1.reset_default_graph()
+        input_tensor_names = [spec.name for spec in self._model.inputs.values()]
+        output_tensor_names = [spec.name for spec in self._model.outputs.values()]
+        self._concrete_func = wrap_function.function_from_graph_def(
+            self._model.handle, input_tensor_names, output_tensor_names
+        )
+        self._concrete_func._signature = [
+            tf.TensorSpec(shape=spec.shape, dtype=spec.dtype, name=name) for name, spec in self._model.inputs.items()
+        ]
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._concrete_func = None
+        tf.compat.v1.reset_default_graph()
+
+    def __call__(self, x: Dict[str, object]):
+        x = tf.nest.map_structure(tf.convert_to_tensor, x)
+        y_pred = self._concrete_func(**x)
+        output_struct = {name: spec.name for name, spec in self._model.outputs.items()}
+        y_pred = tf.nest.map_structure(lambda t: t.numpy(), y_pred)
+        y_pred = tf.nest.pack_sequence_as(output_struct, y_pred)
+        return y_pred
+
+
+class TFSavedModelSaver(BaseSaver):
+    def save(self, model: Model, model_path: Union[str, Path]) -> None:
+        if isinstance(model_path, Path):
+            model_path = model_path.as_posix()
+
+        session_config = create_session_config(allow_growth=True)
+        with tf.compat.v1.Session(config=session_config) as sess:
+            tf.import_graph_def(model.handle, name="")
+
+            is_func = is_function(sess.graph)
+            if not is_func:
+                infer_shape(sess.graph, {})
+
+            inputs = {name: sess.graph.get_tensor_by_name(spec.name) for name, spec in model.inputs.items()}
+            outputs = {name: sess.graph.get_tensor_by_name(spec.name) for name, spec in model.outputs.items()}
+
+            def _ensure_shape(tensors_dict, tensors_specs):
+                for name, tensor in tensors_dict.items():
+                    if tensor.shape.rank is None:
+                        tensor.set_shape(tensors_specs[name].shape)
+                return tensors_dict
+
+            inputs = _ensure_shape(inputs, model.inputs)
+            outputs = _ensure_shape(outputs, model.outputs)
+
+            LOGGER.info(inputs)
+            LOGGER.info(outputs)
+
+            tf.compat.v1.saved_model.simple_save(sess, model_path, inputs, outputs, legacy_init_op=None)
+
+
+def handle_tensor_specs(
+    graph_def, inputs: Dict[str, str], outputs: Dict[str, str]
+) -> Tuple[Dict[str, TensorSpec], Dict[str, TensorSpec]]:
+    session_config = tf.compat.v1.ConfigProto(graph_options=tf.compat.v1.GraphOptions(infer_shapes=True))
+    tf.compat.v1.reset_default_graph()
+    with tf.compat.v1.Session(config=session_config) as sess:
+        tf.import_graph_def(graph_def, name="")
+
+        def _get_spec(tensors_dict):
+            tensors_dict = {name: sess.graph.get_tensor_by_name(tname) for name, tname in tensors_dict.items()}
+            return {name: tensor2tensor_spec(tensor) for name, tensor in tensors_dict.items()}
+
+        inputs = _get_spec(inputs)
+        outputs = _get_spec(outputs)
+
+    tf.compat.v1.reset_default_graph()
+    return inputs, outputs
+
+
+def tensor2tensor_spec(tensor):
+    shape = tuple([s.value if hasattr(s, "value") else s for s in tensor.shape])
+    return TensorSpec(tensor.name, tensor.dtype.name, shape)
+
+
+loaders.register_extension(Format.TF_ESTIMATOR.value, TFEstimatorLoader)
+loaders.register_extension(Format.TF_KERAS.value, TFKerasLoader)
+loaders.register_extension(Format.TF_SAVEDMODEL.value, TFSavedModelLoader)
+loaders.register_extension(Format.TF_TRT.value, TFSavedModelLoader)
+
+converters.register_extension(f"{Format.TF_ESTIMATOR.value}--{Format.TF_SAVEDMODEL.value}", None)
+converters.register_extension(f"{Format.TF_KERAS.value}--{Format.TF_SAVEDMODEL.value}", None)
+converters.register_extension(f"{Format.TF_SAVEDMODEL.value}--{Format.TF_SAVEDMODEL.value}", None)
+converters.register_extension(f"{Format.TF_ESTIMATOR.value}--{Format.TF_TRT.value}", TFTRTConverter)
+converters.register_extension(f"{Format.TF_KERAS.value}--{Format.TF_TRT.value}", TFTRTConverter)
+converters.register_extension(f"{Format.TF_SAVEDMODEL.value}--{Format.TF_TRT.value}", TFTRTConverter)
+
+savers.register_extension(Format.TF_SAVEDMODEL.value, TFSavedModelSaver)
+savers.register_extension(Format.TF_TRT.value, TFSavedModelSaver)
+
+runners.register_extension(Format.TF_ESTIMATOR.value, TFRunner)
+runners.register_extension(Format.TF_KERAS.value, TFRunner)
+runners.register_extension(Format.TF_SAVEDMODEL.value, TFRunner)
+runners.register_extension(Format.TF_TRT.value, TFRunner)
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/tf2onnx_conv.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/tf2onnx_conv.py
@ -0,0 +1,89 @@
+from collections import Iterable
+
+# pytype: disable=import-error
+import onnx
+import onnx.shape_inference
+import tensorflow as tf
+from tf2onnx import optimizer, tfonnx
+
+# pytype: enable=import-error
+
+from ..core import BaseConverter, Format, Model
+from ..extensions import converters
+from .tf import create_session_config
+
+
+def _replace_io_names(graph_proto, io_type, name2tensor):
+    tensor2name = {v: k for k, v in name2tensor.items()}
+    tensor_value_info_list = {"inputs": graph_proto.input, "outputs": graph_proto.output}[io_type]
+    for tensor_value_info in tensor_value_info_list:
+        old_name = tensor_value_info.name
+        new_name = tensor2name.get(old_name)
+        if new_name is not None and new_name != old_name:
+            tensor_value_info.name = new_name
+            # replace other graph nodes I/O
+            for node in graph_proto.node:
+                if old_name in node.input:
+                    idx = list(node.input).index(old_name)
+                    node.input[idx] = new_name
+                if old_name in node.output:
+                    idx = list(node.output).index(old_name)
+                    node.output[idx] = new_name
+
+
+def tfgraph2onnx(graph_def, inputnames2tensornames, outputnames2tensornames, *, onnx_opset, onnx_optimized=True):
+    with tf.Graph().as_default() as tf_graph:
+        tf.import_graph_def(graph_def, name="")
+    session_config = create_session_config(allow_growth=True)
+    with tf.compat.v1.Session(graph=tf_graph, config=session_config):
+        input_tensor_names = list(inputnames2tensornames.values())
+        output_tensor_names = list(outputnames2tensornames.values())
+        onnx_graph = tfonnx.process_tf_graph(
+            tf_graph,
+            input_names=input_tensor_names,
+            output_names=output_tensor_names,
+            opset=onnx_opset,
+        )
+    if onnx_optimized:
+        onnx_graph = optimizer.optimize_graph(onnx_graph)
+    graph_doc: str = "triton export"
+    onnx_model = onnx_graph.make_model(graph_doc)
+
+    # to match tensorflow savedmodel signature
+    _replace_io_names(onnx_model.graph, "inputs", inputnames2tensornames)
+    _replace_io_names(onnx_model.graph, "outputs", outputnames2tensornames)
+
+    onnx.checker.check_model(onnx_model)
+    onnx.helper.strip_doc_string(onnx_model)
+    onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
+
+    return onnx_model
+
+
+class TFGraphDef2ONNXConverter(BaseConverter):
+    def __init__(self, *, onnx_opset: int, onnx_optimized: bool = True):
+        self._onnx_opset = onnx_opset
+        self._onnx_optimized = onnx_optimized
+
+    def convert(self, model: Model, dataloader_fn) -> Model:
+        assert isinstance(model.handle, tf.compat.v1.GraphDef)
+
+        inputnames2tensorname = {name: spec.name for name, spec in model.inputs.items()}
+        outputnames2tensorname = {name: spec.name for name, spec in model.outputs.items()}
+        onnx_model = tfgraph2onnx(
+            model.handle,
+            inputnames2tensorname,
+            outputnames2tensorname,
+            onnx_opset=self._onnx_opset,
+            onnx_optimized=self._onnx_optimized,
+        )
+        from .onnx import _infer_graph_precision
+
+        precision = _infer_graph_precision(onnx_model.graph)
+        assert precision == model.precision  # for testing precision inference function
+        return model._replace(handle=onnx_model)
+
+
+converters.register_extension(f"{Format.TF_ESTIMATOR.value}--{Format.ONNX.value}", TFGraphDef2ONNXConverter)
+converters.register_extension(f"{Format.TF_KERAS.value}--{Format.ONNX.value}", TFGraphDef2ONNXConverter)
+converters.register_extension(f"{Format.TF_SAVEDMODEL.value}--{Format.ONNX.value}", TFGraphDef2ONNXConverter)
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/tf2trt_conv.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/tf2trt_conv.py
@ -0,0 +1,60 @@
+from typing import Iterable
+
+from ..core import BaseConverter, Format, Model, Precision, ShapeSpec
+from ..extensions import converters
+from .onnx2trt_conv import onnx2trt
+from .tf2onnx_conv import tfgraph2onnx
+from .utils import get_input_shapes
+
+
+class TFGraphDef2TRTConverter(BaseConverter):
+    def __init__(
+        self,
+        *,
+        max_batch_size: int,
+        max_workspace_size: int,
+        onnx_opset: int,
+        onnx_optimized: bool = True,
+        precision: str,
+    ):
+        self._max_batch_size = max_batch_size
+        self._max_workspace_size = max_workspace_size
+        self._onnx_opset = onnx_opset
+        self._onnx_optimized = onnx_optimized
+        self._precision = Precision(precision)
+
+    def convert(self, model: Model, dataloader_fn) -> Model:
+        inputnames2tensorname = {name: spec.name for name, spec in model.inputs.items()}
+        outputnames2tensorname = {name: spec.name for name, spec in model.outputs.items()}
+        onnx_model = tfgraph2onnx(
+            model.handle,
+            inputnames2tensorname,
+            outputnames2tensorname,
+            onnx_opset=self._onnx_opset,
+            onnx_optimized=self._onnx_optimized,
+        )
+
+        from .onnx import _infer_graph_precision
+
+        precision = _infer_graph_precision(onnx_model.graph)
+        assert precision == model.precision  # for testing precision inference function
+
+        input_shapes = get_input_shapes(dataloader_fn(), self._max_batch_size)
+        cuda_engine = onnx2trt(
+            onnx_model,
+            shapes=input_shapes,
+            max_workspace_size=self._max_workspace_size,
+            max_batch_size=self._max_batch_size,
+            model_precision=self._precision.value,
+        )
+        return model._replace(handle=cuda_engine)
+
+    @staticmethod
+    def required_source_model_precision(requested_model_precision: Precision) -> Precision:
+        # TensorRT requires source models to be in FP32 precision
+        return Precision.FP32
+
+
+converters.register_extension(f"{Format.TF_ESTIMATOR.value}--{Format.TRT.value}", TFGraphDef2TRTConverter)
+converters.register_extension(f"{Format.TF_KERAS.value}--{Format.TRT.value}", TFGraphDef2TRTConverter)
+converters.register_extension(f"{Format.TF_SAVEDMODEL.value}--{Format.TRT.value}", TFGraphDef2TRTConverter)
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/utils.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/bermuda/utils.py
@ -0,0 +1,107 @@
+from collections import Counter
+from typing import Callable, Dict, List
+
+import networkx as nx
+
+from ..core import ShapeSpec
+
+
+def infer_precision(
+    nx_graph: nx.Graph,
+    input_names: List[str],
+    output_names: List[str],
+    get_node_dtype_fn: Callable,
+):
+    node_dtypes = [nx_graph.nodes[node_name].get("dtype", None) for node_name in nx_graph.nodes]
+    node_dtypes = [dt for dt in node_dtypes if dt is None or dt.kind not in ["i", "b"]]
+    dtypes_counter = Counter(node_dtypes)
+    return dtypes_counter.most_common()[0][0]
+
+
+def get_shapes_with_dynamic_axes(dataloader, batch_size_dim=0):
+    def _set_dynamic_shapes(t, shapes):
+        for k, v in t.items():
+            shape = list(v.shape)
+            for dim, s in enumerate(shape):
+                if shapes[k][dim] != -1 and shapes[k][dim] != s:
+                    shapes[k][dim] = -1
+
+    ## get all shapes from input and output tensors
+    input_shapes = {}
+    output_shapes = {}
+    for batch in dataloader:
+        _, x, y = batch
+        for k, v in x.items():
+            input_shapes[k] = list(v.shape)
+        for k, v in y.items():
+            output_shapes[k] = list(v.shape)
+        break
+
+    # based on max <max_num_iters> iterations, check which
+    # dimensions differ to determine dynamic_axes
+    max_num_iters = 100
+    for idx, batch in enumerate(dataloader):
+        if idx >= max_num_iters:
+            break
+
+        _, x, y = batch
+
+        _set_dynamic_shapes(x, input_shapes)
+        _set_dynamic_shapes(y, output_shapes)
+
+    return input_shapes, output_shapes
+
+
+def get_dynamic_axes(dataloader, batch_size_dim=0):
+    input_shapes, output_shapes = get_shapes_with_dynamic_axes(dataloader, batch_size_dim)
+    all_shapes = {**input_shapes, **output_shapes}
+    dynamic_axes = {}
+
+    for k, shape in all_shapes.items():
+        for idx, s in enumerate(shape):
+            if s == -1:
+                dynamic_axes[k] = {idx: k + "_" + str(idx)}
+
+    for k, v in all_shapes.items():
+        if k in dynamic_axes:
+            dynamic_axes[k].update({batch_size_dim: "batch_size_" + str(batch_size_dim)})
+        else:
+            dynamic_axes[k] = {batch_size_dim: "batch_size_" + str(batch_size_dim)}
+
+    return dynamic_axes
+
+
+def get_input_shapes(dataloader, max_batch_size=1) -> Dict[str, ShapeSpec]:
+    def init_counters_and_shapes(x, counters, min_shapes, max_shapes):
+        for k, v in x.items():
+            counters[k] = Counter()
+            min_shapes[k] = [float("inf")] * v.ndim
+            max_shapes[k] = [float("-inf")] * v.ndim
+
+    counters = {}
+    min_shapes: Dict[str, tuple] = {}
+    max_shapes: Dict[str, tuple] = {}
+    for idx, batch in enumerate(dataloader):
+        ids, x, y = batch
+
+        if idx == 0:
+            init_counters_and_shapes(x, counters, min_shapes, max_shapes)
+
+        for k, v in x.items():
+            shape = v.shape
+            counters[k][shape] += 1
+            min_shapes[k] = tuple([min(a, b) for a, b in zip(min_shapes[k], shape)])
+            max_shapes[k] = tuple([max(a, b) for a, b in zip(max_shapes[k], shape)])
+
+    opt_shapes: Dict[str, tuple] = {}
+    for k, v in counters.items():
+        opt_shapes[k] = v.most_common(1)[0][0]
+
+    shapes = {}
+    for k in opt_shapes.keys():  # same keys in min_shapes and max_shapes
+        shapes[k] = ShapeSpec(
+            min=(1,) + min_shapes[k][1:],
+            max=(max_batch_size,) + max_shapes[k][1:],
+            opt=(max_batch_size,) + opt_shapes[k][1:],
+        )
+    return shapes
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/core.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/core.py
@ -0,0 +1,169 @@
+import abc
+import importlib
+import logging
+import os
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union
+
+import numpy as np
+
+LOGGER = logging.getLogger(__name__)
+DATALOADER_FN_NAME = "get_dataloader_fn"
+GET_MODEL_FN_NAME = "get_model"
+GET_SERVING_INPUT_RECEIVER_FN = "get_serving_input_receiver_fn"
+GET_ARGPARSER_FN_NAME = "update_argparser"
+
+
+class TensorSpec(NamedTuple):
+    name: str
+    dtype: str
+    shape: Tuple
+
+
+class Parameter(Enum):
+    def __lt__(self, other: "Parameter") -> bool:
+        return self.value < other.value
+
+
+class Accelerator(Parameter):
+    AMP = "amp"
+    CUDA = "cuda"
+    TRT = "trt"
+
+
+class Precision(Parameter):
+    FP16 = "fp16"
+    FP32 = "fp32"
+    TF32 = "tf32"  # Deprecated
+
+
+class Format(Parameter):
+    TF_GRAPHDEF = "tf-graphdef"
+    TF_SAVEDMODEL = "tf-savedmodel"
+    TF_TRT = "tf-trt"
+    TF_ESTIMATOR = "tf-estimator"
+    TF_KERAS = "tf-keras"
+    ONNX = "onnx"
+    TRT = "trt"
+    TS_SCRIPT = "ts-script"
+    TS_TRACE = "ts-trace"
+    PYT = "pyt"
+
+
+class Model(NamedTuple):
+    handle: object
+    precision: Optional[Precision]
+    inputs: Dict[str, TensorSpec]
+    outputs: Dict[str, TensorSpec]
+
+
+def load_from_file(file_path, label, target):
+    spec = importlib.util.spec_from_file_location(name=label, location=file_path)
+    my_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(my_module)  # pytype: disable=attribute-error
+    return getattr(my_module, target, None)
+
+
+class BaseLoader(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def load(self, model_path: Union[str, Path], **kwargs) -> Model:
+        """
+        Loads and process model from file based on given set of args
+        """
+        pass
+
+
+class BaseSaver(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def save(self, model: Model, model_path: Union[str, Path]) -> None:
+        """
+        Save model to file
+        """
+        pass
+
+
+class BaseRunner(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def init_inference(self, model: Model):
+        raise NotImplementedError
+
+
+class BaseRunnerSession(abc.ABC):
+    def __init__(self, model: Model):
+        self._model = model
+
+    @abc.abstractmethod
+    def __enter__(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def __exit__(self, exc_type, exc_value, traceback):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def __call__(self, x: Dict[str, object]):
+        raise NotImplementedError()
+
+    def _set_env_variables(self) -> Dict[str, object]:
+        """this method not remove values; fix it if needed"""
+        to_set = {}
+        old_values = {k: os.environ.pop(k, None) for k in to_set}
+        os.environ.update(to_set)
+        return old_values
+
+    def _recover_env_variables(self, old_envs: Dict[str, object]):
+        for name, value in old_envs.items():
+            if value is None:
+                del os.environ[name]
+            else:
+                os.environ[name] = str(value)
+
+
+class BaseConverter(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def convert(self, model: Model, dataloader_fn) -> Model:
+        raise NotImplementedError()
+
+    @staticmethod
+    def required_source_model_precision(requested_model_precision: Precision) -> Precision:
+        return requested_model_precision
+
+
+class BaseMetricsCalculator(abc.ABC):
+    required_fn_name_for_signature_parsing: Optional[str] = None
+
+    @abc.abstractmethod
+    def calc(
+        self,
+        *,
+        ids: List[Any],
+        y_pred: Dict[str, np.ndarray],
+        x: Optional[Dict[str, np.ndarray]],
+        y_real: Optional[Dict[str, np.ndarray]],
+    ) -> Dict[str, float]:
+        """
+        Calculates error/accuracy metrics
+        Args:
+            ids: List of ids identifying each sample in the batch
+            y_pred: model output as dict where key is output name and value is output value
+            x: model input as dict where key is input name and value is input value
+            y_real: input ground truth as dict where key is output name and value is output value
+        Returns:
+            dictionary where key is metric name and value is its value
+        """
+        pass
+
+
+class ShapeSpec(NamedTuple):
+    min: Tuple
+    opt: Tuple
+    max: Tuple
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/dump.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/dump.py
@ -0,0 +1,133 @@
+from pathlib import Path
+from typing import Dict, Iterable
+
+import numpy as np
+
+MB2B = 2 ** 20
+B2MB = 1 / MB2B
+FLUSH_THRESHOLD_B = 256 * MB2B
+
+
+def pad_except_batch_axis(data: np.ndarray, target_shape_with_batch_axis: Iterable[int]):
+    assert all(
+        [current_size <= target_size for target_size, current_size in zip(target_shape_with_batch_axis, data.shape)]
+    ), "target_shape should have equal or greater all dimensions comparing to data.shape"
+    padding = [(0, 0)] + [  # (0, 0) - do not pad on batch_axis (with index 0)
+        (0, target_size - current_size)
+        for target_size, current_size in zip(target_shape_with_batch_axis[1:], data.shape[1:])
+    ]
+    return np.pad(data, padding, "constant", constant_values=np.nan)
+
+
+class NpzWriter:
+    """
+    Dumps dicts of numpy arrays into npz files
+
+    It can/shall be used as context manager:
+    ```
+    with OutputWriter('mydir') as writer:
+        writer.write(outputs={'classes': np.zeros(8), 'probs': np.zeros((8, 4))},
+                     labels={'classes': np.zeros(8)},
+                     inputs={'input': np.zeros((8, 240, 240, 3)})
+    ```
+
+    ## Variable size data
+
+    Only dynamic of last axis is handled. Data is padded with np.nan value.
+    Also each generated file may have different size of dynamic axis.
+    """
+
+    def __init__(self, output_dir, compress=False):
+        self._output_dir = Path(output_dir)
+        self._items_cache: Dict[str, Dict[str, np.ndarray]] = {}
+        self._items_counters: Dict[str, int] = {}
+        self._flush_threshold_b = FLUSH_THRESHOLD_B
+        self._compress = compress
+
+    @property
+    def cache_size(self):
+        return {name: sum([a.nbytes for a in data.values()]) for name, data in self._items_cache.items()}
+
+    def _append_to_cache(self, prefix, data):
+        if data is None:
+            return
+
+        if not isinstance(data, dict):
+            raise ValueError(f"{prefix} data to store shall be dict")
+
+        cached_data = self._items_cache.get(prefix, {})
+        for name, value in data.items():
+            assert isinstance(
+                value, (list, np.ndarray)
+            ), f"Values shall be lists or np.ndarrays; current type {type(value)}"
+            if not isinstance(value, np.ndarray):
+                value = np.array(value)
+
+            assert value.dtype.kind in ["S", "U"] or not np.any(
+                np.isnan(value)
+            ), f"Values with np.nan is not supported; {name}={value}"
+            cached_value = cached_data.get(name, None)
+            if cached_value is not None:
+                target_shape = np.max([cached_value.shape, value.shape], axis=0)
+                cached_value = pad_except_batch_axis(cached_value, target_shape)
+                value = pad_except_batch_axis(value, target_shape)
+                value = np.concatenate((cached_value, value))
+            cached_data[name] = value
+        self._items_cache[prefix] = cached_data
+
+    def write(self, **kwargs):
+        """
+        Writes named list of dictionaries of np.ndarrays.
+        Finally keyword names will be later prefixes of npz files where those dictionaries will be stored.
+
+        ex. writer.write(inputs={'input': np.zeros((2, 10))},
+                         outputs={'classes': np.zeros((2,)), 'probabilities': np.zeros((2, 32))},
+                         labels={'classes': np.zeros((2,))})
+        Args:
+            **kwargs: named list of dictionaries of np.ndarrays to store
+        """
+
+        for prefix, data in kwargs.items():
+            self._append_to_cache(prefix, data)
+
+        biggest_item_size = max(self.cache_size.values())
+        if biggest_item_size > self._flush_threshold_b:
+            self.flush()
+
+    def flush(self):
+        for prefix, data in self._items_cache.items():
+            self._dump(prefix, data)
+        self._items_cache = {}
+
+    def _dump(self, prefix, data):
+        idx = self._items_counters.setdefault(prefix, 0)
+        filename = f"{prefix}-{idx:012d}.npz"
+        output_path = self._output_dir / filename
+        if self._compress:
+            np.savez_compressed(output_path, **data)
+        else:
+            np.savez(output_path, **data)
+
+        nitems = len(list(data.values())[0])
+
+        msg_for_labels = (
+            "If these are correct shapes - consider moving loading of them into metrics.py."
+            if prefix == "labels"
+            else ""
+        )
+        shapes = {name: value.shape if isinstance(value, np.ndarray) else (len(value),) for name, value in data.items()}
+
+        assert all(len(v) == nitems for v in data.values()), (
+            f'All items in "{prefix}" shall have same size on 0 axis equal to batch size. {msg_for_labels}'
+            f'{", ".join(f"{name}: {shape}" for name, shape in shapes.items())}'
+        )
+        self._items_counters[prefix] += nitems
+
+    def __enter__(self):
+        if self._output_dir.exists() and len(list(self._output_dir.iterdir())):
+            raise ValueError(f"{self._output_dir.as_posix()} is not empty")
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.flush()
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/extensions.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/extensions.py
@ -0,0 +1,69 @@
+import importlib
+import logging
+import os
+import re
+from pathlib import Path
+from typing import List
+
+LOGGER = logging.getLogger(__name__)
+
+
+class ExtensionManager:
+    def __init__(self, name: str):
+        self._name = name
+        self._registry = {}
+
+    def register_extension(self, extension: str, clazz):
+        already_registered_class = self._registry.get(extension, None)
+        if already_registered_class and already_registered_class.__module__ != clazz.__module__:
+            raise RuntimeError(
+                f"Conflicting extension {self._name}/{extension}; "
+                f"{already_registered_class.__module__}.{already_registered_class.__name} "
+                f"and "
+                f"{clazz.__module__}.{clazz.__name__}"
+            )
+        elif already_registered_class is None:
+            clazz_full_name = f"{clazz.__module__}.{clazz.__name__}" if clazz is not None else "None"
+            LOGGER.debug(f"Registering extension {self._name}/{extension}: {clazz_full_name}")
+            self._registry[extension] = clazz
+
+    def get(self, extension):
+        if extension not in self._registry:
+            raise RuntimeError(f"Missing extension {self._name}/{extension}")
+        return self._registry[extension]
+
+    @property
+    def supported_extensions(self):
+        return list(self._registry)
+
+    @staticmethod
+    def scan_for_extensions(extension_dirs: List[Path]):
+        register_pattern = r".*\.register_extension\(.*"
+
+        for extension_dir in extension_dirs:
+            for python_path in extension_dir.rglob("*.py"):
+                if not python_path.is_file():
+                    continue
+                payload = python_path.read_text()
+                if re.findall(register_pattern, payload):
+                    import_path = python_path.relative_to(toolkit_root_dir.parent)
+                    package = import_path.parent.as_posix().replace(os.sep, ".")
+                    package_with_module = f"{package}.{import_path.stem}"
+                    spec = importlib.util.spec_from_file_location(name=package_with_module, location=python_path)
+                    my_module = importlib.util.module_from_spec(spec)
+                    my_module.__package__ = package
+
+                    try:
+                        spec.loader.exec_module(my_module)  # pytype: disable=attribute-error
+                    except ModuleNotFoundError as e:
+                        LOGGER.error(
+                            f"Could not load extensions from {import_path} due to missing python packages; {e}"
+                        )
+
+
+runners = ExtensionManager("runners")
+loaders = ExtensionManager("loaders")
+savers = ExtensionManager("savers")
+converters = ExtensionManager("converters")
+toolkit_root_dir = (Path(__file__).parent / "..").resolve()
+ExtensionManager.scan_for_extensions([toolkit_root_dir])
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/report.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/report.py
@ -0,0 +1,47 @@
+import csv
+import re
+from typing import Dict, List
+
+from natsort import natsorted
+from tabulate import tabulate
+
+
+def sort_results(results: List):
+    results = natsorted(results, key=lambda item: [item[key] for key in item.keys()])
+    return results
+
+
+def save_results(filename: str, data: List, formatted: bool = False):
+    data = format_data(data=data) if formatted else data
+    with open(filename, "a") as csvfile:
+        fieldnames = data[0].keys()
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+        writer.writeheader()
+        for row in data:
+            writer.writerow(row)
+
+
+def format_data(data: List[Dict]) -> List[Dict]:
+    formatted_data = list()
+    for item in data:
+        formatted_item = format_keys(data=item)
+        formatted_data.append(formatted_item)
+
+    return formatted_data
+
+
+def format_keys(data: Dict) -> Dict:
+    keys = {format_key(key=key): value for key, value in data.items()}
+    return keys
+
+
+def format_key(key: str) -> str:
+    key = " ".join([k.capitalize() for k in re.split("_| ", key)])
+    return key
+
+
+def show_results(results: List[Dict]):
+    headers = list(results[0].keys())
+    summary = map(lambda x: list(map(lambda item: item[1], x.items())), results)
+    print(tabulate(summary, headers=headers))
--- a/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/warmup.py
+++ b/TensorFlow/Classification/ConvNets/triton/deployment_toolkit/warmup.py
@ -0,0 +1,47 @@
+import os
+from typing import List, Optional
+
+
+def warmup(
+    model_name: str,
+    batch_sizes: List[int],
+    triton_instances: int = 1,
+    profiling_data: str = "random",
+    input_shapes: Optional[List[str]] = None,
+    server_url: str = "localhost",
+    measurement_window: int = 10000,
+):
+    print("\n")
+    print(f"==== Warmup start ====")
+    print("\n")
+
+    input_shapes = " ".join(map(lambda shape: f" --shape {shape}", input_shapes)) if input_shapes else ""
+
+    bs = set()
+    bs.add(min(batch_sizes))
+    bs.add(max(batch_sizes))
+
+    measurement_window = 6 * measurement_window
+
+    for batch_size in bs:
+        exec_args = f"""-max-threads {triton_instances} \
+           -m {model_name} \
+           -x 1 \
+           -c {triton_instances} \
+           -t {triton_instances} \
+           -p {measurement_window} \
+           -v \
+           -i http \
+           -u {server_url}:8000 \
+           -b {batch_size} \
+           --input-data {profiling_data} {input_shapes}
+        """
+
+        result = os.system(f"perf_client {exec_args}")
+        if result != 0:
+            print(f"Failed running performance tests. Perf client failed with exit code {result}")
+            exit(1)
+
+    print("\n")
+    print(f"==== Warmup done ====")
+    print("\n")
--- a/TensorFlow/Classification/ConvNets/triton/metrics.py
+++ b/TensorFlow/Classification/ConvNets/triton/metrics.py
@ -0,0 +1,18 @@
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+from deployment_toolkit.core import BaseMetricsCalculator
+
+
+class MetricsCalculator(BaseMetricsCalculator):
+    def __init__(self, output_used_for_metrics: str = "classes"):
+        self._output_used_for_metrics = output_used_for_metrics
+
+    def calc(self, *, y_pred: Dict[str, np.ndarray], y_real: Optional[Dict[str, np.ndarray]], **_) -> Dict[str, float]:
+        y_true = y_real[self._output_used_for_metrics]
+        y_pred = y_pred[self._output_used_for_metrics]
+        y_true = np.squeeze(y_true)
+        y_pred = np.squeeze(y_pred)
+        assert y_true.shape == y_pred.shape
+        return {"accuracy": (y_true == y_pred).mean()}
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_11l.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_11l.svg
@ -0,0 +1,992 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Created with matplotlib (https://matplotlib.org/) -->
+<svg height="331.389812pt" version="1.1" viewBox="0 0 424.62875 331.389812" width="424.62875pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <metadata>
+  <rdf:RDF xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2021-04-15T15:15:19.288796</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.3.4, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linecap:butt;stroke-linejoin:round;}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 331.389812 
+L 424.62875 331.389812 
+L 424.62875 0 
+L 0 0 
+z
+" style="fill:#ffffff;"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 60.30875 288.430125 
+L 417.42875 288.430125 
+L 417.42875 22.318125 
+L 60.30875 22.318125 
+z
+" style="fill:#ffffff;"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <path clip-path="url(#p0d91672b8f)" d="M 76.541477 288.430125 
+L 76.541477 22.318125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_1">
+      <!-- 1 -->
+      <g style="fill:#262626;" transform="translate(73.042102 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 12.40625 8.296875 
+L 28.515625 8.296875 
+L 28.515625 63.921875 
+L 10.984375 60.40625 
+L 10.984375 69.390625 
+L 28.421875 72.90625 
+L 38.28125 72.90625 
+L 38.28125 8.296875 
+L 54.390625 8.296875 
+L 54.390625 0 
+L 12.40625 0 
+z
+" id="DejaVuSans-49"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-49"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <path clip-path="url(#p0d91672b8f)" d="M 122.920698 288.430125 
+L 122.920698 22.318125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_2">
+      <!-- 2 -->
+      <g style="fill:#262626;" transform="translate(119.421323 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 19.1875 8.296875 
+L 53.609375 8.296875 
+L 53.609375 0 
+L 7.328125 0 
+L 7.328125 8.296875 
+Q 12.9375 14.109375 22.625 23.890625 
+Q 32.328125 33.6875 34.8125 36.53125 
+Q 39.546875 41.84375 41.421875 45.53125 
+Q 43.3125 49.21875 43.3125 52.78125 
+Q 43.3125 58.59375 39.234375 62.25 
+Q 35.15625 65.921875 28.609375 65.921875 
+Q 23.96875 65.921875 18.8125 64.3125 
+Q 13.671875 62.703125 7.8125 59.421875 
+L 7.8125 69.390625 
+Q 13.765625 71.78125 18.9375 73 
+Q 24.125 74.21875 28.421875 74.21875 
+Q 39.75 74.21875 46.484375 68.546875 
+Q 53.21875 62.890625 53.21875 53.421875 
+Q 53.21875 48.921875 51.53125 44.890625 
+Q 49.859375 40.875 45.40625 35.40625 
+Q 44.1875 33.984375 37.640625 27.21875 
+Q 31.109375 20.453125 19.1875 8.296875 
+z
+" id="DejaVuSans-50"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-50"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <path clip-path="url(#p0d91672b8f)" d="M 169.299919 288.430125 
+L 169.299919 22.318125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_3">
+      <!-- 4 -->
+      <g style="fill:#262626;" transform="translate(165.800544 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 37.796875 64.3125 
+L 12.890625 25.390625 
+L 37.796875 25.390625 
+z
+M 35.203125 72.90625 
+L 47.609375 72.90625 
+L 47.609375 25.390625 
+L 58.015625 25.390625 
+L 58.015625 17.1875 
+L 47.609375 17.1875 
+L 47.609375 0 
+L 37.796875 0 
+L 37.796875 17.1875 
+L 4.890625 17.1875 
+L 4.890625 26.703125 
+z
+" id="DejaVuSans-52"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-52"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <path clip-path="url(#p0d91672b8f)" d="M 215.67914 288.430125 
+L 215.67914 22.318125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_4">
+      <!-- 8 -->
+      <g style="fill:#262626;" transform="translate(212.179765 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 31.78125 34.625 
+Q 24.75 34.625 20.71875 30.859375 
+Q 16.703125 27.09375 16.703125 20.515625 
+Q 16.703125 13.921875 20.71875 10.15625 
+Q 24.75 6.390625 31.78125 6.390625 
+Q 38.8125 6.390625 42.859375 10.171875 
+Q 46.921875 13.96875 46.921875 20.515625 
+Q 46.921875 27.09375 42.890625 30.859375 
+Q 38.875 34.625 31.78125 34.625 
+z
+M 21.921875 38.8125 
+Q 15.578125 40.375 12.03125 44.71875 
+Q 8.5 49.078125 8.5 55.328125 
+Q 8.5 64.0625 14.71875 69.140625 
+Q 20.953125 74.21875 31.78125 74.21875 
+Q 42.671875 74.21875 48.875 69.140625 
+Q 55.078125 64.0625 55.078125 55.328125 
+Q 55.078125 49.078125 51.53125 44.71875 
+Q 48 40.375 41.703125 38.8125 
+Q 48.828125 37.15625 52.796875 32.3125 
+Q 56.78125 27.484375 56.78125 20.515625 
+Q 56.78125 9.90625 50.3125 4.234375 
+Q 43.84375 -1.421875 31.78125 -1.421875 
+Q 19.734375 -1.421875 13.25 4.234375 
+Q 6.78125 9.90625 6.78125 20.515625 
+Q 6.78125 27.484375 10.78125 32.3125 
+Q 14.796875 37.15625 21.921875 38.8125 
+z
+M 18.3125 54.390625 
+Q 18.3125 48.734375 21.84375 45.5625 
+Q 25.390625 42.390625 31.78125 42.390625 
+Q 38.140625 42.390625 41.71875 45.5625 
+Q 45.3125 48.734375 45.3125 54.390625 
+Q 45.3125 60.0625 41.71875 63.234375 
+Q 38.140625 66.40625 31.78125 66.40625 
+Q 25.390625 66.40625 21.84375 63.234375 
+Q 18.3125 60.0625 18.3125 54.390625 
+z
+" id="DejaVuSans-56"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-56"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <path clip-path="url(#p0d91672b8f)" d="M 262.05836 288.430125 
+L 262.05836 22.318125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_5">
+      <!-- 16 -->
+      <g style="fill:#262626;" transform="translate(255.05961 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 33.015625 40.375 
+Q 26.375 40.375 22.484375 35.828125 
+Q 18.609375 31.296875 18.609375 23.390625 
+Q 18.609375 15.53125 22.484375 10.953125 
+Q 26.375 6.390625 33.015625 6.390625 
+Q 39.65625 6.390625 43.53125 10.953125 
+Q 47.40625 15.53125 47.40625 23.390625 
+Q 47.40625 31.296875 43.53125 35.828125 
+Q 39.65625 40.375 33.015625 40.375 
+z
+M 52.59375 71.296875 
+L 52.59375 62.3125 
+Q 48.875 64.0625 45.09375 64.984375 
+Q 41.3125 65.921875 37.59375 65.921875 
+Q 27.828125 65.921875 22.671875 59.328125 
+Q 17.53125 52.734375 16.796875 39.40625 
+Q 19.671875 43.65625 24.015625 45.921875 
+Q 28.375 48.1875 33.59375 48.1875 
+Q 44.578125 48.1875 50.953125 41.515625 
+Q 57.328125 34.859375 57.328125 23.390625 
+Q 57.328125 12.15625 50.6875 5.359375 
+Q 44.046875 -1.421875 33.015625 -1.421875 
+Q 20.359375 -1.421875 13.671875 8.265625 
+Q 6.984375 17.96875 6.984375 36.375 
+Q 6.984375 53.65625 15.1875 63.9375 
+Q 23.390625 74.21875 37.203125 74.21875 
+Q 40.921875 74.21875 44.703125 73.484375 
+Q 48.484375 72.75 52.59375 71.296875 
+z
+" id="DejaVuSans-54"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-49"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-54"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <path clip-path="url(#p0d91672b8f)" d="M 308.437581 288.430125 
+L 308.437581 22.318125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_6">
+      <!-- 32 -->
+      <g style="fill:#262626;" transform="translate(301.438831 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 40.578125 39.3125 
+Q 47.65625 37.796875 51.625 33 
+Q 55.609375 28.21875 55.609375 21.1875 
+Q 55.609375 10.40625 48.1875 4.484375 
+Q 40.765625 -1.421875 27.09375 -1.421875 
+Q 22.515625 -1.421875 17.65625 -0.515625 
+Q 12.796875 0.390625 7.625 2.203125 
+L 7.625 11.71875 
+Q 11.71875 9.328125 16.59375 8.109375 
+Q 21.484375 6.890625 26.8125 6.890625 
+Q 36.078125 6.890625 40.9375 10.546875 
+Q 45.796875 14.203125 45.796875 21.1875 
+Q 45.796875 27.640625 41.28125 31.265625 
+Q 36.765625 34.90625 28.71875 34.90625 
+L 20.21875 34.90625 
+L 20.21875 43.015625 
+L 29.109375 43.015625 
+Q 36.375 43.015625 40.234375 45.921875 
+Q 44.09375 48.828125 44.09375 54.296875 
+Q 44.09375 59.90625 40.109375 62.90625 
+Q 36.140625 65.921875 28.71875 65.921875 
+Q 24.65625 65.921875 20.015625 65.03125 
+Q 15.375 64.15625 9.8125 62.3125 
+L 9.8125 71.09375 
+Q 15.4375 72.65625 20.34375 73.4375 
+Q 25.25 74.21875 29.59375 74.21875 
+Q 40.828125 74.21875 47.359375 69.109375 
+Q 53.90625 64.015625 53.90625 55.328125 
+Q 53.90625 49.265625 50.4375 45.09375 
+Q 46.96875 40.921875 40.578125 39.3125 
+z
+" id="DejaVuSans-51"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-51"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-50"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <path clip-path="url(#p0d91672b8f)" d="M 354.816802 288.430125 
+L 354.816802 22.318125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_7">
+      <!-- 64 -->
+      <g style="fill:#262626;" transform="translate(347.818052 306.288406)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-54"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-52"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <path clip-path="url(#p0d91672b8f)" d="M 401.196023 288.430125 
+L 401.196023 22.318125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_8">
+      <!-- 128 -->
+      <g style="fill:#262626;" transform="translate(390.697898 306.288406)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-49"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-50"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-56"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_9">
+     <!-- Client Batch Size -->
+     <g style="fill:#262626;" transform="translate(188.120938 321.694187)scale(0.12 -0.12)">
+      <defs>
+       <path d="M 64.40625 67.28125 
+L 64.40625 56.890625 
+Q 59.421875 61.53125 53.78125 63.8125 
+Q 48.140625 66.109375 41.796875 66.109375 
+Q 29.296875 66.109375 22.65625 58.46875 
+Q 16.015625 50.828125 16.015625 36.375 
+Q 16.015625 21.96875 22.65625 14.328125 
+Q 29.296875 6.6875 41.796875 6.6875 
+Q 48.140625 6.6875 53.78125 8.984375 
+Q 59.421875 11.28125 64.40625 15.921875 
+L 64.40625 5.609375 
+Q 59.234375 2.09375 53.4375 0.328125 
+Q 47.65625 -1.421875 41.21875 -1.421875 
+Q 24.65625 -1.421875 15.125 8.703125 
+Q 5.609375 18.84375 5.609375 36.375 
+Q 5.609375 53.953125 15.125 64.078125 
+Q 24.65625 74.21875 41.21875 74.21875 
+Q 47.75 74.21875 53.53125 72.484375 
+Q 59.328125 70.75 64.40625 67.28125 
+z
+" id="DejaVuSans-67"/>
+       <path d="M 9.421875 75.984375 
+L 18.40625 75.984375 
+L 18.40625 0 
+L 9.421875 0 
+z
+" id="DejaVuSans-108"/>
+       <path d="M 9.421875 54.6875 
+L 18.40625 54.6875 
+L 18.40625 0 
+L 9.421875 0 
+z
+M 9.421875 75.984375 
+L 18.40625 75.984375 
+L 18.40625 64.59375 
+L 9.421875 64.59375 
+z
+" id="DejaVuSans-105"/>
+       <path d="M 56.203125 29.59375 
+L 56.203125 25.203125 
+L 14.890625 25.203125 
+Q 15.484375 15.921875 20.484375 11.0625 
+Q 25.484375 6.203125 34.421875 6.203125 
+Q 39.59375 6.203125 44.453125 7.46875 
+Q 49.3125 8.734375 54.109375 11.28125 
+L 54.109375 2.78125 
+Q 49.265625 0.734375 44.1875 -0.34375 
+Q 39.109375 -1.421875 33.890625 -1.421875 
+Q 20.796875 -1.421875 13.15625 6.1875 
+Q 5.515625 13.8125 5.515625 26.8125 
+Q 5.515625 40.234375 12.765625 48.109375 
+Q 20.015625 56 32.328125 56 
+Q 43.359375 56 49.78125 48.890625 
+Q 56.203125 41.796875 56.203125 29.59375 
+z
+M 47.21875 32.234375 
+Q 47.125 39.59375 43.09375 43.984375 
+Q 39.0625 48.390625 32.421875 48.390625 
+Q 24.90625 48.390625 20.390625 44.140625 
+Q 15.875 39.890625 15.1875 32.171875 
+z
+" id="DejaVuSans-101"/>
+       <path d="M 54.890625 33.015625 
+L 54.890625 0 
+L 45.90625 0 
+L 45.90625 32.71875 
+Q 45.90625 40.484375 42.875 44.328125 
+Q 39.84375 48.1875 33.796875 48.1875 
+Q 26.515625 48.1875 22.3125 43.546875 
+Q 18.109375 38.921875 18.109375 30.90625 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 54.6875 
+L 18.109375 54.6875 
+L 18.109375 46.1875 
+Q 21.34375 51.125 25.703125 53.5625 
+Q 30.078125 56 35.796875 56 
+Q 45.21875 56 50.046875 50.171875 
+Q 54.890625 44.34375 54.890625 33.015625 
+z
+" id="DejaVuSans-110"/>
+       <path d="M 18.3125 70.21875 
+L 18.3125 54.6875 
+L 36.8125 54.6875 
+L 36.8125 47.703125 
+L 18.3125 47.703125 
+L 18.3125 18.015625 
+Q 18.3125 11.328125 20.140625 9.421875 
+Q 21.96875 7.515625 27.59375 7.515625 
+L 36.8125 7.515625 
+L 36.8125 0 
+L 27.59375 0 
+Q 17.1875 0 13.234375 3.875 
+Q 9.28125 7.765625 9.28125 18.015625 
+L 9.28125 47.703125 
+L 2.6875 47.703125 
+L 2.6875 54.6875 
+L 9.28125 54.6875 
+L 9.28125 70.21875 
+z
+" id="DejaVuSans-116"/>
+       <path id="DejaVuSans-32"/>
+       <path d="M 19.671875 34.8125 
+L 19.671875 8.109375 
+L 35.5 8.109375 
+Q 43.453125 8.109375 47.28125 11.40625 
+Q 51.125 14.703125 51.125 21.484375 
+Q 51.125 28.328125 47.28125 31.5625 
+Q 43.453125 34.8125 35.5 34.8125 
+z
+M 19.671875 64.796875 
+L 19.671875 42.828125 
+L 34.28125 42.828125 
+Q 41.5 42.828125 45.03125 45.53125 
+Q 48.578125 48.25 48.578125 53.8125 
+Q 48.578125 59.328125 45.03125 62.0625 
+Q 41.5 64.796875 34.28125 64.796875 
+z
+M 9.8125 72.90625 
+L 35.015625 72.90625 
+Q 46.296875 72.90625 52.390625 68.21875 
+Q 58.5 63.53125 58.5 54.890625 
+Q 58.5 48.1875 55.375 44.234375 
+Q 52.25 40.28125 46.1875 39.3125 
+Q 53.46875 37.75 57.5 32.78125 
+Q 61.53125 27.828125 61.53125 20.40625 
+Q 61.53125 10.640625 54.890625 5.3125 
+Q 48.25 0 35.984375 0 
+L 9.8125 0 
+z
+" id="DejaVuSans-66"/>
+       <path d="M 34.28125 27.484375 
+Q 23.390625 27.484375 19.1875 25 
+Q 14.984375 22.515625 14.984375 16.5 
+Q 14.984375 11.71875 18.140625 8.90625 
+Q 21.296875 6.109375 26.703125 6.109375 
+Q 34.1875 6.109375 38.703125 11.40625 
+Q 43.21875 16.703125 43.21875 25.484375 
+L 43.21875 27.484375 
+z
+M 52.203125 31.203125 
+L 52.203125 0 
+L 43.21875 0 
+L 43.21875 8.296875 
+Q 40.140625 3.328125 35.546875 0.953125 
+Q 30.953125 -1.421875 24.3125 -1.421875 
+Q 15.921875 -1.421875 10.953125 3.296875 
+Q 6 8.015625 6 15.921875 
+Q 6 25.140625 12.171875 29.828125 
+Q 18.359375 34.515625 30.609375 34.515625 
+L 43.21875 34.515625 
+L 43.21875 35.40625 
+Q 43.21875 41.609375 39.140625 45 
+Q 35.0625 48.390625 27.6875 48.390625 
+Q 23 48.390625 18.546875 47.265625 
+Q 14.109375 46.140625 10.015625 43.890625 
+L 10.015625 52.203125 
+Q 14.9375 54.109375 19.578125 55.046875 
+Q 24.21875 56 28.609375 56 
+Q 40.484375 56 46.34375 49.84375 
+Q 52.203125 43.703125 52.203125 31.203125 
+z
+" id="DejaVuSans-97"/>
+       <path d="M 48.78125 52.59375 
+L 48.78125 44.1875 
+Q 44.96875 46.296875 41.140625 47.34375 
+Q 37.3125 48.390625 33.40625 48.390625 
+Q 24.65625 48.390625 19.8125 42.84375 
+Q 14.984375 37.3125 14.984375 27.296875 
+Q 14.984375 17.28125 19.8125 11.734375 
+Q 24.65625 6.203125 33.40625 6.203125 
+Q 37.3125 6.203125 41.140625 7.25 
+Q 44.96875 8.296875 48.78125 10.40625 
+L 48.78125 2.09375 
+Q 45.015625 0.34375 40.984375 -0.53125 
+Q 36.96875 -1.421875 32.421875 -1.421875 
+Q 20.0625 -1.421875 12.78125 6.34375 
+Q 5.515625 14.109375 5.515625 27.296875 
+Q 5.515625 40.671875 12.859375 48.328125 
+Q 20.21875 56 33.015625 56 
+Q 37.15625 56 41.109375 55.140625 
+Q 45.0625 54.296875 48.78125 52.59375 
+z
+" id="DejaVuSans-99"/>
+       <path d="M 54.890625 33.015625 
+L 54.890625 0 
+L 45.90625 0 
+L 45.90625 32.71875 
+Q 45.90625 40.484375 42.875 44.328125 
+Q 39.84375 48.1875 33.796875 48.1875 
+Q 26.515625 48.1875 22.3125 43.546875 
+Q 18.109375 38.921875 18.109375 30.90625 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 75.984375 
+L 18.109375 75.984375 
+L 18.109375 46.1875 
+Q 21.34375 51.125 25.703125 53.5625 
+Q 30.078125 56 35.796875 56 
+Q 45.21875 56 50.046875 50.171875 
+Q 54.890625 44.34375 54.890625 33.015625 
+z
+" id="DejaVuSans-104"/>
+       <path d="M 53.515625 70.515625 
+L 53.515625 60.890625 
+Q 47.90625 63.578125 42.921875 64.890625 
+Q 37.9375 66.21875 33.296875 66.21875 
+Q 25.25 66.21875 20.875 63.09375 
+Q 16.5 59.96875 16.5 54.203125 
+Q 16.5 49.359375 19.40625 46.890625 
+Q 22.3125 44.4375 30.421875 42.921875 
+L 36.375 41.703125 
+Q 47.40625 39.59375 52.65625 34.296875 
+Q 57.90625 29 57.90625 20.125 
+Q 57.90625 9.515625 50.796875 4.046875 
+Q 43.703125 -1.421875 29.984375 -1.421875 
+Q 24.8125 -1.421875 18.96875 -0.25 
+Q 13.140625 0.921875 6.890625 3.21875 
+L 6.890625 13.375 
+Q 12.890625 10.015625 18.65625 8.296875 
+Q 24.421875 6.59375 29.984375 6.59375 
+Q 38.421875 6.59375 43.015625 9.90625 
+Q 47.609375 13.234375 47.609375 19.390625 
+Q 47.609375 24.75 44.3125 27.78125 
+Q 41.015625 30.8125 33.5 32.328125 
+L 27.484375 33.5 
+Q 16.453125 35.6875 11.515625 40.375 
+Q 6.59375 45.0625 6.59375 53.421875 
+Q 6.59375 63.09375 13.40625 68.65625 
+Q 20.21875 74.21875 32.171875 74.21875 
+Q 37.3125 74.21875 42.625 73.28125 
+Q 47.953125 72.359375 53.515625 70.515625 
+z
+" id="DejaVuSans-83"/>
+       <path d="M 5.515625 54.6875 
+L 48.1875 54.6875 
+L 48.1875 46.484375 
+L 14.40625 7.171875 
+L 48.1875 7.171875 
+L 48.1875 0 
+L 4.296875 0 
+L 4.296875 8.203125 
+L 38.09375 47.515625 
+L 5.515625 47.515625 
+z
+" id="DejaVuSans-122"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-67"/>
+      <use x="69.824219" xlink:href="#DejaVuSans-108"/>
+      <use x="97.607422" xlink:href="#DejaVuSans-105"/>
+      <use x="125.390625" xlink:href="#DejaVuSans-101"/>
+      <use x="186.914062" xlink:href="#DejaVuSans-110"/>
+      <use x="250.292969" xlink:href="#DejaVuSans-116"/>
+      <use x="289.501953" xlink:href="#DejaVuSans-32"/>
+      <use x="321.289062" xlink:href="#DejaVuSans-66"/>
+      <use x="389.892578" xlink:href="#DejaVuSans-97"/>
+      <use x="451.171875" xlink:href="#DejaVuSans-116"/>
+      <use x="490.380859" xlink:href="#DejaVuSans-99"/>
+      <use x="545.361328" xlink:href="#DejaVuSans-104"/>
+      <use x="608.740234" xlink:href="#DejaVuSans-32"/>
+      <use x="640.527344" xlink:href="#DejaVuSans-83"/>
+      <use x="704.003906" xlink:href="#DejaVuSans-105"/>
+      <use x="731.787109" xlink:href="#DejaVuSans-122"/>
+      <use x="784.277344" xlink:href="#DejaVuSans-101"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_9">
+      <path clip-path="url(#p0d91672b8f)" d="M 60.30875 288.430125 
+L 417.42875 288.430125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_10">
+      <!-- 0 -->
+      <g style="fill:#262626;" transform="translate(43.81 292.609266)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 31.78125 66.40625 
+Q 24.171875 66.40625 20.328125 58.90625 
+Q 16.5 51.421875 16.5 36.375 
+Q 16.5 21.390625 20.328125 13.890625 
+Q 24.171875 6.390625 31.78125 6.390625 
+Q 39.453125 6.390625 43.28125 13.890625 
+Q 47.125 21.390625 47.125 36.375 
+Q 47.125 51.421875 43.28125 58.90625 
+Q 39.453125 66.40625 31.78125 66.40625 
+z
+M 31.78125 74.21875 
+Q 44.046875 74.21875 50.515625 64.515625 
+Q 56.984375 54.828125 56.984375 36.375 
+Q 56.984375 17.96875 50.515625 8.265625 
+Q 44.046875 -1.421875 31.78125 -1.421875 
+Q 19.53125 -1.421875 13.0625 8.265625 
+Q 6.59375 17.96875 6.59375 36.375 
+Q 6.59375 54.828125 13.0625 64.515625 
+Q 19.53125 74.21875 31.78125 74.21875 
+z
+" id="DejaVuSans-48"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_10">
+      <path clip-path="url(#p0d91672b8f)" d="M 60.30875 241.763458 
+L 417.42875 241.763458 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_11">
+      <!-- 200 -->
+      <g style="fill:#262626;" transform="translate(29.8125 245.942599)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-50"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_11">
+      <path clip-path="url(#p0d91672b8f)" d="M 60.30875 195.096792 
+L 417.42875 195.096792 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_12">
+      <!-- 400 -->
+      <g style="fill:#262626;" transform="translate(29.8125 199.275932)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-52"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_12">
+      <path clip-path="url(#p0d91672b8f)" d="M 60.30875 148.430125 
+L 417.42875 148.430125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_13">
+      <!-- 600 -->
+      <g style="fill:#262626;" transform="translate(29.8125 152.609266)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-54"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_13">
+      <path clip-path="url(#p0d91672b8f)" d="M 60.30875 101.763458 
+L 417.42875 101.763458 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_14">
+      <!-- 800 -->
+      <g style="fill:#262626;" transform="translate(29.8125 105.942599)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-56"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_14">
+      <path clip-path="url(#p0d91672b8f)" d="M 60.30875 55.096792 
+L 417.42875 55.096792 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_15">
+      <!-- 1000 -->
+      <g style="fill:#262626;" transform="translate(22.81375 59.275932)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-49"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-48"/>
+       <use x="190.869141" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_16">
+     <!-- Inferences/second -->
+     <g style="fill:#262626;" transform="translate(16.318125 210.113812)rotate(-90)scale(0.12 -0.12)">
+      <defs>
+       <path d="M 9.8125 72.90625 
+L 19.671875 72.90625 
+L 19.671875 0 
+L 9.8125 0 
+z
+" id="DejaVuSans-73"/>
+       <path d="M 37.109375 75.984375 
+L 37.109375 68.5 
+L 28.515625 68.5 
+Q 23.6875 68.5 21.796875 66.546875 
+Q 19.921875 64.59375 19.921875 59.515625 
+L 19.921875 54.6875 
+L 34.71875 54.6875 
+L 34.71875 47.703125 
+L 19.921875 47.703125 
+L 19.921875 0 
+L 10.890625 0 
+L 10.890625 47.703125 
+L 2.296875 47.703125 
+L 2.296875 54.6875 
+L 10.890625 54.6875 
+L 10.890625 58.5 
+Q 10.890625 67.625 15.140625 71.796875 
+Q 19.390625 75.984375 28.609375 75.984375 
+z
+" id="DejaVuSans-102"/>
+       <path d="M 41.109375 46.296875 
+Q 39.59375 47.171875 37.8125 47.578125 
+Q 36.03125 48 33.890625 48 
+Q 26.265625 48 22.1875 43.046875 
+Q 18.109375 38.09375 18.109375 28.8125 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 54.6875 
+L 18.109375 54.6875 
+L 18.109375 46.1875 
+Q 20.953125 51.171875 25.484375 53.578125 
+Q 30.03125 56 36.53125 56 
+Q 37.453125 56 38.578125 55.875 
+Q 39.703125 55.765625 41.0625 55.515625 
+z
+" id="DejaVuSans-114"/>
+       <path d="M 44.28125 53.078125 
+L 44.28125 44.578125 
+Q 40.484375 46.53125 36.375 47.5 
+Q 32.28125 48.484375 27.875 48.484375 
+Q 21.1875 48.484375 17.84375 46.4375 
+Q 14.5 44.390625 14.5 40.28125 
+Q 14.5 37.15625 16.890625 35.375 
+Q 19.28125 33.59375 26.515625 31.984375 
+L 29.59375 31.296875 
+Q 39.15625 29.25 43.1875 25.515625 
+Q 47.21875 21.78125 47.21875 15.09375 
+Q 47.21875 7.46875 41.1875 3.015625 
+Q 35.15625 -1.421875 24.609375 -1.421875 
+Q 20.21875 -1.421875 15.453125 -0.5625 
+Q 10.6875 0.296875 5.421875 2 
+L 5.421875 11.28125 
+Q 10.40625 8.6875 15.234375 7.390625 
+Q 20.0625 6.109375 24.8125 6.109375 
+Q 31.15625 6.109375 34.5625 8.28125 
+Q 37.984375 10.453125 37.984375 14.40625 
+Q 37.984375 18.0625 35.515625 20.015625 
+Q 33.0625 21.96875 24.703125 23.78125 
+L 21.578125 24.515625 
+Q 13.234375 26.265625 9.515625 29.90625 
+Q 5.8125 33.546875 5.8125 39.890625 
+Q 5.8125 47.609375 11.28125 51.796875 
+Q 16.75 56 26.8125 56 
+Q 31.78125 56 36.171875 55.265625 
+Q 40.578125 54.546875 44.28125 53.078125 
+z
+" id="DejaVuSans-115"/>
+       <path d="M 25.390625 72.90625 
+L 33.6875 72.90625 
+L 8.296875 -9.28125 
+L 0 -9.28125 
+z
+" id="DejaVuSans-47"/>
+       <path d="M 30.609375 48.390625 
+Q 23.390625 48.390625 19.1875 42.75 
+Q 14.984375 37.109375 14.984375 27.296875 
+Q 14.984375 17.484375 19.15625 11.84375 
+Q 23.34375 6.203125 30.609375 6.203125 
+Q 37.796875 6.203125 41.984375 11.859375 
+Q 46.1875 17.53125 46.1875 27.296875 
+Q 46.1875 37.015625 41.984375 42.703125 
+Q 37.796875 48.390625 30.609375 48.390625 
+z
+M 30.609375 56 
+Q 42.328125 56 49.015625 48.375 
+Q 55.71875 40.765625 55.71875 27.296875 
+Q 55.71875 13.875 49.015625 6.21875 
+Q 42.328125 -1.421875 30.609375 -1.421875 
+Q 18.84375 -1.421875 12.171875 6.21875 
+Q 5.515625 13.875 5.515625 27.296875 
+Q 5.515625 40.765625 12.171875 48.375 
+Q 18.84375 56 30.609375 56 
+z
+" id="DejaVuSans-111"/>
+       <path d="M 45.40625 46.390625 
+L 45.40625 75.984375 
+L 54.390625 75.984375 
+L 54.390625 0 
+L 45.40625 0 
+L 45.40625 8.203125 
+Q 42.578125 3.328125 38.25 0.953125 
+Q 33.9375 -1.421875 27.875 -1.421875 
+Q 17.96875 -1.421875 11.734375 6.484375 
+Q 5.515625 14.40625 5.515625 27.296875 
+Q 5.515625 40.1875 11.734375 48.09375 
+Q 17.96875 56 27.875 56 
+Q 33.9375 56 38.25 53.625 
+Q 42.578125 51.265625 45.40625 46.390625 
+z
+M 14.796875 27.296875 
+Q 14.796875 17.390625 18.875 11.75 
+Q 22.953125 6.109375 30.078125 6.109375 
+Q 37.203125 6.109375 41.296875 11.75 
+Q 45.40625 17.390625 45.40625 27.296875 
+Q 45.40625 37.203125 41.296875 42.84375 
+Q 37.203125 48.484375 30.078125 48.484375 
+Q 22.953125 48.484375 18.875 42.84375 
+Q 14.796875 37.203125 14.796875 27.296875 
+z
+" id="DejaVuSans-100"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-73"/>
+      <use x="29.492188" xlink:href="#DejaVuSans-110"/>
+      <use x="92.871094" xlink:href="#DejaVuSans-102"/>
+      <use x="128.076172" xlink:href="#DejaVuSans-101"/>
+      <use x="189.599609" xlink:href="#DejaVuSans-114"/>
+      <use x="228.462891" xlink:href="#DejaVuSans-101"/>
+      <use x="289.986328" xlink:href="#DejaVuSans-110"/>
+      <use x="353.365234" xlink:href="#DejaVuSans-99"/>
+      <use x="408.345703" xlink:href="#DejaVuSans-101"/>
+      <use x="469.869141" xlink:href="#DejaVuSans-115"/>
+      <use x="521.96875" xlink:href="#DejaVuSans-47"/>
+      <use x="555.660156" xlink:href="#DejaVuSans-115"/>
+      <use x="607.759766" xlink:href="#DejaVuSans-101"/>
+      <use x="669.283203" xlink:href="#DejaVuSans-99"/>
+      <use x="724.263672" xlink:href="#DejaVuSans-111"/>
+      <use x="785.445312" xlink:href="#DejaVuSans-110"/>
+      <use x="848.824219" xlink:href="#DejaVuSans-100"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_15">
+    <path clip-path="url(#p0d91672b8f)" d="M 76.541477 228.790125 
+L 122.920698 190.616792 
+L 169.299919 140.590125 
+L 215.67914 86.456792 
+L 262.05836 66.670125 
+L 308.437581 89.070125 
+L 354.816802 95.790125 
+L 401.196023 124.163458 
+" style="fill:none;stroke:#0173b2;stroke-linecap:round;stroke-width:1.5;"/>
+    <defs>
+     <path d="M 0 3 
+C 0.795609 3 1.55874 2.683901 2.12132 2.12132 
+C 2.683901 1.55874 3 0.795609 3 0 
+C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 
+C 1.55874 -2.683901 0.795609 -3 0 -3 
+C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 
+C -2.683901 -1.55874 -3 -0.795609 -3 0 
+C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 
+C -1.55874 2.683901 -0.795609 3 0 3 
+z
+" id="mfc2dfc2535" style="stroke:#ffffff;stroke-width:0.75;"/>
+    </defs>
+    <g clip-path="url(#p0d91672b8f)">
+     <use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="76.541477" xlink:href="#mfc2dfc2535" y="228.790125"/>
+     <use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="122.920698" xlink:href="#mfc2dfc2535" y="190.616792"/>
+     <use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="169.299919" xlink:href="#mfc2dfc2535" y="140.590125"/>
+     <use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="215.67914" xlink:href="#mfc2dfc2535" y="86.456792"/>
+     <use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="262.05836" xlink:href="#mfc2dfc2535" y="66.670125"/>
+     <use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="308.437581" xlink:href="#mfc2dfc2535" y="89.070125"/>
+     <use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="354.816802" xlink:href="#mfc2dfc2535" y="95.790125"/>
+     <use style="fill:#0173b2;stroke:#ffffff;stroke-width:0.75;" x="401.196023" xlink:href="#mfc2dfc2535" y="124.163458"/>
+    </g>
+   </g>
+   <g id="line2d_16"/>
+   <g id="line2d_17"/>
+   <g id="patch_3">
+    <path d="M 60.30875 288.430125 
+L 60.30875 22.318125 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 417.42875 288.430125 
+L 417.42875 22.318125 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 60.30875 288.430125 
+L 417.42875 288.430125 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 60.30875 22.318125 
+L 417.42875 22.318125 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
+   </g>
+   <g id="text_17">
+    <!-- Performance offline -->
+    <g style="fill:#262626;" transform="translate(180.219688 16.318125)scale(0.12 -0.12)">
+     <defs>
+      <path d="M 19.671875 64.796875 
+L 19.671875 37.40625 
+L 32.078125 37.40625 
+Q 38.96875 37.40625 42.71875 40.96875 
+Q 46.484375 44.53125 46.484375 51.125 
+Q 46.484375 57.671875 42.71875 61.234375 
+Q 38.96875 64.796875 32.078125 64.796875 
+z
+M 9.8125 72.90625 
+L 32.078125 72.90625 
+Q 44.34375 72.90625 50.609375 67.359375 
+Q 56.890625 61.8125 56.890625 51.125 
+Q 56.890625 40.328125 50.609375 34.8125 
+Q 44.34375 29.296875 32.078125 29.296875 
+L 19.671875 29.296875 
+L 19.671875 0 
+L 9.8125 0 
+z
+" id="DejaVuSans-80"/>
+      <path d="M 52 44.1875 
+Q 55.375 50.25 60.0625 53.125 
+Q 64.75 56 71.09375 56 
+Q 79.640625 56 84.28125 50.015625 
+Q 88.921875 44.046875 88.921875 33.015625 
+L 88.921875 0 
+L 79.890625 0 
+L 79.890625 32.71875 
+Q 79.890625 40.578125 77.09375 44.375 
+Q 74.3125 48.1875 68.609375 48.1875 
+Q 61.625 48.1875 57.5625 43.546875 
+Q 53.515625 38.921875 53.515625 30.90625 
+L 53.515625 0 
+L 44.484375 0 
+L 44.484375 32.71875 
+Q 44.484375 40.625 41.703125 44.40625 
+Q 38.921875 48.1875 33.109375 48.1875 
+Q 26.21875 48.1875 22.15625 43.53125 
+Q 18.109375 38.875 18.109375 30.90625 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 54.6875 
+L 18.109375 54.6875 
+L 18.109375 46.1875 
+Q 21.1875 51.21875 25.484375 53.609375 
+Q 29.78125 56 35.6875 56 
+Q 41.65625 56 45.828125 52.96875 
+Q 50 49.953125 52 44.1875 
+z
+" id="DejaVuSans-109"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-80"/>
+     <use x="56.677734" xlink:href="#DejaVuSans-101"/>
+     <use x="118.201172" xlink:href="#DejaVuSans-114"/>
+     <use x="159.314453" xlink:href="#DejaVuSans-102"/>
+     <use x="194.519531" xlink:href="#DejaVuSans-111"/>
+     <use x="255.701172" xlink:href="#DejaVuSans-114"/>
+     <use x="295.064453" xlink:href="#DejaVuSans-109"/>
+     <use x="392.476562" xlink:href="#DejaVuSans-97"/>
+     <use x="453.755859" xlink:href="#DejaVuSans-110"/>
+     <use x="517.134766" xlink:href="#DejaVuSans-99"/>
+     <use x="572.115234" xlink:href="#DejaVuSans-101"/>
+     <use x="633.638672" xlink:href="#DejaVuSans-32"/>
+     <use x="665.425781" xlink:href="#DejaVuSans-111"/>
+     <use x="726.607422" xlink:href="#DejaVuSans-102"/>
+     <use x="761.8125" xlink:href="#DejaVuSans-102"/>
+     <use x="797.017578" xlink:href="#DejaVuSans-108"/>
+     <use x="824.800781" xlink:href="#DejaVuSans-105"/>
+     <use x="852.583984" xlink:href="#DejaVuSans-110"/>
+     <use x="915.962891" xlink:href="#DejaVuSans-101"/>
+    </g>
+   </g>
+   <g id="legend_1"/>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p0d91672b8f">
+   <rect height="266.112" width="357.12" x="60.30875" y="22.318125"/>
+  </clipPath>
+ </defs>
+</svg>
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_11r.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_11r.svg
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_15l.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_15l.svg
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_15r.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_15r.svg
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_3l.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_3l.svg
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_3r.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_3r.svg
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_7l.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_7l.svg
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_7r.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_offline_7r.svg
@ -0,0 +1,980 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Created with matplotlib (https://matplotlib.org/) -->
+<svg height="331.389812pt" version="1.1" viewBox="0 0 417.63 331.389812" width="417.63pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <metadata>
+  <rdf:RDF xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2021-04-15T15:15:18.496826</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.3.4, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linecap:butt;stroke-linejoin:round;}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 331.389812 
+L 417.63 331.389812 
+L 417.63 0 
+L 0 0 
+z
+" style="fill:#ffffff;"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 53.31 288.430125 
+L 410.43 288.430125 
+L 410.43 22.318125 
+L 53.31 22.318125 
+z
+" style="fill:#ffffff;"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="text_1">
+      <!-- 1 -->
+      <g style="fill:#262626;" transform="translate(72.130625 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 12.40625 8.296875 
+L 28.515625 8.296875 
+L 28.515625 63.921875 
+L 10.984375 60.40625 
+L 10.984375 69.390625 
+L 28.421875 72.90625 
+L 38.28125 72.90625 
+L 38.28125 8.296875 
+L 54.390625 8.296875 
+L 54.390625 0 
+L 12.40625 0 
+z
+" id="DejaVuSans-49"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-49"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="text_2">
+      <!-- 2 -->
+      <g style="fill:#262626;" transform="translate(116.770625 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 19.1875 8.296875 
+L 53.609375 8.296875 
+L 53.609375 0 
+L 7.328125 0 
+L 7.328125 8.296875 
+Q 12.9375 14.109375 22.625 23.890625 
+Q 32.328125 33.6875 34.8125 36.53125 
+Q 39.546875 41.84375 41.421875 45.53125 
+Q 43.3125 49.21875 43.3125 52.78125 
+Q 43.3125 58.59375 39.234375 62.25 
+Q 35.15625 65.921875 28.609375 65.921875 
+Q 23.96875 65.921875 18.8125 64.3125 
+Q 13.671875 62.703125 7.8125 59.421875 
+L 7.8125 69.390625 
+Q 13.765625 71.78125 18.9375 73 
+Q 24.125 74.21875 28.421875 74.21875 
+Q 39.75 74.21875 46.484375 68.546875 
+Q 53.21875 62.890625 53.21875 53.421875 
+Q 53.21875 48.921875 51.53125 44.890625 
+Q 49.859375 40.875 45.40625 35.40625 
+Q 44.1875 33.984375 37.640625 27.21875 
+Q 31.109375 20.453125 19.1875 8.296875 
+z
+" id="DejaVuSans-50"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-50"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="text_3">
+      <!-- 4 -->
+      <g style="fill:#262626;" transform="translate(161.410625 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 37.796875 64.3125 
+L 12.890625 25.390625 
+L 37.796875 25.390625 
+z
+M 35.203125 72.90625 
+L 47.609375 72.90625 
+L 47.609375 25.390625 
+L 58.015625 25.390625 
+L 58.015625 17.1875 
+L 47.609375 17.1875 
+L 47.609375 0 
+L 37.796875 0 
+L 37.796875 17.1875 
+L 4.890625 17.1875 
+L 4.890625 26.703125 
+z
+" id="DejaVuSans-52"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-52"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="text_4">
+      <!-- 8 -->
+      <g style="fill:#262626;" transform="translate(206.050625 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 31.78125 34.625 
+Q 24.75 34.625 20.71875 30.859375 
+Q 16.703125 27.09375 16.703125 20.515625 
+Q 16.703125 13.921875 20.71875 10.15625 
+Q 24.75 6.390625 31.78125 6.390625 
+Q 38.8125 6.390625 42.859375 10.171875 
+Q 46.921875 13.96875 46.921875 20.515625 
+Q 46.921875 27.09375 42.890625 30.859375 
+Q 38.875 34.625 31.78125 34.625 
+z
+M 21.921875 38.8125 
+Q 15.578125 40.375 12.03125 44.71875 
+Q 8.5 49.078125 8.5 55.328125 
+Q 8.5 64.0625 14.71875 69.140625 
+Q 20.953125 74.21875 31.78125 74.21875 
+Q 42.671875 74.21875 48.875 69.140625 
+Q 55.078125 64.0625 55.078125 55.328125 
+Q 55.078125 49.078125 51.53125 44.71875 
+Q 48 40.375 41.703125 38.8125 
+Q 48.828125 37.15625 52.796875 32.3125 
+Q 56.78125 27.484375 56.78125 20.515625 
+Q 56.78125 9.90625 50.3125 4.234375 
+Q 43.84375 -1.421875 31.78125 -1.421875 
+Q 19.734375 -1.421875 13.25 4.234375 
+Q 6.78125 9.90625 6.78125 20.515625 
+Q 6.78125 27.484375 10.78125 32.3125 
+Q 14.796875 37.15625 21.921875 38.8125 
+z
+M 18.3125 54.390625 
+Q 18.3125 48.734375 21.84375 45.5625 
+Q 25.390625 42.390625 31.78125 42.390625 
+Q 38.140625 42.390625 41.71875 45.5625 
+Q 45.3125 48.734375 45.3125 54.390625 
+Q 45.3125 60.0625 41.71875 63.234375 
+Q 38.140625 66.40625 31.78125 66.40625 
+Q 25.390625 66.40625 21.84375 63.234375 
+Q 18.3125 60.0625 18.3125 54.390625 
+z
+" id="DejaVuSans-56"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-56"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="text_5">
+      <!-- 16 -->
+      <g style="fill:#262626;" transform="translate(247.19125 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 33.015625 40.375 
+Q 26.375 40.375 22.484375 35.828125 
+Q 18.609375 31.296875 18.609375 23.390625 
+Q 18.609375 15.53125 22.484375 10.953125 
+Q 26.375 6.390625 33.015625 6.390625 
+Q 39.65625 6.390625 43.53125 10.953125 
+Q 47.40625 15.53125 47.40625 23.390625 
+Q 47.40625 31.296875 43.53125 35.828125 
+Q 39.65625 40.375 33.015625 40.375 
+z
+M 52.59375 71.296875 
+L 52.59375 62.3125 
+Q 48.875 64.0625 45.09375 64.984375 
+Q 41.3125 65.921875 37.59375 65.921875 
+Q 27.828125 65.921875 22.671875 59.328125 
+Q 17.53125 52.734375 16.796875 39.40625 
+Q 19.671875 43.65625 24.015625 45.921875 
+Q 28.375 48.1875 33.59375 48.1875 
+Q 44.578125 48.1875 50.953125 41.515625 
+Q 57.328125 34.859375 57.328125 23.390625 
+Q 57.328125 12.15625 50.6875 5.359375 
+Q 44.046875 -1.421875 33.015625 -1.421875 
+Q 20.359375 -1.421875 13.671875 8.265625 
+Q 6.984375 17.96875 6.984375 36.375 
+Q 6.984375 53.65625 15.1875 63.9375 
+Q 23.390625 74.21875 37.203125 74.21875 
+Q 40.921875 74.21875 44.703125 73.484375 
+Q 48.484375 72.75 52.59375 71.296875 
+z
+" id="DejaVuSans-54"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-49"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-54"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="text_6">
+      <!-- 32 -->
+      <g style="fill:#262626;" transform="translate(291.83125 306.288406)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 40.578125 39.3125 
+Q 47.65625 37.796875 51.625 33 
+Q 55.609375 28.21875 55.609375 21.1875 
+Q 55.609375 10.40625 48.1875 4.484375 
+Q 40.765625 -1.421875 27.09375 -1.421875 
+Q 22.515625 -1.421875 17.65625 -0.515625 
+Q 12.796875 0.390625 7.625 2.203125 
+L 7.625 11.71875 
+Q 11.71875 9.328125 16.59375 8.109375 
+Q 21.484375 6.890625 26.8125 6.890625 
+Q 36.078125 6.890625 40.9375 10.546875 
+Q 45.796875 14.203125 45.796875 21.1875 
+Q 45.796875 27.640625 41.28125 31.265625 
+Q 36.765625 34.90625 28.71875 34.90625 
+L 20.21875 34.90625 
+L 20.21875 43.015625 
+L 29.109375 43.015625 
+Q 36.375 43.015625 40.234375 45.921875 
+Q 44.09375 48.828125 44.09375 54.296875 
+Q 44.09375 59.90625 40.109375 62.90625 
+Q 36.140625 65.921875 28.71875 65.921875 
+Q 24.65625 65.921875 20.015625 65.03125 
+Q 15.375 64.15625 9.8125 62.3125 
+L 9.8125 71.09375 
+Q 15.4375 72.65625 20.34375 73.4375 
+Q 25.25 74.21875 29.59375 74.21875 
+Q 40.828125 74.21875 47.359375 69.109375 
+Q 53.90625 64.015625 53.90625 55.328125 
+Q 53.90625 49.265625 50.4375 45.09375 
+Q 46.96875 40.921875 40.578125 39.3125 
+z
+" id="DejaVuSans-51"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-51"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-50"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="text_7">
+      <!-- 64 -->
+      <g style="fill:#262626;" transform="translate(336.47125 306.288406)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-54"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-52"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="text_8">
+      <!-- 128 -->
+      <g style="fill:#262626;" transform="translate(377.611875 306.288406)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-49"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-50"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-56"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_9">
+     <!-- Client Batch Size -->
+     <g style="fill:#262626;" transform="translate(181.122187 321.694187)scale(0.12 -0.12)">
+      <defs>
+       <path d="M 64.40625 67.28125 
+L 64.40625 56.890625 
+Q 59.421875 61.53125 53.78125 63.8125 
+Q 48.140625 66.109375 41.796875 66.109375 
+Q 29.296875 66.109375 22.65625 58.46875 
+Q 16.015625 50.828125 16.015625 36.375 
+Q 16.015625 21.96875 22.65625 14.328125 
+Q 29.296875 6.6875 41.796875 6.6875 
+Q 48.140625 6.6875 53.78125 8.984375 
+Q 59.421875 11.28125 64.40625 15.921875 
+L 64.40625 5.609375 
+Q 59.234375 2.09375 53.4375 0.328125 
+Q 47.65625 -1.421875 41.21875 -1.421875 
+Q 24.65625 -1.421875 15.125 8.703125 
+Q 5.609375 18.84375 5.609375 36.375 
+Q 5.609375 53.953125 15.125 64.078125 
+Q 24.65625 74.21875 41.21875 74.21875 
+Q 47.75 74.21875 53.53125 72.484375 
+Q 59.328125 70.75 64.40625 67.28125 
+z
+" id="DejaVuSans-67"/>
+       <path d="M 9.421875 75.984375 
+L 18.40625 75.984375 
+L 18.40625 0 
+L 9.421875 0 
+z
+" id="DejaVuSans-108"/>
+       <path d="M 9.421875 54.6875 
+L 18.40625 54.6875 
+L 18.40625 0 
+L 9.421875 0 
+z
+M 9.421875 75.984375 
+L 18.40625 75.984375 
+L 18.40625 64.59375 
+L 9.421875 64.59375 
+z
+" id="DejaVuSans-105"/>
+       <path d="M 56.203125 29.59375 
+L 56.203125 25.203125 
+L 14.890625 25.203125 
+Q 15.484375 15.921875 20.484375 11.0625 
+Q 25.484375 6.203125 34.421875 6.203125 
+Q 39.59375 6.203125 44.453125 7.46875 
+Q 49.3125 8.734375 54.109375 11.28125 
+L 54.109375 2.78125 
+Q 49.265625 0.734375 44.1875 -0.34375 
+Q 39.109375 -1.421875 33.890625 -1.421875 
+Q 20.796875 -1.421875 13.15625 6.1875 
+Q 5.515625 13.8125 5.515625 26.8125 
+Q 5.515625 40.234375 12.765625 48.109375 
+Q 20.015625 56 32.328125 56 
+Q 43.359375 56 49.78125 48.890625 
+Q 56.203125 41.796875 56.203125 29.59375 
+z
+M 47.21875 32.234375 
+Q 47.125 39.59375 43.09375 43.984375 
+Q 39.0625 48.390625 32.421875 48.390625 
+Q 24.90625 48.390625 20.390625 44.140625 
+Q 15.875 39.890625 15.1875 32.171875 
+z
+" id="DejaVuSans-101"/>
+       <path d="M 54.890625 33.015625 
+L 54.890625 0 
+L 45.90625 0 
+L 45.90625 32.71875 
+Q 45.90625 40.484375 42.875 44.328125 
+Q 39.84375 48.1875 33.796875 48.1875 
+Q 26.515625 48.1875 22.3125 43.546875 
+Q 18.109375 38.921875 18.109375 30.90625 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 54.6875 
+L 18.109375 54.6875 
+L 18.109375 46.1875 
+Q 21.34375 51.125 25.703125 53.5625 
+Q 30.078125 56 35.796875 56 
+Q 45.21875 56 50.046875 50.171875 
+Q 54.890625 44.34375 54.890625 33.015625 
+z
+" id="DejaVuSans-110"/>
+       <path d="M 18.3125 70.21875 
+L 18.3125 54.6875 
+L 36.8125 54.6875 
+L 36.8125 47.703125 
+L 18.3125 47.703125 
+L 18.3125 18.015625 
+Q 18.3125 11.328125 20.140625 9.421875 
+Q 21.96875 7.515625 27.59375 7.515625 
+L 36.8125 7.515625 
+L 36.8125 0 
+L 27.59375 0 
+Q 17.1875 0 13.234375 3.875 
+Q 9.28125 7.765625 9.28125 18.015625 
+L 9.28125 47.703125 
+L 2.6875 47.703125 
+L 2.6875 54.6875 
+L 9.28125 54.6875 
+L 9.28125 70.21875 
+z
+" id="DejaVuSans-116"/>
+       <path id="DejaVuSans-32"/>
+       <path d="M 19.671875 34.8125 
+L 19.671875 8.109375 
+L 35.5 8.109375 
+Q 43.453125 8.109375 47.28125 11.40625 
+Q 51.125 14.703125 51.125 21.484375 
+Q 51.125 28.328125 47.28125 31.5625 
+Q 43.453125 34.8125 35.5 34.8125 
+z
+M 19.671875 64.796875 
+L 19.671875 42.828125 
+L 34.28125 42.828125 
+Q 41.5 42.828125 45.03125 45.53125 
+Q 48.578125 48.25 48.578125 53.8125 
+Q 48.578125 59.328125 45.03125 62.0625 
+Q 41.5 64.796875 34.28125 64.796875 
+z
+M 9.8125 72.90625 
+L 35.015625 72.90625 
+Q 46.296875 72.90625 52.390625 68.21875 
+Q 58.5 63.53125 58.5 54.890625 
+Q 58.5 48.1875 55.375 44.234375 
+Q 52.25 40.28125 46.1875 39.3125 
+Q 53.46875 37.75 57.5 32.78125 
+Q 61.53125 27.828125 61.53125 20.40625 
+Q 61.53125 10.640625 54.890625 5.3125 
+Q 48.25 0 35.984375 0 
+L 9.8125 0 
+z
+" id="DejaVuSans-66"/>
+       <path d="M 34.28125 27.484375 
+Q 23.390625 27.484375 19.1875 25 
+Q 14.984375 22.515625 14.984375 16.5 
+Q 14.984375 11.71875 18.140625 8.90625 
+Q 21.296875 6.109375 26.703125 6.109375 
+Q 34.1875 6.109375 38.703125 11.40625 
+Q 43.21875 16.703125 43.21875 25.484375 
+L 43.21875 27.484375 
+z
+M 52.203125 31.203125 
+L 52.203125 0 
+L 43.21875 0 
+L 43.21875 8.296875 
+Q 40.140625 3.328125 35.546875 0.953125 
+Q 30.953125 -1.421875 24.3125 -1.421875 
+Q 15.921875 -1.421875 10.953125 3.296875 
+Q 6 8.015625 6 15.921875 
+Q 6 25.140625 12.171875 29.828125 
+Q 18.359375 34.515625 30.609375 34.515625 
+L 43.21875 34.515625 
+L 43.21875 35.40625 
+Q 43.21875 41.609375 39.140625 45 
+Q 35.0625 48.390625 27.6875 48.390625 
+Q 23 48.390625 18.546875 47.265625 
+Q 14.109375 46.140625 10.015625 43.890625 
+L 10.015625 52.203125 
+Q 14.9375 54.109375 19.578125 55.046875 
+Q 24.21875 56 28.609375 56 
+Q 40.484375 56 46.34375 49.84375 
+Q 52.203125 43.703125 52.203125 31.203125 
+z
+" id="DejaVuSans-97"/>
+       <path d="M 48.78125 52.59375 
+L 48.78125 44.1875 
+Q 44.96875 46.296875 41.140625 47.34375 
+Q 37.3125 48.390625 33.40625 48.390625 
+Q 24.65625 48.390625 19.8125 42.84375 
+Q 14.984375 37.3125 14.984375 27.296875 
+Q 14.984375 17.28125 19.8125 11.734375 
+Q 24.65625 6.203125 33.40625 6.203125 
+Q 37.3125 6.203125 41.140625 7.25 
+Q 44.96875 8.296875 48.78125 10.40625 
+L 48.78125 2.09375 
+Q 45.015625 0.34375 40.984375 -0.53125 
+Q 36.96875 -1.421875 32.421875 -1.421875 
+Q 20.0625 -1.421875 12.78125 6.34375 
+Q 5.515625 14.109375 5.515625 27.296875 
+Q 5.515625 40.671875 12.859375 48.328125 
+Q 20.21875 56 33.015625 56 
+Q 37.15625 56 41.109375 55.140625 
+Q 45.0625 54.296875 48.78125 52.59375 
+z
+" id="DejaVuSans-99"/>
+       <path d="M 54.890625 33.015625 
+L 54.890625 0 
+L 45.90625 0 
+L 45.90625 32.71875 
+Q 45.90625 40.484375 42.875 44.328125 
+Q 39.84375 48.1875 33.796875 48.1875 
+Q 26.515625 48.1875 22.3125 43.546875 
+Q 18.109375 38.921875 18.109375 30.90625 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 75.984375 
+L 18.109375 75.984375 
+L 18.109375 46.1875 
+Q 21.34375 51.125 25.703125 53.5625 
+Q 30.078125 56 35.796875 56 
+Q 45.21875 56 50.046875 50.171875 
+Q 54.890625 44.34375 54.890625 33.015625 
+z
+" id="DejaVuSans-104"/>
+       <path d="M 53.515625 70.515625 
+L 53.515625 60.890625 
+Q 47.90625 63.578125 42.921875 64.890625 
+Q 37.9375 66.21875 33.296875 66.21875 
+Q 25.25 66.21875 20.875 63.09375 
+Q 16.5 59.96875 16.5 54.203125 
+Q 16.5 49.359375 19.40625 46.890625 
+Q 22.3125 44.4375 30.421875 42.921875 
+L 36.375 41.703125 
+Q 47.40625 39.59375 52.65625 34.296875 
+Q 57.90625 29 57.90625 20.125 
+Q 57.90625 9.515625 50.796875 4.046875 
+Q 43.703125 -1.421875 29.984375 -1.421875 
+Q 24.8125 -1.421875 18.96875 -0.25 
+Q 13.140625 0.921875 6.890625 3.21875 
+L 6.890625 13.375 
+Q 12.890625 10.015625 18.65625 8.296875 
+Q 24.421875 6.59375 29.984375 6.59375 
+Q 38.421875 6.59375 43.015625 9.90625 
+Q 47.609375 13.234375 47.609375 19.390625 
+Q 47.609375 24.75 44.3125 27.78125 
+Q 41.015625 30.8125 33.5 32.328125 
+L 27.484375 33.5 
+Q 16.453125 35.6875 11.515625 40.375 
+Q 6.59375 45.0625 6.59375 53.421875 
+Q 6.59375 63.09375 13.40625 68.65625 
+Q 20.21875 74.21875 32.171875 74.21875 
+Q 37.3125 74.21875 42.625 73.28125 
+Q 47.953125 72.359375 53.515625 70.515625 
+z
+" id="DejaVuSans-83"/>
+       <path d="M 5.515625 54.6875 
+L 48.1875 54.6875 
+L 48.1875 46.484375 
+L 14.40625 7.171875 
+L 48.1875 7.171875 
+L 48.1875 0 
+L 4.296875 0 
+L 4.296875 8.203125 
+L 38.09375 47.515625 
+L 5.515625 47.515625 
+z
+" id="DejaVuSans-122"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-67"/>
+      <use x="69.824219" xlink:href="#DejaVuSans-108"/>
+      <use x="97.607422" xlink:href="#DejaVuSans-105"/>
+      <use x="125.390625" xlink:href="#DejaVuSans-101"/>
+      <use x="186.914062" xlink:href="#DejaVuSans-110"/>
+      <use x="250.292969" xlink:href="#DejaVuSans-116"/>
+      <use x="289.501953" xlink:href="#DejaVuSans-32"/>
+      <use x="321.289062" xlink:href="#DejaVuSans-66"/>
+      <use x="389.892578" xlink:href="#DejaVuSans-97"/>
+      <use x="451.171875" xlink:href="#DejaVuSans-116"/>
+      <use x="490.380859" xlink:href="#DejaVuSans-99"/>
+      <use x="545.361328" xlink:href="#DejaVuSans-104"/>
+      <use x="608.740234" xlink:href="#DejaVuSans-32"/>
+      <use x="640.527344" xlink:href="#DejaVuSans-83"/>
+      <use x="704.003906" xlink:href="#DejaVuSans-105"/>
+      <use x="731.787109" xlink:href="#DejaVuSans-122"/>
+      <use x="784.277344" xlink:href="#DejaVuSans-101"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_1">
+      <path clip-path="url(#p9ba82f1e29)" d="M 53.31 288.430125 
+L 410.43 288.430125 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_10">
+      <!-- 0 -->
+      <g style="fill:#262626;" transform="translate(36.81125 292.609266)scale(0.11 -0.11)">
+       <defs>
+        <path d="M 31.78125 66.40625 
+Q 24.171875 66.40625 20.328125 58.90625 
+Q 16.5 51.421875 16.5 36.375 
+Q 16.5 21.390625 20.328125 13.890625 
+Q 24.171875 6.390625 31.78125 6.390625 
+Q 39.453125 6.390625 43.28125 13.890625 
+Q 47.125 21.390625 47.125 36.375 
+Q 47.125 51.421875 43.28125 58.90625 
+Q 39.453125 66.40625 31.78125 66.40625 
+z
+M 31.78125 74.21875 
+Q 44.046875 74.21875 50.515625 64.515625 
+Q 56.984375 54.828125 56.984375 36.375 
+Q 56.984375 17.96875 50.515625 8.265625 
+Q 44.046875 -1.421875 31.78125 -1.421875 
+Q 19.53125 -1.421875 13.0625 8.265625 
+Q 6.59375 17.96875 6.59375 36.375 
+Q 6.59375 54.828125 13.0625 64.515625 
+Q 19.53125 74.21875 31.78125 74.21875 
+z
+" id="DejaVuSans-48"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_2">
+      <path clip-path="url(#p9ba82f1e29)" d="M 53.31 242.325216 
+L 410.43 242.325216 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_11">
+      <!-- 20 -->
+      <g style="fill:#262626;" transform="translate(29.8125 246.504357)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-50"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_3">
+      <path clip-path="url(#p9ba82f1e29)" d="M 53.31 196.220308 
+L 410.43 196.220308 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_12">
+      <!-- 40 -->
+      <g style="fill:#262626;" transform="translate(29.8125 200.399448)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-52"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_4">
+      <path clip-path="url(#p9ba82f1e29)" d="M 53.31 150.115399 
+L 410.43 150.115399 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_13">
+      <!-- 60 -->
+      <g style="fill:#262626;" transform="translate(29.8125 154.29454)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-54"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_5">
+      <path clip-path="url(#p9ba82f1e29)" d="M 53.31 104.01049 
+L 410.43 104.01049 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_14">
+      <!-- 80 -->
+      <g style="fill:#262626;" transform="translate(29.8125 108.189631)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-56"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_6">
+      <path clip-path="url(#p9ba82f1e29)" d="M 53.31 57.905582 
+L 410.43 57.905582 
+" style="fill:none;stroke:#c0c0c0;stroke-linecap:round;stroke-width:0.5;"/>
+     </g>
+     <g id="text_15">
+      <!-- 100 -->
+      <g style="fill:#262626;" transform="translate(22.81375 62.084722)scale(0.11 -0.11)">
+       <use xlink:href="#DejaVuSans-49"/>
+       <use x="63.623047" xlink:href="#DejaVuSans-48"/>
+       <use x="127.246094" xlink:href="#DejaVuSans-48"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_16">
+     <!-- Avg Latency -->
+     <g style="fill:#262626;" transform="translate(16.318125 192.110062)rotate(-90)scale(0.12 -0.12)">
+      <defs>
+       <path d="M 34.1875 63.1875 
+L 20.796875 26.90625 
+L 47.609375 26.90625 
+z
+M 28.609375 72.90625 
+L 39.796875 72.90625 
+L 67.578125 0 
+L 57.328125 0 
+L 50.6875 18.703125 
+L 17.828125 18.703125 
+L 11.1875 0 
+L 0.78125 0 
+z
+" id="DejaVuSans-65"/>
+       <path d="M 2.984375 54.6875 
+L 12.5 54.6875 
+L 29.59375 8.796875 
+L 46.6875 54.6875 
+L 56.203125 54.6875 
+L 35.6875 0 
+L 23.484375 0 
+z
+" id="DejaVuSans-118"/>
+       <path d="M 45.40625 27.984375 
+Q 45.40625 37.75 41.375 43.109375 
+Q 37.359375 48.484375 30.078125 48.484375 
+Q 22.859375 48.484375 18.828125 43.109375 
+Q 14.796875 37.75 14.796875 27.984375 
+Q 14.796875 18.265625 18.828125 12.890625 
+Q 22.859375 7.515625 30.078125 7.515625 
+Q 37.359375 7.515625 41.375 12.890625 
+Q 45.40625 18.265625 45.40625 27.984375 
+z
+M 54.390625 6.78125 
+Q 54.390625 -7.171875 48.1875 -13.984375 
+Q 42 -20.796875 29.203125 -20.796875 
+Q 24.46875 -20.796875 20.265625 -20.09375 
+Q 16.0625 -19.390625 12.109375 -17.921875 
+L 12.109375 -9.1875 
+Q 16.0625 -11.328125 19.921875 -12.34375 
+Q 23.78125 -13.375 27.78125 -13.375 
+Q 36.625 -13.375 41.015625 -8.765625 
+Q 45.40625 -4.15625 45.40625 5.171875 
+L 45.40625 9.625 
+Q 42.625 4.78125 38.28125 2.390625 
+Q 33.9375 0 27.875 0 
+Q 17.828125 0 11.671875 7.65625 
+Q 5.515625 15.328125 5.515625 27.984375 
+Q 5.515625 40.671875 11.671875 48.328125 
+Q 17.828125 56 27.875 56 
+Q 33.9375 56 38.28125 53.609375 
+Q 42.625 51.21875 45.40625 46.390625 
+L 45.40625 54.6875 
+L 54.390625 54.6875 
+z
+" id="DejaVuSans-103"/>
+       <path d="M 9.8125 72.90625 
+L 19.671875 72.90625 
+L 19.671875 8.296875 
+L 55.171875 8.296875 
+L 55.171875 0 
+L 9.8125 0 
+z
+" id="DejaVuSans-76"/>
+       <path d="M 32.171875 -5.078125 
+Q 28.375 -14.84375 24.75 -17.8125 
+Q 21.140625 -20.796875 15.09375 -20.796875 
+L 7.90625 -20.796875 
+L 7.90625 -13.28125 
+L 13.1875 -13.28125 
+Q 16.890625 -13.28125 18.9375 -11.515625 
+Q 21 -9.765625 23.484375 -3.21875 
+L 25.09375 0.875 
+L 2.984375 54.6875 
+L 12.5 54.6875 
+L 29.59375 11.921875 
+L 46.6875 54.6875 
+L 56.203125 54.6875 
+z
+" id="DejaVuSans-121"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-65"/>
+      <use x="62.533203" xlink:href="#DejaVuSans-118"/>
+      <use x="121.712891" xlink:href="#DejaVuSans-103"/>
+      <use x="185.189453" xlink:href="#DejaVuSans-32"/>
+      <use x="216.976562" xlink:href="#DejaVuSans-76"/>
+      <use x="272.689453" xlink:href="#DejaVuSans-97"/>
+      <use x="333.96875" xlink:href="#DejaVuSans-116"/>
+      <use x="373.177734" xlink:href="#DejaVuSans-101"/>
+      <use x="434.701172" xlink:href="#DejaVuSans-110"/>
+      <use x="498.080078" xlink:href="#DejaVuSans-99"/>
+      <use x="553.060547" xlink:href="#DejaVuSans-121"/>
+     </g>
+    </g>
+   </g>
+   <g id="patch_3">
+    <path clip-path="url(#p9ba82f1e29)" d="M 57.774 288.430125 
+L 93.486 288.430125 
+L 93.486 282.496423 
+L 57.774 282.496423 
+z
+" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
+   </g>
+   <g id="patch_4">
+    <path clip-path="url(#p9ba82f1e29)" d="M 102.414 288.430125 
+L 138.126 288.430125 
+L 138.126 281.203181 
+L 102.414 281.203181 
+z
+" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
+   </g>
+   <g id="patch_5">
+    <path clip-path="url(#p9ba82f1e29)" d="M 147.054 288.430125 
+L 182.766 288.430125 
+L 182.766 279.047776 
+L 147.054 279.047776 
+z
+" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
+   </g>
+   <g id="patch_6">
+    <path clip-path="url(#p9ba82f1e29)" d="M 191.694 288.430125 
+L 227.406 288.430125 
+L 227.406 272.83514 
+L 191.694 272.83514 
+z
+" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
+   </g>
+   <g id="patch_7">
+    <path clip-path="url(#p9ba82f1e29)" d="M 236.334 288.430125 
+L 272.046 288.430125 
+L 272.046 262.931805 
+L 236.334 262.931805 
+z
+" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
+   </g>
+   <g id="patch_8">
+    <path clip-path="url(#p9ba82f1e29)" d="M 280.974 288.430125 
+L 316.686 288.430125 
+L 316.686 234.019417 
+L 280.974 234.019417 
+z
+" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
+   </g>
+   <g id="patch_9">
+    <path clip-path="url(#p9ba82f1e29)" d="M 325.614 288.430125 
+L 361.326 288.430125 
+L 361.326 188.380168 
+L 325.614 188.380168 
+z
+" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
+   </g>
+   <g id="patch_10">
+    <path clip-path="url(#p9ba82f1e29)" d="M 370.254 288.430125 
+L 405.966 288.430125 
+L 405.966 66.670125 
+L 370.254 66.670125 
+z
+" style="fill:#5875a4;stroke:#ffffff;stroke-linejoin:miter;"/>
+   </g>
+   <g id="patch_11">
+    <path d="M 53.31 288.430125 
+L 53.31 22.318125 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
+   </g>
+   <g id="patch_12">
+    <path d="M 410.43 288.430125 
+L 410.43 22.318125 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
+   </g>
+   <g id="patch_13">
+    <path d="M 53.31 288.430125 
+L 410.43 288.430125 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
+   </g>
+   <g id="patch_14">
+    <path d="M 53.31 22.318125 
+L 410.43 22.318125 
+" style="fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:2;"/>
+   </g>
+   <g id="text_17">
+    <!-- Performance offline -->
+    <g style="fill:#262626;" transform="translate(173.220937 16.318125)scale(0.12 -0.12)">
+     <defs>
+      <path d="M 19.671875 64.796875 
+L 19.671875 37.40625 
+L 32.078125 37.40625 
+Q 38.96875 37.40625 42.71875 40.96875 
+Q 46.484375 44.53125 46.484375 51.125 
+Q 46.484375 57.671875 42.71875 61.234375 
+Q 38.96875 64.796875 32.078125 64.796875 
+z
+M 9.8125 72.90625 
+L 32.078125 72.90625 
+Q 44.34375 72.90625 50.609375 67.359375 
+Q 56.890625 61.8125 56.890625 51.125 
+Q 56.890625 40.328125 50.609375 34.8125 
+Q 44.34375 29.296875 32.078125 29.296875 
+L 19.671875 29.296875 
+L 19.671875 0 
+L 9.8125 0 
+z
+" id="DejaVuSans-80"/>
+      <path d="M 41.109375 46.296875 
+Q 39.59375 47.171875 37.8125 47.578125 
+Q 36.03125 48 33.890625 48 
+Q 26.265625 48 22.1875 43.046875 
+Q 18.109375 38.09375 18.109375 28.8125 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 54.6875 
+L 18.109375 54.6875 
+L 18.109375 46.1875 
+Q 20.953125 51.171875 25.484375 53.578125 
+Q 30.03125 56 36.53125 56 
+Q 37.453125 56 38.578125 55.875 
+Q 39.703125 55.765625 41.0625 55.515625 
+z
+" id="DejaVuSans-114"/>
+      <path d="M 37.109375 75.984375 
+L 37.109375 68.5 
+L 28.515625 68.5 
+Q 23.6875 68.5 21.796875 66.546875 
+Q 19.921875 64.59375 19.921875 59.515625 
+L 19.921875 54.6875 
+L 34.71875 54.6875 
+L 34.71875 47.703125 
+L 19.921875 47.703125 
+L 19.921875 0 
+L 10.890625 0 
+L 10.890625 47.703125 
+L 2.296875 47.703125 
+L 2.296875 54.6875 
+L 10.890625 54.6875 
+L 10.890625 58.5 
+Q 10.890625 67.625 15.140625 71.796875 
+Q 19.390625 75.984375 28.609375 75.984375 
+z
+" id="DejaVuSans-102"/>
+      <path d="M 30.609375 48.390625 
+Q 23.390625 48.390625 19.1875 42.75 
+Q 14.984375 37.109375 14.984375 27.296875 
+Q 14.984375 17.484375 19.15625 11.84375 
+Q 23.34375 6.203125 30.609375 6.203125 
+Q 37.796875 6.203125 41.984375 11.859375 
+Q 46.1875 17.53125 46.1875 27.296875 
+Q 46.1875 37.015625 41.984375 42.703125 
+Q 37.796875 48.390625 30.609375 48.390625 
+z
+M 30.609375 56 
+Q 42.328125 56 49.015625 48.375 
+Q 55.71875 40.765625 55.71875 27.296875 
+Q 55.71875 13.875 49.015625 6.21875 
+Q 42.328125 -1.421875 30.609375 -1.421875 
+Q 18.84375 -1.421875 12.171875 6.21875 
+Q 5.515625 13.875 5.515625 27.296875 
+Q 5.515625 40.765625 12.171875 48.375 
+Q 18.84375 56 30.609375 56 
+z
+" id="DejaVuSans-111"/>
+      <path d="M 52 44.1875 
+Q 55.375 50.25 60.0625 53.125 
+Q 64.75 56 71.09375 56 
+Q 79.640625 56 84.28125 50.015625 
+Q 88.921875 44.046875 88.921875 33.015625 
+L 88.921875 0 
+L 79.890625 0 
+L 79.890625 32.71875 
+Q 79.890625 40.578125 77.09375 44.375 
+Q 74.3125 48.1875 68.609375 48.1875 
+Q 61.625 48.1875 57.5625 43.546875 
+Q 53.515625 38.921875 53.515625 30.90625 
+L 53.515625 0 
+L 44.484375 0 
+L 44.484375 32.71875 
+Q 44.484375 40.625 41.703125 44.40625 
+Q 38.921875 48.1875 33.109375 48.1875 
+Q 26.21875 48.1875 22.15625 43.53125 
+Q 18.109375 38.875 18.109375 30.90625 
+L 18.109375 0 
+L 9.078125 0 
+L 9.078125 54.6875 
+L 18.109375 54.6875 
+L 18.109375 46.1875 
+Q 21.1875 51.21875 25.484375 53.609375 
+Q 29.78125 56 35.6875 56 
+Q 41.65625 56 45.828125 52.96875 
+Q 50 49.953125 52 44.1875 
+z
+" id="DejaVuSans-109"/>
+     </defs>
+     <use xlink:href="#DejaVuSans-80"/>
+     <use x="56.677734" xlink:href="#DejaVuSans-101"/>
+     <use x="118.201172" xlink:href="#DejaVuSans-114"/>
+     <use x="159.314453" xlink:href="#DejaVuSans-102"/>
+     <use x="194.519531" xlink:href="#DejaVuSans-111"/>
+     <use x="255.701172" xlink:href="#DejaVuSans-114"/>
+     <use x="295.064453" xlink:href="#DejaVuSans-109"/>
+     <use x="392.476562" xlink:href="#DejaVuSans-97"/>
+     <use x="453.755859" xlink:href="#DejaVuSans-110"/>
+     <use x="517.134766" xlink:href="#DejaVuSans-99"/>
+     <use x="572.115234" xlink:href="#DejaVuSans-101"/>
+     <use x="633.638672" xlink:href="#DejaVuSans-32"/>
+     <use x="665.425781" xlink:href="#DejaVuSans-111"/>
+     <use x="726.607422" xlink:href="#DejaVuSans-102"/>
+     <use x="761.8125" xlink:href="#DejaVuSans-102"/>
+     <use x="797.017578" xlink:href="#DejaVuSans-108"/>
+     <use x="824.800781" xlink:href="#DejaVuSans-105"/>
+     <use x="852.583984" xlink:href="#DejaVuSans-110"/>
+     <use x="915.962891" xlink:href="#DejaVuSans-101"/>
+    </g>
+   </g>
+   <g id="legend_1"/>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p9ba82f1e29">
+   <rect height="266.112" width="357.12" x="53.31" y="22.318125"/>
+  </clipPath>
+ </defs>
+</svg>
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_online_14.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_online_14.svg
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_online_22.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_online_22.svg
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_online_30.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_online_30.svg
--- a/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_online_6.svg
+++ b/TensorFlow/Classification/ConvNets/triton/plots/graph_performance_online_6.svg
--- a/TensorFlow/Classification/ConvNets/triton/process_dataset.py
+++ b/TensorFlow/Classification/ConvNets/triton/process_dataset.py
@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tarfile
+from pathlib import Path
+from typing import Tuple, Dict, List
+
+from PIL import Image
+from tqdm import tqdm
+
+DATASETS_DIR = os.environ.get("DATASETS_DIR", None)
+IMAGENET_DIRNAME = "imagenet"
+IMAGE_ARCHIVE_FILENAME = "ILSVRC2012_img_val.tar"
+DEVKIT_ARCHIVE_FILENAME = "ILSVRC2012_devkit_t12.tar.gz"
+LABELS_REL_PATH = "ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt"
+META_REL_PATH = "ILSVRC2012_devkit_t12/data/meta.mat"
+
+TARGET_SIZE = (224, 224)  # (width, height)
+_RESIZE_MIN = 256  # resize preserving aspect ratio to where this is minimal size
+
+
+def parse_meta_mat(metafile) -> Dict[int, str]:
+    import scipy.io
+
+    meta = scipy.io.loadmat(metafile, squeeze_me=True)["synsets"]
+    nums_children = list(zip(*meta))[4]
+    meta = [meta[idx] for idx, num_children in enumerate(nums_children) if num_children == 0]
+    idcs, wnids = list(zip(*meta))[:2]
+    idx_to_wnid = {idx: wnid for idx, wnid in zip(idcs, wnids)}
+    return idx_to_wnid
+
+
+def _process_image(image_file, target_size):
+    image = Image.open(image_file)
+    original_size = image.size
+
+    # scale image to size where minimal size is _RESIZE_MIN
+    scale_factor = max(_RESIZE_MIN / original_size[0], _RESIZE_MIN / original_size[1])
+    resize_to = int(original_size[0] * scale_factor), int(original_size[1] * scale_factor)
+    resized_image = image.resize(resize_to)
+
+    # central crop of image to target_size
+    left, upper = (resize_to[0] - target_size[0]) // 2, (resize_to[1] - target_size[1]) // 2
+    cropped_image = resized_image.crop((left, upper, left + target_size[0], upper + target_size[1]))
+    return cropped_image
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="short_description")
+    parser.add_argument(
+        "--dataset-dir",
+        help="Path to dataset directory where imagenet archives are stored and processed files will be saved.",
+        required=False,
+        default=DATASETS_DIR,
+    )
+    parser.add_argument(
+        "--target-size",
+        help="Size of target image. Format it as <width>,<height>.",
+        required=False,
+        default=",".join(map(str, TARGET_SIZE)),
+    )
+    args = parser.parse_args()
+
+    if args.dataset_dir is None:
+        raise ValueError(
+            "Please set $DATASETS_DIR env variable to point dataset dir with original dataset archives "
+            "and where processed files should be stored. Alternatively provide --dataset-dir CLI argument"
+        )
+
+    datasets_dir = Path(args.dataset_dir)
+    target_size = tuple(map(int, args.target_size.split(",")))
+
+    image_archive_path = datasets_dir / IMAGE_ARCHIVE_FILENAME
+    if not image_archive_path.exists():
+        raise RuntimeError(
+            f"There should be {IMAGE_ARCHIVE_FILENAME} file in {datasets_dir}."
+            f"You need to download the dataset from http://www.image-net.org/download."
+        )
+
+    devkit_archive_path = datasets_dir / DEVKIT_ARCHIVE_FILENAME
+    if not devkit_archive_path.exists():
+        raise RuntimeError(
+            f"There should be {DEVKIT_ARCHIVE_FILENAME} file in {datasets_dir}."
+            f"You need to download the dataset from http://www.image-net.org/download."
+        )
+
+    with tarfile.open(devkit_archive_path, mode="r") as devkit_archive_file:
+        labels_file = devkit_archive_file.extractfile(LABELS_REL_PATH)
+        labels = list(map(int, labels_file.readlines()))
+
+        # map validation labels (idxes from LABELS_REL_PATH) into WNID compatible with training set
+        meta_file = devkit_archive_file.extractfile(META_REL_PATH)
+        idx_to_wnid = parse_meta_mat(meta_file)
+        labels_wnid = [idx_to_wnid[idx] for idx in labels]
+
+        # remap WNID into index in sorted list of all WNIDs - this is how network outputs class
+        available_wnids = sorted(set(labels_wnid))
+        wnid_to_newidx = {wnid: new_cls for new_cls, wnid in enumerate(available_wnids)}
+        labels = [wnid_to_newidx[wnid] for wnid in labels_wnid]
+
+    output_dir = datasets_dir / IMAGENET_DIRNAME
+    with tarfile.open(image_archive_path, mode="r") as image_archive_file:
+        image_rel_paths = sorted(image_archive_file.getnames())
+        for cls, image_rel_path in tqdm(zip(labels, image_rel_paths), total=len(image_rel_paths)):
+            output_path = output_dir / str(cls) / image_rel_path
+            original_image_file = image_archive_file.extractfile(image_rel_path)
+            processed_image = _process_image(original_image_file, target_size)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            processed_image.save(output_path.as_posix())
+
+
+if __name__ == "__main__":
+    main()
--- a/TensorFlow/Classification/ConvNets/triton/requirements.txt
+++ b/TensorFlow/Classification/ConvNets/triton/requirements.txt
@ -0,0 +1,12 @@
+networkx==2.5
+numpy<1.20.0,>=1.16.0  # # numpy 1.20+ requires py37+
+onnx==1.8.0
+onnxruntime==1.6.0
+pycuda>=2019.1.2
+PyYAML>=5.2
+tqdm>=4.44.1
+tf2onnx==1.8.3
+tabulate>=0.8.7
+natsort>=7.0.0
+# use tags instead of branch names - because there might be docker cache hit causing not fetching most recent changes on branch
+service_maker @ git+https://access-token:usVyg8b11sn9gCacsVCf@gitlab-master.nvidia.com/dl/JoC/service_maker.git@1b83b96#egg=service_maker
--- a/TensorFlow/Classification/ConvNets/triton/rn50_model.py
+++ b/TensorFlow/Classification/ConvNets/triton/rn50_model.py
@ -0,0 +1,86 @@
+import logging
+
+import tensorflow as tf
+from utils import data_utils
+
+LOGGER = logging.getLogger(__name__)
+
+NCLASSES = 1001
+WIDTH = 224
+HEIGHT = 224
+NCHANNELS = 3
+INPUT_FORMAT = "NHWC"
+COMPUTE_FORMAT = "NHWC"
+
+
+def get_model(
+    *,
+    model_dir: str,
+    arch: str = "resnet50",
+    precision: str = "fp32",
+    use_xla: bool = True,
+    use_tf_amp: bool = False,
+    use_dali: bool = False,
+    gpu_memory_fraction=0.7,
+):
+    import horovod.tensorflow as hvd
+    from runtime import Runner
+
+    hvd.init()
+
+    try:
+        dtype = {"fp16": tf.float16, "fp32": tf.float32}[precision.lower()]
+    except KeyError:
+        raise ValueError(f"Uknown precision {precision}. Allowed values: fp16|fp32")
+
+    LOGGER.info(
+        f"Creating model arch={arch} precision={precision} xla={use_xla}"
+        f"tf_amp={use_tf_amp}, dali={use_dali}, gpu_memory_frac={gpu_memory_fraction}"
+    )
+
+    runner = Runner(
+        n_classes=NCLASSES,
+        architecture=arch,
+        input_format=INPUT_FORMAT,
+        compute_format=COMPUTE_FORMAT,
+        dtype=dtype,
+        n_channels=NCHANNELS,
+        height=HEIGHT,
+        width=WIDTH,
+        use_xla=use_xla,
+        use_tf_amp=use_tf_amp,
+        use_dali=use_dali,
+        gpu_memory_fraction=gpu_memory_fraction,
+        gpu_id=0,
+        model_dir=model_dir,
+    )
+
+    # removed params not used in inference
+    estimator_params = {"use_final_conv": False}  # TODO: Why not moved to model constructor?
+    estimator = runner._get_estimator(
+        mode="inference",
+        run_params=estimator_params,
+        use_xla=use_xla,
+        use_dali=use_dali,
+        gpu_memory_fraction=gpu_memory_fraction,
+    )
+    return estimator
+
+
+def get_serving_input_receiver_fn(
+    batch_size: int = None,
+    input_dtype: str = "fp32",
+    width: int = WIDTH,
+    height: int = HEIGHT,
+    nchannels: int = NCHANNELS,
+):
+    input_dtype = tf.float16 if input_dtype and "16" in input_dtype else tf.float32
+    serving_input_receiver_fn = data_utils.get_serving_input_receiver_fn(
+        batch_size=batch_size,
+        height=height,
+        width=width,
+        num_channels=nchannels,
+        data_format=INPUT_FORMAT,
+        dtype=input_dtype,
+    )
+    return serving_input_receiver_fn
--- a/TensorFlow/Classification/ConvNets/triton/run_benchmark.py
+++ b/TensorFlow/Classification/ConvNets/triton/run_benchmark.py
@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# method from PEP-366 to support relative import in executed modules
+import argparse
+import logging
+from pathlib import Path
+from typing import List
+
+if __name__ == "__main__" and __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .benchmark.benchmark import Benchmark
+from .benchmark.checkpoints import HttpCheckpoint
+from .benchmark.core import LOGGER
+from .benchmark.executor import DockerExecutor
+from .deployment_toolkit.core import Accelerator, Format, Precision
+
+AVAILABLE_MODEL_FORMATS = [f.value for f in Format]
+AVAILABLE_MODEL_PRECISIONS = [p.value for p in Precision]
+AVAILABLE_MODEL_ACCELERATORS = [a.value for a in Accelerator]
+
+def run_benchmark(
+        devices: List[str],
+        model_name: str,
+        model_version: int,
+        model_format: str,
+        container_version: str,
+        checkpoint: str,
+        max_batch_size: int,
+        precision: str,
+        number_of_model_instances: int,
+        preferred_batch_sizes: List[int],
+        max_queue_delay_us: int,
+        backend_accelerator: str,
+        verbose: bool,
+        **kwargs
+):
+    benchmark = Benchmark(
+        devices=devices,
+        model_name=model_name,
+        model_version=model_version,
+        framework="TensorFlow1",
+        container_version=container_version,
+        checkpoint=HttpCheckpoint(checkpoint),
+        verbose=verbose
+    )
+    benchmark.model_conversion(
+        cmds=(
+            r"""
+        python3 triton/convert_model.py \
+            --input-path triton/rn50_model.py \
+            --input-type tf-estimator \
+            --output-path ${SHARED_DIR}/model \
+            --output-type ${FORMAT} \
+            --onnx-opset 12 \
+            --onnx-optimized 1 \
+            --max-batch-size ${MAX_BATCH_SIZE} \
+            --max-workspace-size 4294967296 \
+            --ignore-unknown-parameters \
+            \
+            --model-dir ${CHECKPOINT_DIR} \
+            --precision ${PRECISION} \
+            --dataloader triton/dataloader.py \
+            --data-dir ${DATASETS_DIR}/imagenet
+        """,
+        )
+    )
+
+    benchmark.model_deploy(
+        cmds=(
+            r"""
+        python3 triton/deploy_model.py \
+            --model-repository ${MODEL_REPOSITORY_PATH} \
+            --model-path ${SHARED_DIR}/model \
+            --model-format ${FORMAT} \
+            --model-name ${MODEL_NAME} \
+            --model-version 1 \
+            --max-batch-size ${MAX_BATCH_SIZE} \
+            --precision ${PRECISION} \
+            --number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
+            --max-queue-delay-us ${TRITON_MAX_QUEUE_DELAY} \
+            --preferred-batch-sizes ${TRITON_PREFERRED_BATCH_SIZES} \
+            --capture-cuda-graph 0 \
+            --backend-accelerator ${BACKEND_ACCELERATOR} \
+            --load-model ${TRITON_LOAD_MODEL_METHOD}
+        """,
+        )
+    )
+    benchmark.triton_performance_offline_tests(
+        cmds=(
+            r"""
+        python triton/run_offline_performance_test_on_triton.py \
+            --server-url ${TRITON_SERVER_URL} \
+            --model-name ${MODEL_NAME} \
+            --number-of-warmup-iterations 5 \
+            --input-data random \
+            --batch-sizes ${BATCH_SIZE} \
+            --triton-instances ${TRITON_INSTANCES} \
+            --result-path ${SHARED_DIR}/triton_performance_offline.csv
+        """,
+        ),
+        result_path="${SHARED_DIR}/triton_performance_offline.csv",
+    )
+    benchmark.triton_performance_online_tests(
+        cmds=(
+            r"""
+        python triton/run_online_performance_test_on_triton.py \
+            --server-url ${TRITON_SERVER_URL} \
+            --model-name ${MODEL_NAME} \
+            --number-of-warmup-iterations 5 \
+            --input-data random \
+            --batch-sizes ${BATCH_SIZE} \
+            --triton-instances ${TRITON_INSTANCES} \
+            --number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
+            --result-path ${SHARED_DIR}/triton_performance_online.csv
+        """,
+        ),
+        result_path="${SHARED_DIR}/triton_performance_online.csv",
+    )
+
+    benchmark.configuration(
+        precision=precision,
+        max_batch_size=max_batch_size,
+        format=model_format,
+        accelerator=backend_accelerator,
+        triton_gpu_engine_count=number_of_model_instances,
+        triton_preferred_batch_sizes=preferred_batch_sizes,
+        triton_max_queue_delay_us=max_queue_delay_us,
+        **kwargs
+    )
+
+    executor = DockerExecutor()
+    executor.run(benchmark)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run benchmark for model.")
+    parser.add_argument("--devices", help="NVIDIA GPU device ID on which Triton Inference Server is ran. Accept multiple values", nargs="*", required=False)
+    parser.add_argument("--model-name", help="Model name. Default: ResNet50", default="ResNet50", required=False)
+    parser.add_argument("--model-version", default="1", help="Version of model. Default: 1", required=False)
+    parser.add_argument("--checkpoint", default="https://api.ngc.nvidia.com/v2/models/nvidia/rn50_tf_amp_ckpt/versions/20.06.0/zip", help="Checkpoint url. Default: https://api.ngc.nvidia.com/v2/models/nvidia/rn50_tf_amp_ckpt/versions/20.06.0/zip", required=False)
+    parser.add_argument("--container-version", help="Version of container for Triton Inference Server. Default: 20.12", default="20.12", required=False)
+    parser.add_argument(
+        "--model-format",
+        choices=AVAILABLE_MODEL_FORMATS,
+        help="Format of exported model. Default: tf-savedmodel",
+        default="tf-savedmodel",
+        required=False
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default="fp16",
+        choices=AVAILABLE_MODEL_PRECISIONS,
+        help="Model precision (parameter used only by Tensorflow backend with TensorRT optimization). Default: fp16",
+        required=False
+    )
+    parser.add_argument(
+        "--max-batch-size",
+        type=int,
+        default=32,
+        help="Batch size used for benchmark. Maximal batch size which is used to convert model. Default: 32",
+        required=False
+    )
+    parser.add_argument(
+        "--number-of-model-instances",
+        type=int,
+        default=2,
+        help="Number of model instances per GPU (model instances). Default: 2",
+        required=False
+    )
+    parser.add_argument(
+        "--preferred-batch-sizes",
+        type=int,
+        nargs="*",
+        help="Batch sizes that the dynamic batching should attempt to create. "
+             "In case --max-queue-delay-us is set and this parameter is not, default value will be calculated based on --max-batch-size",
+        required=False
+    )
+    parser.add_argument(
+        "--max-queue-delay-us",
+        type=int,
+        default=100,
+        help="Max delay time which dynamic batch shall wait to form a batch. Default: 100",
+        required=False
+    )
+    parser.add_argument(
+        "--backend-accelerator",
+        choices=AVAILABLE_MODEL_ACCELERATORS,
+        type=str,
+        default="cuda",
+        help="Select backend accelerator used for model. Default: cuda",
+        required=False
+    )
+    parser.add_argument("--verbose", action="store_true", default=False, help="Provide verbose output")
+
+    args = parser.parse_args()
+
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    LOGGER.setLevel(log_level)
+
+    LOGGER.info(f"args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    run_benchmark(**vars(args))
--- a/TensorFlow/Classification/ConvNets/triton/run_inference_on_fw.py
+++ b/TensorFlow/Classification/ConvNets/triton/run_inference_on_fw.py
@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+To infer the model on framework runtime, you can use `run_inference_on_fw.py` script.
+It infers data obtained from pointed data loader locally and saves received data into
+[npz files](https://gitlab-master.nvidia.com/dl/JoC/bermuda-api/-/blob/develop/bermuda_api_toolset/docs/dump_files.md).
+Those files are stored in directory pointed by `--output-dir` argument.
+
+Example call:
+
+```shell script
+python ./triton/run_inference_on_fw.py \
+    --input-path /models/exported/model.onnx \
+    --input-type onnx \
+    --dataloader triton/dataloader.py \
+    --data-dir /data/imagenet \
+    --batch-size 32 \
+    --output-dir /results/dump_local \
+    --dump-labels
+```
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "0"
+
+from tqdm import tqdm
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator
+from .deployment_toolkit.core import DATALOADER_FN_NAME, BaseLoader, BaseRunner, Format, load_from_file
+from .deployment_toolkit.dump import NpzWriter
+from .deployment_toolkit.extensions import loaders, runners
+
+LOGGER = logging.getLogger("run_inference_on_fw")
+
+
+def _verify_and_format_dump(args, ids, x, y_pred, y_real):
+    data = {"outputs": y_pred, "ids": {"ids": ids}}
+    if args.dump_inputs:
+        data["inputs"] = x
+    if args.dump_labels:
+        if not y_real:
+            raise ValueError(
+                "Found empty label values. Please provide labels in dataloader_fn or do not use --dump-labels argument"
+            )
+        data["labels"] = y_real
+    return data
+
+
+def _parse_and_validate_args():
+    supported_inputs = set(runners.supported_extensions) & set(loaders.supported_extensions)
+
+    parser = argparse.ArgumentParser(description="Dump local inference output of given model", allow_abbrev=False)
+    parser.add_argument("--input-path", help="Path to input model", required=True)
+    parser.add_argument("--input-type", help="Input model type", choices=supported_inputs, required=True)
+    parser.add_argument("--dataloader", help="Path to python file containing dataloader.", required=True)
+    parser.add_argument("--output-dir", help="Path to dir where output files will be stored", required=True)
+    parser.add_argument("--dump-labels", help="Dump labels to output dir", action="store_true", default=False)
+    parser.add_argument("--dump-inputs", help="Dump inputs to output dir", action="store_true", default=False)
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
+
+    args, *_ = parser.parse_known_args()
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
+
+    Runner: BaseRunner = runners.get(args.input_type)
+    ArgParserGenerator(Runner).update_argparser(parser)
+
+    args = parser.parse_args()
+
+    types_requiring_io_params = []
+
+    if args.input_type in types_requiring_io_params and not all(p for p in [args.inputs, args.outputs]):
+        parser.error(f"For {args.input_type} input provide --inputs and --outputs parameters")
+
+    return args
+
+
+def main():
+    args = _parse_and_validate_args()
+
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info(f"args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    Loader: BaseLoader = loaders.get(args.input_type)
+    Runner: BaseRunner = runners.get(args.input_type)
+
+    loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
+    runner = ArgParserGenerator(Runner).from_args(args)
+    LOGGER.info(f"Loading {args.input_path}")
+    model = loader.load(args.input_path)
+    with runner.init_inference(model=model) as runner_session, NpzWriter(args.output_dir) as writer:
+        get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+        dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+        LOGGER.info(f"Data loader initialized; Running inference")
+        for ids, x, y_real in tqdm(dataloader_fn(), unit="batch", mininterval=10):
+            y_pred = runner_session(x)
+            data = _verify_and_format_dump(args, ids=ids, x=x, y_pred=y_pred, y_real=y_real)
+            writer.write(**data)
+        LOGGER.info(f"Inference finished")
+
+
+if __name__ == "__main__":
+    main()
--- a/Show more
+++ b/Show more