Merge pull request #561 from NVIDIA/nvpstr/fed7ba

Nvpstr/fed7ba
2020-06-12 14:48:24 +02:00 · 2020-06-12 14:48:24 +02:00 · 6c1d562eb9
parent b709b3ae57 f11884b38a
commit 6c1d562eb9
152 changed files with 3314 additions and 4165 deletions
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/transformer.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/transformer.py
@ -29,7 +29,7 @@ class PositionalEmbedding(nn.Module):

    def forward(self, pos_seq, bsz=None):
        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
-        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=1)
        if bsz is not None:
            return pos_emb[None, :, :].expand(bsz, -1, -1)
        else:
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/transformer_jit.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/transformer_jit.py
@ -36,7 +36,7 @@ class PositionalEmbedding(nn.Module):

    def forward(self, pos_seq, bsz: Optional[int] = None):
        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
-        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=1)
        if bsz is not None:
            return pos_emb[None, :, :].expand(bsz, -1, -1)
        else:
--- a/PyTorch/SpeechSynthesis/FastPitch/inference.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/inference.py
@ -296,6 +296,9 @@ def main():
              'pitch_tgt': None,
              'pitch_transform': build_pitch_transformation(args)}

+    if args.torchscript:
+        gen_kw.pop('pitch_transform')
+
    all_utterances = 0
    all_samples = 0
    all_letters = 0
--- a/PyTorch/SpeechSynthesis/FastPitch/models.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/models.py
@ -101,8 +101,7 @@ def get_model(model_name, model_config, device,
                class FastPitch__forward_is_infer(_FastPitchJIT):
                    def forward(self, inputs, input_lengths, pace: float = 1.0,
                                dur_tgt: Optional[torch.Tensor] = None,
-                                pitch_tgt: Optional[torch.Tensor] = None,
-                                pitch_transform: Optional[bool] = None):
+                                pitch_tgt: Optional[torch.Tensor] = None):
                        return self.infer(inputs, input_lengths, pace=pace,
                                          dur_tgt=dur_tgt, pitch_tgt=pitch_tgt)
            else:
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_benchmark.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_benchmark.sh
@ -21,6 +21,7 @@ for PRECISION in fp16 fp32; do
                        --include-warmup \
                        --batch-size ${BSZ} \
                        --repeats 1000 \
+                        --torchscript \
                        -i phrases/benchmark_8_128.tsv
  done
 done
--- a/README.md
+++ b/README.md
@ -15,9 +15,9 @@ These examples, along with our NVIDIA deep learning software stack, are provided
 The examples are organized first by framework, such as TensorFlow, PyTorch, etc. and second by use case, such as computer vision, natural language processing, etc. We hope this structure enables you to quickly locate the example networks that best suit your needs. Here are the currently supported models:

 ### Computer Vision
- __ResNet-50__ [[MXNet](https://github.com/NVIDIA/DeepLearningExamples/tree/master/MxNet/Classification/RN50v1.5)] [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/RN50v1.5)]
- __ResNext__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets)]
- __SE-ResNext__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets)]
+- __ResNet-50__ [[MXNet](https://github.com/NVIDIA/DeepLearningExamples/tree/master/MxNet/Classification/RN50v1.5)] [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/ConvNets)]
+- __ResNext__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/ConvNets)]
+- __SE-ResNext__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/ConvNets)]
 - __SSD__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Detection/SSD)]
 - __Mask R-CNN__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/MaskRCNN)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/MaskRCNN)] [[TensorFlow 2](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/MaskRCNN)] 
 - __U-Net(industrial)__ [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Segmentation/UNet_Industrial)]
@ -65,7 +65,7 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc.
 ## Feature Matrix
 | Models  | Framework | DALI | AMP | Multi-GPU | Multi-Node  | TensorRT  | ONNX  | Triton | TF-TRT |
 | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |------------- |------------- |------------- |------------- |
-| [ResNet50 v1.5](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5)  |PyTorch  | Yes  | Yes  | Yes  | -  | -  | -  | -  | -  |
+| [ResNet-50 v1.5](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5)  |PyTorch  | Yes  | Yes  | Yes  | -  | -  | -  | -  | -  |
 | [ResNeXt101-32x4d](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnext101-32x4d)  |PyTorch  | Yes  | Yes  | Yes  | -  | -  |   -  | -  | -  |
 | [SE-ResNeXt101-32x4d](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/se-resnext101-32x4d)  |PyTorch  | Yes  | Yes  | Yes  | -  | -  | -  | -  | -  |
 | [SSD300 v1.1](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD) |PyTorch  | Yes  | Yes  | Yes  | -  | -  |   -  | -  | -  |
@ -79,7 +79,9 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc.
 | [FastPitch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/FastPitch) | PyTorch  | N/A  | Yes  | Yes  | - | - | - | - | - |
 | [GNMT v2](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/GNMT) |PyTorch  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
 | [Transformer](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/Transformer) |PyTorch  | N/A  | Yes  | Yes  | -  | -  |   -  | -  | -  |
-| [ResNet-50 v1.5](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/RN50v1.5) |TensorFlow  | Yes  | Yes  | Yes  | -  | -  | -  | -  | -  |
+| [ResNet-50 v1.5](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/ConvNets/resnet50v1.5) |TensorFlow  | Yes  | Yes  | Yes  | -  | -  | -  | -  | -  |
+| [ResNeXt101-32x4d](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/ConvNets/resnext101-32x4d)  |TensorFlow  | Yes  | Yes  | Yes  | -  | -  |   -  | -  | -  |
+| [SE-ResNeXt101-32x4d](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TEnsorFlow/Classification/ConvNets/se-resnext101-32x4d)  |TensorFlow  | Yes  | Yes  | Yes  | -  | -  | -  | -  | -  |
 | [SSD320 v1.2](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Detection/SSD) | TensorFlow  | N/A  | Yes  | Yes  | -  | -  | -  | -  | -  |
 | [BERT](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT) |TensorFlow  | N/A  | Yes  | Yes  | Yes  | Yes  | -  | [Yes](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/triton)  | Yes  |
 | [BioBert](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/biobert) | TensorFlow  | N/A  | Yes  | Yes  | -  | -  | -  | -  | -  |
--- a/TensorFlow/Classification/ConvNets/.gitignore
+++ b/TensorFlow/Classification/ConvNets/.gitignore
--- a/TensorFlow/Classification/RN50v1.5/dllogger/init.py
+++ b/TensorFlow/Classification/RN50v1.5/dllogger/init.py
--- a/TensorFlow/Classification/ConvNets/.style.yapf
+++ b/TensorFlow/Classification/ConvNets/.style.yapf
--- a/TensorFlow/Classification/ConvNets/Dockerfile
+++ b/TensorFlow/Classification/ConvNets/Dockerfile
@ -0,0 +1,8 @@
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.03-tf1-py3
+FROM ${FROM_IMAGE_NAME}
+
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+
+ADD . /workspace/rn50v15_tf
+WORKDIR /workspace/rn50v15_tf
--- a/TensorFlow/Classification/ConvNets/LICENSE
+++ b/TensorFlow/Classification/ConvNets/LICENSE
--- a/TensorFlow/Classification/ConvNets/README.md
+++ b/TensorFlow/Classification/ConvNets/README.md
@ -0,0 +1,68 @@
+# Resnet-family Convolutional Neural Networks for Image Classification in Tensorflow
+
+In this repository you will find implementation of Resnet and its variations for image
+classification
+
+## Table Of Contents
+
+* [Models](#models)
+* [Validation accuracy results](#validation-accuracy-results)
+* [Training performance results](#training-performance-results)
+  * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+* [Release notes](#release-notes)
+  * [Changelog](#changelog)
+
+
+## Models
+
+The following table provides links to where you can find additional information on each model:
+
+| **Model** | **Link**|
+|-----------|---------|
+| resnet50 | [README](./resnet50v1.5/README.md) |
+| resnext101-32x4d | [README](./resnext101-32x4d/README.md) |
+| se-resnext101-32x4d | [README](./se-resnext101-32x4d/README.md) |
+
+## Validation accuracy results
+
+Our results were obtained by running the applicable training scripts in the tensorflow-20.03-tf1-py3 NGC container 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs. The specific training script that was run is documented in the corresponding model's README.
+
+The following table shows the validation accuracy results of the 
+three classification models side-by-side.
+
+
+| **arch** | **AMP Top1** | **AMP Top5** | **FP32 Top1** | **FP32 Top5** |
+|:-:|:-:|:-:|:-:|:-:|
+| resnet50            | 78.35 | 94.21 | 78.34 | 94.21 |
+| resnext101-32x4d    | 80.21 | 95.00 | 80.21 | 94.99 |
+| se-resnext101-32x4d | 80.87 | 95.35 | 80.84 | 95.37 |
+
+## Training performance results
+
+### Training performance: NVIDIA DGX-1 (8x V100 16G)
+
+Our results were obtained by running the applicable 
+training scripts in the tensorflow-20.03-tf1-py3 NGC container 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs. 
+Performance numbers (in images per second) 
+were averaged over an entire training epoch.
+The specific training script that was run is documented 
+in the corresponding model's README.
+
+The following table shows the training accuracy results of the 
+three classification models side-by-side.
+
+
+| **arch** | **Mixed Precision** | **Mixed Prcesision XLA** | **FP32** | **Mixed Precision speedup** | **XLA Mixed Precision speedup**|
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| resnet50            | 8277.91 img/s | 9485.21 img/s | 2785.81 img/s | 2.97x | 1.14x |
+| resnext101-32x4d    | 3151.81 img/s | 4231.42 img/s | 1055.82 img/s | 2.98x | 1.34x |
+| se-resnext101-32x4d | 2168.40 img/s | 3297.39 img/s | 921.38 img/s  | 2.35x | 1.52x |
+
+## Release notes
+
+### Changelog
+June 2020
+  - ConvNets repo restructurization
+  - Initial release of ResNext and SE-Resnext
--- a/TensorFlow/Classification/ConvNets/main.py
+++ b/TensorFlow/Classification/ConvNets/main.py
@ -0,0 +1,127 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import warnings
+warnings.simplefilter("ignore")
+
+import tensorflow as tf
+
+import horovod.tensorflow as hvd
+import dllogger
+
+from utils import hvd_utils
+from runtime import Runner
+from model.resnet import model_architectures
+
+from utils.cmdline_helper import parse_cmdline
+
+if __name__ == "__main__":
+
+    tf.logging.set_verbosity(tf.logging.ERROR)
+
+    FLAGS = parse_cmdline(model_architectures.keys())
+    hvd.init()
+
+    if hvd.rank() == 0:
+        log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
+        os.makedirs(FLAGS.results_dir, exist_ok=True)
+
+        dllogger.init(
+            backends=[
+                dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path),
+                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
+            ]
+        )
+    else:
+        dllogger.init(backends=[])
+    dllogger.log(data=vars(FLAGS), step='PARAMETER')
+
+    runner = Runner(
+        # ========= Model HParams ========= #
+        n_classes=1001,
+        architecture=FLAGS.arch,
+        input_format='NHWC',
+        compute_format=FLAGS.data_format,
+        dtype=tf.float32 if FLAGS.precision == 'fp32' else tf.float16,
+        n_channels=3,
+        height=224,
+        width=224,
+        distort_colors=False,
+        log_dir=FLAGS.results_dir,
+        model_dir=FLAGS.model_dir if FLAGS.model_dir is not None else FLAGS.results_dir,
+        data_dir=FLAGS.data_dir,
+        data_idx_dir=FLAGS.data_idx_dir,
+        weight_init=FLAGS.weight_init,
+        use_xla=FLAGS.use_xla,
+        use_tf_amp=FLAGS.use_tf_amp,
+        use_dali=FLAGS.use_dali,
+        gpu_memory_fraction=FLAGS.gpu_memory_fraction,
+        gpu_id=FLAGS.gpu_id,
+        seed=FLAGS.seed
+    )
+
+    if FLAGS.mode in ["train", "train_and_evaluate", "training_benchmark"]:
+        runner.train(
+            iter_unit=FLAGS.iter_unit,
+            num_iter=FLAGS.num_iter,
+            run_iter=FLAGS.run_iter,
+            batch_size=FLAGS.batch_size,
+            warmup_steps=FLAGS.warmup_steps,
+            log_every_n_steps=FLAGS.display_every,
+            weight_decay=FLAGS.weight_decay,
+            lr_init=FLAGS.lr_init,
+            lr_warmup_epochs=FLAGS.lr_warmup_epochs,
+            momentum=FLAGS.momentum,
+            loss_scale=FLAGS.loss_scale,
+            label_smoothing=FLAGS.label_smoothing,
+            mixup=FLAGS.mixup,
+            use_static_loss_scaling=FLAGS.use_static_loss_scaling,
+            use_cosine_lr=FLAGS.use_cosine_lr,
+            is_benchmark=FLAGS.mode == 'training_benchmark',
+        )
+
+    if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:
+
+        if FLAGS.mode == 'inference_benchmark' and hvd_utils.is_using_hvd():
+            raise NotImplementedError("Only single GPU inference is implemented.")
+
+        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
+
+            runner.evaluate(
+                iter_unit=FLAGS.iter_unit if FLAGS.mode != "train_and_evaluate" else "epoch",
+                num_iter=FLAGS.num_iter if FLAGS.mode != "train_and_evaluate" else 1,
+                warmup_steps=FLAGS.warmup_steps,
+                batch_size=FLAGS.batch_size,
+                log_every_n_steps=FLAGS.display_every,
+                is_benchmark=FLAGS.mode == 'inference_benchmark',
+                export_dir=FLAGS.export_dir
+            )
+
+    if FLAGS.mode == 'predict':
+        if FLAGS.to_predict is None:
+            raise ValueError("No data to predict on.")
+
+        if not os.path.isfile(FLAGS.to_predict):
+            raise ValueError("Only prediction on single images is supported!")
+
+        if hvd_utils.is_using_hvd():
+            raise NotImplementedError("Only single GPU inference is implemented.")
+
+        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
+            runner.predict(FLAGS.to_predict)
--- a/TensorFlow/Classification/ConvNets/model/init.py
+++ b/TensorFlow/Classification/ConvNets/model/init.py
@ -14,4 +14,4 @@

 from model import layers
 from model import blocks
-from model import resnet_v1_5
+from model import resnet
--- a/TensorFlow/Classification/ConvNets/model/blocks/init.py
+++ b/TensorFlow/Classification/ConvNets/model/blocks/init.py
--- a/TensorFlow/Classification/ConvNets/model/blocks/conv2d_block.py
+++ b/TensorFlow/Classification/ConvNets/model/blocks/conv2d_block.py
@ -34,7 +34,8 @@ def conv2d_block(
    data_format='NHWC',
    conv2d_hparams=None,
    batch_norm_hparams=None,
-    name='conv2d'
+    name='conv2d',
+    cardinality=1,
 ):

    if not isinstance(conv2d_hparams, tf.contrib.training.HParams):
@ -44,8 +45,7 @@ def conv2d_block(
        raise ValueError("The paramater `conv2d_hparams` is not of type `HParams`")

    with tf.variable_scope(name):
-
-        if mode != 'SAME_RESNET':
+        if cardinality == 1:
            net = layers.conv2d(
                inputs,
                n_channels=n_channels,
@ -56,58 +56,18 @@ def conv2d_block(
                use_bias=not use_batch_norm,
                trainable=is_training,
                kernel_initializer=conv2d_hparams.kernel_initializer,
-                bias_initializer=conv2d_hparams.bias_initializer,
-            )
-
-        else:  # Special padding mode for ResNet models
-            if strides == (1, 1):
-
-                net = layers.conv2d(
-                    inputs,
-                    n_channels=n_channels,
-                    kernel_size=kernel_size,
-                    strides=strides,
-                    padding='SAME',
-                    data_format=data_format,
-                    use_bias=not use_batch_norm,
-                    trainable=is_training,
-                    kernel_initializer=conv2d_hparams.kernel_initializer,
-                    bias_initializer=conv2d_hparams.bias_initializer,
-                )
-
-            else:
-                rate = 1  # Unused (for 'a trous' convolutions)
-
-                kernel_height_effective = kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)
-
-                pad_h_beg = (kernel_height_effective - 1) // 2
-                pad_h_end = kernel_height_effective - 1 - pad_h_beg
-
-                kernel_width_effective = kernel_size[1] + (kernel_size[1] - 1) * (rate - 1)
-
-                pad_w_beg = (kernel_width_effective - 1) // 2
-                pad_w_end = kernel_width_effective - 1 - pad_w_beg
-
-                padding = [[0, 0], [pad_h_beg, pad_h_end], [pad_w_beg, pad_w_end], [0, 0]]
-
-                if data_format == 'NCHW':
-                    padding = [padding[0], padding[3], padding[1], padding[2]]
-
-                padded_inputs = tf.pad(inputs, padding)
-
-                net = layers.conv2d(
-                    padded_inputs,  # inputs,
-                    n_channels=n_channels,
-                    kernel_size=kernel_size,
-                    strides=strides,
-                    padding='VALID',
-                    data_format=data_format,
-                    use_bias=not use_batch_norm,
-                    trainable=is_training,
-                    kernel_initializer=conv2d_hparams.kernel_initializer,
-                    bias_initializer=conv2d_hparams.bias_initializer,
-                )
-
+                bias_initializer=conv2d_hparams.bias_initializer)
+        else:
+            group_filter = tf.get_variable(
+                name=name + 'group_filter',
+                shape=[3, 3, n_channels // cardinality, n_channels],
+                trainable=is_training,
+                dtype=tf.float32)
+            net = tf.nn.conv2d(inputs,
+                                      group_filter,
+                                      strides=strides,
+                                      padding='SAME',
+                                      data_format=data_format)
        if use_batch_norm:
            net = layers.batch_norm(
                net,
--- a/TensorFlow/Classification/ConvNets/model/blocks/resnet_bottleneck_block.py
+++ b/TensorFlow/Classification/ConvNets/model/blocks/resnet_bottleneck_block.py
@ -30,11 +30,14 @@ def bottleneck_block(
    depth,
    depth_bottleneck,
    stride,
+    cardinality=1,
    training=True,
    data_format='NCHW',
    conv2d_hparams=None,
    batch_norm_hparams=None,
-    block_name="bottleneck_block"
+    block_name="bottleneck_block",
+    use_se=False,
+    ratio=1
 ):

    if data_format not in ['NHWC', 'NCHW']:
@ -44,7 +47,7 @@ def bottleneck_block(
        raise ValueError("The paramater `conv2d_hparams` is not of type `HParams`")

    if not isinstance(batch_norm_hparams, tf.contrib.training.HParams):
-        raise ValueError("The paramater `conv2d_hparams` is not of type `HParams`")
+        raise ValueError("The paramater `batch_norm_hparams` is not of type `HParams`")

    in_shape = inputs.get_shape()

@ -54,23 +57,17 @@ def bottleneck_block(

        with tf.variable_scope("shortcut"):
            if depth == in_size:
-
                if stride == 1:
                    shortcut = tf.identity(inputs)
-
                else:
-
                    shortcut = model.layers.average_pooling2d(
                        inputs,
                        pool_size=(1, 1),
                        strides=(stride, stride),
                        padding='valid',
                        data_format='channels_first' if data_format == 'NCHW' else 'channels_last',
-                        name="average_pooling2d",
-                    )
-
+                        name="average_pooling2d")
            else:
-
                shortcut = model.blocks.conv2d_block(
                    inputs,
                    n_channels=depth,
@ -85,8 +82,13 @@ def bottleneck_block(
                    batch_norm_hparams=batch_norm_hparams
                )

+        #cardinality_to_bottleneck_width = { 1:64, 2:40, 4:24, 8:14, 32:4, 64:4 }
+        #cardinality_to_grouped_conv_width = { 1:64, 2:80, 4:96, 8:112, 32:128, 64:256 }
+        #per_group_ck = cardinality_to_bottleneck_width[cardinality] * depth_bottleneck / 64
+
        bottleneck = model.blocks.conv2d_block(
            inputs,
+            #n_channels=per_group_ck * cardinality if cardinality != 1 else depth_bottleneck,
            n_channels=depth_bottleneck,
            kernel_size=(1, 1),
            strides=(1, 1),
@ -97,23 +99,22 @@ def bottleneck_block(
            data_format=data_format,
            conv2d_hparams=conv2d_hparams,
            batch_norm_hparams=batch_norm_hparams,
-            name='bottleneck_1'
-        )
+            name='bottleneck_1')

        bottleneck = model.blocks.conv2d_block(
            bottleneck,
            n_channels=depth_bottleneck,
            kernel_size=(3, 3),
            strides=(stride, stride),
-            mode='SAME_RESNET',
+            mode='SAME',
            use_batch_norm=True,
            activation='relu',
            is_training=training,
            data_format=data_format,
            conv2d_hparams=conv2d_hparams,
            batch_norm_hparams=batch_norm_hparams,
-            name='bottleneck_2'
-        )
+            name='bottleneck_2',
+            cardinality=cardinality)

        bottleneck = model.blocks.conv2d_block(
            bottleneck,
@ -130,4 +131,12 @@ def bottleneck_block(
            name='bottleneck_3'
        )

+        if use_se:
+            bottleneck = model.layers.squeeze_excitation_layer(
+                inputs=bottleneck,
+                ratio=ratio,
+                training=training,
+                data_format=data_format,
+                name='bottleneck_se_layer')
+
        return model.layers.relu(shortcut + bottleneck, name='relu')
--- a/TensorFlow/Classification/ConvNets/model/layers/init.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/init.py
@ -18,6 +18,7 @@
 from model.layers.activation import relu
 from model.layers.activation import softmax
 from model.layers.activation import tanh
+from model.layers.activation import sigmoid

 from model.layers.conv2d import conv2d

@ -31,6 +32,7 @@ from model.layers.padding import pad

 from model.layers.pooling import average_pooling2d
 from model.layers.pooling import max_pooling2d
+from model.layers.squeeze_excitation_layer import squeeze_excitation_layer

 __all__ = [

@ -38,6 +40,7 @@ __all__ = [
    'relu',
    'softmax',
    'tanh',
+    'sigmoid',

    # conv layers
    'conv2d',
@ -56,5 +59,7 @@ __all__ = [

    # pooling layers
    'average_pooling2d',
-    'max_pooling2d'
+    'max_pooling2d',
+
+    'squeeze_excitation_layer'
 ]
--- a/TensorFlow/Classification/ConvNets/model/layers/activation.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/activation.py
@ -17,7 +17,7 @@

 import tensorflow as tf

-__all__ = ['relu', 'softmax', 'tanh']
+__all__ = ['relu', 'softmax', 'tanh', 'sigmoid']


 def relu(inputs, name='relu'):
@ -43,3 +43,9 @@ def tanh(inputs, name='tanh'):
    net = tf.math.tanh(inputs, name=name)

    return net
+
+def sigmoid(inputs, name='sigmoid'):
+    
+    net = tf.math.sigmoid(inputs, name=name)
+    
+    return net
--- a/TensorFlow/Classification/ConvNets/model/layers/conv2d.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/conv2d.py
--- a/TensorFlow/Classification/ConvNets/model/layers/dense.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/dense.py
--- a/TensorFlow/Classification/ConvNets/model/layers/math_ops.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/math_ops.py
--- a/TensorFlow/Classification/ConvNets/model/layers/normalization.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/normalization.py
--- a/TensorFlow/Classification/ConvNets/model/layers/padding.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/padding.py
--- a/TensorFlow/Classification/ConvNets/model/layers/pooling.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/pooling.py
--- a/TensorFlow/Classification/ConvNets/model/layers/squeeze_excitation_layer.py
+++ b/TensorFlow/Classification/ConvNets/model/layers/squeeze_excitation_layer.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import tensorflow as tf
+
+from model import layers
+from model import blocks
+
+__all__ = ['squeeze_excitation_layer']
+
+def squeeze_excitation_layer(
+    inputs,
+    ratio,
+    training=True,
+    data_format='NCHW',
+    kernel_initializer=tf.variance_scaling_initializer(),
+    bias_initializer=tf.zeros_initializer(),
+    name="squeeze_excitation_layer"
+):
+
+    if data_format not in ['NHWC', 'NCHW']:
+        raise ValueError("Unknown data format: `%s` (accepted: ['NHWC', 'NCHW'])" % data_format)
+
+    in_shape = inputs.get_shape()
+
+    num_channels = in_shape[1] if data_format == "NCHW" else in_shape[-1]
+
+    with tf.variable_scope(name):
+
+        net = inputs
+        
+        # squeeze
+        squeeze = layers.reduce_mean(
+                    net, 
+                    keepdims=False, 
+                    data_format=data_format,
+                    name='squeeze_spatial_mean'
+                )
+
+        # fc + relu
+        excitation = layers.dense(
+                    inputs=squeeze,
+                    units=num_channels // ratio,
+                    use_bias=True,
+                    trainable=training,
+                    kernel_initializer=kernel_initializer,
+                    bias_initializer=bias_initializer
+                )
+        excitation = layers.relu(excitation)
+        
+        # fc + sigmoid
+        excitation = layers.dense(
+                    inputs=excitation,
+                    units=num_channels,
+                    use_bias=True,
+                    trainable=training,
+                    kernel_initializer=kernel_initializer,
+                    bias_initializer=bias_initializer
+                )
+        excitation = layers.sigmoid(excitation)
+        
+        out_shape = [-1, num_channels, 1, 1] if data_format == "NCHW" else [-1, 1, 1, num_channels] 
+        
+        excitation = tf.reshape(excitation, out_shape)
+        
+        net = net * excitation
+        
+        return net
--- a/TensorFlow/Classification/RN50v1.5/model/resnet_v1_5.py
+++ b/TensorFlow/Classification/RN50v1.5/model/resnet_v1_5.py
@ -33,7 +33,6 @@ from utils.data_utils import normalized_inputs
 from utils.learning_rate import learning_rate_scheduler
 from utils.optimizers import FixedLossScalerOptimizer

-from dllogger.logger import LOGGER

 __all__ = [
    'ResnetModel',
@ -47,10 +46,17 @@ class ResnetModel(object):
        self,
        model_name,
        n_classes,
+        layers_count,
+        layers_depth,
+        expansions,
        compute_format='NCHW',
        input_format='NHWC',
+        weight_init='fan_out',
        dtype=tf.float32,
        use_dali=False,
+        cardinality=1,
+        use_se=False,
+        se_ratio=1,
    ):

        self.model_hparams = tf.contrib.training.HParams(
@ -58,9 +64,14 @@ class ResnetModel(object):
            compute_format=compute_format,
            input_format=input_format,
            dtype=dtype,
-            layer_counts=(3, 4, 6, 3),
+            layers_count=layers_count,
+            layers_depth=layers_depth,
+            expansions=expansions,
            model_name=model_name,
-            use_dali=use_dali
+            use_dali=use_dali,
+            cardinality=cardinality,
+            use_se=use_se,
+            se_ratio=se_ratio
        )

        self.batch_norm_hparams = tf.contrib.training.HParams(
@ -78,63 +89,44 @@ class ResnetModel(object):

        self.conv2d_hparams = tf.contrib.training.HParams(
            kernel_initializer=tf.variance_scaling_initializer(
-                scale=2.0, distribution='truncated_normal', mode='fan_out'
+                scale=2.0, distribution='truncated_normal', mode=weight_init
            ),
            bias_initializer=tf.constant_initializer(0.0)
        )

        self.dense_hparams = tf.contrib.training.HParams(
            kernel_initializer=tf.variance_scaling_initializer(
-                scale=2.0, distribution='truncated_normal', mode='fan_out'
+                scale=2.0, distribution='truncated_normal', mode=weight_init
            ),
            bias_initializer=tf.constant_initializer(0.0)
        )
        if hvd.rank() == 0:
-            LOGGER.log("Model HParams:")
-            LOGGER.log("Name", model_name)
-            LOGGER.log("Number of classes", n_classes)
-            LOGGER.log("Compute_format", compute_format)
-            LOGGER.log("Input_format", input_format)
-            LOGGER.log("dtype", str(dtype))
+            print("Model HParams:")
+            print("Name", model_name)
+            print("Number of classes", n_classes)
+            print("Compute_format", compute_format)
+            print("Input_format", input_format)
+            print("dtype", str(dtype))


    def __call__(self, features, labels, mode, params):

        if mode == tf.estimator.ModeKeys.TRAIN:
+            mandatory_params = ["batch_size", "lr_init", "num_gpus", "steps_per_epoch",
+                                "momentum", "weight_decay", "loss_scale", "label_smoothing"]
+            for p in mandatory_params:
+                if p not in params:
+                    raise RuntimeError("Parameter {} is missing.".format(p))

-            if "batch_size" not in params.keys():
-                raise RuntimeError("Parameter `batch_size` is missing...")
-
-            if "lr_init" not in params.keys():
-                raise RuntimeError("Parameter `lr_init` is missing...")
-
-            if "num_gpus" not in params.keys():
-                raise RuntimeError("Parameter `num_gpus` is missing...")
-
-            if "steps_per_epoch" not in params.keys():
-                raise RuntimeError("Parameter `steps_per_epoch` is missing...")
-
-            if "momentum" not in params.keys():
-                raise RuntimeError("Parameter `momentum` is missing...")
-
-            if "weight_decay" not in params.keys():
-                raise RuntimeError("Parameter `weight_decay` is missing...")
-
-            if "loss_scale" not in params.keys():
-                raise RuntimeError("Parameter `loss_scale` is missing...")
-            
-            if "label_smoothing" not in params.keys():
-                raise RuntimeError("Parameter `label_smoothing` is missing...")
-                
        if mode == tf.estimator.ModeKeys.TRAIN and not self.model_hparams.use_dali:

            with tf.device('/cpu:0'):
                # Stage inputs on the host
-                cpu_prefetch_op, (features, labels) = ResnetModel._stage([features, labels])
+                cpu_prefetch_op, (features, labels) = self._stage([features, labels])

            with tf.device('/gpu:0'):
                # Stage inputs to the device
-                gpu_prefetch_op, (features, labels) = ResnetModel._stage([features, labels])
+                gpu_prefetch_op, (features, labels) = self._stage([features, labels])

        with tf.device("/gpu:0"):

@ -159,7 +151,7 @@ class ResnetModel(object):
                                                     off_value = eta/1001)
                if mixup != 0:

-                    LOGGER.log("Using mixup training with beta=", params['mixup'])
+                    print("Using mixup training with beta=", params['mixup'])
                    beta_distribution = tf.distributions.Beta(params['mixup'], params['mixup'])

                    feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1])      
@ -205,13 +197,13 @@ class ResnetModel(object):
            tf.identity(probs, name="probs_ref")
            tf.identity(y_preds, name="y_preds_ref")

-            if mode == tf.estimator.ModeKeys.TRAIN:
-                
-                assert (len(tf.trainable_variables()) == 161)
-
-            else:
-                
-                assert (len(tf.trainable_variables()) == 0)
+            #if mode == tf.estimator.ModeKeys.TRAIN:
+            #    
+            #    assert (len(tf.trainable_variables()) == 161)
+            #
+            #else:
+            #    
+            #    assert (len(tf.trainable_variables()) == 0)


        if mode == tf.estimator.ModeKeys.PREDICT:
@ -303,11 +295,9 @@ class ResnetModel(object):
                    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"])

                    if params["apply_loss_scaling"]:
-
                        optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"])

                    if hvd_utils.is_using_hvd():
-                        
                        optimizer = hvd.DistributedOptimizer(optimizer)

                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
@ -321,15 +311,13 @@ class ResnetModel(object):

                    
                    if self.model_hparams.use_dali:
-                    
                        train_ops = tf.group(backprop_op, update_ops, name='train_ops')
-                    
                    else:
                        train_ops = tf.group(backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops')
+
                    return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_ops)

                elif mode == tf.estimator.ModeKeys.EVAL:
-
                    eval_metrics = {
                        "top1_accuracy": (acc_top1, acc_top1_update_op),
                        "top5_accuracy": (acc_top5, acc_top5_update_op)
@ -363,22 +351,20 @@ class ResnetModel(object):
        return put_op, get_tensors


+
    def build_model(self, inputs, training=True, reuse=False):
        
        with var_storage.model_variable_scope(
            self.model_hparams.model_name,
            reuse=reuse,
-            dtype=self.model_hparams.dtype
-        ):
+            dtype=self.model_hparams.dtype):

            with tf.variable_scope("input_reshape"):
-
                if self.model_hparams.input_format == 'NHWC' and self.model_hparams.compute_format == 'NCHW':
                    # Reshape inputs: NHWC => NCHW
                    inputs = tf.transpose(inputs, [0, 3, 1, 2])

                elif self.model_hparams.input_format == 'NCHW' and self.model_hparams.compute_format == 'NHWC':
-
                    # Reshape inputs: NCHW => NHWC
                    inputs = tf.transpose(inputs, [0, 2, 3, 1])

@ -390,7 +376,7 @@ class ResnetModel(object):
                n_channels=64,
                kernel_size=(7, 7),
                strides=(2, 2),
-                mode='SAME_RESNET',
+                mode='SAME',
                use_batch_norm=True,
                activation='relu',
                is_training=training,
@ -409,74 +395,28 @@ class ResnetModel(object):
                name="max_pooling2d",
            )

-            for block_id, _ in enumerate(range(self.model_hparams.layer_counts[0])):
+            model_bottlenecks = self.model_hparams.layers_depth
+            for block_id, block_bottleneck in enumerate(model_bottlenecks):
+                for layer_id in range(self.model_hparams.layers_count[block_id]):
+                    stride = 2 if (layer_id == 0 and block_id != 0) else 1

-                net = blocks.bottleneck_block(
-                    inputs=net,
-                    depth=256,
-                    depth_bottleneck=64,
-                    stride=1,
-                    training=training,
-                    data_format=self.model_hparams.compute_format,
-                    conv2d_hparams=self.conv2d_hparams,
-                    batch_norm_hparams=self.batch_norm_hparams,
-                    block_name="btlnck_block_1_%d" % (block_id + 1)
-                )
-
-            for block_id, i in enumerate(range(self.model_hparams.layer_counts[1])):
-
-                stride = 2 if i == 0 else 1
-                
-                net = blocks.bottleneck_block(
-                    inputs=net,
-                    depth=512,
-                    depth_bottleneck=128,
-                    stride=stride,
-                    training=training,
-                    data_format=self.model_hparams.compute_format,
-                    conv2d_hparams=self.conv2d_hparams,
-                    batch_norm_hparams=self.batch_norm_hparams,
-                    block_name="btlnck_block_2_%d" % (block_id + 1)
-                )
-
-            for block_id, i in enumerate(range(self.model_hparams.layer_counts[2])):
-
-                block_id += 1
-                stride = 2 if i == 0 else 1
-
-                net = blocks.bottleneck_block(
-                    inputs=net,
-                    depth=1024,
-                    depth_bottleneck=256,
-                    stride=stride,
-                    training=training,
-                    data_format=self.model_hparams.compute_format,
-                    conv2d_hparams=self.conv2d_hparams,
-                    batch_norm_hparams=self.batch_norm_hparams,
-                    block_name="btlnck_block_3_%d" % (block_id + 1)
-                )
-
-            for block_id, i in enumerate(range(self.model_hparams.layer_counts[3])):
-
-                stride = 2 if i == 0 else 1
-
-                net = blocks.bottleneck_block(
-                    inputs=net,
-                    depth=2048,
-                    depth_bottleneck=512,
-                    stride=stride,
-                    training=training,
-                    data_format=self.model_hparams.compute_format,
-                    conv2d_hparams=self.conv2d_hparams,
-                    batch_norm_hparams=self.batch_norm_hparams,
-                    block_name="btlnck_block_4_%d" % (block_id + 1)
-                )
+                    net = blocks.bottleneck_block(
+                        inputs=net,
+                        depth=block_bottleneck * self.model_hparams.expansions,
+                        depth_bottleneck=block_bottleneck,
+                        cardinality=self.model_hparams.cardinality,
+                        stride=stride,
+                        training=training,
+                        data_format=self.model_hparams.compute_format,
+                        conv2d_hparams=self.conv2d_hparams,
+                        batch_norm_hparams=self.batch_norm_hparams,
+                        block_name="btlnck_block_%d_%d" % (block_id, layer_id),
+                        use_se=self.model_hparams.use_se,
+                        ratio=self.model_hparams.se_ratio)

            with tf.variable_scope("output"):
-
                net = layers.reduce_mean(
-                    net, keepdims=False, data_format=self.model_hparams.compute_format, name='spatial_mean'
-                )
+                    net, keepdims=False, data_format=self.model_hparams.compute_format, name='spatial_mean')

                logits = layers.dense(
                    inputs=net,
@ -484,8 +424,7 @@ class ResnetModel(object):
                    use_bias=True,
                    trainable=training,
                    kernel_initializer=self.dense_hparams.kernel_initializer,
-                    bias_initializer=self.dense_hparams.bias_initializer
-                )
+                    bias_initializer=self.dense_hparams.bias_initializer)

                if logits.dtype != tf.float32:
                    logits = tf.cast(logits, tf.float32)
@ -493,3 +432,28 @@ class ResnetModel(object):
                probs = layers.softmax(logits, name="softmax", axis=1)

            return probs, logits
+
+model_architectures = {
+    'resnet50': {
+        'layers': [3, 4, 6, 3],
+        'widths': [64, 128, 256, 512],
+        'expansions': 4,
+    },
+
+    'resnext101-32x4d': {
+        'layers': [3, 4, 23, 3],
+        'widths': [128, 256, 512, 1024],
+        'expansions': 2,
+        'cardinality': 32,
+    },
+
+    'se-resnext101-32x4d' : {
+        'cardinality' : 32,
+        'layers' : [3, 4, 23, 3],
+        'widths' : [128, 256, 512, 1024],
+        'expansions' : 2,
+        'use_se': True,
+        'se_ratio': 16,
+    },
+
+}
--- a/TensorFlow/Classification/ConvNets/requirements.txt
+++ b/TensorFlow/Classification/ConvNets/requirements.txt
@ -0,0 +1 @@
+-e git://github.com/NVIDIA/dllogger#egg=dllogger
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/README.md
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/README.md
@ -0,0 +1,663 @@
+# ResNet-50 v1.5 for TensorFlow
+
+This repository provides a script and recipe to train the ResNet-50 v1.5 model to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+* [Model overview](#model-overview)
+    * [Default configuration](#default-configuration)
+        * [Optimizer](#optimizer)
+        * [Data augmentation](#data-augmentation)
+    * [Feature support matrix](#feature-support-matrix)
+        * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+        * [Enabling mixed precision](#enabling-mixed-precision)
+* [Setup](#setup)
+    * [Requirements](#requirements)
+* [Quick Start Guide](#quick-start-guide)
+* [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+        * [The `main.py` script](#the-mainpy-script)
+    * [Inference process](#inference-process)
+* [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
+            * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
+        * [Training time for 90 Epochs](#training-time-for-90-epochs)
+            * [Training time: NVIDIA DGX-1 (8x V100 16G)](#training-time-nvidia-dgx-1-8x-v100-16g)
+            * [Training time: NVIDIA DGX-2 (16x V100 32G)](#training-time-nvidia-dgx-2-16x-v100-32g)
+        * [Inference performance results](#inference-performance-results)
+            * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
+            * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
+            * [Inference performance: NVIDIA T4 (1x T4)](#inference-performance-nvidia-t4-1x-t4-16g)
+* [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+## Model overview
+The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
+
+The difference between v1 and v1.5 is in the bottleneck blocks which requires
+downsampling, for example, v1 has stride = 2 in the first 1x1 convolution, whereas v1.5 has stride = 2 in the 3x3 convolution.
+
+This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1,
+but comes with a small performance drawback (~5% imgs/sec).
+
+The following performance optimizations were implemented in this model:
+* JIT graph compilation with [XLA](https://www.tensorflow.org/xla)
+* Multi-GPU training with [Horovod](https://github.com/horovod/horovod)
+* NVIDIA Data Loading ([DALI](https://github.com/NVIDIA/DALI)). 
+
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Default configuration
+
+The following sections highlight the default configuration for the ResNet50 model.
+
+#### Optimizer
+
+This model uses the SGD optimizer with the following hyperparameters:
+
+* Momentum (0.875).
+* Learning rate (LR) = 0.256 for 256 batch size, for other batch sizes we linearly scale the learning
+  rate. 
+* Learning rate schedule - we use cosine LR schedule.
+* For bigger batch sizes (512 and up) we use linear warmup of the learning rate.
+during the first 5 epochs according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
+* Weight decay: 3.0517578125e-05 (1/32768).
+* We do not apply Weight decay on batch norm trainable parameters (gamma/bias).
+* Label Smoothing: 0.1.
+* We train for:
+    * 90 Epochs -> 90 epochs is a standard for ResNet50
+    * 250 Epochs -> best possible accuracy. 
+* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
+
+#### Data Augmentation
+
+This model uses the following data augmentation:
+
+* For training:
+  * Normalization.
+  * Random resized crop to 224x224.
+    * Scale from 8% to 100%.
+    * Aspect ratio from 3/4 to 4/3.
+  * Random horizontal flip.
+
+* For inference:
+  * Normalization.
+  * Scale to 256x256.
+  * Center crop to 224x224.
+
+### Feature support matrix
+
+The following features are supported by this model.
+
+| Feature               | ResNet-50 v1.5 Tensorflow             |
+|-----------------------|--------------------------
+|Multi-GPU training with [Horovod](https://github.com/horovod/horovod)  |  Yes |
+|[NVIDIA DALI](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html)                |  Yes |
+|Automatic mixed precision (AMP) | Yes |
+
+
+#### Features
+
+Multi-GPU training with Horovod - Our model uses Horovod to implement efficient multi-GPU training with NCCL.
+For details, refer to the example sources in this repository or the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
+
+NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
+with the DALI library. For details, refer to the example sources in this repository or the [DALI documentation](https://docs.nvidia.com/deeplearning/dali/index.html).
+
+Automatic mixed precision (AMP) - Computation graph can be modified by TensorFlow on runtime to support mixed precision training. 
+Detailed explanation of mixed precision can be found in the next section.
+
+### Mixed precision training
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network.  Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.  Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values. 
+
+This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code.  AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow framework code makes all necessary model changes internally.
+
+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
+
+For information about:
+ * How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
+ * How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+ * Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+ 
+#### Enabling mixed precision
+
+Mixed precision is enabled in TensorFlow by using the Automatic Mixed Precision (TF-AMP) extension which casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In TensorFlow, loss scaling can be applied statically by using simple multiplication of loss by a constant value or automatically, by TF-AMP. Automatic mixed precision makes all the adjustments internally in TensorFlow, providing two benefits over manual operations. First, programmers need not modify network model code, reducing development and maintenance effort. Second, using AMP maintains forward and backward compatibility with all the APIs for defining and running TensorFlow models.
+
+To enable mixed precision, you can simply add the values to the environmental variables inside your training script:
+- Enable TF-AMP graph rewrite:
+  ```
+  os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+  ```
+  
+- Enable Automated Mixed Precision:
+  ```
+  os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
+  ```
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to use the ResNet50 v1.5 model.
+
+### Requirements
+This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates all dependencies.  Aside from these dependencies, ensure you have the following software:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html),
+* [Accessing And Pulling From The NGC container registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry),
+* [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running).
+
+For those unable to use the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+To train your model using mixed precision with Tensor Cores or FP32, perform the following steps using the default parameters of the ResNet-50 v1.5 model on the [ImageNet](http://www.image-net.org/) dataset. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
+
+
+1. Clone the repository.
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/TensorFlow/Classification/RN50v1.5
+```
+
+2. Download and preprocess the dataset.
+The ResNet50 v1.5 script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
+
+To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/master/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
+
+3. Build the ResNet-50 v1.5 TensorFlow NGC container.
+```bash
+docker build . -t nvidia_rn50
+```
+
+4. Start an interactive session in the NGC container to run training/inference.
+After you build the container image, you can start an interactive CLI session with
+```bash
+nvidia-docker run --rm -it -v <path to imagenet>:/data/tfrecords --ipc=host nvidia_rn50
+```
+
+5. (Optional) Create index files to use DALI.
+To allow proper sharding in a multi-GPU environment, DALI has to create index files for the dataset. To create index files, run inside the container:
+```bash
+bash ./utils/dali_index.sh /data/tfrecords <index file store location>
+```
+Index files can be created once and then reused. It is highly recommended to save them into a persistent location.
+
+6. Start training.
+To run training for a standard configuration (as described in [Default
+configuration](#default-configuration), DGX1V, DGX2V, single GPU, FP16, FP32, 50, 90, and 250 epochs), run
+one of the scripts int the `resnet50v1.5/training` directory. Ensure ImageNet is mounted in the
+`/data/tfrecords` directory.
+
+For example, to train on DGX-1 for 90 epochs using AMP, run: 
+
+`bash ./resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh`
+
+Additionally, features like DALI data preprocessing or TensorFlow XLA can be enabled with
+environmental variables when running those scripts:
+
+`USE_XLA=1 USE_DALI=1 bash ./resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh`
+
+To store results in a specific location, add a location as a first argument:
+
+`bash ./resnext101-32x4d/training/AMP/DGX1_RNxt101-32x4d_AMP_90E.sh <location to store>`
+
+7. Start validation/evaluation.
+To evaluate the validation dataset located in `/data/tfrecords`, run `main.py` with
+`--mode=evaluate`. For example:
+
+`python main.py --mode=evaluate --data_dir=/data/tfrecords --batch_size <batch size> --model_dir
+<model location> --result_dir <output location> [--use_xla] [--use_tf_amp]`
+
+The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during evaluation. 
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+ - `main.py`:               the script that controls the logic of training and validation of the ResNet-like models
+ - `Dockerfile`:            Instructions for Docker to build a container with the basic set of dependencies to run ResNet like models for image classification
+ - `requirements.txt`:      a set of extra Python requirements for running ResNet-like models
+
+The `model/` directory contains the following modules used to define ResNet family models:
+ - `resnet.py`: the definition of ResNet, ResNext, and SE-ResNext model
+ - `blocks/conv2d_block.py`: the definition of 2D convolution block
+ - `blocks/resnet_bottleneck_block.py`: the definition of ResNet-like bottleneck block
+ - `layers/*.py`: definitions of specific layers used in the ResNet-like model
+ 
+The `utils/` directory contains the following utility modules:
+ - `cmdline_helper.py`: helper module for command line processing
+ - `data_utils.py`: module defining input data pipelines
+ - `dali_utils.py`: helper module for DALI 
+ - `hvd_utils.py`: helper module for Horovod
+ - `image_processing.py`: image processing and data augmentation functions
+ - `learning_rate.py`: definition of used learning rate schedule
+ - `optimizers.py`: definition of used custom optimizers
+ - `hooks/*.py`: definitions of specific hooks allowing logging of training and inference process
+ 
+The `runtime/` directory contains the following module that define the mechanics of the training process:
+ - `runner.py`: module encapsulating the training, inference and evaluation  
+
+
+### Parameters
+
+#### The `main.py` script
+The script for training and evaluating the ResNet-50 v1.5 model has a variety of parameters that control these processes.
+
+```
+usage: main.py [-h]
+               [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
+               [--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}]
+               [--data_dir DATA_DIR] [--data_idx_dir DATA_IDX_DIR]
+               [--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
+               [--batch_size BATCH_SIZE] [--num_iter NUM_ITER]
+               [--iter_unit {epoch,batch}] [--warmup_steps WARMUP_STEPS]
+               [--model_dir MODEL_DIR] [--results_dir RESULTS_DIR]
+               [--log_filename LOG_FILENAME] [--display_every DISPLAY_EVERY]
+               [--lr_init LR_INIT] [--lr_warmup_epochs LR_WARMUP_EPOCHS]
+               [--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
+               [--momentum MOMENTUM] [--loss_scale LOSS_SCALE]
+               [--label_smoothing LABEL_SMOOTHING] [--mixup MIXUP]
+               [--use_static_loss_scaling | --nouse_static_loss_scaling]
+               [--use_xla | --nouse_xla] [--use_dali | --nouse_dali]
+               [--use_tf_amp | --nouse_tf_amp]
+               [--use_cosine_lr | --nouse_cosine_lr] [--seed SEED]
+               [--gpu_memory_fraction GPU_MEMORY_FRACTION] [--gpu_id GPU_ID]
+
+JoC-RN50v1.5-TF
+
+optional arguments:
+  -h, --help            Show this help message and exit
+  --arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}
+                        Architecture of model to run (default is resnet50)
+  --mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}
+                        The execution mode of the script.
+  --data_dir DATA_DIR   Path to dataset in TFRecord format. Files should be
+                        named 'train-*' and 'validation-*'.
+  --data_idx_dir DATA_IDX_DIR
+                        Path to index files for DALI. Files should be named
+                        'train-*' and 'validation-*'.
+  --export_dir EXPORT_DIR
+                        Directory in which to write exported SavedModel.
+  --to_predict TO_PREDICT
+                        Path to file or directory of files to run prediction
+                        on.
+  --batch_size BATCH_SIZE
+                        Size of each minibatch per GPU.
+  --num_iter NUM_ITER   Number of iterations to run.
+  --iter_unit {epoch,batch}
+                        Unit of iterations.
+  --warmup_steps WARMUP_STEPS
+                        Number of steps considered as warmup and not taken
+                        into account for performance measurements.
+  --model_dir MODEL_DIR
+                        Directory in which to write the model. If undefined,
+                        results directory will be used.
+  --results_dir RESULTS_DIR
+                        Directory in which to write training logs, summaries
+                        and checkpoints.
+  --log_filename LOG_FILENAME
+                        Name of the JSON file to which write the training log
+  --display_every DISPLAY_EVERY
+                        How often (in batches) to print out running
+                        information.
+  --lr_init LR_INIT     Initial value for the learning rate.
+  --lr_warmup_epochs LR_WARMUP_EPOCHS
+                        Number of warmup epochs for the learning rate schedule.
+  --weight_decay WEIGHT_DECAY
+                        Weight Decay scale factor.
+  --weight_init {fan_in,fan_out}
+                        Model weight initialization method.
+  --momentum MOMENTUM   SGD momentum value for the momentum optimizer.
+  --loss_scale LOSS_SCALE
+                        Loss scale for FP16 training and fast math FP32.
+  --label_smoothing LABEL_SMOOTHING
+                        The value of label smoothing.
+  --mixup MIXUP         The alpha parameter for mixup (if 0 then mixup is not
+                        applied).
+  --use_static_loss_scaling
+                        Use static loss scaling in FP16 or FP32 AMP.
+  --nouse_static_loss_scaling
+  --use_xla             Enable XLA (Accelerated Linear Algebra) computation
+                        for improved performance.
+  --nouse_xla
+  --use_dali            Enable DALI data input.
+  --nouse_dali
+  --use_tf_amp          Enable AMP to speedup FP32
+                        computation using Tensor Cores.
+  --nouse_tf_amp
+  --use_cosine_lr       Use cosine learning rate schedule.
+  --nouse_cosine_lr
+  --seed SEED           Random seed.
+  --gpu_memory_fraction GPU_MEMORY_FRACTION
+                        Limit memory fraction used by the training script for DALI
+  --gpu_id GPU_ID       Specify the ID of the target GPU on a multi-device platform.
+                        Effective only for single-GPU mode.
+```
+
+### Inference process
+To run inference on a single example with a checkpoint and a model script, use: 
+
+`python main.py --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`
+
+The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during inference.
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark the training performance on a specific batch size, run:
+
+* For 1 GPU
+    * FP32
+        `python ./main.py --mode=training_benchmark --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+    * FP16
+        `python ./main.py --mode=training_benchmark  --use_tf_amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+* For multiple GPUs
+    * FP32
+        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --mode=training_benchmark --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+    * FP16
+        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --mode=training_benchmark --use_tf_amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+        
+Each of these scripts runs 200 warm-up iterations and measures the first epoch.
+
+To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
+with `--use_xla` and `--use_dali` flags.
+Suggested batch sizes for training are 256 for mixed precision training and 128 for single precision training.
+
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size, run:
+
+* FP32
+`python ./main.py --mode=inference_benchmark --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+
+* FP16
+`python ./main.py --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+
+By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
+To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
+
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference. 
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+Our results were obtained by running the `/resnet50v1.5/training/{PRECISION}/DGX1_RN50_{PRECISION}_{EPOCHS}E.sh` 
+training script in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) 
+NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+
+| Epochs | Batch Size / GPU | Accuracy - FP32 | Accuracy - mixed precision | 
+|--------|------------------|-----------------|----------------------------|
+| 50   | 128 (FP32) / 256 (AMP) | 76.06             | 75.96   | 
+| 90   | 128 (FP32) / 256 (AMP) | 77.08             | 77.01   |
+| 250  | 128 (FP32) / 256 (AMP) | 78.34             | 78.35   |
+
+**Example training loss plot**
+
+![TrainingLoss](./imgs/train_loss.png)
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16G)
+Our results were obtained by running the steps from [Training performance benchmark](#training-performance-benchmark) in the 
+[TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)  NGC container 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in images per second) were averaged over an entire training epoch.
+
+
+| GPUs | Batch Size / GPU | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
+| 1  | 128 (FP32) / 256 (AMP) | 369.62 img/s  | 1099.55 img/s | 2.97x           | 1.00x     | 1.00x             |
+| 8  | 128 (FP32) / 256 (AMP) | 2785.81 img/s | 8277.91 img/s | 2.97x           | 7.54x     | 7.53x             |
+
+**XLA Enabled**
+
+| GPUs | Batch Size / GPU | Throughput - mixed precision | Throughput - mixed precision + XLA | Throughput speedup (mixed precision - XLA) |
+|----|------------|---------------|---------------------|-----------|
+| 1  | 256        | 1099.55 img/s  |1217.90 img/s       | 1.11x     |
+| 8  | 256        | 8277.91 img/s  |9485.21 img/s       | 1.14x     |
+
+##### Training performance: NVIDIA DGX-2 (16x V100 32G)
+Our results were obtained by running the steps from [Training performance benchmark](#training-performance-benchmark) in the 
+[TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)  NGC container 
+on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in images per second) were averaged over an entire training epoch.
+
+| GPUs | Batch Size / GPU | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|----|---------------|---------------|-------------------------|-------|--------|--------|
+| 1  | 128 (FP32) / 256 (AMP) | 385.26 img/s  | 1020.80 img/s  | 2.64x | 1.00x  | 1.00x  |
+| 16 | 128 (FP32) / 256 (AMP) | 5347.83 img/s | 16290.42 img/s | 3.04x | 13.88x | 15.95x |
+
+**XLA Enabled**
+
+| GPUs | Batch Size / GPU | Throughput - mixed precision | Throughput - mixed precision + XLA | Throughput speedup (mixed precision - XLA) |
+|----|-----|----------|---------------------|-----------|
+| 1  | 256 | 1020.80 img/s   | 1157.97 img/s  |1.13x      |
+| 16 | 256 | 16290.42 img/s  | 18304.71 img/s |1.12x      |
+
+#### Training Time for 90 Epochs
+
+##### Training time: NVIDIA DGX-1 (8x V100 16G)
+
+Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-1-8x-v100-16g) 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+
+| GPUs | Time to train - mixed precision + XLA | Time to train - mixed precision | Time to train - FP32 |
+|---|--------|---------|---------|
+| 1 | ~26h   |  ~29h   |  ~86h   |
+| 8 | ~3.5h  |  ~4h    |  ~11.5h | 
+
+##### Training time: NVIDIA DGX-2 (16x V100 32G)
+
+Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-2-16x-v100-32g) 
+on NVIDIA DGX-2 with (16x V100 32G) GPUs.
+
+| GPUs | Time to train - mixed precision + XLA | Time to train - mixed precision | Time to train - FP32 |
+|----|-------|--------|-------|
+| 1  | ~27h  | ~31h   | ~83h  |
+| 16 | ~1.7h | ~2h    | ~6h   | 
+
+
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+
+Our results were obtained by running the `inference_benchmark.sh` inferencing benchmarking script
+in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container 
+on NVIDIA DGX-1 with (1x V100 16G) GPU.
+
+**FP32 Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|173.35 img/s	|5.79 ms	|5.90 ms	|5.95 ms	|6.04 ms  |
+|2	|303.65 img/s	|6.61 ms	|6.80 ms	|6.87 ms	|7.01 ms  |
+|4	|562.35 img/s	|7.12 ms	|7.32 ms	|7.42 ms	|7.69 ms  |
+|8	|783.24 img/s	|10.22 ms	|10.37 ms	|10.44 ms	|10.60 ms |
+|16	|1003.10 img/s	|15.99 ms	|16.07 ms	|16.12 ms	|16.29 ms |
+|32	|1140.12 img/s	|28.19 ms	|28.27 ms	|28.38 ms	|28.54 ms |
+|64	|1252.06 img/s	|51.12 ms	|51.82 ms	|52.75 ms	|53.45 ms |
+|128	|1324.91 img/s	|96.61 ms	|97.02 ms	|97.25 ms	|99.08 ms |
+|256	|1348.52 img/s	|189.85 ms	|191.16 ms	|191.77 ms	|192.47 ms|
+
+**Mixed Precision Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|237.35 img/s	|4.25 ms	|4.39 ms	|4.54 ms	|5.30 ms  |
+|2	|464.94 img/s	|4.32 ms	|4.63 ms	|4.83 ms	|5.52 ms  |
+|4	|942.44 img/s	|4.26 ms	|4.55 ms	|4.74 ms	|5.45 ms  |
+|8	|1454.93 img/s	|5.57 ms	|5.73 ms	|5.91 ms	|6.51 ms  |
+|16	|2003.75 img/s	|8.13 ms	|8.19 ms	|8.29 ms	|8.50 ms  |
+|32	|2356.17 img/s	|13.69 ms	|13.82 ms	|13.92 ms	|14.26 ms |
+|64	|2706.11 img/s	|23.86 ms	|23.82 ms	|23.89 ms	|24.10 ms |
+|128	|2770.61 img/s	|47.04 ms	|49.36 ms	|62.43 ms	|90.05 ms |
+|256	|2742.14 img/s	|94.67 ms	|108.02 ms	|119.34 ms	|145.55 ms|
+
+**Mixed Precision Inference Latency + XLA**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|162.95 img/s	|6.16 ms	|6.28 ms	|6.34 ms	|6.50 ms  |
+|2	|335.63 img/s	|5.96 ms	|6.10 ms	|6.14 ms	|6.25 ms  |
+|4	|637.72 img/s	|6.30 ms	|6.53 ms	|7.17 ms	|8.10 ms  |
+|8	|1153.92 img/s	|7.03 ms	|7.97 ms	|8.22 ms	|9.00 ms  |
+|16	|1906.52 img/s	|8.64 ms	|9.51 ms	|9.88 ms	|10.47 ms |
+|32	|2492.78 img/s	|12.84 ms	|13.06 ms	|13.13 ms	|13.24 ms |
+|64	|2910.05 img/s	|22.66 ms	|21.82 ms	|24.71 ms	|48.61 ms |
+|128	|2964.31 img/s	|45.25 ms	|59.30 ms	|71.42 ms	|98.72 ms |
+|256	|2898.12 img/s	|90.53 ms	|106.12 ms	|118.12 ms	|150.78 ms|
+
+##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
+
+Our results were obtained by running the `inference_benchmark.sh` inferencing benchmarking script
+in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container 
+on NVIDIA DGX-2 with (1x V100 32G) GPU.
+
+**FP32 Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|187.41 img/s	|5.374 ms	|5.61 ms	|5.70 ms	|6.33 ms |
+|2	|339.52 img/s	|5.901 ms	|6.16 ms	|6.29 ms	|6.53 ms |
+|4	|577.50 img/s	|6.940 ms	|7.07 ms	|7.24 ms	|7.99 ms |
+|8	|821.15 img/s	|9.751 ms	|9.99 ms	|10.15 ms	|10.80 ms|
+|16	|1055.64 img/s	|15.209 ms	|15.26 ms	|15.30 ms	|16.14 ms|
+|32	|1195.74 img/s	|26.772 ms	|26.93 ms	|26.98 ms	|27.80 ms|
+|64	|1313.83 img/s	|48.796 ms	|48.99 ms	|49.72 ms	|51.83 ms|
+|128	|1372.58 img/s	|93.262 ms	|93.90 ms	|94.97 ms	|96.57 ms|
+|256	|1414.99 img/s	|180.923 ms	|181.65 ms	|181.92 ms	|183.37 ms|
+
+**Mixed Precision Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|289.89 img/s	|3.50 ms	|3.81 ms	|3.90 ms	|4.19 ms |
+|2	|606.27 img/s	|3.38 ms	|3.56 ms	|3.76 ms	|4.25 ms |
+|4	|982.92 img/s	|4.09 ms	|4.42 ms	|4.53 ms	|4.81 ms |
+|8	|1553.34 img/s	|5.22 ms	|5.31 ms	|5.50 ms	|6.74 ms |
+|16	|2091.27 img/s	|7.82 ms	|7.77 ms	|7.82 ms	|8.77 ms |
+|32	|2457.61 img/s	|13.14 ms	|13.15 ms	|13.21 ms	|13.37 ms|
+|64	|2746.11 img/s	|23.31 ms	|23.50 ms	|23.56 ms	|24.31 ms|
+|128	|2937.20 img/s	|43.58 ms	|43.76 ms	|43.82 ms	|44.37 ms|
+|256	|3009.83 img/s	|85.06 ms	|86.23 ms	|87.37 ms	|88.67 ms|
+
+**Mixed Precision Inference Latency + XLA**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|240.66 img/s	|4.22 ms	|4.59 ms	|4.69 ms	|4.84 ms |
+|2	|428.60 img/s	|4.70 ms	|5.11 ms	|5.44 ms	|6.01 ms |
+|4	|945.38 img/s	|4.26 ms	|4.35 ms	|4.42 ms	|4.74 ms |
+|8	|1518.66 img/s	|5.33 ms	|5.50 ms	|5.63 ms	|5.88 ms |
+|16	|2091.66 img/s	|7.83 ms	|7.74 ms	|7.79 ms	|8.88 ms |
+|32	|2604.17 img/s	|12.40 ms	|12.45 ms	|12.51 ms	|12.61 ms|
+|64	|3101.15 img/s	|20.64 ms	|20.93 ms	|21.00 ms	|21.17 ms|
+|128	|3408.72 img/s	|37.55 ms	|37.93 ms	|38.05 ms	|38.53 ms|
+|256	|3633.85 img/s	|70.85 ms	|70.93 ms	|71.12 ms	|71.45 ms|
+
+##### Inference performance: NVIDIA T4 (1x T4 16G)
+
+Our results were obtained by running the `inference_benchmark.sh` inferencing benchmarking script
+in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container 
+on NVIDIA T4 with (1x T4 16G) GPU.
+
+**FP32 Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|136.44 img/s	|7.34 ms	|7.43 ms	|7.47 ms	|7.54 ms    |
+|2	|215.38 img/s	|9.29 ms	|9.42 ms	|9.46 ms	|9.59 ms    |
+|4	|289.29 img/s	|13.83 ms	|14.08 ms	|14.16 ms	|14.40 ms   |
+|8	|341.77 img/s	|23.41 ms	|23.79 ms	|23.86 ms	|24.11 ms   |
+|16	|394.36 img/s	|40.58 ms	|40.87 ms	|40.98 ms	|41.41 ms   |
+|32	|414.66 img/s	|77.18 ms	|78.05 ms	|78.29 ms	|78.67 ms   |
+|64	|424.42 img/s	|150.82 ms	|152.99 ms	|153.44 ms	|154.34 ms  |
+|128	|429.83 img/s	|297.82 ms	|301.09 ms	|301.60 ms	|302.51 ms  |
+|256	|425.72 img/s	|601.37 ms	|605.74 ms	|606.47 ms	|608.74 ms  |
+
+
+**Mixed Precision Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|211.04 img/s	|4.77 ms	|5.05 ms	|5.08 ms	|5.15 ms    |
+|2	|381.23 img/s	|5.27 ms	|5.40 ms	|5.45 ms	|5.52 ms    |
+|4	|593.13 img/s	|6.75 ms	|6.89 ms	|6.956 ms	|7.02 ms   |
+|8	|791.12 img/s	|10.16 ms	|10.35 ms	|10.43 ms	|10.68 ms   |
+|16	|914.26 img/s	|17.55 ms	|17.80 ms	|17,89 ms	|18.19 ms   |
+|32	|972.36 img/s	|32.92 ms	|33.33 ms	|33.46 ms	|33.61 ms   |
+|64	|991.39 img/s	|64.56 ms	|65.62 ms	|65.92 ms	|66.35 ms  |
+|128	|995.81 img/s	|128.55 ms	|130.03 ms	|130.37 ms	|131.08 ms  |
+|256	|993.39 img/s	|257.71 ms	|259.26 ms	|259.62 ms	|260.36 ms  |
+
+**Mixed Precision Inference Latency + XLA**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1      |167.01 img/s	|6.01 ms	|6.12 ms	|6.14 ms	|6.18 ms  |
+|2	|333.67 img/s	|6.03 ms	|6.11 ms	|6.15 ms	|6.23 ms  |
+|4	|605.94 img/s	|6.63 ms	|6.79 ms	|6.86 ms	|7.02 ms  |
+|8	|802.13 img/s	|9.98 ms	|10.14 ms	|10.22 ms	|10.36 ms |
+|16	|986.85 img/s	|16.27 ms	|16.36 ms	|16.42 ms	|16.52 ms |
+|32	|1090.38 img/s	|29.35 ms	|29.68 ms	|29.79 ms	|30.07 ms |
+|64	|1131.56 img/s	|56.63 ms	|57.22 ms	|57.41 ms	|57.76 ms |
+|128	|1167.62 img/s	|109.77 ms	|111.06 ms	|111.27 ms	|111.85 ms|
+|256	|1193.74 img/s	|214.46 ms	|216.28 ms	|216.86 ms	|217.80 ms|
+
+## Release notes
+
+### Changelog
+1. March, 2019
+  * Initial release
+2. May, 2019
+  * Added DALI support
+  * Added scripts for DGX-2
+  * Added benchmark results for DGX-2 and XLA-enabled DGX-1 and DGX-2.
+3. July, 2019
+  * Added Cosine learning rate schedule
+3. August, 2019
+  * Added mixup regularization
+  * Added T4 benchmarks
+  * Improved inference capabilities
+  * Added SavedModel export 
+4. January, 2020
+  * Removed manual checks for dataset paths to facilitate cloud storage solutions
+  * Move to a new logging solution
+  * Bump base docker image version
+5. March, 2020
+  * Code cleanup and refactor
+  * Improved training process
+6. June, 2020
+  * Added ResNext and SE-ResNext architectures
+### Known issues
+There are no known issues with this model.
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/imgs/train_loss.png
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/imgs/train_loss.png
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_250E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    8 250 256 amp --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_50E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_50E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    8 50 256 amp 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    8 90 256 amp 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_250E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    16 250 256 amp --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_50E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_50E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    16 50 256 amp 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/DGX2_RN50_AMP_90E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    16 90 256 amp 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/GPU1_RN50_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/GPU1_RN50_AMP_250E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    1 250 256 amp --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/GPU1_RN50_AMP_50E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/GPU1_RN50_AMP_50E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    1 50 256 amp 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/GPU1_RN50_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/AMP/GPU1_RN50_AMP_90E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    1 90 256 amp 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_250E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    8 250 256 fp16 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_50E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_50E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    8 50 256 fp16 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX1_RN50_FP16_90E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    8 90 256 fp16 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_250E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    16 250 256 fp16 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_50E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_50E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    16 50 256 fp16 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/DGX2_RN50_FP16_90E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    16 90 256 fp16 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/GPU1_RN50_FP16_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/GPU1_RN50_FP16_250E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    1 250 256 fp16 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/GPU1_RN50_FP16_50E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/GPU1_RN50_FP16_50E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    1 50 256 fp16 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/GPU1_RN50_FP16_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP16/GPU1_RN50_FP16_90E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    1 90 256 fp16 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_250E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    8 250 128 fp32 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_50E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_50E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    8 50 128 fp32 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX1_RN50_FP32_90E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    8 90 128 fp32 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_250E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    16 250 128 fp32 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_50E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_50E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    16 50 128 fp32 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/DGX2_RN50_FP32_90E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    16 90 128 fp32 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/GPU1_RN50_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/GPU1_RN50_FP32_250E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    1 250 128 fp32 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/GPU1_RN50_FP32_50E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/GPU1_RN50_FP32_50E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    1 50 128 fp32 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/GPU1_RN50_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/FP32/GPU1_RN50_FP32_90E.sh
@ -0,0 +1,5 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+bash ${WORKSPACE}/resnet50v1.5/training/GENERIC.sh ${WORKSPACE} ${DATA_DIR} \
+    1 90 128 fp32 
--- a/TensorFlow/Classification/ConvNets/resnet50v1.5/training/GENERIC.sh
+++ b/TensorFlow/Classification/ConvNets/resnet50v1.5/training/GENERIC.sh
@ -0,0 +1,39 @@
+WORKSPACE=${1:-"/workspace/rn50v15_tf"}
+DATA_DIR=${2:-"/data"}
+
+GPU_COUNT=${3:-8}
+ITER_COUNT=${4:-50}
+BATCH_SIZE=${5:-128}
+PRECISION=${6:-"fp32"}
+OTHER=${@:7}
+
+if [[ ! -z "${BIND_TO_SOCKET}" ]]; then
+    BIND_TO_SOCKET="--bind-to socket"
+fi
+
+if [[ ! -z "${USE_DALI}" ]]; then
+    USE_DALI="--use_dali --data_idx_dir=${DATA_DIR}/dali_idx"
+fi
+
+if [[ ! -z "${USE_XLA}" ]]; then
+    USE_XLA="--use_xla"
+fi
+
+CMD=""
+case $PRECISION in
+    "fp32") CMD+="--precision=fp32";;
+    "fp16") CMD+="--precision=fp16 --use_static_loss_scaling --loss_scale=128";;
+    "amp") CMD+="--precision=fp32 --use_tf_amp --use_static_loss_scaling --loss_scale=128";;
+esac
+
+CMD="--arch=resnet50 --mode=train_and_evaluate --iter_unit=epoch --num_iter=${ITER_COUNT} \
+    --batch_size=${BATCH_SIZE} --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \
+    ${CMD} --data_dir=${DATA_DIR}/tfrecords ${USE_DALI} ${USE_XLA} \
+    --results_dir=${WORKSPACE}/results --weight_init=fan_in ${OTHER}"
+
+if [[ ${GPU_COUNT} -eq 1 ]]; then
+    python3 main.py ${CMD}
+else
+    mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np ${GPU_COUNT} python3 main.py ${CMD}
+fi
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md
@ -0,0 +1,654 @@
+# ResNext101-32x4d for TensorFlow
+
+This repository provides a script and recipe to train the ResNext101-32x4d model to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+* [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+        * [Optimizer](#optimizer)
+        * [Data augmentation](#data-augmentation)
+    * [Feature support matrix](#feature-support-matrix)
+        * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+        * [Enabling mixed precision](#enabling-mixed-precision)
+* [Setup](#setup)
+    * [Requirements](#requirements)
+* [Quick Start Guide](#quick-start-guide)
+* [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+        * [The `main.py` script](#the-mainpy-script)
+    * [Inference process](#inference-process)
+* [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
+            * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
+        * [Training time for 90 Epochs](#training-time-for-90-epochs)
+            * [Training time: NVIDIA DGX-1 (8x V100 16G)](#training-time-nvidia-dgx-1-8x-v100-16g)
+            * [Training time: NVIDIA DGX-2 (16x V100 32G)](#training-time-nvidia-dgx-2-16x-v100-32g)
+        * [Inference performance results](#inference-performance-results)
+            * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
+            * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
+            * [Inference performance: NVIDIA T4 (1x T4)](#inference-performance-nvidia-t4-1x-t4-16g)
+* [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+## Model overview
+The ResNeXt101-32x4d is a model introduced in the [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf) paper.
+
+It is based on a regular ResNet model, substituting 3x3 convolutions inside the bottleneck block for 3x3 grouped convolutions.
+
+The following performance optimizations were implemented in this model:
+* JIT graph compilation with [XLA](https://www.tensorflow.org/xla)
+* Multi-GPU training with [Horovod](https://github.com/horovod/horovod)
+* Automated mixed precision [AMP](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html)
+
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+
+![ResNextArch](./imgs/ResNeXtArch.png)
+
+_Image source: [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/pdf/1611.05431.pdf)_
+
+Image shows difference between ResNet bottleneck block and ResNeXt bottleneck block.
+ResNeXt bottleneck block splits single convolution into multiple smaller, parallel convolutions.
+
+ResNeXt101-32x4d model's cardinality equals 32 and bottleneck width equals 4. This means instead of single convolution with 64 filters 
+32 parallel convolutions with only 4 filters are used.
+
+
+### Default configuration
+
+The following sections highlight the default configuration for the ResNext101-32x4d model.
+
+#### Optimizer
+
+This model uses the SGD optimizer with the following hyperparameters:
+
+* Momentum (0.875).
+* Learning rate (LR) = 0.256 for 256 batch size, for other batch sizes we linearly scale the learning
+  rate. 
+* Learning rate schedule - we use cosine LR schedule.
+* For bigger batch sizes (512 and up) we use linear warmup of the learning rate.
+during the first 5 epochs according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
+* Weight decay: 6.103515625e-05 (1/16384).
+* We do not apply Weight decay on batch norm trainable parameters (gamma/bias).
+* Label Smoothing: 0.1.
+* We train for:
+    * 90 Epochs -> 90 epochs is a standard for ImageNet networks.
+    * 250 Epochs -> best possible accuracy. 
+* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
+
+#### Data Augmentation
+
+This model uses the following data augmentation:
+
+* For training:
+  * Normalization.
+  * Random resized crop to 224x224.
+    * Scale from 8% to 100%.
+    * Aspect ratio from 3/4 to 4/3.
+  * Random horizontal flip.
+
+* For inference:
+  * Normalization.
+  * Scale to 256x256.
+  * Center crop to 224x224.
+
+### Feature support matrix
+
+The following features are supported by this model.
+
+| Feature               | ResNext101-32x4d Tensorflow             |
+|-----------------------|--------------------------
+|Multi-GPU training with [Horovod](https://github.com/horovod/horovod)  |  Yes |
+|[NVIDIA DALI](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html)                |  Yes |
+|Automatic mixed precision (AMP) | Yes |
+
+
+#### Features
+
+Multi-GPU training with Horovod - Our model uses Horovod to implement efficient multi-GPU training with NCCL.
+For details, refer to the example sources in this repository or the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
+
+NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
+with the DALI library. For details, refer to the example sources in this repository or the [DALI documentation](https://docs.nvidia.com/deeplearning/dali/index.html).
+
+Automatic mixed precision (AMP) - Computation graph can be modified by TensorFlow on runtime to support mixed precision training. 
+Detailed explanation of mixed precision can be found in the next section.
+
+### Mixed precision training
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network.  Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.  Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values. 
+
+This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code.  AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow framework code makes all necessary model changes internally.
+
+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
+
+For information about:
+ * How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
+ * How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+ * Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+ 
+#### Enabling mixed precision
+
+Mixed precision is enabled in TensorFlow by using the Automatic Mixed Precision (TF-AMP) extension which casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In TensorFlow, loss scaling can be applied statically by using simple multiplication of loss by a constant value or automatically, by TF-AMP. Automatic mixed precision makes all the adjustments internally in TensorFlow, providing two benefits over manual operations. First, programmers need not modify network model code, reducing development and maintenance effort. Second, using AMP maintains forward and backward compatibility with all the APIs for defining and running TensorFlow models.
+
+To enable mixed precision, you can simply add the values to the environmental variables inside your training script:
+- Enable TF-AMP graph rewrite:
+  ```
+  os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+  ```
+  
+- Enable Automated Mixed Precision:
+  ```
+  os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
+  ```
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to use the ResNext101-32x4d model.
+
+### Requirements
+This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates all dependencies.  Aside from these dependencies, ensure you have the following software:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html),
+* [Accessing And Pulling From The NGC container registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry),
+* [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running).
+
+For those unable to use the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+To train your model using mixed precision with Tensor Cores or FP32, perform the following steps using the default parameters of the ResNext101-32x4d model on the [ImageNet](http://www.image-net.org/) dataset. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
+
+
+1. Clone the repository.
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/TensorFlow/Classification/RN50v1.5
+```
+
+2. Download and preprocess the dataset.
+The ResNext101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
+
+To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/master/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
+
+3. Build the ResNext101-32x4d TensorFlow NGC container.
+```bash
+docker build . -t nvidia_rn50
+```
+
+4. Start an interactive session in the NGC container to run training/inference.
+After you build the container image, you can start an interactive CLI session with
+```bash
+nvidia-docker run --rm -it -v <path to imagenet>:/data/tfrecords --ipc=host nvidia_rn50
+```
+
+5. (Optional) Create index files to use DALI.
+To allow proper sharding in a multi-GPU environment, DALI has to create index files for the dataset. To create index files, run inside the container:
+```bash
+bash ./utils/dali_index.sh /data/tfrecords <index file store location>
+```
+Index files can be created once and then reused. It is highly recommended to save them into a persistent location.
+
+6. Start training.
+To run training for a standard configuration (as described in [Default
+configuration](#default-configuration), DGX1V, DGX2V, single GPU, FP16, FP32, 90, and 250 epochs), run
+one of the scripts in the `resnext101-32x4d/training` directory. Ensure ImageNet is mounted in the
+`/data/tfrecords` directory.
+
+For example, to train on DGX-1 for 90 epochs using AMP, run: 
+
+`bash ./resnext101-32x4d/training/AMP/DGX1_RNxt101-32x4d_AMP_90E.sh`
+
+Additionally, features like DALI data preprocessing or TensorFlow XLA can be enabled with
+environmental variables when running those scripts:
+
+`USE_XLA=1 USE_DALI=1 bash ./resnext101-32x4d/training/AMP/DGX1_RNxt101-32x4d_AMP_90E.sh`
+
+To store results in a specific location, add a location as a first argument:
+
+`bash ./resnext101-32x4d/training/AMP/DGX1_RNxt101-32x4d_AMP_90E.sh <location to store>`
+
+7. Start validation/evaluation.
+To evaluate the validation dataset located in `/data/tfrecords`, run `main.py` with
+`--mode=evaluate`. For example:
+
+`python main.py --arch=resnext101-32x4d --mode=evaluate --data_dir=/data/tfrecords --batch_size <batch size> --model_dir
+<model location> --result_dir <output location> [--use_xla] [--use_tf_amp]`
+
+The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during evaluation. 
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+ - `main.py`:               the script that controls the logic of training and validation of the ResNet-like models
+ - `Dockerfile`:            Instructions for Docker to build a container with the basic set of dependencies to run ResNet like models for image classification
+ - `requirements.txt`:      a set of extra Python requirements for running ResNet-like models
+
+The `model/` directory contains the following modules used to define ResNet family models:
+ - `resnet.py`: the definition of ResNet, ResNext, and SE-ResNext model
+ - `blocks/conv2d_block.py`: the definition of 2D convolution block
+ - `blocks/resnet_bottleneck_block.py`: the definition of ResNet-like bottleneck block
+ - `layers/*.py`: definitions of specific layers used in the ResNet-like model
+ 
+The `utils/` directory contains the following utility modules:
+ - `cmdline_helper.py`: helper module for command line processing
+ - `data_utils.py`: module defining input data pipelines
+ - `dali_utils.py`: helper module for DALI 
+ - `hvd_utils.py`: helper module for Horovod
+ - `image_processing.py`: image processing and data augmentation functions
+ - `learning_rate.py`: definition of used learning rate schedule
+ - `optimizers.py`: definition of used custom optimizers
+ - `hooks/*.py`: definitions of specific hooks allowing logging of training and inference process
+ 
+The `runtime/` directory contains the following module that define the mechanics of the training process:
+ - `runner.py`: module encapsulating the training, inference and evaluation 
+
+
+### Parameters
+
+#### The `main.py` script
+The script for training and evaluating the ResNext101-32x4d model has a variety of parameters that control these processes.
+
+```
+usage: main.py [-h]
+               [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
+               [--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}]
+               [--data_dir DATA_DIR] [--data_idx_dir DATA_IDX_DIR]
+               [--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
+               [--batch_size BATCH_SIZE] [--num_iter NUM_ITER]
+               [--iter_unit {epoch,batch}] [--warmup_steps WARMUP_STEPS]
+               [--model_dir MODEL_DIR] [--results_dir RESULTS_DIR]
+               [--log_filename LOG_FILENAME] [--display_every DISPLAY_EVERY]
+               [--lr_init LR_INIT] [--lr_warmup_epochs LR_WARMUP_EPOCHS]
+               [--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
+               [--momentum MOMENTUM] [--loss_scale LOSS_SCALE]
+               [--label_smoothing LABEL_SMOOTHING] [--mixup MIXUP]
+               [--use_static_loss_scaling | --nouse_static_loss_scaling]
+               [--use_xla | --nouse_xla] [--use_dali | --nouse_dali]
+               [--use_tf_amp | --nouse_tf_amp]
+               [--use_cosine_lr | --nouse_cosine_lr] [--seed SEED]
+               [--gpu_memory_fraction GPU_MEMORY_FRACTION] [--gpu_id GPU_ID]
+
+JoC-RN50v1.5-TF
+
+optional arguments:
+  -h, --help            Show this help message and exit
+  --arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}
+                        Architecture of model to run (to run Resnext-32x4d set
+                        --arch=rensext101-32x4d)
+  --mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}
+                        The execution mode of the script.
+  --data_dir DATA_DIR   Path to dataset in TFRecord format. Files should be
+                        named 'train-*' and 'validation-*'.
+  --data_idx_dir DATA_IDX_DIR
+                        Path to index files for DALI. Files should be named
+                        'train-*' and 'validation-*'.
+  --export_dir EXPORT_DIR
+                        Directory in which to write exported SavedModel.
+  --to_predict TO_PREDICT
+                        Path to file or directory of files to run prediction
+                        on.
+  --batch_size BATCH_SIZE
+                        Size of each minibatch per GPU.
+  --num_iter NUM_ITER   Number of iterations to run.
+  --iter_unit {epoch,batch}
+                        Unit of iterations.
+  --warmup_steps WARMUP_STEPS
+                        Number of steps considered as warmup and not taken
+                        into account for performance measurements.
+  --model_dir MODEL_DIR
+                        Directory in which to write the model. If undefined,
+                        results directory will be used.
+  --results_dir RESULTS_DIR
+                        Directory in which to write training logs, summaries
+                        and checkpoints.
+  --log_filename LOG_FILENAME
+                        Name of the JSON file to which write the training log
+  --display_every DISPLAY_EVERY
+                        How often (in batches) to print out running
+                        information.
+  --lr_init LR_INIT     Initial value for the learning rate.
+  --lr_warmup_epochs LR_WARMUP_EPOCHS
+                        Number of warmup epochs for the learning rate schedule.
+  --weight_decay WEIGHT_DECAY
+                        Weight Decay scale factor.
+  --weight_init {fan_in,fan_out}
+                        Model weight initialization method.
+  --momentum MOMENTUM   SGD momentum value for the momentum optimizer.
+  --loss_scale LOSS_SCALE
+                        Loss scale for FP16 training and fast math FP32.
+  --label_smoothing LABEL_SMOOTHING
+                        The value of label smoothing.
+  --mixup MIXUP         The alpha parameter for mixup (if 0 then mixup is not
+                        applied).
+  --use_static_loss_scaling
+                        Use static loss scaling in FP16 or FP32 AMP.
+  --nouse_static_loss_scaling
+  --use_xla             Enable XLA (Accelerated Linear Algebra) computation
+                        for improved performance.
+  --nouse_xla
+  --use_dali            Enable DALI data input.
+  --nouse_dali
+  --use_tf_amp          Enable AMP to speedup FP32
+                        computation using Tensor Cores.
+  --nouse_tf_amp
+  --use_cosine_lr       Use cosine learning rate schedule.
+  --nouse_cosine_lr
+  --seed SEED           Random seed.
+  --gpu_memory_fraction GPU_MEMORY_FRACTION
+                        Limit memory fraction used by the training script for DALI
+  --gpu_id GPU_ID       Specify the ID of the target GPU on a multi-device platform.
+                        Effective only for single-GPU mode.
+```
+
+### Inference process
+To run inference on a single example with a checkpoint and a model script, use: 
+
+`python main.py --arch=resnext101-32x4d --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`
+
+The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during inference.
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark the training performance on a specific batch size, run:
+
+* For 1 GPU
+    * FP32
+        `python ./main.py --arch=resnext101-32x4d --mode=training_benchmark --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+    * FP16
+        `python ./main.py --arch=resnext101-32x4d --mode=training_benchmark  --use_tf_amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+* For multiple GPUs
+    * FP32
+        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=resnext101-32x4d --mode=training_benchmark --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+    * FP16
+        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=resnext101-32x4d --mode=training_benchmark --use_tf_amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+        
+Each of these scripts runs 200 warm-up iterations and measures the first epoch.
+
+To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
+with `--use_xla` and `--use_dali` flags.
+Suggested batch sizes for training are 128 for mixed precision training and 64 for single precision training per single V100 16 GB.
+
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size, run:
+
+* FP32
+`python ./main.py --arch=resnext101-32x4d --mode=inference_benchmark --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+
+* FP16
+`python ./main.py --arch=resnext101-32x4d --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+
+By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
+To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
+
+The benchmark can be automated with the `inference_benchmark.sh` script provided in `resnext101-32x4d`, by simply running:
+`bash ./resnext101-32x4d/inference_benchmark.sh <data dir> <data idx dir>`
+
+The `<data dir>` parameter refers to the input data directory (by default `/data/tfrecords` inside the container). 
+By default, the benchmark tests the following configurations: **FP32**, **AMP**, **AMP + XLA** with different batch sizes.
+When the optional directory with the DALI index files `<data idx dir>` is specified, the benchmark executes an additional **DALI + AMP + XLA** configuration.
+
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference. 
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+Our results were obtained by running the `/resnext101-32x4d/training/{PRECISION}/DGX1_RNxt101-32x4d_{PRECISION}_{EPOCHS}E.sh` 
+training script in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) 
+NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+
+| Epochs | Batch Size / GPU | Accuracy - FP32 | Accuracy - mixed precision | 
+|--------|------------------|-----------------|----------------------------|
+| 90   | 64 (FP32) / 128 (AMP) | 79.34              | 79.31   |
+| 250  | 64 (FP32) / 128 (AMP) | 80.21              | 80.21   |
+
+**Example training loss plot**
+
+![TrainingLoss](./imgs/train_loss.png)
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16G)
+Our results were obtained by running the steps from [Training performance benchmark](#training-performance-benchmark) in the 
+[TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)  NGC container 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in images per second) were averaged over an entire training epoch.
+
+
+| GPUs | Batch Size / GPU | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|----|---------------|---------------|------------------------|-----------------|-----------|-------------------|
+| 1  | 64 (FP32) / 128 (AMP) | 142.10 img/s  | 423.19 img/s   |  2.97x          | 1.00x     | 1.00x             |
+| 8  | 64 (FP32) / 128 (AMP) | 1055.82 img/s | 3151.81 img/s  |  2.98x          | 7.43x     | 7.44x             |
+
+**XLA Enabled**
+
+| GPUs | Batch Size / GPU | Throughput - mixed precision | Throughput - mixed precision + XLA | Throughput speedup (mixed precision - XLA) |
+|----|------------|---------------|---------------------|-----------|
+| 1  | 128        | 423.19 img/s  | 588.49 img/s        | 1.39x    |
+| 8  | 128        | 3151.81 img/s | 4231.42 img/s       | 1.34x    |
+
+##### Training performance: NVIDIA DGX-2 (16x V100 32G)
+Our results were obtained by running the steps from [Training performance benchmark](#training-performance-benchmark) in the 
+[TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)  NGC container 
+on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in images per second) were averaged over an entire training epoch.
+
+| GPUs | Batch Size / GPU | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|----|---------------|---------------|-------------------------|-------|--------|--------|
+| 1  | 64 (FP32) / 128 (AMP) | 148.19 img/s  | 403.13 img/s    | 2.72x | 1.00x  | 1.00x  |
+| 16 | 64 (FP32) / 128 (AMP) | 1961.31 img/s | 5601.13 img/s   | 2.86x | 13.23x | 13.89x |
+
+**XLA Enabled**
+
+| GPUs | Batch Size / GPU | Throughput - mixed precision | Throughput - mixed precision + XLA | Throughput speedup (mixed precision - XLA) |
+|----|-----|----------|---------------------|-----------|
+| 1  | 128 | 403.13 img/s   | 555.33 img/s  |1.13x      |
+| 16 | 128 | 5601.13 img/s  | 7617.25 img/s |1.36x      |
+
+#### Training Time for 90 Epochs
+
+##### Training time: NVIDIA DGX-1 (8x V100 16G)
+
+Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-1-8x-v100-16g) 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+
+| GPUs | Time to train - mixed precision + XLA | Time to train - mixed precision | Time to train - FP32 |
+|---|--------|---------|---------|
+| 1 | ~54h   |  ~75h   |  ~225h  |
+| 8 | ~7.5h  |  ~10h   |  ~30h   | 
+
+##### Training time: NVIDIA DGX-2 (16x V100 32G)
+
+Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-2-16x-v100-32g) 
+on NVIDIA DGX-2 with (16x V100 32G) GPUs.
+
+| GPUs | Time to train - mixed precision + XLA | Time to train - mixed precision | Time to train - FP32 |
+|----|-------|--------|-------|
+| 1  | ~57h  | ~79h   | ~216h |
+| 16 | ~4.2h | ~6h    | ~16h  | 
+
+
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+
+Our results were obtained by running the `inference_benchmark.sh` inferencing benchmarking script
+in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container 
+on NVIDIA DGX-1 with (1x V100 16G) GPU.
+
+**FP32 Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|61.19 img/s	|16.36 ms	|16.66 ms	|16.87 ms	|17.31 ms |
+|2	|120.52 img/s	|16.60 ms   |16.91 ms	|17.00 ms	|17.60 ms |
+|4	|179.63 img/s	|22.26 ms	|22.44 ms	|22.50 ms	|22.73 ms |
+|8	|287.94 img/s	|27.78 ms	|27.97 ms	|28.08 ms	|28.30 ms |
+|16	|403.04 img/s	|39.72 ms	|39.93 ms	|40.01 ms	|40.29 ms |
+|32	|463.61 img/s	|69.03 ms	|69.68 ms	|70.99 ms	|71.48 ms |
+|64	|530.00 img/s	|120.75 ms	|121.12 ms	|121.38 ms	|123.17 ms |
+|128	|570.60 img/s	|224.32 ms	|224.84 ms	|224.98 ms	|225.72 ms |
+
+**Mixed Precision Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|164.92 img/s	|6.10 ms	|6.17 ms	|6.26 ms	|7.73 ms  |
+|2	|326.59 img/s	|6.14 ms	|6.32 ms	|6.39 ms	|6.62 ms  |
+|4	|607.20 img/s	|6.60 ms	|6.77 ms	|6.88 ms	|8.08 ms  |
+|8	|892.31 img/s	|8.97 ms	|9.13 ms	|9.49 ms	|9.86 ms  |
+|16	|1259.92 img/s	|12.82 ms	|13.31 ms	|13.44 ms	|13.58 ms |
+|32	|1508.73 img/s	|31.30 ms	|21.70 ms	|21.86 ms	|22.02 ms |
+|64	|1618.77 img/s	|39.55 ms	|40.71 ms	|41.33 ms	|41.94 ms |
+|128	|1730.40 img/s	|73.98 ms	|74.27 ms	|76.01 ms	|76.74 ms |
+
+**Mixed Precision Inference Latency + XLA**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|97.24 img/s	|10.31 ms	|10.48 ms	|10.57 ms	|10.81 ms  |
+|2	|191.68 img/s	|10.44 ms	|10.74 ms	|10.84 ms	|11.42 ms  |
+|4	|381.19 img/s	|10.50 ms	|10.85 ms	|10.98 ms	|11.74 ms  |
+|8	|744.11 img/s	|10.77 ms	|11.42 ms	|11.85 ms	|12.44 ms  |
+|16	|1174.29 img/s	|13.83 ms	|13.87 ms	|14.29 ms	|15.53 ms |
+|32	|1439.07 img/s	|22.33 ms	|22.67 ms	|22.84 ms	|23.06 ms |
+|64	|1712.76 img/s	|37.37 ms	|37.91 ms	|38.09 ms	|38.74 ms |
+|128	|1883.71 img/s	|67.95 ms	|68.48 ms	|68.63 ms	|68.86 ms |
+
+##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
+
+Our results were obtained by running the `inference_benchmark.sh` inferencing benchmarking script
+in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container 
+on NVIDIA DGX-2 with (1x V100 32G) GPU.
+
+**FP32 Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|62.02 img/s	|16.22 ms	|17.62 ms	|17.92 ms	|19.21 ms |
+|2	|97.98 img/s	|20.54 ms	|20.59 ms	|20.72 ms	|23.21 ms |
+|4	|168.16 img/s	|23.79 ms	|24.12 ms	|24.24 ms	|26.94 ms |
+|8	|269.89 img/s	|29.66 ms	|30.01 ms	|30.35 ms	|34.05 ms|
+|16	|379.81 img/s	|42.14 ms	|42.47 ms	|42.85 ms	|47.63 ms|
+|32	|466.04 img/s	|68.67 ms	|68.99 ms	|69.26 ms	|74.87 ms|
+|64	|547.64 img/s	|117.01 ms	|117.59 ms	|118.37 ms	|122.83 ms|
+|128	|603.44 img/s	|212.21 ms	|212.92 ms	|214.09 ms	|217.06 ms|
+
+**Mixed Precision Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|198.53 img/s	|5.14 ms	|5.23 ms	|5.41 ms	|5.54 ms |
+|2	|343.00 img/s	|6.14 ms	|6.08 ms	|6.26 ms	|7.72 ms |
+|4	|592.25 img/s	|6.77 ms	|7.06 ms	|7.18 ms	|8.70 ms |
+|8	|918.45 img/s	|8.72 ms	|8.90 ms	|9.09 ms	|9.77 ms |
+|16	|1306.53 img/s	|12.60 ms	|12.65 ms	|12.91 ms	|17.06 ms |
+|32	|1483.83 img/s	|21.56 ms	|21.61 ms	|21.84 ms	|27.05 ms|
+|64	|1668.63 img/s	|38.39 ms	|38.50 ms	|40.15 ms	|43.15 ms|
+|128	|1748.25 img/s	|73.35 ms	|75.23 ms	|78.82 ms	|80.17 ms|
+
+**Mixed Precision Inference Latency + XLA**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|143.65 img/s	|6.97 ms	|7.15 ms	|7.24 ms	|7.95 ms |
+|2	|282.21 img/s	|7.09 ms	|7.32 ms	|7.56 ms	|7.97 ms |
+|4	|511.55 img/s	|7.85 ms	|8.42 ms	|8.62 ms	|9.02 ms |
+|8	|870.60 img/s	|9.23 ms	|9.46 ms	|9.54 ms	|9.88 ms |
+|16	|1179.93 img/s	|13.62 ms	|14.04 ms	|14.19 ms	|14.51 ms|
+|32	|1512.36 img/s	|21.19 ms	|21.70 ms	|21.80 ms	|22.04 ms|
+|64	|1805.38 img/s	|35.56 ms	|36.33 ms	|36.48 ms	|36.94 ms|
+|128	|1947.49 img/s	|65.88 ms	|66.50 ms	|66.72 ms	|67.17 ms|
+
+##### Inference performance: NVIDIA T4 (1x T4 16G)
+
+Our results were obtained by running the `inference_benchmark.sh` inferencing benchmarking script
+in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container 
+on NVIDIA T4 with (1x T4 16G) GPU.
+
+**FP32 Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|54.23 img/s	|18.48 ms	|19.62 ms	|19.78 ms	|20.13 ms   |
+|2	|94.22 img/s	|21.24 ms	|21.58 ms	|21.71 ms	|21.97 ms   |
+|4	|127.71 img/s	|31.33 ms	|31.90 ms	|32.10 ms	|32.50 ms   |
+|8	|151.88 img/s	|52.67 ms	|53.45 ms	|53.80 ms	|54.12 ms   |
+|16	|163.01 img/s	|98.16 ms	|99.52 ms	|99.94 ms	|100.49 ms  |
+|32	|176.13 img/s	|181.71 ms	|183.91 ms	|184.54 ms	|185.60 ms  |
+|64	|183.40 img/s	|349.00 ms	|352.65 ms	|353.55 ms	|355.03 ms  |
+|128	|182.77 img/s	|700.35 ms	|707.89 ms	|708.80 ms	|710.28 ms  |
+
+
+**Mixed Precision Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|157.87 img/s	|6.36 ms	|6.47 ms	|6.52 ms	|6.64 ms    |
+|2	|274.10 img/s	|7.29 ms	|7.41 ms	|7.45 ms	|7.51 ms    |
+|4	|395.41 img/s	|10.12 ms	|10.35 ms	|10.41 ms	|10.53 ms   |
+|8	|479.83 img/s	|16.68 ms	|16.92 ms	|17.01 ms	|17.15 ms   |
+|16	|525.83 img/s	|30.47 ms	|30.80 ms	|30.89 ms	|31.27 ms   |
+|32	|536.31 img/s	|59.67 ms	|60.35 ms	|60.51 ms	|60.96 ms   |
+|64	|541.26 img/s	|118.25 ms	|119.51 ms	|119.77 ms	|120.38 ms  |
+|128	|538.20 img/s	|237.84 ms	|240.41 ms	|240.82 ms	|241.72 ms  |
+
+
+**Mixed Precision Inference Latency + XLA**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1  |104.10 img/s	|9.63 ms	|9.75 ms	|9.78 ms	|9.86 ms  |
+|2	|220.23 img/s	|9.08 ms	|9.22 ms	|9.26 ms	|9.35 ms  |
+|4	|361.55 img/s	|11.06 ms	|11.19 ms	|11.29 ms	|11.68 ms |
+|8	|452.95 img/s	|17.66 ms	|17.92 ms	|18.00 ms	|18.12 ms |
+|16	|522.64 img/s	|30.65 ms	|30.92 ms	|31.04 ms	|31.36 ms |
+|32	|542.06 img/s	|59.03 ms	|59.63 ms	|59.77 ms	|60.25 ms |
+|64	|536.14 img/s	|119.37 ms	|120.31 ms	|120.68 ms	|121.39 ms |
+|128	|548.43 img/s	|233.50 ms	|234.83 ms	|235.31 ms	|236.29 ms|
+
+## Release notes
+
+### Changelog
+
+June 2020
+   - Initial release
+
+### Known issues
+There are no known issues with this model.
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/imgs/ResNeXtArch.png
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/imgs/ResNeXtArch.png
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/imgs/train_loss.png
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/imgs/train_loss.png
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/inference_benchmark.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/inference_benchmark.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+
+DATA_DIR=${1:-"/data/tfrecords"}
+DALI_DIR=${2}
+
+BATCH_SIZE_TO_TEST="1 2 4 8 16 32 64 128"
+INFERENCE_BENCHMARK=$(mktemp /tmp/inference-benchmark.XXXXXX)
+
+function test_configuration() {
+    echo "Testing configuration: $1" | tee -a $INFERENCE_BENCHMARK
+
+    for BATCH in $BATCH_SIZE_TO_TEST; do
+        python ./main.py --arch=resnext101-32x4d --mode=inference_benchmark --warmup_steps 50 --num_iter 400 --iter_unit batch \
+            --batch_size $BATCH --data_dir=$DATA_DIR --results_dir=/tmp/results $2 | tail -n2 | head -n1 | sed \
+            's/^DLL \([0-9]*-\)*[0-9]* \([0-9]*:\)*[0-9]*.[0-9]* - ()/Results for BS='$BATCH'/' | tee -a $INFERENCE_BENCHMARK
+
+        if [ ! $? -eq 0 ]; then
+            echo "Failed test on batch size $BATCH_SIZE"
+            exit 1
+        fi
+    done
+}
+
+test_configuration "FP32 nodali noxla"
+test_configuration "FP16 nodali noxla" "--use_tf_amp"
+test_configuration "FP16 nodali xla" "--use_tf_amp --use_xla"
+
+if [ ! -z $DALI_DIR ]; then
+    test_configuration "FP16 dali xla" "--use_tf_amp --use_xla --use_dali --data_idx_dir ${DALI_DIR}"
+fi
+
+cat $INFERENCE_BENCHMARK
+rm $INFERENCE_BENCHMARK
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNxt101-32x4d_AMP_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    8 250 128 amp --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX1_RNxt101-32x4d_AMP_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    8 90 128 amp 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX2_RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX2_RNxt101-32x4d_AMP_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    16 250 128 amp --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX2_RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/DGX2_RNxt101-32x4d_AMP_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    16 90 128 amp 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/GPU1_RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/GPU1_RNxt101-32x4d_AMP_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    1 250 128 amp --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/GPU1_RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/AMP/GPU1_RNxt101-32x4d_AMP_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    1 90 128 amp 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNxt101-32x4d_FP32_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    8 250 64 fp32 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX1_RNxt101-32x4d_FP32_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    8 90 64 fp32 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX2_RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX2_RNxt101-32x4d_FP32_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    16 250 64 fp32 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX2_RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/DGX2_RNxt101-32x4d_FP32_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    16 90 64 fp32 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/GPU1_RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/GPU1_RNxt101-32x4d_FP32_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    1 250 64 fp32 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/GPU1_RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/FP32/GPU1_RNxt101-32x4d_FP32_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    1 90 64 fp32 
--- a/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/GENERIC.sh
+++ b/TensorFlow/Classification/ConvNets/resnext101-32x4d/training/GENERIC.sh
@ -0,0 +1,39 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+DATA_DIR=${2:-"/data"}
+
+GPU_COUNT=${3:-8}
+ITER_COUNT=${4:-50}
+BATCH_SIZE=${5:-128}
+PRECISION=${6:-"fp32"}
+OTHER=${@:7}
+
+if [[ ! -z "${BIND_TO_SOCKET}" ]]; then
+    BIND_TO_SOCKET="--bind-to socket"
+fi
+
+if [[ ! -z "${USE_DALI}" ]]; then
+    USE_DALI="--use_dali --data_idx_dir=${DATA_DIR}/dali_idx"
+fi
+
+if [[ ! -z "${USE_XLA}" ]]; then
+    USE_XLA="--use_xla"
+fi
+
+CMD=""
+case $PRECISION in
+    "fp32") CMD+="--precision=fp32";;
+    "fp16") CMD+="--precision=fp16 --use_static_loss_scaling --loss_scale=128";;
+    "amp") CMD+="--precision=fp32 --use_tf_amp --use_static_loss_scaling --loss_scale=128";;
+esac
+
+CMD="--arch=resnext101-32x4d --mode=train_and_evaluate --iter_unit=epoch --num_iter=${ITER_COUNT} \
+    --batch_size=${BATCH_SIZE} --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
+    ${CMD} --data_dir=${DATA_DIR}/tfrecords ${USE_DALI} ${USE_XLA} \
+    --results_dir=${RESULT_DIR} --weight_init=fan_in ${OTHER}"
+
+if [[ ${GPU_COUNT} -eq 1 ]]; then
+    python3 main.py ${CMD}
+else
+    mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np ${GPU_COUNT} python3 main.py ${CMD}
+fi
--- a/TensorFlow/Classification/ConvNets/runtime/init.py
+++ b/TensorFlow/Classification/ConvNets/runtime/init.py
--- a/TensorFlow/Classification/ConvNets/runtime/runner.py
+++ b/TensorFlow/Classification/ConvNets/runtime/runner.py
@ -19,10 +19,11 @@ import multiprocessing
 import warnings

 import tensorflow as tf
+import numpy as np

 import horovod.tensorflow as hvd

-from model import resnet_v1_5
+from model import resnet

 from utils import hooks
 from utils import data_utils
@ -30,21 +31,23 @@ from utils import hvd_utils

 from runtime import runner_utils

-from dllogger.logger import LOGGER
+import dllogger

 __all__ = [
    'Runner',
 ]

+
 class Runner(object):

    def __init__(
        self,
        # ========= Model HParams ========= #
        n_classes=1001,
-        input_format='NHWC',    # NCHW or NHWC
+        architecture='resnet50',
+        input_format='NHWC',  # NCHW or NHWC
        compute_format='NCHW',  # NCHW or NHWC
-        dtype=tf.float32,       # tf.float32 or tf.float16
+        dtype=tf.float32,  # tf.float32 or tf.float16
        n_channels=3,
        height=224,
        width=224,
@ -53,6 +56,7 @@ class Runner(object):
        log_dir=None,
        data_dir=None,
        data_idx_dir=None,
+        weight_init="fan_out",

        # ======= Optimization HParams ======== #
        use_xla=False,
@ -60,7 +64,7 @@ class Runner(object):
        use_dali=False,
        gpu_memory_fraction=1.0,
        gpu_id=0,
-        
+
        # ======== Debug Flags ======== #
        debug_verbosity=0,
        seed=None
@ -78,10 +82,6 @@ class Runner(object):
        if n_channels not in [1, 3]:
            raise ValueError("Unsupported number of channels: %d (allowed: 1 (grayscale) and 3 (color))" % n_channels)

-        if data_dir is not None and not os.path.exists(data_dir):
-            raise ValueError("The `data_dir` received does not exists: %s" % data_dir)
-
-        hvd.init()
        tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None

        # ============================================
@ -106,21 +106,20 @@ class Runner(object):
        os.environ['TF_SYNC_ON_FINISH'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
        os.environ['TF_DISABLE_NVTX_RANGES'] = '1'
+        os.environ["TF_XLA_FLAGS"] = (os.environ.get("TF_XLA_FLAGS", "") + " --tf_xla_enable_lazy_compilation=false")

        # ============================================
        # TF-AMP Setup - Do not remove
        # ============================================

        if dtype == tf.float16:
-
            if use_tf_amp:
                raise RuntimeError("TF AMP can not be activated for FP16 precision")

        elif use_tf_amp:
-            
-            if hvd.rank() == 0:
-                LOGGER.log("TF AMP is activated - Experimental Feature")
            os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+        else:
+            os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "0"

        # =================================================

@ -135,12 +134,8 @@ class Runner(object):
            distort_colors=distort_colors,
            seed=tf_seed
        )
-        
-        if use_dali:
-            num_preprocessing_threads=4
-        else:
-            num_preprocessing_threads=10

+        num_preprocessing_threads = 10 if not use_dali else 4
        run_config_performance = tf.contrib.training.HParams(
            num_preprocessing_threads=num_preprocessing_threads,
            use_tf_amp=use_tf_amp,
@ -160,20 +155,30 @@ class Runner(object):

        self.run_hparams = Runner._build_hparams(model_hparams, run_config_additional, run_config_performance)

-        self._model = resnet_v1_5.ResnetModel(
-            model_name="resnet50_v1.5",
+        model_name = architecture
+        architecture = resnet.model_architectures[architecture]
+
+        self._model = resnet.ResnetModel(
+            model_name=model_name,
            n_classes=model_hparams.n_classes,
+            layers_count=architecture["layers"],
+            layers_depth=architecture["widths"],
+            expansions=architecture["expansions"],
            input_format=model_hparams.input_format,
            compute_format=model_hparams.compute_format,
            dtype=model_hparams.dtype,
-            use_dali=use_dali
+            weight_init=weight_init,
+            use_dali=use_dali,
+            cardinality=architecture['cardinality'] if 'cardinality' in architecture else 1,
+            use_se=architecture['use_se'] if 'use_se' in architecture else False,
+            se_ratio=architecture['se_ratio'] if 'se_ratio' in architecture else 1
        )

        if self.run_hparams.seed is not None:
-            if hvd.rank() == 0:
-                LOGGER.log("Deterministic Run - Seed: %d" % seed)
            tf.set_random_seed(self.run_hparams.seed)
-            
+
+        self.training_logging_hook = None
+        self.eval_logging_hook = None

    @staticmethod
    def _build_hparams(*args):
@ -209,34 +214,36 @@ class Runner(object):
    def _get_session_config(mode, use_xla, use_dali, gpu_memory_fraction, gpu_id=0):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
-            raise ValueError("Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode)
+            raise ValueError(
+                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode
+            )

        # Limit available GPU memory (tune the size)
        if use_dali:
-            LOGGER.log("DALI is activated, GPU memory fraction used for training is limited to", gpu_memory_fraction)
-            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_memory_fraction)  
+            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_memory_fraction)
            config = tf.ConfigProto(gpu_options=gpu_options)
            config.gpu_options.allow_growth = False
-
-        
        else:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True

        config.allow_soft_placement = True
        config.log_device_placement = False
-    
+
        config.gpu_options.visible_device_list = str(gpu_id)

        if hvd_utils.is_using_hvd():
            config.gpu_options.visible_device_list = str(hvd.local_rank())

-        if use_xla: 
-            LOGGER.log("XLA is activated - Experimental Feature")
+        if use_xla:
            config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

        config.gpu_options.force_gpu_compatible = True  # Force pinned memory

+        # Bug 2939231 - disable bn+relu fusion
+        from tensorflow.core.protobuf import rewriter_config_pb2
+        config.graph_options.rewrite_options.remapping = (rewriter_config_pb2.RewriterConfig.OFF)
+
        if mode == 'train':
            config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads

@ -251,7 +258,9 @@ class Runner(object):
    def _get_run_config(mode, model_dir, use_xla, use_dali, gpu_memory_fraction, gpu_id=0, seed=None):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
-            raise ValueError("Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode)
+            raise ValueError(
+                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode
+            )

        if seed is not None:
            if hvd_utils.is_using_hvd():
@ -267,7 +276,9 @@ class Runner(object):
            save_summary_steps=100 if mode in ['train', 'validation'] else 1e9,  # disabled in benchmark mode
            save_checkpoints_steps=None,
            save_checkpoints_secs=None,
-            session_config=Runner._get_session_config(mode=mode, use_xla=use_xla, use_dali=use_dali, gpu_memory_fraction=gpu_memory_fraction, gpu_id=gpu_id),
+            session_config=Runner._get_session_config(
+                mode=mode, use_xla=use_xla, use_dali=use_dali, gpu_memory_fraction=gpu_memory_fraction, gpu_id=gpu_id
+            ),
            keep_checkpoint_max=5,
            keep_checkpoint_every_n_hours=1e6,  # disabled
            log_step_count_steps=1e9,
@ -291,7 +302,9 @@ class Runner(object):
    def _get_estimator(self, mode, run_params, use_xla, use_dali, gpu_memory_fraction, gpu_id=0):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
-            raise ValueError("Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode)
+            raise ValueError(
+                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode
+            )

        run_config = Runner._get_run_config(
            mode=mode,
@ -301,20 +314,17 @@ class Runner(object):
            gpu_memory_fraction=gpu_memory_fraction,
            gpu_id=gpu_id,
            seed=self.run_hparams.seed
-            
        )

        return tf.estimator.Estimator(
-            model_fn=self._model,
-            model_dir=self.run_hparams.model_dir,
-            config=run_config,
-            params=run_params
+            model_fn=self._model, model_dir=self.run_hparams.model_dir, config=run_config, params=run_params
        )

    def train(
        self,
        iter_unit,
        num_iter,
+        run_iter,
        batch_size,
        warmup_steps=50,
        weight_decay=1e-4,
@ -340,7 +350,6 @@ class Runner(object):
            if use_static_loss_scaling:
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0"
            else:
-                LOGGER.log("TF Loss Auto Scaling is activated")
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"
        else:
            use_static_loss_scaling = False  # Make sure it hasn't been set to True on FP32 training
@ -349,7 +358,7 @@ class Runner(object):
        global_batch_size = batch_size * num_gpus

        if self.run_hparams.data_dir is not None:
-            filenames,num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
+            filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
                data_dir=self.run_hparams.data_dir,
                mode="train",
                iter_unit=iter_unit,
@ -366,51 +375,43 @@ class Runner(object):
            num_decay_steps = num_steps
            num_samples = num_steps * batch_size

-            
+        if run_iter == -1:
+            run_iter = num_steps
+        else:
+            run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter
+
        if self.run_hparams.data_idx_dir is not None:
            idx_filenames = runner_utils.parse_dali_idx_dataset(
-                data_idx_dir=self.run_hparams.data_idx_dir,
-                mode="train"
+                data_idx_dir=self.run_hparams.data_idx_dir, mode="train"
            )
-            
+
        training_hooks = []
-      
+
        if hvd.rank() == 0:
-            LOGGER.log('Starting Model Training...')
-            LOGGER.log("Training Epochs", num_epochs)
-            LOGGER.log("Total Steps", num_steps)
-            LOGGER.log("Steps per Epoch", steps_per_epoch)
-            LOGGER.log("Decay Steps", num_decay_steps)
-            LOGGER.log("Weight Decay Factor", weight_decay)
-            LOGGER.log("Init Learning Rate", lr_init)
-            LOGGER.log("Momentum", momentum)
-            LOGGER.log("Num GPUs", num_gpus)
-            LOGGER.log("Per-GPU Batch Size", batch_size)
+            print('Starting Model Training...')
+            print("Training Epochs", num_epochs)
+            print("Total Steps", num_steps)
+            print("Steps per Epoch", steps_per_epoch)
+            print("Decay Steps", num_decay_steps)
+            print("Weight Decay Factor", weight_decay)
+            print("Init Learning Rate", lr_init)
+            print("Momentum", momentum)
+            print("Num GPUs", num_gpus)
+            print("Per-GPU Batch Size", batch_size)

-            
            if is_benchmark:
-
-                benchmark_logging_hook = hooks.BenchmarkLoggingHook(
-                    log_file_path=os.path.join(self.run_hparams.log_dir, "training_benchmark.json"),
-                    global_batch_size=global_batch_size,
-                    log_every=log_every_n_steps,
-                    warmup_steps=warmup_steps
+                self.training_logging_hook = hooks.BenchmarkLoggingHook(
+                    global_batch_size=global_batch_size, warmup_steps=warmup_steps
                )
-
-                training_hooks.append(benchmark_logging_hook)
-
            else:
-
-                training_logging_hook = hooks.TrainingLoggingHook(
-                    log_file_path=os.path.join(self.run_hparams.log_dir, "training.json"),
+                self.training_logging_hook = hooks.TrainingLoggingHook(
                    global_batch_size=global_batch_size,
                    num_steps=num_steps,
                    num_samples=num_samples,
                    num_epochs=num_epochs,
-                    log_every=log_every_n_steps
+                    steps_per_epoch=steps_per_epoch
                )
-
-                training_hooks.append(training_logging_hook)
+            training_hooks.append(self.training_logging_hook)

        if hvd_utils.is_using_hvd():
            bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
@ -418,7 +419,6 @@ class Runner(object):

        training_hooks.append(hooks.PrefillStagingAreasHook())

-      
        estimator_params = {
            'batch_size': batch_size,
            'steps_per_epoch': steps_per_epoch,
@ -445,11 +445,11 @@ class Runner(object):
        )

        def training_data_fn():
-            
+
            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
-                    LOGGER.log("Using DALI input... ")
-                    
+                    print("Using DALI input... ")
+
                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
@ -461,7 +461,7 @@ class Runner(object):
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False if self.run_hparams.seed is None else True
                )
-            
+
            elif self.run_hparams.data_dir is not None:

                return data_utils.get_tfrecords_input_fn(
@ -477,7 +477,7 @@ class Runner(object):

            else:
                if hvd.rank() == 0:
-                    LOGGER.log("Using Synthetic Data ...")
+                    print("Using Synthetic Data ...")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
@ -488,18 +488,19 @@ class Runner(object):
                    dtype=self.run_hparams.dtype,
                )

-
        try:
            image_classifier.train(
                input_fn=training_data_fn,
-                steps=num_steps,
+                steps=run_iter,
                hooks=training_hooks,
            )
        except KeyboardInterrupt:
            print("Keyboard interrupt")
-            
+
        if hvd.rank() == 0:
-            LOGGER.log('Ending Model Training ...')
+            print('Ending Model Training ...')
+            train_throughput = self.training_logging_hook.mean_throughput.value()
+            dllogger.log(data={'train_throughput': train_throughput}, step=tuple())

    def evaluate(
        self,
@ -522,13 +523,13 @@ class Runner(object):
            raise RuntimeError('Multi-GPU inference is not supported')

        estimator_params = {}
-            
+
        image_classifier = self._get_estimator(
            mode='validation',
            run_params=estimator_params,
            use_xla=self.run_hparams.use_xla,
            use_dali=self.run_hparams.use_dali,
-            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,         
+            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
            gpu_id=self.run_hparams.gpu_id
        )

@ -540,44 +541,35 @@ class Runner(object):
                num_iter=num_iter,
                global_batch_size=batch_size,
            )
-        
+
        else:
            num_epochs = 1
            num_decay_steps = -1
            num_steps = num_iter
-       
-    
+
        if self.run_hparams.data_idx_dir is not None:
            idx_filenames = runner_utils.parse_dali_idx_dataset(
-                data_idx_dir=self.run_hparams.data_idx_dir,
-                mode="validation"
+                data_idx_dir=self.run_hparams.data_idx_dir, mode="validation"
            )
-    
-        eval_hooks = []
-        
-        if hvd.rank() == 0:
-            if is_benchmark:
-                
-                benchmark_logging_hook = hooks.BenchmarkLoggingHook(
-                    log_file_path=os.path.join(self.run_hparams.log_dir, "eval_benchmark.json"),
-                    global_batch_size=batch_size,
-                    log_every=log_every_n_steps,
-                    warmup_steps=warmup_steps
-                )
-                eval_hooks.append(benchmark_logging_hook)

-            LOGGER.log('Starting Model Evaluation...')
-            LOGGER.log("Evaluation Epochs", num_epochs)
-            LOGGER.log("Evaluation Steps", num_steps)
-            LOGGER.log("Decay Steps", num_decay_steps)
-            LOGGER.log("Global Batch Size", batch_size)
+        eval_hooks = []
+
+        if hvd.rank() == 0:
+            self.eval_logging_hook = hooks.BenchmarkLoggingHook(global_batch_size=batch_size, warmup_steps=warmup_steps)
+            eval_hooks.append(self.eval_logging_hook)
+
+            print('Starting Model Evaluation...')
+            print("Evaluation Epochs", num_epochs)
+            print("Evaluation Steps", num_steps)
+            print("Decay Steps", num_decay_steps)
+            print("Global Batch Size", batch_size)

        def evaluation_data_fn():
-    
+
            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
-                    LOGGER.log("Using DALI input... ")
-                    
+                    print("Using DALI input... ")
+
                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
@ -589,22 +581,21 @@ class Runner(object):
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False if self.run_hparams.seed is None else True
                )
-            
-            
+
            elif self.run_hparams.data_dir is not None:
                return data_utils.get_tfrecords_input_fn(
-                        filenames=filenames,
-                        batch_size=batch_size,
-                        height=self.run_hparams.height,
-                        width=self.run_hparams.width,
-                        training=False,
-                        distort_color=self.run_hparams.distort_colors,
-                        num_threads=self.run_hparams.num_preprocessing_threads,
-                        deterministic=False if self.run_hparams.seed is None else True
-                    )
+                    filenames=filenames,
+                    batch_size=batch_size,
+                    height=self.run_hparams.height,
+                    width=self.run_hparams.width,
+                    training=False,
+                    distort_color=self.run_hparams.distort_colors,
+                    num_threads=self.run_hparams.num_preprocessing_threads,
+                    deterministic=False if self.run_hparams.seed is None else True
+                )

            else:
-                LOGGER.log("Using Synthetic Data ...\n")
+                print("Using Synthetic Data ...\n")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
@ -621,35 +612,50 @@ class Runner(object):
                steps=num_steps,
                hooks=eval_hooks,
            )
-            LOGGER.log('Top-1 Accuracy: %.3f' % float(eval_results['top1_accuracy'] * 100))
-            LOGGER.log('Top-5 Accuracy: %.3f' % float(eval_results['top5_accuracy'] * 100))

-            #def get_serving_input_receiver_fn(batch_size, height, width, num_channels, data_format, dtype=tf.float32):   
-            
+            eval_throughput = self.eval_logging_hook.mean_throughput.value()
+            eval_latencies = np.array(self.eval_logging_hook.latencies) * 1000
+            eval_latencies_q = np.quantile(eval_latencies, q=[0.9, 0.95, 0.99])
+            eval_latencies_mean = np.mean(eval_latencies)
+
+            dllogger.log(
+                data={
+                    'top1_accuracy': float(eval_results['top1_accuracy']),
+                    'top5_accuracy': float(eval_results['top5_accuracy']),
+                    'eval_throughput': eval_throughput,
+                    'eval_latency_avg': eval_latencies_mean,
+                    'eval_latency_p90': eval_latencies_q[0],
+                    'eval_latency_p95': eval_latencies_q[1],
+                    'eval_latency_p99': eval_latencies_q[2],
+                },
+                step=tuple()
+            )
+
            if export_dir is not None:
-                LOGGER.log('Exporting to', export_dir)
+                dllogger.log(data={'export_dir': export_dir}, step=tuple())
                input_receiver_fn = data_utils.get_serving_input_receiver_fn(
-                    batch_size=None,                        
+                    batch_size=None,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
-                    dtype=self.run_hparams.dtype)
-                
+                    dtype=self.run_hparams.dtype
+                )
+
                image_classifier.export_savedmodel(export_dir, input_receiver_fn)
-            
+
        except KeyboardInterrupt:
            print("Keyboard interrupt")

-        LOGGER.log('Ending Model Evaluation ...')
-        
+        print('Model evaluation finished')
+
    def predict(self, to_predict):

        estimator_params = {}
-            
+
        if to_predict is not None:
            filenames = runner_utils.parse_inference_input(to_predict)
-            
+
        image_classifier = self._get_estimator(
            mode='inference',
            run_params=estimator_params,
@ -657,28 +663,26 @@ class Runner(object):
            use_dali=self.run_hparams.use_dali,
            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction
        )
-      
+
        inference_hooks = []
-        
+
        def inference_data_fn():
            return data_utils.get_inference_input_fn(
-                        filenames=filenames,
-                        height=self.run_hparams.height,
-                        width=self.run_hparams.width,
-                        num_threads=self.run_hparams.num_preprocessing_threads
-                )
+                filenames=filenames,
+                height=self.run_hparams.height,
+                width=self.run_hparams.width,
+                num_threads=self.run_hparams.num_preprocessing_threads
+            )
+
        try:
            inference_results = image_classifier.predict(
-                input_fn=inference_data_fn,
-                predict_keys=None,
-                hooks=inference_hooks,
-                yield_single_examples=True
+                input_fn=inference_data_fn, predict_keys=None, hooks=inference_hooks, yield_single_examples=True
            )
-            
+
            for result in inference_results:
-                LOGGER.log(result['classes'], str(result['probabilities'][result['classes']]))
+                print(result['classes'], str(result['probabilities'][result['classes']]))

        except KeyboardInterrupt:
            print("Keyboard interrupt")

-        LOGGER.log('Ending Inference ...')
+        print('Ending Inference ...')
--- a/TensorFlow/Classification/ConvNets/runtime/runner_utils.py
+++ b/TensorFlow/Classification/ConvNets/runtime/runner_utils.py
@ -46,9 +46,6 @@ def list_filenames_in_dataset(data_dir, mode, count=True):
    if mode not in ["train", 'validation']:
        raise ValueError("Unknown mode received: %s" % mode)

-    if not os.path.exists(data_dir):
-        raise FileNotFoundError("The data directory: `%s` can't be found" % data_dir)
-
    filename_pattern = os.path.join(data_dir, '%s-*' % mode)

    file_list = sorted(tf.gfile.Glob(filename_pattern))
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md
@ -0,0 +1,647 @@
+# SE-ResNext101-32x4d for TensorFlow
+
+This repository provides a script and recipe to train the SE-ResNext101-32x4d model to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+* [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+        * [Optimizer](#optimizer)
+        * [Data augmentation](#data-augmentation)
+    * [Feature support matrix](#feature-support-matrix)
+        * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+        * [Enabling mixed precision](#enabling-mixed-precision)
+* [Setup](#setup)
+    * [Requirements](#requirements)
+* [Quick Start Guide](#quick-start-guide)
+* [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+        * [The `main.py` script](#the-mainpy-script)
+    * [Inference process](#inference-process)
+* [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results](#training-accuracy-results)
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
+        * [Training performance results](#training-performance-results)
+            * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
+            * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
+        * [Training time for 90 Epochs](#training-time-for-90-epochs)
+            * [Training time: NVIDIA DGX-1 (8x V100 16G)](#training-time-nvidia-dgx-1-8x-v100-16g)
+            * [Training time: NVIDIA DGX-2 (16x V100 32G)](#training-time-nvidia-dgx-2-16x-v100-32g)
+        * [Inference performance results](#inference-performance-results)
+            * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
+            * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
+            * [Inference performance: NVIDIA T4 (1x T4 16G)](#inference-performance-nvidia-t4-1x-t4-16g)
+* [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+## Model overview
+The SE-ResNeXt101-32x4d is a [ResNeXt101-32x4d](https://arxiv.org/pdf/1611.05431.pdf)
+model with added Squeeze-and-Excitation module introduced in the [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf) paper.
+
+The following performance optimizations were implemented in this model:
+* JIT graph compilation with [XLA](https://www.tensorflow.org/xla)
+* Multi-GPU training with [Horovod](https://github.com/horovod/horovod)
+* Automated mixed precision [AMP](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html)
+
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Model architecture
+Here is a diagram of the Squeeze and Excitation module architecture for ResNet-type models:
+
+![SEArch](./imgs/SEArch.png)
+
+_Image source: [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf)_
+
+This image shows the architecture of the SE block and where it is placed in the ResNet bottleneck block.
+
+### Default configuration
+
+The following sections highlight the default configuration for the SE-ResNext101-32x4d model.
+
+#### Optimizer
+
+This model uses the SGD optimizer with the following hyperparameters:
+
+* Momentum (0.875).
+* Learning rate (LR) = 0.256 for 256 batch size, for other batch sizes we linearly scale the learning
+  rate.
+* Learning rate schedule - we use cosine LR schedule.
+* For bigger batch sizes (512 and up) we use linear warmup of the learning rate.
+during the first 5 epochs according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
+* Weight decay: 6.103515625e-05 (1/16384).
+* We do not apply Weight decay on batch norm trainable parameters (gamma/bias).
+* Label Smoothing: 0.1.
+* We train for:
+    * 90 Epochs -> 90 epochs is a standard for ImageNet networks.
+    * 250 Epochs -> best possible accuracy. 
+* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
+
+#### Data Augmentation
+
+This model uses the following data augmentation:
+
+* For training:
+  * Normalization.
+  * Random resized crop to 224x224.
+    * Scale from 8% to 100%.
+    * Aspect ratio from 3/4 to 4/3.
+  * Random horizontal flip.
+
+* For inference:
+  * Normalization.
+  * Scale to 256x256.
+  * Center crop to 224x224.
+
+### Feature support matrix
+
+The following features are supported by this model.
+
+| Feature               | SE-ResNext101-32x4d Tensorflow             |
+|-----------------------|--------------------------
+|Multi-GPU training with [Horovod](https://github.com/horovod/horovod)  |  Yes |
+|[NVIDIA DALI](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html)                |  Yes |
+|Automatic mixed precision (AMP) | Yes |
+
+
+#### Features
+
+Multi-GPU training with Horovod - Our model uses Horovod to implement efficient multi-GPU training with NCCL.
+For details, refer to the example sources in this repository or the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
+
+NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
+with the DALI library. For details, refer to the example sources in this repository or the [DALI documentation](https://docs.nvidia.com/deeplearning/dali/index.html).
+
+Automatic mixed precision (AMP) - Computation graph can be modified by TensorFlow on runtime to support mixed precision training. 
+Detailed explanation of mixed precision can be found in the next section.
+
+### Mixed precision training
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network.  Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.  Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values. 
+
+This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code.  AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow framework code makes all necessary model changes internally.
+
+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
+
+For information about:
+ * How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
+ * How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+ * Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+ 
+#### Enabling mixed precision
+
+Mixed precision is enabled in TensorFlow by using the Automatic Mixed Precision (TF-AMP) extension which casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In TensorFlow, loss scaling can be applied statically by using simple multiplication of loss by a constant value or automatically, by TF-AMP. Automatic mixed precision makes all the adjustments internally in TensorFlow, providing two benefits over manual operations. First, programmers need not modify network model code, reducing development and maintenance effort. Second, using AMP maintains forward and backward compatibility with all the APIs for defining and running TensorFlow models.
+
+To enable mixed precision, you can simply add the values to the environmental variables inside your training script:
+- Enable TF-AMP graph rewrite:
+  ```
+  os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+  ```
+  
+- Enable Automated Mixed Precision:
+  ```
+  os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
+  ```
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to use the SE-ResNext101-32x4d model.
+
+### Requirements
+This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates all dependencies.  Aside from these dependencies, ensure you have the following software:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html),
+* [Accessing And Pulling From The NGC container registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry),
+* [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running).
+
+For those unable to use the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+To train your model using mixed precision with Tensor Cores or FP32, perform the following steps using the default parameters of the SE-ResNext101-32x4d model on the [ImageNet](http://www.image-net.org/) dataset. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
+
+
+1. Clone the repository.
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/TensorFlow/Classification/RN50v1.5
+```
+
+2. Download and preprocess the dataset.
+The SE-ResNext101-32x4d script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
+
+To download and preprocess the dataset, use the [Generate ImageNet for TensorFlow](https://github.com/tensorflow/models/blob/master/research/inception/inception/data/download_and_preprocess_imagenet.sh) script. The dataset will be downloaded to a directory specified as the first parameter of the script.
+
+3. Build the SE-ResNext101-32x4d TensorFlow NGC container.
+```bash
+docker build . -t nvidia_rn50
+```
+
+4. Start an interactive session in the NGC container to run training/inference.
+After you build the container image, you can start an interactive CLI session with
+```bash
+nvidia-docker run --rm -it -v <path to imagenet>:/data/tfrecords --ipc=host nvidia_rn50
+```
+
+5. (Optional) Create index files to use DALI.
+To allow proper sharding in a multi-GPU environment, DALI has to create index files for the dataset. To create index files, run inside the container:
+```bash
+bash ./utils/dali_index.sh /data/tfrecords <index file store location>
+```
+Index files can be created once and then reused. It is highly recommended to save them into a persistent location.
+
+6. Start training.
+To run training for a standard configuration (as described in [Default
+configuration](#default-configuration), DGX1V, DGX2V, single GPU, FP16, FP32, 90, and 250 epochs), run
+one of the scripts in the `se-resnext101-32x4d/training` directory. Ensure ImageNet is mounted in the
+`/data/tfrecords` directory.
+
+For example, to train on DGX-1 for 90 epochs using AMP, run:  
+
+`bash ./se-resnext101-32x4d/training/AMP/DGX1_SE-RNxt101-32x4d_AMP_90E.sh`
+
+Additionally, features like DALI data preprocessing or TensorFlow XLA can be enabled with
+environmental variables when running those scripts:
+
+`USE_XLA=1 USE_DALI=1 bash ./se-resnext101-32x4d/training/AMP/DGX1_SE-RNxt101-32x4d_AMP_90E.sh`
+
+To store results in a specific location, add a location as a first argument:
+
+`bash ./resnext101-32x4d/training/AMP/DGX1_RNxt101-32x4d_AMP_90E.sh <location to store>`
+
+7. Start validation/evaluation.
+To evaluate the validation dataset located in `/data/tfrecords`, run `main.py` with
+`--mode=evaluate`. For example:
+
+`python main.py --arch=se-resnext101-32x4d --mode=evaluate --data_dir=/data/tfrecords --batch_size <batch size> --model_dir
+<model location> --result_dir <output location> [--use_xla] [--use_tf_amp]`
+
+The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during evaluation. 
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In the root directory, the most important files are:
+ - `main.py`:               the script that controls the logic of training and validation of the ResNet-like models
+ - `Dockerfile`:            Instructions for Docker to build a container with the basic set of dependencies to run ResNet like models for image classification
+ - `requirements.txt`:      a set of extra Python requirements for running ResNet-like models
+
+The `model/` directory contains the following modules used to define ResNet family models:
+ - `resnet.py`: the definition of ResNet, ResNext, and SE-ResNext model
+ - `blocks/conv2d_block.py`: the definition of 2D convolution block
+ - `blocks/resnet_bottleneck_block.py`: the definition of ResNet-like bottleneck block
+ - `layers/*.py`: definitions of specific layers used in the ResNet-like model
+ 
+The `utils/` directory contains the following utility modules:
+ - `cmdline_helper.py`: helper module for command line processing
+ - `data_utils.py`: module defining input data pipelines
+ - `dali_utils.py`: helper module for DALI 
+ - `hvd_utils.py`: helper module for Horovod
+ - `image_processing.py`: image processing and data augmentation functions
+ - `learning_rate.py`: definition of used learning rate schedule
+ - `optimizers.py`: definition of used custom optimizers
+ - `hooks/*.py`: definitions of specific hooks allowing logging of training and inference process
+ 
+The `runtime/` directory contains the following module that define the mechanics of the training process:
+ - `runner.py`: module encapsulating the training, inference and evaluation 
+
+
+### Parameters
+
+#### The `main.py` script
+The script for training and evaluating the ResNext101-32x4d model has a variety of parameters that control these processes.
+
+```
+usage: main.py [-h]
+               [--arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}]
+               [--mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}]
+               [--data_dir DATA_DIR] [--data_idx_dir DATA_IDX_DIR]
+               [--export_dir EXPORT_DIR] [--to_predict TO_PREDICT]
+               [--batch_size BATCH_SIZE] [--num_iter NUM_ITER]
+               [--iter_unit {epoch,batch}] [--warmup_steps WARMUP_STEPS]
+               [--model_dir MODEL_DIR] [--results_dir RESULTS_DIR]
+               [--log_filename LOG_FILENAME] [--display_every DISPLAY_EVERY]
+               [--lr_init LR_INIT] [--lr_warmup_epochs LR_WARMUP_EPOCHS]
+               [--weight_decay WEIGHT_DECAY] [--weight_init {fan_in,fan_out}]
+               [--momentum MOMENTUM] [--loss_scale LOSS_SCALE]
+               [--label_smoothing LABEL_SMOOTHING] [--mixup MIXUP]
+               [--use_static_loss_scaling | --nouse_static_loss_scaling]
+               [--use_xla | --nouse_xla] [--use_dali | --nouse_dali]
+               [--use_tf_amp | --nouse_tf_amp]
+               [--use_cosine_lr | --nouse_cosine_lr] [--seed SEED]
+               [--gpu_memory_fraction GPU_MEMORY_FRACTION] [--gpu_id GPU_ID]
+
+JoC-RN50v1.5-TF
+
+optional arguments:
+  -h, --help            Show this help message and exit
+  --arch {resnet50,resnext101-32x4d,se-resnext101-32x4d}
+                        Architecture of model to run (to run se-resnext-32x4d set
+                        --arch=se-rensext101-32x4d)
+  --mode {train,train_and_evaluate,evaluate,predict,training_benchmark,inference_benchmark}
+                        The execution mode of the script.
+  --data_dir DATA_DIR   Path to dataset in TFRecord format. Files should be
+                        named 'train-*' and 'validation-*'.
+  --data_idx_dir DATA_IDX_DIR
+                        Path to index files for DALI. Files should be named
+                        'train-*' and 'validation-*'.
+  --export_dir EXPORT_DIR
+                        Directory in which to write exported SavedModel.
+  --to_predict TO_PREDICT
+                        Path to file or directory of files to run prediction
+                        on.
+  --batch_size BATCH_SIZE
+                        Size of each minibatch per GPU.
+  --num_iter NUM_ITER   Number of iterations to run.
+  --iter_unit {epoch,batch}
+                        Unit of iterations.
+  --warmup_steps WARMUP_STEPS
+                        Number of steps considered as warmup and not taken
+                        into account for performance measurements.
+  --model_dir MODEL_DIR
+                        Directory in which to write the model. If undefined,
+                        results directory will be used.
+  --results_dir RESULTS_DIR
+                        Directory in which to write training logs, summaries
+                        and checkpoints.
+  --log_filename LOG_FILENAME
+                        Name of the JSON file to which write the training log
+  --display_every DISPLAY_EVERY
+                        How often (in batches) to print out running
+                        information.
+  --lr_init LR_INIT     Initial value for the learning rate.
+  --lr_warmup_epochs LR_WARMUP_EPOCHS
+                        Number of warmup epochs for the learning rate schedule.
+  --weight_decay WEIGHT_DECAY
+                        Weight Decay scale factor.
+  --weight_init {fan_in,fan_out}
+                        Model weight initialization method.
+  --momentum MOMENTUM   SGD momentum value for the momentum optimizer.
+  --loss_scale LOSS_SCALE
+                        Loss scale for FP16 training and fast math FP32.
+  --label_smoothing LABEL_SMOOTHING
+                        The value of label smoothing.
+  --mixup MIXUP         The alpha parameter for mixup (if 0 then mixup is not
+                        applied).
+  --use_static_loss_scaling
+                        Use static loss scaling in FP16 or FP32 AMP.
+  --nouse_static_loss_scaling
+  --use_xla             Enable XLA (Accelerated Linear Algebra) computation
+                        for improved performance.
+  --nouse_xla
+  --use_dali            Enable DALI data input.
+  --nouse_dali
+  --use_tf_amp          Enable AMP to speedup FP32
+                        computation using Tensor Cores.
+  --nouse_tf_amp
+  --use_cosine_lr       Use cosine learning rate schedule.
+  --nouse_cosine_lr
+  --seed SEED           Random seed.
+  --gpu_memory_fraction GPU_MEMORY_FRACTION
+                        Limit memory fraction used by the training script for DALI
+  --gpu_id GPU_ID       Specify the ID of the target GPU on a multi-device platform.
+                        Effective only for single-GPU mode.
+```
+
+### Inference process
+To run inference on a single example with a checkpoint and a model script, use: 
+
+`python main.py --arch=se-resnext101-32x4d --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`
+
+The optional `--use_xla` and `--use_tf_amp` flags control XLA and AMP during inference.
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark the training performance on a specific batch size, run:
+
+* For 1 GPU
+    * FP32
+        `python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+    * FP16
+        `python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark  --use_tf_amp --warmup_steps 200 --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+* For multiple GPUs
+    * FP32
+        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+    * FP16
+        `mpiexec --allow-run-as-root --bind-to socket -np <num_gpus> python ./main.py --arch=se-resnext101-32x4d --mode=training_benchmark --use_tf_amp --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+        
+        
+Each of these scripts runs 200 warm-up iterations and measures the first epoch.
+
+To control warmup and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags. Features like XLA or DALI can be controlled
+with `--use_xla` and `--use_dali` flags. 
+Suggested batch sizes for training are 96 for mixed precision training and 64 for single precision training per single V100 16 GB.
+
+
+#### Inference performance benchmark
+
+To benchmark the inference performance on a specific batch size, run:
+
+* FP32
+`python ./main.py --arch=se-resnext101-32x4d --mode=inference_benchmark --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+
+* FP16
+`python ./main.py --arch=se-resnext101-32x4d --mode=inference_benchmark --use_tf_amp --warmup_steps 20 --num_iter 100 --iter_unit batch --batch_size <batch size> --data_dir=<path to imagenet> --results_dir=<path to results directory>`
+
+By default, each of these scripts runs 20 warm-up iterations and measures the next 80 iterations.
+To control warm-up and benchmark length, use the `--warmup_steps`, `--num_iter` and `--iter_unit` flags.
+
+The benchmark can be automated with the `inference_benchmark.sh` script provided in `se-resnext101-32x4d`, by simply running:
+`bash ./se-resnext101-32x4d/inference_benchmark.sh <data dir> <data idx dir>`
+
+The `<data dir>` parameter refers to the input data directory (by default `/data/tfrecords` inside the container). 
+By default, the benchmark tests the following configurations: **FP32**, **AMP**, **AMP + XLA** with different batch sizes.
+When the optional directory with the DALI index files `<data idx dir>` is specified, the benchmark executes an additional **DALI + AMP + XLA** configuration.
+
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference. 
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+Our results were obtained by running the `/resnext101-32x4d/training/{PRECISION}/DGX1_RNxt101-32x4d_{PRECISION}_{EPOCHS}E.sh` 
+training script in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) 
+NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+
+| Epochs | Batch Size / GPU | Accuracy - FP32 | Accuracy - mixed precision | 
+|--------|------------------|-----------------|----------------------------|
+| 90   | 64 (FP32) / 96 (AMP) | 79.69              | 79.81   |
+| 250  | 64 (FP32) / 96 (AMP) | 80.87              | 80.84   |
+
+**Example training loss plot**
+
+![TrainingLoss](./imgs/train_loss.png)
+
+#### Training performance results
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16G)
+Our results were obtained by running the steps from [Training performance benchmark](#training-performance-benchmark) in the 
+[TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)  NGC container 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in images per second) were averaged over an entire training epoch.
+
+
+| GPUs | Batch Size / GPU | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|----|---------------|---------------|-----------------------|---------------|-----------|-------|
+| 1  | 64 (FP32) / 96 (AMP) | 126.44 img/s | 285.99 img/s     | 2.26x         | 1.00x     | 1.00x             |
+| 8  | 64 (FP32) / 96 (AMP) | 921.38 img/s | 2168.40 img/s    | 2.35x         | 7.28x     | 7.58x             |
+
+**XLA Enabled**
+
+| GPUs | Batch Size / GPU | Throughput - mixed precision | Throughput - mixed precision + XLA | Throughput speedup (mixed precision - XLA) |
+|----|------------|---------------|---------------------|-----------|
+| 1  | 128        | 285.99 img/s   |453.39 img/s         |1.58x      |
+| 8  | 128        | 2168.40 img/s  |3297.39 img/s        |1.52x      |
+
+##### Training performance: NVIDIA DGX-2 (16x V100 32G)
+Our results were obtained by running the steps from [Training performance benchmark](#training-performance-benchmark) in the 
+[TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)  NGC container 
+on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in images per second) were averaged over an entire training epoch.
+
+| GPUs | Batch Size / GPU | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|----|---------------|---------------|-------------------------|-------|--------|--------|
+| 1  | 64 (FP32) / 96 (AMP) | 128.08 img/s | 309.34 img/s    | 2.39x                 | 1.00x         | 1.00x  |
+| 16 | 64 (FP32) / 96 (AMP) | 1759.44 img/s| 4210.85 img/s   | 2.51x                 | 13.73x        | 13.61x |
+
+**XLA Enabled**
+
+| GPUs | Batch Size / GPU | Throughput - mixed precision | Throughput - mixed precision + XLA | Throughput speedup (mixed precision - XLA) |
+|----|-----|----------|---------------------|-----------|
+| 1  | 96 | 309.34 img/s    |520.10 img/s         |1.68x      |
+| 16 | 96 | 4210.85 img/s   |6835.66 img/s        |1.62x      |
+
+#### Training Time for 90 Epochs
+
+##### Training time: NVIDIA DGX-1 (8x V100 16G)
+
+Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-1-8x-v100-16g) 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs.
+
+| GPUs | Time to train - mixed precision + XLA | Time to train - mixed precision | Time to train - FP32 |
+|---|--------|---------|---------|
+| 1 | ~71h   |  ~112h   |  ~253h   |
+| 8 | ~10h  |  ~15h    |  ~35h | 
+
+##### Training time: NVIDIA DGX-2 (16x V100 32G)
+
+Our results were estimated based on the [training performance results](#training-performance-nvidia-dgx-2-16x-v100-32g) 
+on NVIDIA DGX-2 with (16x V100 32G) GPUs.
+
+| GPUs | Time to train - mixed precision + XLA | Time to train - mixed precision | Time to train - FP32 |
+|----|-------|--------|-------|
+| 1  | ~61h  | ~103h  | ~247h |
+| 16 | ~4.7h | ~7.5h  | ~19h   | 
+
+
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+
+Our results were obtained by running the `inference_benchmark.sh` inferencing benchmarking script
+in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container 
+on NVIDIA DGX-1 with (1x V100 16G) GPU.
+
+**FP32 Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|54.71 img/s	|18.32 ms	|18.73 ms	|18.89 ms	|19.95 ms  |
+|2	|98.36 img/s	|20.37 ms	|20.69 ms	|20.75 ms	|21.03 ms  |
+|4	|150.60 img/s	|26.56 ms	|26.83 ms	|26.95 ms	|27.46 ms  |
+|8	|235.17 img/s	|34.02 ms	|34.40 ms	|34.57 ms	|35.37 ms |
+|16	|330.33 img/s	|48.43 ms	|48.91 ms	|49.22 ms	|49.79 ms |
+|32	|393.96 img/s	|81.22 ms	|81.72 ms	|81.99 ms	|82.49 ms |
+|64	|446.13 img/s	|143.54 ms	|144.37 ms	|144.74 ms	|145.93 ms |
+|128	|490.61 img/s	|260.89 ms	|261.56 ms	|261.76 ms	|262.71 ms |
+
+**Mixed Precision Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|92.56 img/s	|10.88 ms	|11.04 ms	|11.66 ms	|14.34 ms  |
+|2	|180.61 img/s	|11.11 ms	|11.35 ms	|11.47 ms	|11.79 ms  |
+|4	|354.41 img/s	|11.35 ms	|11.90 ms	|12.13 ms	|13.28 ms  |
+|8	|547.79 img/s	|14.63 ms	|15.53 ms	|15.99 ms	|16.38 ms  |
+|16	|772.41 img/s	|20.80 ms	|21.76 ms	|22.02 ms	|23.02 ms  |
+|32	|965.89 img/s	|33.15 ms	|33.82 ms	|34.24 ms	|35.15 ms |
+|64	|1086.99 img/s	|59.01 ms	|59.42 ms	|59.56 ms	|60.35 ms |
+|128	|1162.59 img/s	|110.36 ms	|110.41 ms	|110.64 ms	|111.18 ms |
+
+**Mixed Precision Inference Latency + XLA**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|84.98 img/s	|11.81 ms	|12.00 ms	|12.08 ms	|12.85 ms  |
+|2	|150.35 img/s	|13.37 ms	|14.15 ms	|14.56 ms	|15.11 ms  |
+|4	|288.89 img/s	|13.90 ms	|14.37 ms	|14.56 ms	|16.19 ms  |
+|8	|526.94 img/s	|15.19 ms	|15.61 ms	|15.85 ms	|17.91 ms  |
+|16	|818.86 img/s	|19.63 ms	|19.85 ms	|19.97 ms	|20.70 ms |
+|32	|1134.72 img/s	|28.20 ms	|28.60 ms	|28.82 ms	|30.03 ms |
+|64	|1359.55 img/s	|47.22 ms	|47.51 ms	|47.84 ms	|48.96 ms |
+|128	|1515.12 img/s	|84.49 ms	|85.51 ms	|85.82 ms	|86.89 ms |
+
+##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
+
+Our results were obtained by running the `inference_benchmark.sh` inferencing benchmarking script
+in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container 
+on NVIDIA DGX-2 with (1x V100 32G) GPU.
+
+**FP32 Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|59.41 img/s	|16.86 ms	|17.02 ms	|17.10 ms	|17.47 ms |
+|2	|92.74 img/s	|21.59 ms	|21.88 ms	|22.01 ms	|22.65 ms |
+|4	|141.53 img/s	|28.26 ms	|28.45 ms	|28.55 ms	|28.77 ms |
+|8	|228.80 img/s	|34.96 ms	|35.18 ms	|35.38 ms	|35.72 ms |
+|16	|324.11 img/s	|49.36 ms	|49.61 ms	|49.76 ms	|50.17 ms |
+|32	|397.66 img/s	|80.47 ms	|80.69 ms	|80.82 ms	|82.15 ms |
+|64	|468.28 img/s	|136.67 ms	|137.03 ms	|137.18 ms	|138.11 ms|
+|128	|514.25 img/s	|248.91 ms	|250.42 ms	|251.89 ms	|253.55 ms|
+
+**Mixed Precision Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|103.55 img/s	|9.70 ms	|9.94 ms	|10.03 ms	|10.26 ms |
+|2	|227.06 img/s	|8.86 ms	|9.07 ms	|9.19 ms	|9.78 ms  |
+|4	|380.22 img/s	|10.53 ms	|10.97 ms	|11.11 ms	|11.64 ms |
+|8	|579.59 img/s	|13.82 ms	|14.07 ms	|14.29 ms	|14.99 ms |
+|16	|833.08 img/s	|19.20 ms	|19.33 ms	|19.37 ms	|19.68 ms |
+|32	|990.96 img/s	|32.30 ms	|32.53 ms	|32.70 ms	|33.45 ms |
+|64	|1114.78 img/s	|57.41 ms	|57.61 ms	|57.86 ms	|58.78 ms |
+|128	|1203.04 img/s	|106.40 ms	|106.54 ms	|106.62 ms	|107.93 ms|
+
+**Mixed Precision Inference Latency + XLA**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|92.52 img/s	|10.85 ms	|10.95 ms	|11.12 ms	|11.99 ms |
+|2	|177.54 img/s	|11.32 ms	|11.59 ms	|11.66 ms	|13.16 ms |
+|4	|322.44 img/s	|12.44 ms	|12.63 ms	|12.71 ms	|12.98 ms |
+|8	|548.33 img/s	|14.59 ms	|14.90 ms	|15.58 ms	|15.95 ms |
+|16	|818.34 img/s	|19.55 ms	|19.75 ms	|19.91 ms	|20.30 ms |
+|32	|1178.12 img/s	|27.16 ms	|27.41 ms	|27.53 ms	|28.24 ms|
+|64	|1397.25 img/s	|45.96 ms	|46.03 ms	|46.19 ms	|47.01 ms|
+|128	|1613.78 img/s	|79.32 ms	|80.02 ms	|80.46 ms	|81.71 ms|
+
+##### Inference performance: NVIDIA T4 (1x T4 16G)
+
+Our results were obtained by running the `inference_benchmark.sh` inferencing benchmarking script
+in the [TensorFlow 20.03-tf1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container 
+on NVIDIA T4 with (1x T4 16G) GPU.
+
+**FP32 Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|48.92 img/s	|20.47 ms	|20.75 ms	|20.88 ms	|21.23 ms    |
+|2	|82.59 img/s	|24.24 ms	|24.42 ms	|24.50 ms	|24.64 ms    |
+|4	|111.08 img/s	|36.03 ms	|36.37 ms	|36.45 ms	|36.67 ms   |
+|8	|131.84 img/s	|60.68 ms	|61.36 ms	|61.62 ms	|62.21 ms   |
+|16	|144.27 img/s	|110.90 ms	|112.04 ms	|112.29 ms	|112.80 ms   |
+|32	|156.59 img/s	|204.35 ms	|206.12 ms	|206.78 ms	|208.08 ms   |
+|64	|162.58 img/s	|393.66 ms	|396.45 ms	|396.94 ms	|397.52 ms  |
+|128	|162.41 img/s	|788.13 ms	|790.86 ms	|791.47 ms	|792.43 ms  |
+
+**Mixed Precision Inference Latency**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1	|79.57 img/s	|12.74 ms	|13.31 ms	|13.37 ms	|13.85 ms    |
+|2	|190.25 img/s	|10.55 ms	|10.63 ms	|10.67 ms	|10.77 ms    |
+|4	|263.72 img/s	|15.16 ms	|15.37 ms	|15.44 ms	|15.58 ms   |
+|8	|312.07 img/s	|25.64 ms	|26.14 ms	|26.25 ms	|26.98 ms   |
+|16	|347.43 img/s	|46.05 ms	|46.60 ms	|46.82 ms	|47.09 ms   |
+|32	|360.20 img/s	|88.84 ms	|89.44 ms	|89.60 ms	|90.20 ms   |
+|64	|367.23 img/s	|174.28 ms	|175.81 ms	|176.13 ms	|176.74 ms  |
+|128	|362.43 img/s	|353.17 ms	|354.91 ms	|355.52 ms	|356.07 ms  |
+
+**Mixed Precision Inference Latency + XLA**
+
+|**Batch Size**|**Avg throughput**|**Avg latency**|**90% Latency**|**95% Latency**|**99% Latency**|
+|--------------|------------------|---------------|---------------|---------------|---------------|
+|1  |89.30 img/s	|11.24 ms	|11.31 ms	|11.38 ms	|11.75 ms  |
+|2	|152.93 img/s	|13.11 ms	|13.20 ms	|13.25 ms	|13.58 ms  |
+|4	|254.87 img/s	|15.69 ms	|15.84 ms	|15.87 ms	|15.95 ms  |
+|8	|356.48 img/s	|22.44 ms	|22.79 ms	|22.86 ms	|23.08 ms |
+|16	|442.24 img/s	|36.18 ms	|36.63 ms	|36.76 ms	|36.76 ms |
+|32	|471.28 img/s	|67.90 ms	|68.62 ms	|68.80 ms	|69.14 ms |
+|64	|483.18 img/s	|132.46 ms	|133.72 ms	|134.08 ms	|134.88 ms |
+|128	|501.38 img/s	|255.31 ms	|258.46 ms	|259.19 ms	|260.17 ms|
+
+## Release notes
+
+### Changelog
+
+June 2020
+   - Initial release
+
+### Known issues
+There are no known issues with this model.
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/imgs/SEArch.png
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/imgs/SEArch.png
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/imgs/train_loss.png
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/imgs/train_loss.png
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/inference_benchmark.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/inference_benchmark.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+
+DATA_DIR=${1:-"/data/tfrecords"}
+DALI_DIR=${2}
+
+BATCH_SIZE_TO_TEST="1 2 4 8 16 32 64 128"
+INFERENCE_BENCHMARK=$(mktemp /tmp/inference-benchmark.XXXXXX)
+
+function test_configuration() {
+    echo "Testing configuration: $1" | tee -a $INFERENCE_BENCHMARK
+
+    for BATCH in $BATCH_SIZE_TO_TEST; do
+        python ./main.py --arch=se-resnext101-32x4d --mode=inference_benchmark --warmup_steps 50 --num_iter 400 --iter_unit batch \
+            --batch_size $BATCH --data_dir=$DATA_DIR --results_dir=/tmp/results $2 | tail -n2 | head -n1 | sed \
+            's/^DLL \([0-9]*-\)*[0-9]* \([0-9]*:\)*[0-9]*.[0-9]* - ()/Results for BS='$BATCH'/' | tee -a $INFERENCE_BENCHMARK
+
+        if [ ! $? -eq 0 ]; then
+            echo "Failed test on batch size $BATCH_SIZE"
+            exit 1
+        fi
+    done
+}
+
+test_configuration "FP32 nodali noxla"
+test_configuration "FP16 nodali noxla" "--use_tf_amp"
+test_configuration "FP16 nodali xla" "--use_tf_amp --use_xla"
+
+if [ ! -z $DALI_DIR ]; then
+    test_configuration "FP16 dali xla" "--use_tf_amp --use_xla --use_dali --data_idx_dir ${DALI_DIR}"
+fi
+
+cat $INFERENCE_BENCHMARK
+rm $INFERENCE_BENCHMARK
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNxt101-32x4d_AMP_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    8 250 128 amp --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX1_SE-RNxt101-32x4d_AMP_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    8 90 128 amp 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX2_SE-RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX2_SE-RNxt101-32x4d_AMP_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    16 250 128 amp --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX2_SE-RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/DGX2_SE-RNxt101-32x4d_AMP_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    16 90 128 amp 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/GPU1_SE-RNxt101-32x4d_AMP_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/GPU1_SE-RNxt101-32x4d_AMP_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    1 250 128 amp --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/GPU1_SE-RNxt101-32x4d_AMP_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/AMP/GPU1_SE-RNxt101-32x4d_AMP_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    1 90 128 amp 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNxt101-32x4d_FP32_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    8 250 64 fp32 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX1_SE-RNxt101-32x4d_FP32_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    8 90 64 fp32 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX2_SE-RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX2_SE-RNxt101-32x4d_FP32_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    16 250 64 fp32 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX2_SE-RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/DGX2_SE-RNxt101-32x4d_FP32_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    16 90 64 fp32 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/GPU1_SE-RNxt101-32x4d_FP32_250E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/GPU1_SE-RNxt101-32x4d_FP32_250E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    1 250 64 fp32 --mixup=0.2 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/GPU1_SE-RNxt101-32x4d_FP32_90E.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/FP32/GPU1_SE-RNxt101-32x4d_FP32_90E.sh
@ -0,0 +1,6 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+WORKSPACE=${2:-"/workspace/rn50v15_tf"}
+DATA_DIR=${3:-"/data"}
+
+bash ${WORKSPACE}/se-resnext101-32x4d/training/GENERIC.sh ${RESULT_DIR} ${DATA_DIR} \
+    1 90 64 fp32 
--- a/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/GENERIC.sh
+++ b/TensorFlow/Classification/ConvNets/se-resnext101-32x4d/training/GENERIC.sh
@ -0,0 +1,39 @@
+RESULT_DIR=${1:-"/workspace/rn50v15_tf/results"}
+DATA_DIR=${2:-"/data"}
+
+GPU_COUNT=${3:-8}
+ITER_COUNT=${4:-50}
+BATCH_SIZE=${5:-128}
+PRECISION=${6:-"fp32"}
+OTHER=${@:7}
+
+if [[ ! -z "${BIND_TO_SOCKET}" ]]; then
+    BIND_TO_SOCKET="--bind-to socket"
+fi
+
+if [[ ! -z "${USE_DALI}" ]]; then
+    USE_DALI="--use_dali --data_idx_dir=${DATA_DIR}/dali_idx"
+fi
+
+if [[ ! -z "${USE_XLA}" ]]; then
+    USE_XLA="--use_xla"
+fi
+
+CMD=""
+case $PRECISION in
+    "fp32") CMD+="--precision=fp32";;
+    "fp16") CMD+="--precision=fp16 --use_static_loss_scaling --loss_scale=128";;
+    "amp") CMD+="--precision=fp32 --use_tf_amp --use_static_loss_scaling --loss_scale=128";;
+esac
+
+CMD="--arch=se-resnext101-32x4d --mode=train_and_evaluate --iter_unit=epoch --num_iter=${ITER_COUNT} \
+    --batch_size=${BATCH_SIZE} --warmup_steps=100 --use_cosine --label_smoothing 0.1 \
+    --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=6.103515625e-05 \
+    ${CMD} --data_dir=${DATA_DIR}/tfrecords ${USE_DALI} ${USE_XLA} \
+    --results_dir=${RESULT_DIR} --weight_init=fan_in ${OTHER}"
+
+if [[ ${GPU_COUNT} -eq 1 ]]; then
+    python3 main.py ${CMD}
+else
+    mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np ${GPU_COUNT} python3 main.py ${CMD}
+fi
--- a/TensorFlow/Classification/ConvNets/utils/init.py
+++ b/TensorFlow/Classification/ConvNets/utils/init.py
@ -27,4 +27,4 @@ from utils import image_processing

 from utils import learning_rate

-from utils import dali_utils
+from utils import dali_utils
--- a/TensorFlow/Classification/ConvNets/utils/cmdline_helper.py
+++ b/TensorFlow/Classification/ConvNets/utils/cmdline_helper.py
@ -30,10 +30,19 @@ def _add_bool_argument(parser, name=None, default=False, required=False, help=No
    feature_parser.set_defaults(name=default)


-def parse_cmdline():
+def parse_cmdline(available_arch):

    p = argparse.ArgumentParser(description="JoC-RN50v1.5-TF")

+    p.add_argument(
+        '--arch',
+        choices=available_arch,
+        type=str,
+        default='resnet50',
+        required=False,
+        help="""Architecture of model to run"""
+    )
+
    p.add_argument(
        '--mode',
        choices=['train', 'train_and_evaluate', 'evaluate', 'predict', 'training_benchmark', 'inference_benchmark'],
@ -58,43 +67,35 @@ def parse_cmdline():
        type=str,
        help="Path to index files for DALI. Files should be named 'train-*' and 'validation-*'."
    )
-    
+
    p.add_argument(
-        '--export_dir',
-        required=False,
-        default=None,
-        type=str,
-        help="Directory in which to write exported SavedModel."
+        '--export_dir', required=False, default=None, type=str, help="Directory in which to write exported SavedModel."
    )

-    p.add_argument(        
+    p.add_argument(
        '--to_predict',
        required=False,
        default=None,
        type=str,
        help="Path to file or directory of files to run prediction on."
    )
-    
-    p.add_argument(
-        '--batch_size', 
-        type=int, 
-        required=False, 
-        help="""Size of each minibatch per GPU."""
-    )

+    p.add_argument('--batch_size', type=int, required=True, help="""Size of each minibatch per GPU.""")
+
+    p.add_argument('--num_iter', type=int, required=False, default=1, help="""Number of iterations to run.""")
    p.add_argument(
-        '--num_iter',
-        type=int, 
-        required=False, 
-        default=1,
-        help="""Number of iterations to run."""
+        '--run_iter',
+        type=int,
+        required=False,
+        default=-1,
+        help="""Number of training iterations to run on single run."""
    )

    p.add_argument(
        '--iter_unit',
        choices=['epoch', 'batch'],
        type=str,
-        required=False,    
+        required=False,
        default='epoch',
        help="""Unit of iterations."""
    )
@ -109,14 +110,9 @@ def parse_cmdline():

    # Tensor format used for the computation.
    p.add_argument(
-        '--data_format',
-        choices=['NHWC', 'NCHW'],
-        type=str,
-        default='NCHW',
-        required=False,
-        help=argparse.SUPPRESS 
+        '--data_format', choices=['NHWC', 'NCHW'], type=str, default='NCHW', required=False, help=argparse.SUPPRESS
    )
-    
+
    p.add_argument(
        '--model_dir',
        type=str,
@ -125,30 +121,34 @@ def parse_cmdline():
        help="""Directory in which to write model. If undefined, results dir will be used."""
    )

-
    p.add_argument(
        '--results_dir',
        type=str,
-        required=True,
+        required=False,
+        default='.',
        help="""Directory in which to write training logs, summaries and checkpoints."""
    )

    p.add_argument(
-        '--display_every', 
+        '--log_filename',
+        type=str,
+        required=False,
+        default='log.json',
+        help="Name of the JSON file to which write the training log"
+    )
+
+    p.add_argument(
+        '--display_every',
        default=10,
-        type=int, 
-        required=False, 
+        type=int,
+        required=False,
        help="""How often (in batches) to print out running information."""
    )

    p.add_argument(
-        '--lr_init',
-        default=0.1,
-        type=float,
-        required=False,
-        help="""Initial value for the learning rate."""
+        '--lr_init', default=0.1, type=float, required=False, help="""Initial value for the learning rate."""
    )
-    
+
    p.add_argument(
        '--lr_warmup_epochs',
        default=5,
@ -157,12 +157,15 @@ def parse_cmdline():
        help="""Number of warmup epochs for learning rate schedule."""
    )

+    p.add_argument('--weight_decay', default=1e-4, type=float, required=False, help="""Weight Decay scale factor.""")
+
    p.add_argument(
-        '--weight_decay', 
-        default=1e-4,
-        type=float, 
-        required=False, 
-        help="""Weight Decay scale factor."""
+        '--weight_init',
+        default='fan_out',
+        choices=['fan_in', 'fan_out'],
+        type=str,
+        required=False,
+        help="""Model weight initialization method."""
    )

    p.add_argument(
@ -172,15 +175,10 @@ def parse_cmdline():
        required=False,
        help="""SGD momentum value for the Momentum optimizer."""
    )
-    
+
    #Select fp32 or non-AMP fp16 precision arithmetic.
    p.add_argument(
-        '--precision',
-        choices=['fp32', 'fp16'],
-        type=str,
-        default='fp32',
-        required=False,
-        help=argparse.SUPPRESS 
+        '--precision', choices=['fp32', 'fp16'], type=str, default='fp32', required=False, help=argparse.SUPPRESS
    )

    p.add_argument(
@ -190,15 +188,11 @@ def parse_cmdline():
        required=False,
        help="""Loss scale for FP16 Training and Fast Math FP32."""
    )
-    
+
    p.add_argument(
-        '--label_smoothing',
-        type=float,
-        default=0.0,
-        required=False,
-        help="""The value of label smoothing."""
+        '--label_smoothing', type=float, default=0.0, required=False, help="""The value of label smoothing."""
    )
-    
+
    p.add_argument(
        '--mixup',
        type=float,
@ -206,7 +200,7 @@ def parse_cmdline():
        required=False,
        help="""The alpha parameter for mixup (if 0 then mixup is not applied)."""
    )
-    
+
    _add_bool_argument(
        parser=p,
        name="use_static_loss_scaling",
@ -223,13 +217,7 @@ def parse_cmdline():
        help="Enable XLA (Accelerated Linear Algebra) computation for improved performance."
    )

-    _add_bool_argument(
-        parser=p,
-        name="use_dali",
-        default=False,
-        required=False,
-        help="Enable DALI data input."
-    )
+    _add_bool_argument(parser=p, name="use_dali", default=False, required=False, help="Enable DALI data input.")

    _add_bool_argument(
        parser=p,
@ -238,37 +226,27 @@ def parse_cmdline():
        required=False,
        help="Enable Automatic Mixed Precision to speedup FP32 computation using tensor cores."
    )
-    
+
    _add_bool_argument(
-        parser=p,
-        name="use_cosine_lr",
-        default=False,
-        required=False,
-        help="Use cosine learning rate schedule."
+        parser=p, name="use_cosine_lr", default=False, required=False, help="Use cosine learning rate schedule."
    )
-    
-    p.add_argument(
-        '--seed', 
-        type=int, 
-        default=1, 
-        help="""Random seed."""
-    )
-    
+
+    p.add_argument('--seed', type=int, default=None, help="""Random seed.""")
+
    p.add_argument(
        '--gpu_memory_fraction',
        type=float,
        default=0.7,
        help="""Limit memory fraction used by training script for DALI"""
    )
-    
+
    p.add_argument(
        '--gpu_id',
        type=int,
-        default=0,        
+        default=0,
        help="""Specify ID of the target GPU on multi-device platform. Effective only for single-GPU mode."""
    )
-    
-    
+
    FLAGS, unknown_args = p.parse_known_args()

    if len(unknown_args) > 0:
--- a/TensorFlow/Classification/ConvNets/utils/dali_index.sh
+++ b/TensorFlow/Classification/ConvNets/utils/dali_index.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+SRC_DIR=${1}
+DST_DIR=${2}
+
+echo "Creating training file indexes"
+for file in ${SRC_DIR}/train-*; do
+    BASENAME=$(basename $file)
+    DST_NAME=$DST_DIR/$BASENAME
+
+    echo "Creating index $DST_NAME for $file"
+    tfrecord2idx $file $DST_NAME
+done
+
+echo "Creating validation file indexes"
+for file in ${SRC_DIR}/validation-*; do
+    BASENAME=$(basename $file)
+    DST_NAME=$DST_DIR/$BASENAME
+
+    echo "Creating index $DST_NAME for $file"
+    tfrecord2idx $file $DST_NAME
+done
--- a/TensorFlow/Classification/ConvNets/utils/dali_utils.py
+++ b/TensorFlow/Classification/ConvNets/utils/dali_utils.py
@ -30,24 +30,29 @@ __all__ = ["get_synth_input_fn", "normalized_inputs"]


 class HybridPipe(dali.pipeline.Pipeline):
-    def __init__(self,
-                 tfrec_filenames,
-                 tfrec_idx_filenames,
-                 height, width,
-                 batch_size,
-                 num_threads,
-                 device_id,
-                 shard_id,
-                 num_gpus,
-                 deterministic=False,
-                 dali_cpu=True,
-                 training=True):
+
+    def __init__(
+        self,
+        tfrec_filenames,
+        tfrec_idx_filenames,
+        height,
+        width,
+        batch_size,
+        num_threads,
+        device_id,
+        shard_id,
+        num_gpus,
+        deterministic=False,
+        dali_cpu=True,
+        training=True
+    ):

        kwargs = dict()
        if deterministic:
            kwargs['seed'] = 7 * (1 + hvd.rank())
        super(HybridPipe, self).__init__(batch_size, num_threads, device_id, **kwargs)

+        self.training = training
        self.input = dali.ops.TFRecordReader(
            path=tfrec_filenames,
            index_path=tfrec_idx_filenames,
@ -56,43 +61,39 @@ class HybridPipe(dali.pipeline.Pipeline):
            num_shards=num_gpus,
            initial_fill=10000,
            features={
-                'image/encoded':dali.tfrecord.FixedLenFeature((), dali.tfrecord.string, ""),
-                'image/class/label':dali.tfrecord.FixedLenFeature([1], dali.tfrecord.int64,  -1),
-                'image/class/text':dali.tfrecord.FixedLenFeature([ ], dali.tfrecord.string, ''),
-                'image/object/bbox/xmin':dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
-                'image/object/bbox/ymin':dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
-                'image/object/bbox/xmax':dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
-                'image/object/bbox/ymax':dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0)})
-        if dali_cpu:
-            self.decode = dali.ops.HostDecoder(device="cpu", output_type=dali.types.RGB)
-            resize_device = "cpu"
-        else:
-            self.decode = dali.ops.nvJPEGDecoder(
-                device="mixed",
-                output_type=dali.types.RGB)
-            resize_device = "gpu"
+                'image/encoded': dali.tfrecord.FixedLenFeature((), dali.tfrecord.string, ""),
+                'image/class/label': dali.tfrecord.FixedLenFeature([1], dali.tfrecord.int64, -1),
+                'image/class/text': dali.tfrecord.FixedLenFeature([], dali.tfrecord.string, ''),
+                'image/object/bbox/xmin': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
+                'image/object/bbox/ymin': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
+                'image/object/bbox/xmax': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
+                'image/object/bbox/ymax': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0)
+            }
+        )

-        if training:
-            self.resize = dali.ops.RandomResizedCrop(
-                device=resize_device,
-                size=[height, width],
-                interp_type=dali.types.INTERP_LINEAR,
-                random_aspect_ratio=[0.8, 1.25],
-                random_area=[0.1, 1.0],
-                num_attempts=100)
+        if self.training:
+            self.decode = dali.ops.ImageDecoderRandomCrop(
+                device="cpu" if dali_cpu else "mixed",
+                output_type=dali.types.RGB,
+                random_aspect_ratio=[0.75, 1.33],
+                random_area=[0.05, 1.0],
+                num_attempts=100
+            )
+            self.resize = dali.ops.Resize(device="cpu" if dali_cpu else "gpu", resize_x=width, resize_y=height)
        else:
+            self.decode = dali.ops.ImageDecoder(device="cpu" if dali_cpu else "mixed", output_type=dali.types.RGB)
            # Make sure that every image > 224 for CropMirrorNormalize
-            self.resize = dali.ops.Resize (device=resize_device, resize_shorter=256)
+            self.resize = dali.ops.Resize(device="cpu" if dali_cpu else "gpu", resize_shorter=256)

        self.normalize = dali.ops.CropMirrorNormalize(
            device="gpu",
            output_dtype=dali.types.FLOAT,
            crop=(height, width),
            image_type=dali.types.RGB,
-            mean=[121., 115., 100.],
-            std=[70., 68., 71.],
-            output_layout=dali.types.NHWC)
-        self.uniform = dali.ops.Uniform(range=(0.0, 1.0))
+            mean=[123.68, 116.28, 103.53],
+            std=[58.395, 57.120, 57.385],
+            output_layout=dali.types.NHWC
+        )
        self.cast_float = dali.ops.Cast(device="gpu", dtype=dali.types.FLOAT)
        self.mirror = dali.ops.CoinFlip()
        self.iter = 0
@ -106,21 +107,26 @@ class HybridPipe(dali.pipeline.Pipeline):
        # Decode and augmentation
        images = self.decode(images)
        images = self.resize(images)
-        images = self.normalize(images.gpu(), mirror=self.mirror())
+        images = self.normalize(images.gpu(), mirror=self.mirror() if self.training else None)

        return (images, labels)

+
 class DALIPreprocessor(object):
-    def __init__(self,
-                 filenames,
-                 idx_filenames,
-                 height, width,
-                 batch_size,
-                 num_threads,
-                 dtype=tf.uint8,
-                 dali_cpu=True,
-                 deterministic=False,
-                 training=False):
+
+    def __init__(
+        self,
+        filenames,
+        idx_filenames,
+        height,
+        width,
+        batch_size,
+        num_threads,
+        dtype=tf.uint8,
+        dali_cpu=True,
+        deterministic=False,
+        training=False
+    ):
        device_id = hvd.local_rank()
        shard_id = hvd.rank()
        num_gpus = hvd.size()
@ -136,8 +142,9 @@ class DALIPreprocessor(object):
            num_gpus=num_gpus,
            deterministic=deterministic,
            dali_cpu=dali_cpu,
-            training=training)
-        
+            training=training
+        )
+
        daliop = dali_tf.DALIIterator()

        with tf.device("/gpu:0"):
@ -145,10 +152,11 @@ class DALIPreprocessor(object):
                pipeline=pipe,
                shapes=[(batch_size, height, width, 3), (batch_size, 1)],
                dtypes=[tf.float32, tf.int64],
-                device_id=device_id)
+                device_id=device_id
+            )

    def get_device_minibatches(self):
        with tf.device("/gpu:0"):
-            self.labels -= 1 # Change to 0-based (don't use background class)
-            self.labels = tf.squeeze(self.labels)
-        return self.images, self.labels
+            self.labels -= 1  # Change to 0-based (don't use background class)
+            self.labels = tf.squeeze(self.labels, axis=-1)
+        return self.images, self.labels
--- a/TensorFlow/Classification/ConvNets/utils/data_utils.py
+++ b/TensorFlow/Classification/ConvNets/utils/data_utils.py
@ -27,9 +27,10 @@ from utils import dali_utils
 __all__ = ["get_synth_input_fn", "normalized_inputs"]

 _R_MEAN = 123.68
-_G_MEAN = 116.78
-_B_MEAN = 103.94
+_G_MEAN = 116.28
+_B_MEAN = 103.53
 _CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+_CHANNEL_STDS = [58.395, 57.120, 57.385]

 _NUM_CHANNELS = 3

@ -109,12 +110,8 @@ def get_tfrecords_input_fn(filenames, batch_size, height, width, training, disto
    def preproc_func(record, counter_):
        return image_processing.preprocess_image_record(record, height, width, _NUM_CHANNELS, training)

-    ds = ds.cache()
-    
    if training:
-
        ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=shuffle_buffer_size, seed=seed))
-
    else:
        ds = ds.repeat()

@ -131,8 +128,9 @@ def get_tfrecords_input_fn(filenames, batch_size, height, width, training, disto

    return ds

+
 def get_inference_input_fn(filenames, height, width, num_threads):
-    
+
    ds = tf.data.Dataset.from_tensor_slices(filenames)

    counter = tf.data.Dataset.range(sys.maxsize)
@ -140,38 +138,37 @@ def get_inference_input_fn(filenames, height, width, num_threads):

    def preproc_func(record, counter_):
        return image_processing.preprocess_image_file(record, height, width, _NUM_CHANNELS, is_training=False)
-    
+
    ds = ds.apply(
-        tf.data.experimental.map_and_batch(
-            map_func=preproc_func,
-            num_parallel_calls=num_threads,
-            batch_size=1
-        )
+        tf.data.experimental.map_and_batch(map_func=preproc_func, num_parallel_calls=num_threads, batch_size=1)
    )

    ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)
-    
+
    return ds

-    
-    
-def get_dali_input_fn(filenames, idx_filenames, batch_size, height, width, training, distort_color, num_threads, deterministic):
+
+def get_dali_input_fn(
+    filenames, idx_filenames, batch_size, height, width, training, distort_color, num_threads, deterministic
+):

    if idx_filenames is None:
        raise ValueError("Must provide idx_filenames for DALI's reader")
-        
+
    preprocessor = dali_utils.DALIPreprocessor(
        filenames,
        idx_filenames,
-        height, width,
+        height,
+        width,
        batch_size,
        num_threads,
        dali_cpu=False,
        deterministic=deterministic,
-        training=training)
-    
+        training=training
+    )
+
    images, labels = preprocessor.get_device_minibatches()
-    
+
    return (images, labels)


@ -189,12 +186,15 @@ def normalized_inputs(inputs):
    means_per_channel = tf.reshape(_CHANNEL_MEANS, [1, 1, num_channels])
    means_per_channel = tf.cast(means_per_channel, dtype=inputs.dtype)

-    inputs = tf.subtract(inputs, means_per_channel)
+    stds_per_channel = tf.reshape(_CHANNEL_STDS, [1, 1, num_channels])
+    stds_per_channel = tf.cast(stds_per_channel, dtype=inputs.dtype)
+
+    inputs = tf.subtract(inputs, means_per_channel)
+    return tf.divide(inputs, stds_per_channel)

-    return tf.divide(inputs, 255.0)

 def get_serving_input_receiver_fn(batch_size, height, width, num_channels, data_format, dtype=tf.float32):
-    
+
    if data_format not in ["NHWC", "NCHW"]:
        raise ValueError("Unknown data_format: %s" % str(data_format))

@ -202,9 +202,9 @@ def get_serving_input_receiver_fn(batch_size, height, width, num_channels, data_
        input_shape = [batch_size] + [height, width, num_channels]
    else:
        input_shape = [batch_size] + [num_channels, height, width]
-        
+
    def serving_input_receiver_fn():
        features = tf.placeholder(dtype=dtype, shape=input_shape, name='input_tensor')
        return tf.estimator.export.TensorServingInputReceiver(features=features, receiver_tensors=features)
-    
+
    return serving_input_receiver_fn
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`-e git://github.com/NVIDIA/dllogger#egg=dllogger`