[ConvNets/TF2] EfficientNet release

2021-04-09 23:32:53 +02:00 · 2021-04-09 23:32:53 +02:00 · 4a66a008c4
parent 2bdf2775e3
commit 4a66a008c4
64 changed files with 7278 additions and 0 deletions
--- a/TensorFlow2/Classification/ConvNets/efficientnet/.gitignore
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/.gitignore
@ -0,0 +1,98 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# mypy
+.mypy_cache
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+# PyCharm
+.idea/
+
+# For mac
+.DS_Store
--- a/TensorFlow2/Classification/ConvNets/efficientnet/Dockerfile
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/Dockerfile
@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:21.02-tf2-py3
+FROM ${FROM_IMAGE_NAME}
+
+RUN echo ${FROM_IMAGE_NAME} 
+
+LABEL Effnet_tf by subhankarg
+
+RUN rm -rf /workspace && mkdir -p /workspace
+ADD . /workspace
+WORKDIR /workspace
+COPY . .
+
+RUN python -m pip install --upgrade pip && \
+    pip --no-cache-dir --no-cache install --user -r requirements.txt
+
+RUN pip install git+https://github.com/NVIDIA/dllogger
--- a/TensorFlow2/Classification/ConvNets/efficientnet/LICENSE
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/LICENSE
@ -0,0 +1,204 @@
+Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+Copyright 2016 The TensorFlow Authors.  All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2016, The Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/TensorFlow2/Classification/ConvNets/efficientnet/README.md
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/README.md
@ -0,0 +1,693 @@
+# EfficientNet For TensorFlow 2.4
+
+This repository provides a script and recipe to train the EfficientNet model to achieve state-of-the-art accuracy.
+The content of the repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Model overview](#model-overview)
+    * [Model architecture](#model-architecture)
+    * [Default configuration](#default-configuration)
+    * [Feature support matrix](#feature-support-matrix)
+	    * [Features](#features)
+    * [Mixed precision training](#mixed-precision-training)
+	    * [Enabling mixed precision](#enabling-mixed-precision)
+        * [Enabling TF32](#enabling-tf32)
+
+- [Setup](#setup)
+    * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+    * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+
+    * [Training process](#training-process)
+        *[Multi-node](#multi-node)
+    * [Inference process](#inference-process)
+   - [Performance](#performance)
+    * [Benchmarking](#benchmarking)
+        * [Training performance benchmark](#training-performance-benchmark)
+        * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results for EfficientNet-B0](#training-accuracy-results-for-efficientnet-b0)
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)  
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+        * [Training accuracy results for EfficientNet-B4](#training-accuracy-results-for-efficientnet-b4)
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb-1)  
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 32GB)](#training-accuracy-nvidia-dgx-1-8x-v100-32gb)
+        * [Training performance results for EfficientNet-B0](#training-performance-results-for-efficientnet-b0)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb) 
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+        * [Training performance results for EfficientNet-B4](#training-performance-results-for-efficientnet-b4)
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb-1) 
+            * [Training performance: NVIDIA DGX-1 (8x V100 32GB)](#training-performance-nvidia-dgx-1-8x-v100-32gb)
+        * [Inference performance results for EfficientNet-B0](#inference-performance-results-for-efficientnet-b0)
+            * [Inference performance: NVIDIA DGX A100 (1x A100 80GB)](#inference-performance-nvidia-dgx-a100-1x-a100-80gb)
+            * [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
+        * [Inference performance results for EfficientNet-B4](#inference-performance-results-for-efficientnet-b4)
+            * [Inference performance: NVIDIA DGX A100 (1x A100 80GB)](#inference-performance-nvidia-dgx-a100-1x-a100-80gb-1)
+            * [Inference performance: NVIDIA DGX-1 (1x V100 32GB)](#inference-performance-nvidia-dgx-1-1x-v100-32gb)
+- [Release notes](#release-notes)
+    * [Changelog](#changelog)
+    * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+EfficientNet  TensorFlow 2 is a family of image classification models, which achieve state-of-the-art accuracy, yet being an order-of-magnitude smaller and faster than previous models.
+This model is based on [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946).
+NVIDIA's implementation of EfficientNet TensorFlow 2 is an optimized version of [TensorFlow Model Garden](https://github.com/tensorflow/models/tree/master/official/vision/image_classification) implementation, 
+leveraging mixed precision arithmetic on Volta, Turing, and the NVIDIA Ampere GPU architectures for faster training times while maintaining target accuracy.
+
+The major differences between the original implementation of the paper and this version of EfficientNet are as follows:
+- Automatic mixed precision (AMP) training support
+- Cosine LR decay for better accuracy
+- Weight initialization using `fan_out` for better accuracy
+- Multi-node training support
+- XLA enabled for better performance
+- Lightweight logging using [dllogger](https://github.com/NVIDIA/dllogger)
+
+Other publicly available implementations of EfficientNet include:
+
+- [Tensorflow Model Garden](https://github.com/tensorflow/models/tree/master/official/vision/image_classification)
+- [Pytorch version](https://github.com/rwightman/pytorch-image-models)
+- [Google's implementation for TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet)
+ 
+This model is trained with mixed precision Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. It provides a push-button solution to pretraining on a corpus of choice. 
+As a result, researchers can get results 1.5x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training.  This model is tested against each NGC monthly released container to ensure consistent accuracy and performance over time.
+
+
+### Model architecture
+
+EfficientNets are developed based on AutoML and Compound Scaling. In particular, 
+a mobile-size baseline network called EfficientNet-B0 is developed from AutoML MNAS Mobile
+framework, the building block is mobile inverted bottleneck MBConv with squeeze-and-excitation optimization. 
+Then, through a compound scaling method, this baseline is scaled up to obtain EfficientNet-B1
+to B7.
+
+![Efficientnet_structure](https://1.bp.blogspot.com/-Cdtb97FtgdA/XO3BHsB7oEI/AAAAAAAAEKE/bmtkonwgs8cmWyI5esVo8wJPnhPLQ5bGQCLcBGAs/s1600/image4.png)
+
+### Default configuration
+
+Here is the Baseline EfficientNet-B0 structure. 
+ ![Efficientnet-B0](https://miro.medium.com/max/1106/1*5oQHqmvS_q9Pq_lZ_Rv51A.png)
+
+The following features were implemented in this model:
+- General:
+    -  XLA support
+    -  Mixed precision support
+    -  Multi-GPU support using Horovod
+    -  Multi-node support using Horovod
+    -  Cosine LR Decay
+	
+- Inference:
+    -  Support for inference on single image is included
+    -  Support for inference on batch of images is included
+    
+### Feature support matrix
+
+The following features are supported by this model: 
+
+| Feature               | EfficientNet                
+|-----------------------|-------------------------- |                    
+|Horovod Multi-GPU training (NCCL)              |       Yes      |           
+|Multi-node training    |     Yes     |
+|Automatic mixed precision (AMP)   |   Yes    |
+|XLA     |    Yes    |
+      
+         
+#### Features
+
+
+
+**Multi-GPU training with Horovod**
+
+Our model uses Horovod to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
+
+
+**Multi-node training with Horovod**
+
+Our model also uses Horovod to implement efficient multi-node training. 
+
+
+
+**Automatic Mixed Precision (AMP)**
+
+Computation graphs can be modified by TensorFlow on runtime to support mixed precision training. Detailed explanation of mixed precision can be found in the next section.
+
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) previously required two steps:
+1.  Porting the model to use the FP16 data type where appropriate.    
+2.  Adding loss scaling to preserve small gradient values.
+
+This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code.  AMP enables mixed precision training on Volta, Turing, and NVIDIA Ampere GPU architectures automatically. The TensorFlow framework code makes all necessary model changes internally.
+
+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
+
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+
+#### Enabling mixed precision
+
+Mixed precision is enabled in TensorFlow by using the Automatic Mixed Precision (TF-AMP) extension which casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In TensorFlow, loss scaling can be applied statically by using simple multiplication of loss by a constant value or automatically, by TF-AMP. Automatic mixed precision makes all the adjustments internally in TensorFlow, providing two benefits over manual operations. First, programmers need not modify network model code, reducing development and maintenance effort. Second, using AMP maintains forward and backward compatibility with all the APIs for defining and running TensorFlow models.
+
+To enable mixed precision,  you can simply add the `--use_amp` to the command-line used to run the model. This will enable the following code:
+
+```
+if params.use_amp:
+    policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16', loss_scale='dynamic')
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+```
+
+
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the EfficientNet model.
+
+### Requirements
+
+This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [TensorFlow 20.08-py3] NGC container or later
+-   Supported GPUs:
+    - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+    - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
+    - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running)
+  
+As an alternative  to the use of the Tensorflow2 NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+For multi-node, the sample provided in this repository requires [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) set up on a [SLURM](https://slurm.schedmd.com) cluster.
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the EfficientNet model on the ImageNet dataset. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
+
+1. Clone the repository.
+
+	```
+    git clone https://github.com/NVIDIA/DeepLearningExamples.git
+    
+    cd DeepLearningExamples/TensorFlow2/Classification/ConvNets/efficientnet
+	```
+
+2. Download and prepare the dataset.
+          `Runner.py` supports ImageNet with [TensorFlow Datasets (TFDS)](https://www.tensorflow.org/datasets/overview). Refer to  the [TFDS ImageNet readme](https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md) for manual download instructions.
+
+3. Build EfficientNet on top of the NGC container.
+           `bash ./scripts/docker/build.sh`
+
+4. Start an interactive session in the NGC container to run training/inference.
+           `bash ./scripts/docker/launch.sh`
+
+5. Start training.
+
+    To run training for a standard configuration (DGX A100/DGX-1 V100, AMP/TF32/FP32, 500 Epochs, efficientnet-b0/efficientnet-b4), 
+    run one of the scripts in the `./scripts/{B0, B4}/training` directory called `./scripts/{B0, B4}/training/{AMP, TF32, FP32}/convergence_8x{A100-80G, V100-16G, V100-32G}.sh`.
+    Ensure ImageNet is mounted in the `/data` directory.
+    For example:
+    `bash ./scripts/B0/AMP/convergence_8xA100-80G.sh`
+
+6. Start validation/evaluation.
+
+   To run validation/evaluation for a standard configuration (DGX A100/DGX-1 V100, AMP/TF32/FP32, efficientnet-b0/efficientnet-b4), 
+   run one of the scripts in the `./scripts/{B0, B4}/evaluation` directory called `./scripts/{B0, B4}/evaluation/evaluation_{AMP, FP32, TF32}_8x{A100-80G, V100-16G, V100-32G}.sh`.
+    Ensure ImageNet is mounted in the `/data` directory.
+    (Optional) Place the checkpoint in the `--model_dir` location to evaluate on a checkpoint.
+    For example:
+    `bash ./scripts/B0/evaluation/evaluation_AMP_8xA100-80G.sh`
+
+7. Start inference/predictions.
+
+   To run inference for a standard configuration (DGX A100/DGX-1 V100, AMP/TF32/FP32, efficientnet-b0/efficientnet-b4, batch size 8), 
+   run one of the scripts in the `./scripts/{B0, B4}/inference` directory called `./scripts/{B0, B4}/inference/inference_{AMP, FP32, TF32}.sh`.
+    Ensure your JPEG images to be ran inference on are mounted in the `/infer_data` directory with this folder structure :
+    ```
+    infer_data
+    |   ├── images
+    |   |   ├── image1.JPEG
+    |   |   ├── image2.JPEG
+    ```
+    (Optional) Place the checkpoint in the `--model_dir` location to evaluate on a checkpoint.
+    For example:
+    `bash ./scripts/B0/inference/inference_{AMP, FP32}.sh`
+
+Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark yours performance to [Training performance benchmark](#training-performance-results), or [Inference performance benchmark](#inference-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+The following lists the content for each folder:
+- `scripts/` - shell scripts to build and launch EfficientNet container on top of NGC container,
+and scripts to launch training, evaluation and inference
+- `model/` - building blocks and EfficientNet model definitions 
+- `runtime/` - detailed procedure for each running mode
+- `utils/` - support util functions for `runner.py`
+
+### Parameters
+
+Important parameters for training are listed below with default values.
+
+- `mode` (`train_and_eval`,`train`,`eval`,`prediction`) - the default is `train_and_eval`.
+- `arch` - the default is `efficientnet-b0`
+- `model_dir` - The folder where model checkpoints are saved (the default is `/workspace/output`)
+- `data_dir` - The folder where data resides (the default is `/data/`)
+- `augmenter_name` - Type of Augmentation (the default is `autoaugment`)
+- `max_epochs` - The number of training epochs (the default is `300`)
+- `warmup_epochs` - The number of epochs of warmup (the default is `5`)
+- `train_batch_size` - The training batch size per GPU (the default is `32`)
+- `eval_batch_size` - The evaluation batch size per GPU (the default is `32`)
+- `lr_init` - The learning rate for a batch size of 128, effective learning rate will be automatically scaled according to the global training batch size (the default is `0.008`)
+
+The main script `main.py` specific parameters are:
+```
+ --model_dir MODEL_DIR
+                        The directory where the model and training/evaluation
+                        summariesare stored.
+  --save_checkpoint_freq SAVE_CHECKPOINT_FREQ
+                        Number of epochs to save checkpoint.
+  --data_dir DATA_DIR   The location of the input data. Files should be named
+                        `train-*` and `validation-*`.
+  --mode MODE           Mode to run: `train`, `eval`, `train_and_eval`, `predict` or
+                        `export`.
+  --arch ARCH           The type of the model, e.g. EfficientNet, etc.
+  --dataset DATASET     The name of the dataset, e.g. ImageNet, etc.
+  --log_steps LOG_STEPS
+                        The interval of steps between logging of batch level
+                        stats.
+  --use_xla             Set to True to enable XLA
+  --use_amp             Set to True to enable AMP
+  --num_classes NUM_CLASSES
+                        Number of classes to train on.
+  --batch_norm BATCH_NORM
+                        Type of Batch norm used.
+  --activation ACTIVATION
+                        Type of activation to be used.
+  --optimizer OPTIMIZER
+                        Optimizer to be used.
+  --moving_average_decay MOVING_AVERAGE_DECAY
+                        The value of moving average.
+  --label_smoothing LABEL_SMOOTHING
+                        The value of label smoothing.
+  --max_epochs MAX_EPOCHS
+                        Number of epochs to train.
+  --num_epochs_between_eval NUM_EPOCHS_BETWEEN_EVAL
+                        Eval after how many steps of training.
+  --steps_per_epoch STEPS_PER_EPOCH
+                        Number of steps of training.
+  --warmup_epochs WARMUP_EPOCHS
+                        Number of steps considered as warmup and not taken
+                        into account for performance measurements.
+  --lr_init LR_INIT     Initial value for the learning rate.
+  --lr_decay LR_DECAY   Type of LR Decay.
+  --lr_decay_rate LR_DECAY_RATE
+                        LR Decay rate.
+  --lr_decay_epochs LR_DECAY_EPOCHS
+                        LR Decay epoch.
+  --weight_decay WEIGHT_DECAY
+                        Weight Decay scale factor.
+  --weight_init {fan_in,fan_out}
+                        Model weight initialization method.
+  --train_batch_size TRAIN_BATCH_SIZE
+                        Training batch size per GPU.
+  --augmenter_name AUGMENTER_NAME
+                        Type of Augmentation during preprocessing only during
+                        training.
+  --eval_batch_size EVAL_BATCH_SIZE
+                        Evaluation batch size per GPU.
+  --resume_checkpoint   Resume from a checkpoint in the model_dir.
+  --use_dali            Use dali for data loading and preprocessing of train
+                        dataset.
+  --use_dali_eval       Use dali for data loading and preprocessing of eval
+                        dataset.
+  --dtype DTYPE         Only permitted
+                        `float32`,`bfloat16`,`float16`,`fp32`,`bf16`
+```
+
+### Command-line options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python main.py --help`
+
+
+### Getting the data
+
+Refer to the [TFDS ImageNet readme](https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md) for manual download instructions.
+To train on ImageNet dataset, pass `$path_to_ImageNet_tfrecords` to `$data_dir` in the command-line.
+
+Name the TFRecords in the following scheme:
+
+- Training images - `/data/train-*`
+- Validation images - `/data/validation-*`
+
+### Training process
+
+The training process can start from scratch, or resume from a checkpoint.
+
+By default, bash script `scripts/{B0, B4}/training/{AMP, FP32, TF32}/convergence_8x{A100-80G, V100-16G, V100-32G}.sh` will start the training process from scratch with the following settings.
+   - Use 8 GPUs by Horovod
+   - Has XLA enabled
+   - Saves checkpoints after every 5 epochs to `/workspace/output/` folder
+   - AMP or FP32 or TF32 based on the folder `scripts/{B0, B4}/training/{AMP, FP32, TF32}`
+
+To resume from a checkpoint, include `--resume_checkpoint` in the command-line and place the checkpoint into `--model_dir`.
+
+#### Multi-node
+
+Multi-node runs can be launched on a Pyxis/enroot Slurm cluster (see [Requirements](#requirements)) with the `run_{B0, B4}_multinode.sub` script with the following command for a 4-node NVIDIA DGX A100 example:
+
+```
+PARTITION=<partition_name> sbatch N 4 --ntasks-per-node=8 run_B0_multinode.sub
+PARTITION=<partition_name> sbatch N 4 --ntasks-per-node=8 run_B4_multinode.sub
+```
+ 
+Checkpoint after `--save_checkpoint_freq` epochs will be saved in `checkpointdir`. The checkpoint will be automatically picked up to resume training in case it needs to be resumed. Cluster partition name has to be provided `<partition_name>`.
+ 
+Note that the `run_{B0, B4}_multinode.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `--container-image` handle the container image to train using and `--datadir` handle the location of the ImageNet data.
+ 
+Refer to the files contents to see the full list of variables to adjust for your system.
+
+### Inference process
+
+Validation is done every epoch and can be also run separately on a checkpointed model.
+
+`bash ./scripts/{B0, B4}/evaluation/evaluation_{AMP, FP32, TF32}_8x{A100-80G, V100-16G, V100-32G}.sh`
+
+Metrics gathered through this process are as follows:
+
+```
+- eval_loss
+- eval_accuracy_top_1
+- eval_accuracy_top_5
+- avg_exp_per_second_eval
+- avg_exp_per_second_eval_per_GPU
+- avg_time_per_exp_eval : Average Latency
+- latency_90pct : 90% Latency
+- latency_95pct : 95% Latency
+- latency_99pct : 99% Latency
+```
+
+To run inference on a JPEG image, you have to first store the checkpoint in the `--model_dir` and store the JPEG images in the following directory structure:
+
+    ```
+    infer_data
+    |   ├── images
+    |   |   ├── image1.JPEG
+    |   |   ├── image2.JPEG
+    ```
+
+Run: 
+`bash ./scripts/{B0, B4}/inference/inference_{AMP, FP32, TF32}.sh`
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+Training benchmark for EfficientNet-B0 was run on NVIDIA DGX A100 80GB and NVIDIA DGX-1 V100 16GB.
+
+To benchmark training performance with other parameters, run:
+
+`bash ./scripts/B0/training/{AMP, FP32, TF32}/train_benchmark_8x{A100-80G, V100-16G}.sh`
+
+Training benchmark for EfficientNet-B4 was run on NVIDIA DGX A100- 80GB and NVIDIA DGX-1 V100 32GB.
+
+`bash ./scripts/B4/training/{AMP, FP32, TF32}/train_benchmark_8x{A100-80G, V100-16G}.sh`
+
+#### Inference performance benchmark
+
+Inference benchmark for EfficientNet-B0 was run on NVIDIA DGX A100- 80GB and NVIDIA DGX-1 V100 16GB.
+
+Inference benchmark for EfficientNet-B4 was run on NVIDIA DGX A100- 80GB and NVIDIA DGX-1 V100 32GB.
+
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results for EfficientNet-B0
+
+
+##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)
+
+Our results were obtained by running the training scripts in the tensorflow:21.02-tf2-py3 NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs.
+
+| GPUs     | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)        |
+|-------------------|-----------------------|-------------|-------|-------------------|---------------------------------------|
+|     8              |             77.38          |           77.43  |     19   |  10.5     |     1.8           |
+|     16              |           77.46          |         77.62  |        10 |  5.5   |  1.81  |
+
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
+
+Our results were obtained by running the training scripts in the tensorflow:21.02-tf2-py3 NGC container on NVIDIA DGX-1 (8x V100 16GB) GPUs.
+
+| GPUs     | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        |
+|-------------------|-----------------------|-------------|-------|-------------------|---------------------------------------|
+|     8              |             77.54          |           77.51  |     11.48   |  11.44     |     1.003           |
+|     32              |           77.38          |         77.62  |        48 |  44   |  1.09  |
+
+
+#### Training accuracy results for EfficientNet-B4
+
+
+##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)
+
+Our results were obtained by running the training scripts in the tensorflow:21.02-tf2-py3 NGC container on multi-node NVIDIA DGX A100 (8x A100 80GB) GPUs.
+
+| GPUs     | Accuracy - TF32  | Accuracy - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (TF32 to mixed precision)        |
+|-------------------|-----------------------|-------------|-------|-------------------|---------------------------------------|
+|     32              |             82.69          |           82.69  |     38   |  17.5     |     2.17           |
+|     64              |           82.75          |         82.78  |        18 |  8.5   |  2.11  |
+
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 32GB)
+
+Our results were obtained by running the training scripts in the tensorflow:21.02-tf2-py3 NGC container on multi-node NVIDIA DGX-1 (8x V100 32GB) GPUs.
+
+| GPUs     | Accuracy - FP32  | Accuracy - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (FP32 to mixed precision)        |
+|-------------------|-----------------------|-------------|-------|-------------------|---------------------------------------|
+|     32              |     82.78  |    82.78  |    95   |   39.5     |    2.40           |
+|     64              |           82.74          |         82.74  |     53 |  19   |  2.78  |
+
+#### Training performance results for EfficientNet-B0
+
+
+##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
+
+Our results were obtained by running the training benchmark script in the tensorflow:21.02-tf2-py3 NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over 5 entire training epoch.
+
+| GPUs  | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision   |     
+|-----|-----|-----|-----|------|-------|
+|  1  | 1206 | 2549 | 2.11 | 1 | 1 |
+|  8  | 9365 | 16336 | 1.74 | 7.76 | 6.41 |
+|  16  | 18361 | 33000 | 1.79 | 15.223 | 12.95 |
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
+
+Our results were obtained by running the training benchmark script in the tensorflow:21.02-tf2-py3 NGC container on NVIDIA DGX-1 (8x V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+
+| GPUs  | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision   |     
+|-----|-----|-----|-----|------|-------|
+|  1  | 629 | 712 | 1.13 | 1 | 1 |
+|  8  | 4012 | 4065 | 1.01 | 6.38 | 5.71 |
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+
+#### Training performance results for EfficientNet-B4
+
+
+##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
+
+Our results were obtained by running the training benchmark script in the tensorflow:21.02-tf2-py3 NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over 5 entire training epoch.
+
+| GPUs  | Throughput - TF32    | Throughput - mixed precision    | Throughput speedup (TF32 - mixed precision)   | Weak scaling - TF32    | Weak scaling - mixed precision   |     
+|-----|-----|-----|-----|------|-------|
+|  1  | 167 | 394 | 2.34 | 1 | 1 |
+|  8  | 1280 | 2984 | 2.33 | 7.66 | 7.57 |
+|  32  | 5023 | 11034 | 2.19 | 30.07 | 28.01 |
+|  64  | 9838 | 21844 | 2.22 | 58.91 | 55.44 |
+
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+
+
+##### Training performance: NVIDIA DGX-1 (8x V100 32GB)
+
+Our results were obtained by running the training benchmark script in the tensorflow:21.02-tf2-py3 NGC container on NVIDIA DGX-1 (8x V100 16GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+
+| GPUs  | Throughput - FP32    | Throughput - mixed precision    | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision   |     
+|-----|-----|-----|-----|------|-------|
+|  1  | 89 | 193 | 2.16 | 1 | 1 |
+|  8  | 643 | 1298 | 2.00 | 7.28 | 6.73 |
+|  32  | 2095 | 4892 | 2.33 | 23.54 | 25.35 |
+|  64  | 4109 | 9666 | 2.35 | 46.17 | 50.08 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+
+
+
+#### Inference performance results for EfficientNet-B0
+
+##### Inference performance: NVIDIA DGX A100 (1x A100 80GB)
+
+Our results were obtained by running the inferencing benchmarking script in the tensorflow:21.02-tf2-py3 NGC container on NVIDIA DGX A100 (1x A100 80GB) GPU.
+
+FP16 Inference Latency
+
+| Batch size | Resolution | Throughput Avg | Latency Avg (ms) | Latency 90% (ms) |Latency 95% (ms) |Latency 99% (ms) |
+|------------|-----------------|-----|-----|-----|-----|-----|
+|      1      |    224x224    | 111 |  8.97   | 8.88  | 8.92 | 8.96 |
+|      2      |    224x224      | 233 | 4.28 | 4.22 | 4.25 | 4.27 |
+|      4      |    224x224      | 432 | 2.31 | 2.28 | 2.29 | 2.30 |
+|      8      |    224x224      | 771 | 1.29 | 1.27 | 1.28 | 1.28 |
+|     1024       |    224x224     | 10269 |  0.10   |  0.10   |   0.10  | 0.10 |
+
+TF32 Inference Latency
+
+| Batch size | Resolution | Throughput Avg | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
+|------------|-----------------|-----|-----|-----|-----|-----|
+|     1       |      224x224   |   101     |  9.87   | 9.78    |  9.82   | 9.86    |
+|     2       |      224x224    | 204 |   4.89  |  4.83   |  4.85   |   4.88  |
+|     4       |      224x224    | 381 |  2.62   |  2.59   |  2.60   |  2.61   |
+|     8      |      224x224   | 584 |  1.71   |  1.69   |  1.70   | 1.71    |
+|      512      |   224x224      | 5480 | 0.18 | 0.18 | 0.18 | 0.18 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
+
+Our results were obtained by running the `inference-script-name.sh` inferencing benchmarking script in the TensorFlow NGC container on NVIDIA DGX-1 (1x V100 16GB) GPU.
+
+FP16 Inference Latency
+
+| Batch size | Resolution | Throughput Avg | Latency Avg | Latency 90% |Latency 95% |Latency 99% |
+|------------|-----------------|-----|-----|-----|-----|-----|
+|     1      |    224x224     | 98.8 | 10.12 | 10.03 | 10.06 | 10.10 |
+|     2      |    224x224      | 199.3 | 5.01 | 4.95 | 4.97 | 5.00 |
+|     4      |    224x224      | 382.5 | 2.61 | 2.57 | 2.59 | 2.60 |
+|     8      |    224x224      | 681.2 | 1.46 | 1.44 | 1.45 | 1.46 |
+|      256      |   224x224      | 5271 | 0.19 | 0.18 | 0.18 | 0.19 |
+
+FP32 Inference Latency
+
+| Batch size | Resolution | Throughput Avg | Latency Avg | Latency 90% | Latency 95% | Latency 99% |
+|------------|-----------------|-----|-----|-----|-----|-----|
+|      1      |    224x224     | 68.39 | 14.62 | 14.45 | 14.51 | 14.56 |
+|      2      |    224x224      | 125.62 | 7.96 | 7.89 | 7.91 | 7.94 |
+|      4      |    224x224      | 216.41 | 4.62 | 4.56 | 4.60 | 4.61 |
+|      8      |    224x224      | 401.60 | 2.49 | 2.45 | 2.47 | 2.48 |
+|     128      |   224x224       | 2713 | 0.37 | 0.36 | 0.36 | 0.37 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+
+#### Inference performance results for EfficientNet-B4
+
+##### Inference performance: NVIDIA DGX A100 (1x A100 80GB)
+
+Our results were obtained by running the inferencing benchmarking script in the tensorflow:21.02-tf2-py3 NGC container on NVIDIA DGX A100 (1x A100 80GB) GPU.
+
+FP16 Inference Latency
+
+| Batch size | Resolution | Throughput Avg | Latency Avg (ms) | Latency 90% (ms) |Latency 95% (ms) |Latency 99% (ms) |
+|------------|-----------------|-----|-----|-----|-----|-----|
+|      1      |    380x380    | 57.54 |  17.37   | 17.24  | 17.30 | 17.35 |
+|      2      |    380x380      | 112.06 | 8.92 | 8.85 | 8.88 | 8.91 |
+|      4      |    380x380      | 219.71 | 4.55 | 4.52 | 4.53 | 4.54 |
+|      8      |    380x380      | 383.39 | 2.60 | 2.58 | 2.59 | 2.60 |
+|     128       |    380x380     | 1470 |  0.68   |  0.67   |  0.67  | 0.68 |
+
+TF32 Inference Latency
+| Batch size | Resolution | Throughput Avg | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
+|------------|-----------------|-----|-----|-----|-----|-----|
+|     1       |      380x380   |   52.68     |  18.98   | 18.86    |  18.91   | 18.96    |
+|     2       |      380x380    | 95.32 |   10.49  |  10.42   |  10.45   |  10.48  |
+|     4       |      380x380    | 182.14 |  5.49   | 5.46   |  5.47   |  5.48   |
+|     8      |      380x380   | 325.72 |  3.07   |  3.05   |  3.05   | 3.06    |
+|      64      |   380x380      | 694 | 1.43 | 1.42 | 1.43 | 1.43 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 32GB)
+
+Our results were obtained by running the `inference-script-name.sh` inferencing benchmarking script in the TensorFlow NGC container on NVIDIA DGX-1 (1x V100 16GB) GPU.
+
+FP16 Inference Latency
+
+| Batch size | Resolution | Throughput Avg | Latency Avg | Latency 90% | Latency 95% | Latency 99% |
+|------------|-----------------|-----|-----|-----|-----|-----|
+|     1      |    380x380     | 54.27 | 18.35 | 18.20 | 18.25 | 18.32 |
+|     2      |    380x380      | 104.27 | 9.59 | 9.51 | 9.54 | 9.58 |
+|     4      |    380x380      | 182.61 | 5.47 | 5.41 | 5.43 | 5.46 |
+|     8      |    380x380      | 234.06 | 4.27 | 4.24 | 4.25 | 4.26 |
+|      64      |   380x380      | 782.47 | 1.28 | 1.25 | 1.26 | 1.27 |
+
+
+FP32 Inference Latency
+
+| Batch size | Resolution | Throughput Avg | Latency Avg | Latency 90% |Latency 95% |Latency 99% |
+|------------|-----------------|-----|-----|-----|-----|-----|
+|      1      |    380x380     | 30.48 | 32.80 | 32.86 | 31.83 | 32.60 |
+|      2      |    380x380      | 58.59 | 17.06 | 15.96 | 16.51 | 16.95 |
+|      4      |    380x380      | 111.35 | 8.98 | 8.75 | 8.78 | 8.92 |
+|      8      |    380x380      | 199.00 | 5.03 | 4.84 | 4.88 | 5.00 |
+|     32      |   380x380       | 307.04  | 3.25 | 3.25 | 3.25 | 3.25 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+## Release notes
+
+### Changelog
+
+March 2021
+- Initial release
+
+### Known issues
+
+- EfficientNet-B0 does not improve training speed by using AMP as compared to FP32, because of the CPU bound Auto-augmentation.
+
+
--- a/TensorFlow2/Classification/ConvNets/efficientnet/main.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/main.py
@ -0,0 +1,73 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import warnings
+warnings.simplefilter("ignore")
+import tensorflow as tf
+import horovod.tensorflow as hvd
+from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+import dllogger as DLLogger
+from utils import hvd_utils
+
+from utils.setup import set_flags
+from runtime import Runner
+from utils.cmdline_helper import parse_cmdline
+
+if __name__ == "__main__":
+
+    hvd.init()
+    FLAGS = parse_cmdline()
+    set_flags(FLAGS)
+
+    backends = []
+    if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
+        # Prepare Model Dir
+        log_path = os.path.join(FLAGS.model_dir, FLAGS.log_filename)
+        os.makedirs(FLAGS.model_dir, exist_ok=True)
+        # Setup dlLogger
+        backends+=[
+            JSONStreamBackend(verbosity=Verbosity.VERBOSE, filename=log_path),
+            StdOutBackend(verbosity=Verbosity.DEFAULT)
+        ]
+    DLLogger.init(backends=backends)
+    DLLogger.log(data=vars(FLAGS), step='PARAMETER')
+
+    runner = Runner(FLAGS, DLLogger)
+
+    if FLAGS.mode in ["train", "train_and_eval", "training_benchmark"]:
+        runner.train()
+        
+    if FLAGS.mode in ['eval', 'evaluate', 'inference_benchmark']:
+        if FLAGS.mode == 'inference_benchmark' and hvd_utils.is_using_hvd():
+            raise NotImplementedError("Only single GPU inference is implemented.")
+        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
+            runner.evaluate()
+            
+    if FLAGS.mode == 'predict':
+        if FLAGS.to_predict is None:
+            raise ValueError("No data to predict on.")
+
+        if not os.path.isdir(FLAGS.to_predict):
+            raise ValueError("Provide directory with images to infer!")
+
+        if hvd_utils.is_using_hvd():
+            raise NotImplementedError("Only single GPU inference is implemented.")
+
+        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
+            runner.predict(FLAGS.to_predict, FLAGS.inference_checkpoint)
--- a/TensorFlow2/Classification/ConvNets/efficientnet/model/blocks/init.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/model/blocks/init.py
@ -0,0 +1,18 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from model.blocks.conv2d_block import conv2d_block
+from model.blocks.mb_conv_block import mb_conv_block
+
+__all__ = ['conv2d_block', 'mb_conv_block']
--- a/TensorFlow2/Classification/ConvNets/efficientnet/model/blocks/conv2d_block.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/model/blocks/conv2d_block.py
@ -0,0 +1,83 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+from typing import Any, Dict, Optional, Text, Tuple
+
+from model.layers import get_batch_norm
+
+__all__ = ['conv2d_block']
+
+CONV_KERNEL_INITIALIZER = {
+    'class_name': 'VarianceScaling',
+    'config': {
+        'scale': 2.0,
+        'mode': 'fan_in',
+        # Note: this is a truncated normal distribution
+        'distribution': 'normal'
+    }
+}
+
+def conv2d_block(inputs: tf.Tensor,
+                 conv_filters: Optional[int],
+                 config: dict,
+                 kernel_size: Any = (1, 1),
+                 strides: Any = (1, 1),
+                 use_batch_norm: bool = True,
+                 use_bias: bool = False,
+                 activation: Any = None,
+                 depthwise: bool = False,
+                 name: Text = None):
+  """A conv2d followed by batch norm and an activation."""
+  batch_norm = get_batch_norm(config['batch_norm'])
+  bn_momentum = config['bn_momentum']
+  bn_epsilon = config['bn_epsilon']
+  data_format = tf.keras.backend.image_data_format()
+  weight_decay = config['weight_decay']
+
+  name = name or ''
+
+  # Collect args based on what kind of conv2d block is desired
+  init_kwargs = {
+      'kernel_size': kernel_size,
+      'strides': strides,
+      'use_bias': use_bias,
+      'padding': 'same',
+      'name': name + '_conv2d',
+      'kernel_regularizer': tf.keras.regularizers.l2(weight_decay),
+      'bias_regularizer': tf.keras.regularizers.l2(weight_decay),
+  }
+  CONV_KERNEL_INITIALIZER['config']['mode'] = config['weight_init']
+
+  if depthwise:
+    conv2d = tf.keras.layers.DepthwiseConv2D
+    init_kwargs.update({'depthwise_initializer': CONV_KERNEL_INITIALIZER})
+  else:
+    conv2d = tf.keras.layers.Conv2D
+    init_kwargs.update({'filters': conv_filters,
+                        'kernel_initializer': CONV_KERNEL_INITIALIZER})
+
+  x = conv2d(**init_kwargs)(inputs)
+
+  if use_batch_norm:
+    bn_axis = 1 if data_format == 'channels_first' else -1
+    x = batch_norm(axis=bn_axis,
+                   momentum=bn_momentum,
+                   epsilon=bn_epsilon,
+                   name=name + '_bn')(x)
+
+  if activation is not None:
+    x = tf.keras.layers.Activation(activation,
+                                   name=name + '_activation')(x)
+  return x
--- a/TensorFlow2/Classification/ConvNets/efficientnet/model/blocks/mb_conv_block.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/model/blocks/mb_conv_block.py
@ -0,0 +1,138 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+from typing import Any, Dict, Optional, Text, Tuple
+
+from model.layers import get_activation
+from model.blocks import conv2d_block 
+
+__all__ = ['mb_conv_block']
+
+def mb_conv_block(inputs: tf.Tensor,
+                  block: dict,
+                  config: dict,
+                  prefix: Text = None):
+  """Mobile Inverted Residual Bottleneck.
+
+  Args:
+    inputs: the Keras input to the block
+    block: BlockConfig, arguments to create a Block
+    config: ModelConfig, a set of model parameters
+    prefix: prefix for naming all layers
+
+  Returns:
+    the output of the block
+  """
+  use_se = config['use_se']
+  activation = get_activation(config['activation'])
+  drop_connect_rate = config['drop_connect_rate']
+  data_format = tf.keras.backend.image_data_format()
+  use_depthwise = block['conv_type'] != 'no_depthwise'
+  prefix = prefix or ''
+
+  filters = block['input_filters'] * block['expand_ratio']
+
+  x = inputs
+
+  if block['fused_conv']:
+    # If we use fused mbconv, skip expansion and use regular conv.
+    x = conv2d_block(x,
+                     filters,
+                     config,
+                     kernel_size=block['kernel_size'],
+                     strides=block['strides'],
+                     activation=activation,
+                     name=prefix + 'fused')
+  else:
+    if block['expand_ratio'] != 1:
+      # Expansion phase
+      kernel_size = (1, 1) if use_depthwise else (3, 3)
+      x = conv2d_block(x,
+                       filters,
+                       config,
+                       kernel_size=kernel_size,
+                       activation=activation,
+                       name=prefix + 'expand')
+
+    # Depthwise Convolution
+    if use_depthwise:
+      x = conv2d_block(x,
+                       conv_filters=None,
+                       config=config,
+                       kernel_size=block['kernel_size'],
+                       strides=block['strides'],
+                       activation=activation,
+                       depthwise=True,
+                       name=prefix + 'depthwise')
+
+  # Squeeze and Excitation phase
+  if use_se:
+    assert block['se_ratio'] is not None
+    assert 0 < block['se_ratio'] <= 1
+    num_reduced_filters = max(1, int(
+        block['input_filters'] * block['se_ratio']
+    ))
+
+    if data_format == 'channels_first':
+      se_shape = (filters, 1, 1)
+    else:
+      se_shape = (1, 1, filters)
+
+    se = tf.keras.layers.GlobalAveragePooling2D(name=prefix + 'se_squeeze')(x)
+    se = tf.keras.layers.Reshape(se_shape, name=prefix + 'se_reshape')(se)
+
+    se = conv2d_block(se,
+                      num_reduced_filters,
+                      config,
+                      use_bias=True,
+                      use_batch_norm=False,
+                      activation=activation,
+                      name=prefix + 'se_reduce')
+    se = conv2d_block(se,
+                      filters,
+                      config,
+                      use_bias=True,
+                      use_batch_norm=False,
+                      activation='sigmoid',
+                      name=prefix + 'se_expand')
+    x = tf.keras.layers.multiply([x, se], name=prefix + 'se_excite')
+
+  # Output phase
+  x = conv2d_block(x,
+                   block['output_filters'],
+                   config,
+                   activation=None,
+                   name=prefix + 'project')
+
+  # Add identity so that quantization-aware training can insert quantization
+  # ops correctly.
+  x = tf.keras.layers.Activation(get_activation('identity'),
+                                 name=prefix + 'id')(x)
+
+  if (block['id_skip']
+      and all(s == 1 for s in block['strides'])
+      and block['input_filters'] == block['output_filters']):
+    if drop_connect_rate and drop_connect_rate > 0:
+      # Apply dropconnect
+      # The only difference between dropout and dropconnect in TF is scaling by
+      # drop_connect_rate during training. See:
+      # https://github.com/keras-team/keras/pull/9898#issuecomment-380577612
+      x = tf.keras.layers.Dropout(drop_connect_rate,
+                                  noise_shape=(None, 1, 1, 1),
+                                  name=prefix + 'drop')(x)
+
+    x = tf.keras.layers.add([x, inputs], name=prefix + 'add')
+
+  return x
--- a/TensorFlow2/Classification/ConvNets/efficientnet/model/common_modules.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/model/common_modules.py
@ -0,0 +1,77 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common modeling utilities."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import numpy as np
+import math
+import tensorflow as tf
+from typing import Text, Optional
+
+__all__ = ['count_params', 'load_weights', 'round_filters', 'round_repeats']
+
+
+def count_params(model, trainable_only=True):
+  """Returns the count of all model parameters, or just trainable ones."""
+  if not trainable_only:
+    return model.count_params()
+  else:
+    return int(np.sum([tf.keras.backend.count_params(p)
+                       for p in model.trainable_weights]))
+
+
+def load_weights(model: tf.keras.Model,
+                 model_weights_path: Text,
+                 weights_format: Text = 'saved_model'):
+  """Load model weights from the given file path.
+
+  Args:
+    model: the model to load weights into
+    model_weights_path: the path of the model weights
+    weights_format: the model weights format. One of 'saved_model', 'h5',
+       or 'checkpoint'.
+  """
+  if weights_format == 'saved_model':
+    loaded_model = tf.keras.models.load_model(model_weights_path)
+    model.set_weights(loaded_model.get_weights())
+  else:
+    model.load_weights(model_weights_path)
+
+def round_filters(filters: int,
+                  config: dict) -> int:
+  """Round number of filters based on width coefficient."""
+  width_coefficient = config['width_coefficient']
+  min_depth = config['min_depth']
+  divisor = config['depth_divisor']
+  orig_filters = filters
+
+  if not width_coefficient:
+    return filters
+
+  filters *= width_coefficient
+  min_depth = min_depth or divisor
+  new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_filters < 0.9 * filters:
+    new_filters += divisor
+  return int(new_filters)
+
+
+def round_repeats(repeats: int, depth_coefficient: float) -> int:
+  """Round number of repeats based on depth coefficient."""
+  return int(math.ceil(depth_coefficient * repeats))
--- a/TensorFlow2/Classification/ConvNets/efficientnet/model/efficientnet_model.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/model/efficientnet_model.py
@ -0,0 +1,323 @@
+# Lint as: python3
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains definitions for EfficientNet model.
+
+[1] Mingxing Tan, Quoc V. Le
+  EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks.
+  ICML'19, https://arxiv.org/abs/1905.11946
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os
+from typing import Any, Dict, Optional, List, Text, Tuple
+import copy
+
+import tensorflow as tf
+
+from model.layers import simple_swish, hard_swish, identity, gelu, get_activation
+from model.blocks import conv2d_block, mb_conv_block
+from model.common_modules import round_filters, round_repeats, load_weights
+from utils import preprocessing
+
+
+def build_dict(name, args=None):
+    if name == "ModelConfig":
+        return_dict = copy.deepcopy(ModelConfig)
+    elif name == "BlockConfig":
+        return_dict = copy.deepcopy(BlockConfig)
+    else:
+        raise ValueError("Name of requested dictionary not found!")
+    if args is None:
+      return return_dict
+    if isinstance(args, dict):
+        return_dict.update(args)
+    elif isinstance(args, tuple):
+        return_dict.update( {a: p for a, p in zip(list(return_dict.keys()), args)} )
+    else:
+        raise ValueError("Expected tuple or dict!")
+    return return_dict
+
+# Config for a single MB Conv Block.
+BlockConfig = {
+  'input_filters': 0,
+  'output_filters': 0,
+  'kernel_size': 3,
+  'num_repeat': 1,
+  'expand_ratio': 1,
+  'strides': (1, 1),
+  'se_ratio': None,
+  'id_skip': True,
+  'fused_conv': False,
+  'conv_type': 'depthwise'
+  }
+
+# Default Config for Efficientnet-B0.
+ModelConfig = {
+  'width_coefficient': 1.0,
+  'depth_coefficient': 1.0,
+  'resolution': 224,
+  'dropout_rate': 0.2,
+  'blocks': (
+      # (input_filters, output_filters, kernel_size, num_repeat,
+      #  expand_ratio, strides, se_ratio)
+      # pylint: disable=bad-whitespace
+      build_dict(name="BlockConfig", args=(32,  16,  3, 1, 1, (1, 1), 0.25)),
+      build_dict(name="BlockConfig", args=(16,  24,  3, 2, 6, (2, 2), 0.25)),
+      build_dict(name="BlockConfig", args=(24,  40,  5, 2, 6, (2, 2), 0.25)),
+      build_dict(name="BlockConfig", args=(40,  80,  3, 3, 6, (2, 2), 0.25)),
+      build_dict(name="BlockConfig", args=(80,  112, 5, 3, 6, (1, 1), 0.25)),
+      build_dict(name="BlockConfig", args=(112, 192, 5, 4, 6, (2, 2), 0.25)),
+      build_dict(name="BlockConfig", args=(192, 320, 3, 1, 6, (1, 1), 0.25)),
+      # pylint: enable=bad-whitespace
+  ),
+  'stem_base_filters': 32,
+  'top_base_filters': 1280,
+  'activation': 'simple_swish',
+  'batch_norm': 'default',
+  'bn_momentum': 0.99,
+  'bn_epsilon': 1e-3,
+  # While the original implementation used a weight decay of 1e-5,
+  # tf.nn.l2_loss divides it by 2, so we halve this to compensate in Keras
+  'weight_decay': 5e-6,
+  'drop_connect_rate': 0.2,
+  'depth_divisor': 8,
+  'min_depth': None,
+  'use_se': True,
+  'input_channels': 3,
+  'num_classes': 1000,
+  'model_name': 'efficientnet',
+  'rescale_input': True,
+  'data_format': 'channels_last',
+  'dtype': 'float32',
+  'weight_init': 'fan_in',
+}
+
+MODEL_CONFIGS = {
+    # (width, depth, resolution, dropout)
+    'efficientnet-b0': build_dict(name="ModelConfig", args=(1.0, 1.0, 224, 0.2)),
+    'efficientnet-b1': build_dict(name="ModelConfig", args=(1.0, 1.1, 240, 0.2)),
+    'efficientnet-b2': build_dict(name="ModelConfig", args=(1.1, 1.2, 260, 0.3)),
+    'efficientnet-b3': build_dict(name="ModelConfig", args=(1.2, 1.4, 300, 0.3)),
+    'efficientnet-b4': build_dict(name="ModelConfig", args=(1.4, 1.8, 380, 0.4)),
+    'efficientnet-b5': build_dict(name="ModelConfig", args=(1.6, 2.2, 456, 0.4)),
+    'efficientnet-b6': build_dict(name="ModelConfig", args=(1.8, 2.6, 528, 0.5)),
+    'efficientnet-b7': build_dict(name="ModelConfig", args=(2.0, 3.1, 600, 0.5)),
+    'efficientnet-b8': build_dict(name="ModelConfig", args=(2.2, 3.6, 672, 0.5)),
+    'efficientnet-l2': build_dict(name="ModelConfig", args=(4.3, 5.3, 800, 0.5)),
+}
+
+DENSE_KERNEL_INITIALIZER = {
+    'class_name': 'VarianceScaling',
+    'config': {
+        'scale': 1 / 3.0,
+        'mode': 'fan_in',
+        'distribution': 'uniform'
+    }
+}
+
+
+def efficientnet(input: List[tf.keras.layers.Input],
+                 config: dict):
+  """Creates an EfficientNet graph given the model parameters.
+
+  This function is wrapped by the `EfficientNet` class to make a tf.keras.Model.
+
+  Args:
+    image_input: the input batch of images
+    config: the model config
+
+  Returns:
+    the output of efficientnet
+  """
+  depth_coefficient = config['depth_coefficient']
+  blocks = config['blocks']
+  stem_base_filters = config['stem_base_filters']
+  top_base_filters = config['top_base_filters']
+  activation = get_activation(config['activation'])
+  dropout_rate = config['dropout_rate']
+  drop_connect_rate = config['drop_connect_rate']
+  num_classes = config['num_classes']
+  input_channels = config['input_channels']
+  rescale_input = config['rescale_input']
+  data_format = tf.keras.backend.image_data_format()
+  dtype = config['dtype']
+  weight_decay = config['weight_decay']
+  weight_init = config['weight_init']
+
+  # Move the mixup of images to device
+  images = input[0]
+  if len(input) > 1:
+    mix_weight = input[1]
+    x = (images * mix_weight + images[::-1] * (1. - mix_weight))
+  else:
+    x = images
+
+  if data_format == 'channels_first':
+    # Happens on GPU/TPU if available.
+    x = tf.keras.layers.Permute((3, 1, 2))(x)
+  if rescale_input:
+    x = preprocessing.normalize_images(x,
+                                       num_channels=input_channels,
+                                       dtype=dtype,
+                                       data_format=data_format)
+
+  # Build stem
+  x = conv2d_block(x,
+                   round_filters(stem_base_filters, config),
+                   config,
+                   kernel_size=[3, 3],
+                   strides=[2, 2],
+                   activation=activation,
+                   name='stem')
+
+  # Build blocks
+  num_blocks_total = sum(
+      round_repeats(block['num_repeat'], depth_coefficient) for block in blocks)
+  block_num = 0
+
+  for stack_idx, block in enumerate(blocks):
+    assert block['num_repeat'] > 0
+    # Update block input and output filters based on depth multiplier
+    block.update({
+        'input_filters':round_filters(block['input_filters'], config),
+        'output_filters':round_filters(block['output_filters'], config),
+        'num_repeat':round_repeats(block['num_repeat'], depth_coefficient)})
+
+    # The first block needs to take care of stride and filter size increase
+    drop_rate = drop_connect_rate * float(block_num) / num_blocks_total
+    config.update({'drop_connect_rate': drop_rate})  # TODO(Sugh) replace
+    block_prefix = 'stack_{}/block_0/'.format(stack_idx)
+    x = mb_conv_block(x, block, config, block_prefix)
+    block_num += 1
+    if block['num_repeat'] > 1:
+      block.update({
+          'input_filters':block['output_filters'],
+          'strides':(1, 1)
+      })
+
+      for block_idx in range(block['num_repeat'] - 1):
+        drop_rate = drop_connect_rate * float(block_num) / num_blocks_total
+        config.update({'drop_connect_rate': drop_rate})
+        block_prefix = 'stack_{}/block_{}/'.format(stack_idx, block_idx + 1)
+        x = mb_conv_block(x, block, config, prefix=block_prefix)
+        block_num += 1
+
+  # Build top
+  x = conv2d_block(x,
+                   round_filters(top_base_filters, config),
+                   config,
+                   activation=activation,
+                   name='top')
+
+  # Build classifier
+  DENSE_KERNEL_INITIALIZER['config']['mode'] = weight_init
+  x = tf.keras.layers.GlobalAveragePooling2D(name='top_pool')(x)
+  if dropout_rate and dropout_rate > 0:
+    x = tf.keras.layers.Dropout(dropout_rate, name='top_dropout')(x)
+  x = tf.keras.layers.Dense(
+      num_classes,
+      kernel_initializer=DENSE_KERNEL_INITIALIZER,
+      kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
+      bias_regularizer=tf.keras.regularizers.l2(weight_decay),
+      name='logits')(x)
+  x = tf.keras.layers.Activation('softmax', name='probs', dtype=tf.float32)(x)
+
+  return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class EfficientNet(tf.keras.Model):
+  """Wrapper class for an EfficientNet Keras model.
+
+  Contains helper methods to build, manage, and save metadata about the model.
+  """
+
+  def __init__(self,
+               config: Dict[Text, Any] = None,
+               overrides: Dict[Text, Any] = None):
+    """Create an EfficientNet model.
+
+    Args:
+      config: (optional) the main model parameters to create the model
+      overrides: (optional) a dict containing keys that can override
+                 config
+    """
+    overrides = overrides or {}
+    is_training = overrides.pop('is_training', False)
+    config = config or build_dict(name="ModelConfig")
+    self.config = config
+    self.config.update(overrides)
+
+
+    input_channels = self.config['input_channels']
+    model_name = self.config['model_name']
+    input_shape = (None, None, input_channels)  # Should handle any size image
+    image_input = tf.keras.layers.Input(shape=input_shape)
+    if is_training:
+      beta_input = tf.keras.layers.Input(shape=(1, 1, 1))
+      inputs = (image_input, beta_input)
+      output = efficientnet(inputs, self.config)
+    else:
+      inputs = [image_input]
+      output = efficientnet(inputs, self.config)
+
+    # Cast to float32 in case we have a different model dtype
+    output = tf.cast(output, tf.float32)
+
+    super(EfficientNet, self).__init__(
+        inputs=inputs, outputs=output, name=model_name)
+
+  @classmethod
+  def from_name(cls,
+                model_name: Text,
+                model_weights_path: Text = None,
+                weights_format: Text = 'saved_model',
+                overrides: Dict[Text, Any] = None):
+    """Construct an EfficientNet model from a predefined model name.
+
+    E.g., `EfficientNet.from_name('efficientnet-b0')`.
+
+    Args:
+      model_name: the predefined model name
+      model_weights_path: the path to the weights (h5 file or saved model dir)
+      weights_format: the model weights format. One of 'saved_model', 'h5',
+       or 'checkpoint'.
+      overrides: (optional) a dict containing keys that can override config
+
+    Returns:
+      A constructed EfficientNet instance.
+    """
+    model_configs = dict(MODEL_CONFIGS)
+    overrides = dict(overrides) if overrides else {}
+
+    # One can define their own custom models if necessary
+    model_configs.update(overrides.pop('model_config', {}))
+
+    if model_name not in model_configs:
+      raise ValueError('Unknown model name {}'.format(model_name))
+
+    config = model_configs[model_name]
+
+    model = cls(config=config, overrides=overrides)
+
+    if model_weights_path:
+      load_weights(model, model_weights_path, weights_format=weights_format)
+
+    return model
--- a/TensorFlow2/Classification/ConvNets/efficientnet/model/layers/init.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/model/layers/init.py
@ -0,0 +1,18 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from model.layers.activations import simple_swish, hard_swish, identity, gelu, get_activation
+from model.layers.normalization import get_batch_norm
+
+__all__ = ['simple_swish', 'hard_swish', 'identity', 'gelu', 'get_activation', 'get_batch_norm']
--- a/TensorFlow2/Classification/ConvNets/efficientnet/model/layers/activations.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/model/layers/activations.py
@ -0,0 +1,122 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Customized Swish activation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import math
+
+import tensorflow as tf
+
+__all__ = ['simple_swish', 'hard_swish', 'identity', 'gelu', 'get_activation']
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def simple_swish(features):
+  """Computes the Swish activation function.
+
+  The tf.nn.swish operation uses a custom gradient to reduce memory usage.
+  Since saving custom gradients in SavedModel is currently not supported, and
+  one would not be able to use an exported TF-Hub module for fine-tuning, we
+  provide this wrapper that can allow to select whether to use the native
+  TensorFlow swish operation, or whether to use a customized operation that
+  has uses default TensorFlow gradient computation.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return features * tf.nn.sigmoid(features)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def hard_swish(features):
+  """Computes a hard version of the swish function.
+
+  This operation can be used to reduce computational cost and improve
+  quantization for edge devices.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return features * tf.nn.relu6(features + tf.constant(3.)) * (1. / 6.)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def identity(features):
+  """Computes the identity function.
+
+  Useful for helping in quantization.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.identity(features)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def gelu(x):
+  """Gaussian Error Linear Unit.
+
+  This is a smoother version of the RELU.
+  Original paper: https://arxiv.org/abs/1606.08415
+  Args:
+    x: float Tensor to perform activation.
+
+  Returns:
+    `x` with the GELU activation applied.
+  """
+  cdf = 0.5 * (1.0 + tf.tanh(
+      (math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+  return x * cdf
+
+# TODO(hongkuny): consider moving custom string-map lookup to keras api.
+def get_activation(identifier):
+  """Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
+
+  It checks string first and if it is one of customized activation not in TF,
+  the corresponding activation will be returned. For non-customized activation
+  names and callable identifiers, always fallback to tf.keras.activations.get.
+
+  Args:
+    identifier: String name of the activation function or callable.
+
+  Returns:
+    A Python function corresponding to the activation function.
+  """
+  if isinstance(identifier, six.string_types):
+    name_to_fn = {
+        "gelu": gelu,
+        "simple_swish": simple_swish,
+        "hard_swish": hard_swish,
+        "identity": identity,
+    }
+    identifier = str(identifier).lower()
+    if identifier in name_to_fn:
+      return tf.keras.activations.get(name_to_fn[identifier])
+  return tf.keras.activations.get(identifier)
--- a/TensorFlow2/Classification/ConvNets/efficientnet/model/layers/normalization.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/model/layers/normalization.py
@ -0,0 +1,127 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common modeling utilities."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+import tensorflow.compat.v1 as tf1
+from typing import Text, Optional
+
+from tensorflow.python.tpu import tpu_function
+
+__all__ = ['get_batch_norm']
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class TpuBatchNormalization(tf.keras.layers.BatchNormalization):
+  """Cross replica batch normalization."""
+
+  def __init__(self, fused: Optional[bool] = False, **kwargs):
+    if fused in (True, None):
+      raise ValueError('TpuBatchNormalization does not support fused=True.')
+    super(TpuBatchNormalization, self).__init__(fused=fused, **kwargs)
+
+  def _cross_replica_average(self, t: tf.Tensor, num_shards_per_group: int):
+    """Calculates the average value of input tensor across TPU replicas."""
+    num_shards = tpu_function.get_tpu_context().number_of_shards
+    group_assignment = None
+    if num_shards_per_group > 1:
+      if num_shards % num_shards_per_group != 0:
+        raise ValueError(
+            'num_shards: %d mod shards_per_group: %d, should be 0' %
+            (num_shards, num_shards_per_group))
+      num_groups = num_shards // num_shards_per_group
+      group_assignment = [[
+          x for x in range(num_shards) if x // num_shards_per_group == y
+      ] for y in range(num_groups)]
+    return tf1.tpu.cross_replica_sum(t, group_assignment) / tf.cast(
+        num_shards_per_group, t.dtype)
+
+  def _moments(self, inputs: tf.Tensor, reduction_axes: int, keep_dims: int):
+    """Compute the mean and variance: it overrides the original _moments."""
+    shard_mean, shard_variance = super(TpuBatchNormalization, self)._moments(
+        inputs, reduction_axes, keep_dims=keep_dims)
+
+    num_shards = tpu_function.get_tpu_context().number_of_shards or 1
+    if num_shards <= 8:  # Skip cross_replica for 2x2 or smaller slices.
+      num_shards_per_group = 1
+    else:
+      num_shards_per_group = max(8, num_shards // 8)
+    if num_shards_per_group > 1:
+      # Compute variance using: Var[X]= E[X^2] - E[X]^2.
+      shard_square_of_mean = tf.math.square(shard_mean)
+      shard_mean_of_square = shard_variance + shard_square_of_mean
+      group_mean = self._cross_replica_average(shard_mean, num_shards_per_group)
+      group_mean_of_square = self._cross_replica_average(
+          shard_mean_of_square, num_shards_per_group)
+      group_variance = group_mean_of_square - tf.math.square(group_mean)
+      return (group_mean, group_variance)
+    else:
+      return (shard_mean, shard_variance)
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SyncBatchNormalization(tf.keras.layers.BatchNormalization):
+  """Cross replica batch normalization."""
+
+  def __init__(self, **kwargs):
+    if not kwargs.get('name', None):
+      kwargs['name'] = 'tpu_batch_normalization'
+    super(SyncBatchNormalization, self).__init__(**kwargs)
+
+  def _moments(self, inputs, reduction_axes, keep_dims):
+    """Compute the mean and variance: it overrides the original _moments."""
+    import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
+    shard_mean, shard_variance = super(SyncBatchNormalization, self)._moments(
+        inputs, reduction_axes, keep_dims=keep_dims)
+
+    num_shards = hvd.size()
+    if num_shards > 1:
+      # Compute variance using: Var[X]= E[X^2] - E[X]^2.
+      shard_square_of_mean = tf.math.square(shard_mean)
+      shard_mean_of_square = shard_variance + shard_square_of_mean
+      shard_stack = tf.stack([shard_mean, shard_mean_of_square])
+      group_mean, group_mean_of_square = tf.unstack(hvd.allreduce(shard_stack))
+      group_variance = group_mean_of_square - tf.math.square(group_mean)
+      return (group_mean, group_variance)
+    else:
+      return (shard_mean, shard_variance)
+
+  def call(self, *args, **kwargs):
+    outputs = super(SyncBatchNormalization, self).call(*args, **kwargs)
+    # A temporary hack for tf1 compatibility with keras batch norm.
+    # for u in self.updates:
+    #   tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, u)
+    return outputs
+
+
+def get_batch_norm(batch_norm_type: Text) -> tf.keras.layers.BatchNormalization:
+  """A helper to create a batch normalization getter.
+
+  Args:
+    batch_norm_type: The type of batch normalization layer implementation. `tpu`
+     will use `TpuBatchNormalization`.
+
+  Returns:
+    An instance of `tf.keras.layers.BatchNormalization`.
+  """
+  if batch_norm_type == 'tpu':
+    return TpuBatchNormalization
+  if batch_norm_type == 'syncbn':
+    return SyncBatchNormalization
+
+  return tf.keras.layers.BatchNormalization
--- a/TensorFlow2/Classification/ConvNets/efficientnet/requirements.txt
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/requirements.txt
@ -0,0 +1,25 @@
+six
+google-api-python-client>=1.6.7
+google-cloud-bigquery>=0.31.0
+kaggle>=1.3.9
+numpy>=1.15.4
+oauth2client>=4.1.2
+pandas>=0.22.0
+psutil>=5.4.3
+py-cpuinfo>=3.3.0
+scipy>=0.19.1
+tensorflow-hub>=0.6.0
+tensorflow-model-optimization>=0.2.1
+tensorflow-datasets
+tensorflow-addons
+dataclasses
+gin-config
+tf_slim>=1.1.0
+typing
+sentencepiece
+Cython
+matplotlib
+opencv-python-headless
+pyyaml
+Pillow
+-e git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI
--- a/TensorFlow2/Classification/ConvNets/efficientnet/run_B0_multinode.sub
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/run_B0_multinode.sub
@ -0,0 +1,37 @@
+#!/bin/bash
+###SBATCH -t 8:00:00                 # wall time
+#SBATCH --ntasks-per-node=8        # tasks per node
+#SBATCH --exclusive                # exclusive node access
+#SBATCH --mem=0                    # all mem avail
+#SBATCH --mail-type=FAIL           # only send email on failure
+#SBATCH --overcommit               # Needed for pytorch
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Data dir
+readonly datadir="/datasets/imagenet/train-val-tfrecord"
+# Path to where trained checkpoints will be saved on the system
+readonly checkpointdir="$PWD/B0_mulitnode_AMP/"
+
+CREATE_FOLDER_CMD="if [ ! -d ${checkpointdir} ]; then mkdir -p ${checkpointdir} ; fi && nvidia-smi"
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 sh -c "${CREATE_FOLDER_CMD}"
+
+OUTFILE="${checkpointdir}/slurm-%j.out"
+ERRFILE="${checkpointdir}/error-%j.out"
+
+readonly mounts="${datadir}:/data,${checkpointdir}:/model"
+
+srun -p ${PARTITION} -l -o $OUTFILE -e $ERRFILE --container-image nvcr.io/nvidia/efficientnet-tf2:21.02-tf2-py3 --container-mounts ${mounts} --mpi=pmix bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py --mode train_and_eval --arch efficientnet-b0 --model_dir /model --data_dir /data --use_amp --use_xla --lr_decay cosine --weight_init fan_out --max_epochs 500 --log_steps 100 --save_checkpoint_freq 3 --train_batch_size 1024 --eval_batch_size 1024 --lr_init 0.005 --batch_norm syncbn --resume_checkpoint --augmenter_name autoaugment --mixup_alpha 0.0 --weight_decay 5e-6 --epsilon 0.001
--- a/TensorFlow2/Classification/ConvNets/efficientnet/run_B4_multinode.sub
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/run_B4_multinode.sub
@ -0,0 +1,37 @@
+#!/bin/bash
+###SBATCH -t 8:00:00                 # wall time
+#SBATCH --ntasks-per-node=8        # tasks per node
+#SBATCH --exclusive                # exclusive node access
+#SBATCH --mem=0                    # all mem avail
+#SBATCH --mail-type=FAIL           # only send email on failure
+#SBATCH --overcommit               # Needed for pytorch
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Data dir
+readonly datadir="/datasets/imagenet/train-val-tfrecord"
+# Path to where trained checkpoints will be saved on the system
+readonly checkpointdir="$PWD/B4_mulitnode_AMP/"
+
+CREATE_FOLDER_CMD="if [ ! -d ${checkpointdir} ]; then mkdir -p ${checkpointdir} ; fi && nvidia-smi"
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 sh -c "${CREATE_FOLDER_CMD}"
+
+OUTFILE="${checkpointdir}/slurm-%j.out"
+ERRFILE="${checkpointdir}/error-%j.out"
+
+readonly mounts="${datadir}:/data,${checkpointdir}:/model"
+
+srun -p ${PARTITION} -l -o $OUTFILE -e $ERRFILE --container-image nvcr.io/nvidia/efficientnet-tf2:21.02-tf2-py3 --container-mounts ${mounts} --mpi=pmix bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py --mode train_and_eval --arch efficientnet-b4 --model_dir /model --data_dir /data --use_amp --use_xla --lr_decay cosine --weight_init fan_out --max_epochs 500 --log_steps 100 --save_checkpoint_freq 3 --train_batch_size 128 --eval_batch_size 128 --lr_init 0.005 --batch_norm syncbn --resume_checkpoint --augmenter_name autoaugment --mixup_alpha 0.2 --weight_decay 5e-6 --epsilon 0.001
--- a/TensorFlow2/Classification/ConvNets/efficientnet/runtime/init.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/runtime/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from runtime.runner import Runner
--- a/TensorFlow2/Classification/ConvNets/efficientnet/runtime/runner.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/runtime/runner.py
@ -0,0 +1,296 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import multiprocessing
+import warnings
+import yaml
+import time
+
+import tensorflow as tf
+import numpy as np
+
+import horovod.tensorflow.keras as hvd
+
+from utils import hvd_utils, optimizer_factory
+from utils import callbacks as custom_callbacks
+
+from runtime.runner_utils import get_optimizer_params, get_metrics, get_learning_rate_params, \
+                        build_model_params, get_models, get_dataset_builders, build_stats, \
+                        parse_inference_input, preprocess_image_files
+
+__all__ = [
+    'Runner',
+]
+
+DTYPE_MAP = {
+    'float32': tf.float32,
+    'bfloat16': tf.bfloat16,
+    'float16': tf.float16,
+    'fp32': tf.float32,
+    'bf16': tf.bfloat16,
+}
+
+class Runner(object):
+
+    def __init__(self, flags, logger):
+
+        self.params = flags
+        self.logger = logger
+
+        if hvd.rank() == 0:
+            self.serialize_config(model_dir=self.params.model_dir)
+
+        # =================================================
+        # Define Datasets
+        # =================================================
+        label_smoothing = flags.label_smoothing
+        self.one_hot = label_smoothing and label_smoothing > 0
+
+        builders = get_dataset_builders(self.params, self.one_hot)
+        datasets = [builder.build() if builder else None for builder in builders]
+
+        self.train_dataset, self.validation_dataset = datasets
+        self.train_builder, self.validation_builder = builders
+
+        self.initialize()
+
+        # =================================================
+        # Define Model
+        # =================================================
+        model_params = build_model_params(model_name=self.params.arch,
+            is_training="predict" not in self.params.mode,
+            batch_norm=self.params.batch_norm,
+            num_classes=self.params.num_classes,
+            activation=self.params.activation,
+            dtype=DTYPE_MAP[self.params.dtype],
+            weight_decay=self.params.weight_decay,
+            weight_init=self.params.weight_init
+            )
+        models_dict = get_models()
+        self.model = [model for model_name, model in models_dict.items() if model_name in self.params.arch][0](**model_params)
+        
+        self.metrics = ['accuracy', 'top_5']
+
+        if self.params.dataset == 'ImageNet':
+            self.train_num_examples = 1281167
+            self.eval_num_examples = 50000
+
+    def initialize(self):
+        """Initializes backend related initializations."""
+        if tf.config.list_physical_devices('GPU'):
+            data_format = 'channels_first'
+        else:
+            data_format = 'channels_last'
+        tf.keras.backend.set_image_data_format(data_format)
+        if self.params.run_eagerly:
+            # Enable eager execution to allow step-by-step debugging
+            tf.config.experimental_run_functions_eagerly(True)
+
+
+    def load_model_weights(self, model_dir):
+        latest_checkpoint = tf.train.latest_checkpoint(model_dir)
+        if not latest_checkpoint:
+            return 0
+
+        self.model.load_weights(latest_checkpoint)
+        return self.model.optimizer.iterations
+
+    def resume_from_checkpoint(self,
+                           model_dir: str,
+                           train_steps: int) -> int:
+        """Resumes from the latest checkpoint, if possible.
+
+        Loads the model weights and optimizer settings from a checkpoint.
+        This function should be used in case of preemption recovery.
+
+        Args:
+        model: The model whose weights should be restored.
+        model_dir: The directory where model weights were saved.
+        train_steps: The number of steps to train.
+
+        Returns:
+        The epoch of the latest checkpoint, or 0 if not restoring.
+
+        """
+        last_iteration = self.load_model_weights(model_dir)
+        initial_epoch = last_iteration // train_steps
+        return int(initial_epoch)
+
+
+    def serialize_config(self, model_dir: str):
+        """Serializes and saves the experiment config."""
+        params_save_path = os.path.join(model_dir, 'params.yaml')
+        with open(params_save_path, 'w') as outfile:
+            yaml.dump(vars(self.params), outfile, default_flow_style=False)
+
+
+    def train(self):
+        train_epochs = self.params.max_epochs
+        train_steps = self.params.steps_per_epoch if self.params.steps_per_epoch is not None else self.train_num_examples // self.train_builder.global_batch_size
+        if self.validation_builder is not None:
+            validation_steps = self.eval_num_examples // self.validation_builder.global_batch_size
+        else:
+            validation_steps = None
+
+        learning_rate = optimizer_factory.build_learning_rate(
+            params=get_learning_rate_params(name=self.params.lr_decay,
+                initial_lr=self.params.lr_init,
+                decay_epochs=self.params.lr_decay_epochs,
+                decay_rate=self.params.lr_decay_rate,
+                warmup_epochs=self.params.lr_warmup_epochs),
+            batch_size=self.train_builder.global_batch_size,
+            train_steps=train_steps,
+            max_epochs=train_epochs)
+        optimizer = optimizer_factory.build_optimizer(
+            optimizer_name=self.params.optimizer,
+            base_learning_rate=learning_rate,
+            params=get_optimizer_params(name=self.params.optimizer,
+                decay=self.params.decay,
+                epsilon=self.params.epsilon,
+                momentum=self.params.momentum,
+                moving_average_decay=self.params.moving_average_decay,
+                nesterov=self.params.nesterov,
+                beta_1=self.params.beta_1,
+                beta_2=self.params.beta_2)
+            )
+
+        metrics_map = get_metrics(self.one_hot)
+        metrics = [metrics_map[metric] for metric in self.metrics]
+
+        optimizer = hvd.DistributedOptimizer(optimizer, compression=hvd.Compression.fp16)
+        
+        if self.one_hot:
+            loss_obj = tf.keras.losses.CategoricalCrossentropy(
+                label_smoothing=self.params.label_smoothing)
+        else:
+            loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
+
+        # Training 
+        self.model.compile(optimizer=optimizer, 
+        loss=loss_obj,
+        metrics=metrics,
+        experimental_run_tf_function=False)
+
+        initial_epoch = 0
+        if self.params.resume_checkpoint:
+            initial_epoch = self.resume_from_checkpoint(model_dir=self.params.model_dir,
+                                                train_steps=train_steps)
+        
+        #Define Callbacks (TODO)
+        callbacks=[hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
+        callbacks += custom_callbacks.get_callbacks(
+            model_checkpoint=self.params.enable_checkpoint_and_export,
+            include_tensorboard=self.params.enable_tensorboard,
+            time_history=self.params.time_history,
+            track_lr=True,
+            write_model_weights=self.params.write_model_weights,
+            initial_step=initial_epoch * train_steps,
+            batch_size=self.train_builder.global_batch_size,
+            log_steps=self.params.log_steps,
+            model_dir=self.params.model_dir,
+            save_checkpoint_freq=train_steps * self.params.save_checkpoint_freq,
+            logger=self.logger)
+
+        if "eval" not in self.params.mode:
+            validation_kwargs = {}
+        else:
+            validation_kwargs = {
+                'validation_data': self.validation_dataset,
+                'validation_steps': validation_steps,
+                'validation_freq': self.params.num_epochs_between_eval,
+                }
+
+        history = self.model.fit(
+            self.train_dataset,
+            epochs=train_epochs,
+            steps_per_epoch=train_steps,
+            initial_epoch=initial_epoch,
+            callbacks=callbacks,
+            verbose=2,
+            **validation_kwargs)
+
+        validation_output = None
+        eval_callback = None
+        if not self.params.skip_eval and self.validation_builder is not None:
+            eval_callback = custom_callbacks.EvalTimeHistory(batch_size=self.params.eval_batch_size, logger=self.logger)
+            worker_validation_output = self.model.evaluate(
+                self.validation_dataset, steps=validation_steps, callbacks=eval_callback, verbose=2)
+            validation_output = list(hvd.allreduce(worker_validation_output,average=True))
+
+        build_stats(history, validation_output, callbacks, eval_callback, self.logger)
+
+
+    def evaluate(self):
+
+        if self.validation_builder is not None:
+            validation_steps = self.eval_num_examples // self.validation_builder.global_batch_size
+        else:
+            validation_steps = None
+
+        metrics_map = get_metrics(self.one_hot)
+        metrics = [metrics_map[metric] for metric in self.metrics]
+        
+        if self.one_hot:
+            loss_obj = tf.keras.losses.CategoricalCrossentropy(
+                label_smoothing=self.params.label_smoothing)
+        else:
+            loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
+
+        # Training 
+        self.model.compile(optimizer="rmsprop", 
+        loss=loss_obj,
+        metrics=metrics,
+        experimental_run_tf_function=False)
+
+        _ = self.load_model_weights(self.params.model_dir)
+        eval_callback = custom_callbacks.EvalTimeHistory(batch_size=self.params.eval_batch_size, logger=self.logger)
+        results = self.model.evaluate(self.validation_dataset, steps=validation_steps, callbacks=eval_callback, verbose=1)
+        build_stats(None, results, None, eval_callback, self.logger)
+
+
+    def predict(self, to_predict, checkpoint_name=None, print_results=True):
+
+        images = preprocess_image_files(directory_name=to_predict, arch=self.params.arch, batch_size=self.params.predict_batch_size, dtype=DTYPE_MAP[self.params.dtype])
+        nb_samples = len(images)
+        if checkpoint_name is not None:
+            self.model.load_weights(checkpoint_name)
+        try:
+            file_names = images.filenames
+            num_files = len(file_names)
+            if self.params.benchmark:
+                nb_samples *= 50
+                print_results = False
+                num_files *= 50
+            start_time = time.time()
+            inference_results = self.model.predict(images, verbose=1, steps=nb_samples)
+            total_time = time.time() - start_time
+            score = tf.nn.softmax(inference_results, axis=1)
+            
+            if print_results:
+                for i, name in enumerate(file_names):
+                    print(
+                        "This {} image most likely belongs to {} class with a {} percent confidence."
+                        .format(name, tf.math.argmax(score[i]), 100 * tf.math.reduce_max(score[i]))
+                    )
+            print("Total time to infer {} images :: {}".format(num_files, total_time))
+            print("Inference Throughput {}".format(num_files/total_time))
+            print("Inference Latency {}".format(total_time/num_files))
+
+        except KeyboardInterrupt:
+            print("Keyboard interrupt")
+
+        print('Ending Inference ...')
--- a/TensorFlow2/Classification/ConvNets/efficientnet/runtime/runner_utils.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/runtime/runner_utils.py
@ -0,0 +1,255 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import math
+
+import tensorflow as tf
+import horovod.tensorflow as hvd
+
+from model import efficientnet_model
+
+from utils import dataset_factory, hvd_utils, callbacks, preprocessing
+
+__all__ = ['get_optimizer_params', 'get_metrics', 'get_learning_rate_params', 'build_model_params', 'get_models', 'build_augmenter_params', \
+            'get_image_size_from_model', 'get_dataset_builders', 'build_stats', 'parse_inference_input', 'preprocess_image_files']
+
+def get_optimizer_params(name,
+        decay,
+        epsilon,
+        momentum,
+        moving_average_decay,
+        nesterov,
+        beta_1,
+        beta_2):
+    return {
+        'name': name,
+        'decay': decay,
+        'epsilon': epsilon,
+        'momentum': momentum,
+        'moving_average_decay': moving_average_decay,
+        'nesterov': nesterov,
+        'beta_1': beta_1,
+        'beta_2': beta_2
+    }
+
+
+def get_metrics(one_hot: bool):
+    """Get a dict of available metrics to track."""
+    if one_hot:
+        return {
+            # (name, metric_fn)
+            'acc': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+            'accuracy': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+            'top_1': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+            'top_5': tf.keras.metrics.TopKCategoricalAccuracy(
+                k=5,
+                name='top_5_accuracy'),
+        }
+    else:
+        return {
+            # (name, metric_fn)
+            'acc': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+            'accuracy': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+            'top_1': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+            'top_5': tf.keras.metrics.SparseTopKCategoricalAccuracy(
+                k=5,
+                name='top_5_accuracy'),
+        }
+
+def get_learning_rate_params(name,
+        initial_lr,
+        decay_epochs,
+        decay_rate,
+        warmup_epochs):
+    return {
+        'name':name,
+        'initial_lr': initial_lr,
+        'decay_epochs': decay_epochs,
+        'decay_rate': decay_rate,
+        'warmup_epochs': warmup_epochs,
+        'examples_per_epoch': None,
+        'boundaries': None,
+        'multipliers': None,
+        'scale_by_batch_size': 1./128.,
+        'staircase': True
+    }
+
+
+def build_model_params(model_name, is_training, batch_norm, num_classes, activation, dtype, weight_decay, weight_init):
+    return {
+        'model_name': model_name,
+        'model_weights_path': '',
+        'weights_format': 'saved_model',
+        'overrides': {
+            'is_training': is_training,
+            'batch_norm': batch_norm,
+            'rescale_input': True,
+            'num_classes': num_classes,
+            'weight_decay': weight_decay,
+            'activation': activation,
+            'dtype': dtype,
+            'weight_init': weight_init
+        }
+    }
+
+def get_models():
+    """Returns the mapping from model type name to Keras model."""
+    return  {
+        'efficientnet': efficientnet_model.EfficientNet.from_name,
+    }
+
+def build_augmenter_params(augmenter_name, cutout_const, translate_const, num_layers, magnitude, autoaugmentation_name):
+    if augmenter_name is None or augmenter_name not in ['randaugment', 'autoaugment']:
+        return {}
+    augmenter_params = {}
+    if cutout_const is not None:
+        augmenter_params['cutout_const'] = cutout_const
+    if translate_const is not None:
+        augmenter_params['translate_const'] = translate_const
+    if augmenter_name == 'randaugment':
+        if num_layers is not None:
+            augmenter_params['num_layers'] = num_layers
+        if magnitude is not None:
+            augmenter_params['magnitude'] = magnitude
+    if augmenter_name == 'autoaugment':
+        if autoaugmentation_name is not None:
+            augmenter_params['autoaugmentation_name'] = autoaugmentation_name
+
+    return augmenter_params
+
+def get_image_size_from_model(arch):
+    """If the given model has a preferred image size, return it."""
+    if 'efficientnet' in arch:
+        efficientnet_name = arch
+        if efficientnet_name in efficientnet_model.MODEL_CONFIGS:
+            return efficientnet_model.MODEL_CONFIGS[efficientnet_name]['resolution']
+    return None
+
+def get_dataset_builders(params, one_hot):
+    """Create and return train and validation dataset builders."""
+    if hvd.size() > 1:
+        num_gpus = hvd.size()
+    else:
+        num_devices = 1
+
+    image_size = get_image_size_from_model(params.arch)
+    print("Image size {}".format(image_size))
+    print("Train batch size {}".format(params.train_batch_size))
+    builders = []
+    validation_dataset_builder = None
+    train_dataset_builder = None
+    if "train" in params.mode:
+        train_dataset_builder = dataset_factory.Dataset(data_dir=params.data_dir,
+        index_file_dir=params.index_file,
+        split='train',
+        num_classes=params.num_classes,
+        image_size=image_size,
+        batch_size=params.train_batch_size,
+        one_hot=one_hot,
+        use_dali=params.use_dali,
+        augmenter=params.augmenter_name,
+        augmenter_params=build_augmenter_params(params.augmenter_name, 
+            params.cutout_const, 
+            params.translate_const, 
+            params.num_layers, 
+            params.magnitude, 
+            params.autoaugmentation_name),
+        mixup_alpha=params.mixup_alpha
+        )
+    if "eval" in params.mode:
+        validation_dataset_builder = dataset_factory.Dataset(data_dir=params.data_dir,
+        index_file_dir=params.index_file,
+        split='validation',
+        num_classes=params.num_classes,
+        image_size=image_size,
+        batch_size=params.eval_batch_size,
+        one_hot=one_hot,
+        use_dali=params.use_dali_eval)
+
+    builders.append(train_dataset_builder)
+    builders.append(validation_dataset_builder)
+
+    return builders
+
+def build_stats(history, validation_output, train_callbacks, eval_callback, logger):
+    stats = {}
+    if validation_output:
+        stats['eval_loss'] = float(validation_output[0])
+        stats['eval_accuracy_top_1'] = float(validation_output[1])
+        stats['eval_accuracy_top_5'] = float(validation_output[2])
+    #This part is train loss on GPU_0
+    if history and history.history:
+        train_hist = history.history
+        #Gets final loss from training.
+        stats['training_loss'] = float(hvd.allreduce(tf.constant(train_hist['loss'][-1], dtype=tf.float32), average=True))
+        # Gets top_1 training accuracy.
+        if 'categorical_accuracy' in train_hist:
+            stats['training_accuracy_top_1'] = float(hvd.allreduce(tf.constant(train_hist['categorical_accuracy'][-1], dtype=tf.float32), average=True))
+        elif 'sparse_categorical_accuracy' in train_hist:
+            stats['training_accuracy_top_1'] = float(hvd.allreduce(tf.constant(train_hist['sparse_categorical_accuracy'][-1], dtype=tf.float32), average=True))
+        elif 'accuracy' in train_hist:
+            stats['training_accuracy_top_1'] = float(hvd.allreduce(tf.constant(train_hist['accuracy'][-1], dtype=tf.float32), average=True))
+            stats['training_accuracy_top_5'] = float(hvd.allreduce(tf.constant(train_hist['top_5_accuracy'][-1], dtype=tf.float32), average=True))
+
+    # Look for the time history callback which was used during keras.fit
+    if train_callbacks:
+        for callback in train_callbacks:
+            if isinstance(callback, callbacks.TimeHistory):
+                if callback.epoch_runtime_log:
+                    stats['avg_exp_per_second_training'] = callback.average_examples_per_second
+                    stats['avg_exp_per_second_training_per_GPU'] = callback.average_examples_per_second / hvd.size()
+
+    if eval_callback:
+        stats['avg_exp_per_second_eval'] = float(eval_callback.average_examples_per_second) * hvd.size()
+        stats['avg_exp_per_second_eval_per_GPU'] = float(eval_callback.average_examples_per_second)
+        stats['avg_time_per_exp_eval'] = 1000./stats['avg_exp_per_second_eval']
+        batch_time = eval_callback.batch_time
+        batch_time.sort()
+        latency_90pct_per_batch = sum( batch_time[:int( 0.9 * len(batch_time) )] ) / int( 0.9 * len(batch_time) )
+        stats['latency_90pct'] = 1000.0 * latency_90pct_per_batch / eval_callback.batch_size
+        latency_95pct_per_batch = sum( batch_time[:int( 0.95 * len(batch_time) )] ) / int( 0.95 * len(batch_time) )
+        stats['latency_95pct'] = 1000.0 * latency_95pct_per_batch / eval_callback.batch_size
+        latency_99pct_per_batch = sum( batch_time[:int( 0.99 * len(batch_time) )] ) / int( 0.99 * len(batch_time) )
+        stats['latency_99pct'] = 1000.0 * latency_99pct_per_batch / eval_callback.batch_size
+
+    if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
+        logger.log(step=(), data=stats)
+
+
+def preprocess_image_files(directory_name, arch, batch_size, num_channels=3, dtype=tf.float32):
+    image_size = get_image_size_from_model(arch)
+
+    datagen = tf.keras.preprocessing.image.ImageDataGenerator(data_format="channels_last")
+    images = datagen.flow_from_directory(directory_name, class_mode=None, batch_size=batch_size, target_size=(image_size, image_size), shuffle=False)
+    return images
+
+
+def parse_inference_input(to_predict):
+    
+    filenames = []
+    
+    image_formats = ['.jpg', '.jpeg', '.JPEG', '.JPG', '.png', '.PNG']
+    
+    if os.path.isdir(to_predict):
+        filenames = [f for f in os.listdir(to_predict) 
+                     if os.path.isfile(os.path.join(to_predict, f)) 
+                     and os.path.splitext(f)[1] in image_formats]
+        
+    elif os.path.isfile(to_predict):
+        filenames.append(to_predict)
+      
+    return filenames
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/.gitkeep
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/.gitkeep
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/evaluation/evaluation_AMP_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/evaluation/evaluation_AMP_8xA100-80G.sh
@ -0,0 +1,31 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --max_epochs 1 \
+  --eval_batch_size 1024 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/evaluation/evaluation_AMP_8xV100-16G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/evaluation/evaluation_AMP_8xV100-16G.sh
@ -0,0 +1,31 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --max_epochs 1 \
+  --eval_batch_size 256 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/evaluation/evaluation_FP32_8xV100-16G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/evaluation/evaluation_FP32_8xV100-16G.sh
@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --max_epochs 1 \
+  --eval_batch_size 128 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/evaluation/evaluation_TF32_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/evaluation/evaluation_TF32_8xA100-80G.sh
@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --max_epochs 1 \
+  --eval_batch_size 512 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/inference/inference_AMP.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/inference/inference_AMP.sh
@ -0,0 +1,26 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/infer_data"
+INDX="./index_file"
+
+python3 main.py --mode "predict" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --to_predict "/infer_data/" \
+  --use_amp \
+  --use_xla \
+  --predict_batch_size 8
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/inference/inference_FP32.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/inference/inference_FP32.sh
@ -0,0 +1,25 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/infer_data"
+INDX="./index_file"
+
+python3 main.py --mode "predict" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --to_predict "/infer_data/" \
+  --use_xla \
+  --predict_batch_size 8
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/AMP/convergence_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/AMP/convergence_8xA100-80G.sh
@ -0,0 +1,41 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 1024 \
+  --eval_batch_size 1024 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.0 \
+  --weight_decay 5e-6 \
+  --epsilon 0.001 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/AMP/convergence_8xV100-16G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/AMP/convergence_8xV100-16G.sh
@ -0,0 +1,41 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 256 \
+  --eval_batch_size 256 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.0 \
+  --weight_decay 5e-6 \
+  --epsilon 0.001 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/AMP/train_benchmark_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/AMP/train_benchmark_8xA100-80G.sh
@ -0,0 +1,41 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 3 \
+  --train_batch_size 1024 \
+  --eval_batch_size 1024 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.0 \
+  --weight_decay 5e-6 \
+  --epsilon 0.001 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/AMP/train_benchmark_8xV100-16G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/AMP/train_benchmark_8xV100-16G.sh
@ -0,0 +1,41 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 256 \
+  --eval_batch_size 256 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.0 \
+  --weight_decay 5e-6 \
+  --epsilon 0.001 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/FP32/convergence_8xV100-16G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/FP32/convergence_8xV100-16G.sh
@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 128 \
+  --eval_batch_size 128 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.0 \
+  --weight_decay 5e-6 \
+  --epsilon 0.001 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/FP32/train_benchmark_8xV100-16G
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/FP32/train_benchmark_8xV100-16G
@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 128 \
+  --eval_batch_size 128 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.0 \
+  --weight_decay 5e-6 \
+  --epsilon 0.001 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/TF32/convergence_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/TF32/convergence_8xA100-80G.sh
@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 512 \
+  --eval_batch_size 512 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.0 \
+  --weight_decay 5e-6 \
+  --epsilon 0.001 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/TF32/train_benchmark_8xA100-80G
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B0/training/TF32/train_benchmark_8xA100-80G
@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b0" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 512 \
+  --eval_batch_size 512 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.0 \
+  --weight_decay 5e-6 \
+  --epsilon 0.001 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/evaluation/evaluation_AMP_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/evaluation/evaluation_AMP_8xA100-80G.sh
@ -0,0 +1,31 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --max_epochs 1 \
+  --eval_batch_size 128 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/evaluation/evaluation_AMP_8xV100-32G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/evaluation/evaluation_AMP_8xV100-32G.sh
@ -0,0 +1,31 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --max_epochs 1 \
+  --eval_batch_size 64 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/evaluation/evaluation_FP32_8xV100-32G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/evaluation/evaluation_FP32_8xV100-32G.sh
@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --max_epochs 1 \
+  --eval_batch_size 32 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/evaluation/evaluation_TF32_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/evaluation/evaluation_TF32_8xA100-80G.sh
@ -0,0 +1,30 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --max_epochs 1 \
+  --eval_batch_size 64 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/inference/inference_AMP.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/inference/inference_AMP.sh
@ -0,0 +1,26 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/infer_data"
+INDX="./index_file"
+
+python3 main.py --mode "predict" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --to_predict "/infer_data/" \
+  --use_amp \
+  --use_xla \
+  --predict_batch_size 8
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/inference/inference_FP32.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/inference/inference_FP32.sh
@ -0,0 +1,25 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/infer_data"
+INDX="./index_file"
+
+python3 main.py --mode "predict" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --to_predict "/infer_data/" \
+  --use_xla \
+  --predict_batch_size 8
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/AMP/convergence_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/AMP/convergence_8xA100-80G.sh
@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 160 \
+  --eval_batch_size 160 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.2 \
+  --weight_decay 5e-6 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/AMP/convergence_8xV100-32G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/AMP/convergence_8xV100-32G.sh
@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 64 \
+  --eval_batch_size 64 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.2 \
+  --weight_decay 5e-6 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/AMP/train_benchmark_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/AMP/train_benchmark_8xA100-80G.sh
@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 2 \
+  --train_batch_size 160 \
+  --eval_batch_size 160 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.2 \
+  --weight_decay 5e-6 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/AMP/train_benchmark_8xV100-32G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/AMP/train_benchmark_8xV100-32G.sh
@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_amp \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 2 \
+  --train_batch_size 64 \
+  --eval_batch_size 64 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.2 \
+  --weight_decay 5e-6 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/FP32/convergence_8xV100-32G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/FP32/convergence_8xV100-32G.sh
@ -0,0 +1,39 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 32 \
+  --eval_batch_size 32 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.2 \
+  --weight_decay 5e-6 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/FP32/train_benchmark_8xV100-32G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/FP32/train_benchmark_8xV100-32G.sh
@ -0,0 +1,39 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 2 \
+  --train_batch_size 32 \
+  --eval_batch_size 32 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.2 \
+  --weight_decay 5e-6 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/TF32/convergence_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/TF32/convergence_8xA100-80G.sh
@ -0,0 +1,39 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 80 \
+  --eval_batch_size 80 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.2 \
+  --weight_decay 5e-6 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/TF32/train_benchmark_8xA100-80G.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/B4/training/TF32/train_benchmark_8xA100-80G.sh
@ -0,0 +1,39 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_DIR="./output"
+DATA_DIR="/data"
+INDX="./index_file"
+
+# TF_XLA_FLAGS=--tf_xla_cpu_global_jit
+
+horovodrun -np 8 bash ./scripts/bind.sh --cpu=exclusive --ib=single -- python3 main.py \
+  --mode "train_and_eval" \
+  --arch "efficientnet-b4" \
+  --model_dir $MODEL_DIR \
+  --data_dir $DATA_DIR \
+  --use_xla \
+  --augmenter_name autoaugment \
+  --weight_init fan_out \
+  --lr_decay cosine \
+  --max_epochs 500 \
+  --train_batch_size 80 \
+  --eval_batch_size 80 \
+  --log_steps 100 \
+  --save_checkpoint_freq 5 \
+  --lr_init 0.005 \
+  --batch_norm syncbn \
+  --mixup_alpha 0.2 \
+  --weight_decay 5e-6 \
+  --resume_checkpoint
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/bind.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/bind.sh
@ -0,0 +1,227 @@
+#! /bin/bash
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+print_usage() {
+    cat << EOF
+${0} [options] [--] COMMAND [ARG...]
+
+Control binding policy for each task. Assumes one rank will be launched for each GPU.
+
+Options:
+    --cpu=MODE
+        * exclusive -- bind each rank to an exclusive set of cores near its GPU
+        * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
+        * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
+	* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
+        * off -- don't bind
+    --mem=MODE
+        * node -- bind each rank to the nearest NUMA node [default]
+	* *.sh -- bind each rank using the bash associative array bind_mem from a file
+        * off -- don't bind
+    --ib=MODE
+        * single -- bind each rank to a single IB device near its GPU
+        * off -- don't bind [default]
+    --cluster=CLUSTER
+        Select which cluster is being used. May be required if system params cannot be detected.
+EOF
+}
+
+################################################################################
+# Argument parsing
+################################################################################
+
+cpu_mode='node'
+mem_mode='node'
+ib_mode='off'
+cluster=''
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -h|--help) print_usage ; exit 0 ;;
+        --cpu=*) cpu_mode="${1/*=/}"; shift ;;
+        --cpu)   cpu_mode="$2"; shift 2 ;;
+        --mem=*) mem_mode="${1/*=/}"; shift ;;
+        --mem)   mem_mode="$2"; shift 2 ;;
+        --ib=*) ib_mode="${1/*=/}"; shift ;;
+        --ib)   ib_mode="$2"; shift 2 ;;
+        --cluster=*) cluster="${1/*=/}"; shift ;;
+        --cluster)   cluster="$2"; shift 2 ;;
+        --) shift; break ;;
+        *) break ;;
+    esac
+done
+if [ $# -lt 1 ]; then
+    echo 'ERROR: no command given' 2>&1
+    print_usage
+    exit 1
+fi
+
+################################################################################
+# Get system params
+################################################################################
+
+# LOCAL_RANK is set with an enroot hook for Pytorch containers
+# SLURM_LOCALID is set by Slurm
+# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
+readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+if [ -z "${local_rank}" ]; then
+    echo 'ERROR: cannot read LOCAL_RANK from env' >&2
+    exit 1
+fi
+
+num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
+if [ "${local_rank}" -ge "${num_gpus}" ]; then
+    echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
+    exit 1
+fi
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+lscpu_out=$(lscpu)
+num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
+num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
+cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
+
+readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
+if [ ${num_gpus} -gt 1 ]; then
+    readonly gpus_per_node=$(( num_gpus / num_nodes ))
+else
+    readonly gpus_per_node=1
+fi
+readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
+readonly local_node=$(( local_rank / gpus_per_node ))
+
+
+declare -a ibdevs=()
+case "${cluster}" in
+    circe)
+        # Need to specialize for circe because IB detection is hard
+        ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
+        ;;
+   selene)
+        # Need to specialize for selene because IB detection is hard
+        ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
+        ;;
+    '')
+        if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
+            mapfile -t ibdevs <<< "${ibstat_out}"
+        fi
+        ;;
+    *)
+        echo "ERROR: Unknown cluster '${cluster}'" >&2
+        exit 1
+        ;;
+esac
+readonly num_ibdevs="${#ibdevs[@]}"
+
+################################################################################
+# Setup for exec
+################################################################################
+
+declare -a numactl_args=()
+
+case "${cpu_mode}" in
+    exclusive)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+            $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
+            $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
+        )" )
+        ;;
+    exclusive,nosmt)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+        )" )
+        ;;
+    node)
+        numactl_args+=( "--cpunodebind=${local_node}" )
+        ;;
+    *.sh)
+	source "${cpu_mode}"
+	if [ -n "${bind_cpu_cores:-}" ]; then
+	    numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
+	elif [ -n "${bind_cpu_nodes:-}" ]; then
+	    numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
+	else
+	    echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
+	    exit 1
+	fi
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${mem_mode}" in
+    node)
+        numactl_args+=( "--membind=${local_node}" )
+        ;;
+    *.sh)
+	source "${mem_mode}"
+	if [ -z "${bind_mem:-}" ]; then
+	    echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
+	    exit 1
+	fi
+	numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${ib_mode}" in
+    single)
+        if [ "${num_ibdevs}" -eq 0 ]; then
+            echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
+        else
+            readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
+            export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
+            export UCX_NET_DEVICES="${UCX_NET_DEVICES-$ibdev:1}"
+        fi
+        ;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+################################################################################
+# Exec
+################################################################################
+
+if [ "${#numactl_args[@]}" -gt 0 ] ; then
+    set -x
+    exec numactl "${numactl_args[@]}" -- "${@}"
+else
+    exec "${@}"
+fi
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/dali_index.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/dali_index.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SRC_DIR=${1}
+DST_DIR=${2}
+
+echo "Creating training file indexes"
+mkdir -p ${DST_DIR}
+
+for file in ${SRC_DIR}/train-*; do
+    BASENAME=$(basename $file)
+    DST_NAME=$DST_DIR/$BASENAME
+
+    echo "Creating index $DST_NAME for $file"
+    tfrecord2idx $file $DST_NAME
+done
+
+echo "Creating validation file indexes"
+for file in ${SRC_DIR}/validation-*; do
+    BASENAME=$(basename $file)
+    DST_NAME=$DST_DIR/$BASENAME
+
+    echo "Creating index $DST_NAME for $file"
+    tfrecord2idx $file $DST_NAME
+done
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/docker/.gitkeep
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/docker/.gitkeep
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/docker/build.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/docker/build.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CONTAINER_TF2x_BASE="nvcr.io/nvidia/tensorflow"
+CONTAINER_TF2x_TAG="21.02-tf2-py3"
+# ======================== Refresh base image ======================== #
+docker pull "${CONTAINER_TF2x_BASE}:${CONTAINER_TF2x_TAG}"
+# ========================== Build container ========================= #
+echo -e "\n\nBuilding Effnet_test Container\n\n"
+
+sleep 1
+
+docker build -t nvcr.io/nvidia/efficientnet-tf2:21.02-tf2-py3 \
+    --build-arg FROM_IMAGE_NAME="nvcr.io/nvidia/tensorflow:21.02-tf2-py3" \
+    .
--- a/TensorFlow2/Classification/ConvNets/efficientnet/scripts/docker/launch.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/scripts/docker/launch.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+nvidia-docker run -it --rm --net=host --runtime=nvidia --ipc=host --cap-add=SYS_PTRACE --cap-add SYS_ADMIN --cap-add DAC_READ_SEARCH --security-opt seccomp=unconfined \
+	-v $(pwd)/:/workspace/ \
+	-v "/imagenet_tfrecords":/data/ \
+	-v "/imagenet_infer/":/infer_data/images/ \
+	nvcr.io/nvidia/efficientnet-tf2:21.02-tf2-py3
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/Dali.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/Dali.py
@ -0,0 +1,123 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import tensorflow as tf
+import horovod.tensorflow.keras as hvd
+
+from nvidia import dali
+import nvidia.dali.plugin.tf as dali_tf
+import numpy as np
+
+class DaliPipeline(dali.pipeline.Pipeline):
+
+    def __init__(
+        self,
+        tfrec_filenames,
+        tfrec_idx_filenames,
+        height,
+        width,
+        batch_size,
+        num_threads,
+        device_id,
+        shard_id,
+        num_gpus,
+        num_classes,
+        deterministic=False,
+        dali_cpu=True,
+        training=True
+    ):
+
+        kwargs = dict()
+        if deterministic:
+            kwargs['seed'] = 7 * (1 + hvd.rank())
+        super(DaliPipeline, self).__init__(batch_size, num_threads, device_id, **kwargs)
+
+        self.training = training
+        self.input = dali.ops.TFRecordReader(
+            path=tfrec_filenames,
+            index_path=tfrec_idx_filenames,
+            random_shuffle=True,
+            shard_id=shard_id,
+            num_shards=num_gpus,
+            initial_fill=10000,
+            features={
+                'image/encoded': dali.tfrecord.FixedLenFeature((), dali.tfrecord.string, ""),
+                'image/class/label': dali.tfrecord.FixedLenFeature([1], dali.tfrecord.int64, -1),
+                'image/class/text': dali.tfrecord.FixedLenFeature([], dali.tfrecord.string, ''),
+                'image/object/bbox/xmin': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
+                'image/object/bbox/ymin': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
+                'image/object/bbox/xmax': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
+                'image/object/bbox/ymax': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0)
+            }
+        )
+
+        if self.training:
+            self.decode = dali.ops.ImageDecoderRandomCrop(
+                device="cpu" if dali_cpu else "mixed",
+                output_type=dali.types.RGB,
+                random_aspect_ratio=[0.75, 1.33],
+                random_area=[0.05, 1.0],
+                num_attempts=100
+            )
+            self.resize = dali.ops.Resize(device="cpu" if dali_cpu else "gpu", resize_x=width, resize_y=height)
+        else:
+            self.decode = dali.ops.ImageDecoder(
+                device="cpu",
+                output_type=dali.types.RGB
+            )
+            # Make sure that every image > 224 for CropMirrorNormalize
+            self.resize = dali.ops.Resize(device="cpu" if dali_cpu else "gpu", resize_x=width, resize_y=height)
+
+        self.normalize = dali.ops.CropMirrorNormalize(
+            device="gpu",
+            output_dtype=dali.types.FLOAT,
+            image_type=dali.types.RGB,
+            output_layout=dali.types.NHWC,
+            mirror=1 if self.training else 0
+        )
+        self.one_hot = dali.ops.OneHot(num_classes=num_classes)
+        self.shapes = dali.ops.Shapes(type=dali.types.INT32)
+        self.crop = dali.ops.Crop(device="gpu")
+        self.cast_float = dali.ops.Cast(dtype=dali.types.FLOAT)
+        self.extract_h = dali.ops.Slice(normalized_anchor=False, normalized_shape=False, axes=[0])
+        self.extract_w = dali.ops.Slice(normalized_anchor=False, normalized_shape=False, axes=[0])
+
+    def define_graph(self):
+        # Read images and labels
+        inputs = self.input(name="Reader")
+        images = inputs["image/encoded"]
+        labels = inputs["image/class/label"]
+        labels -= 1
+        labels = self.one_hot(labels).gpu()
+
+        # Decode and augmentation
+        images = self.decode(images)
+        if not self.training:
+            shapes = self.shapes(images)
+            h = self.extract_h(shapes, dali.types.Constant(np.array([0], dtype=np.float32)), dali.types.Constant(np.array([1], dtype=np.float32)))
+            w = self.extract_w(shapes, dali.types.Constant(np.array([1], dtype=np.float32)), dali.types.Constant(np.array([1], dtype=np.float32)))
+            CROP_PADDING = 32
+            CROP_H = h * h / (h + CROP_PADDING)
+            CROP_W = w * w / (w + CROP_PADDING)
+            CROP_H = self.cast_float(CROP_H)
+            CROP_W = self.cast_float(CROP_W)
+            images = images.gpu()
+            images = self.crop(images, crop_h = CROP_H, crop_w = CROP_W)
+        images = self.resize(images)
+        images = self.normalize(images)
+
+
+        return (images, labels)
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/augment.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/augment.py
@ -0,0 +1,999 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AutoAugment and RandAugment policies for enhanced image preprocessing.
+
+AutoAugment Reference: https://arxiv.org/abs/1805.09501
+RandAugment Reference: https://arxiv.org/abs/1909.13719
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import math
+import tensorflow as tf
+from typing import Any, Dict, List, Optional, Text, Tuple
+
+from tensorflow.python.keras.layers.preprocessing import image_preprocessing as image_ops
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+
+def to_4d(image: tf.Tensor) -> tf.Tensor:
+  """Converts an input Tensor to 4 dimensions.
+
+  4D image => [N, H, W, C] or [N, C, H, W]
+  3D image => [1, H, W, C] or [1, C, H, W]
+  2D image => [1, H, W, 1]
+
+  Args:
+    image: The 2/3/4D input tensor.
+
+  Returns:
+    A 4D image tensor.
+
+  Raises:
+    `TypeError` if `image` is not a 2/3/4D tensor.
+
+  """
+  shape = tf.shape(image)
+  original_rank = tf.rank(image)
+  left_pad = tf.cast(tf.less_equal(original_rank, 3), dtype=tf.int32)
+  right_pad = tf.cast(tf.equal(original_rank, 2), dtype=tf.int32)
+  new_shape = tf.concat(
+      [
+          tf.ones(shape=left_pad, dtype=tf.int32),
+          shape,
+          tf.ones(shape=right_pad, dtype=tf.int32),
+      ],
+      axis=0,
+  )
+  return tf.reshape(image, new_shape)
+
+
+def from_4d(image: tf.Tensor, ndims: tf.Tensor) -> tf.Tensor:
+  """Converts a 4D image back to `ndims` rank."""
+  shape = tf.shape(image)
+  begin = tf.cast(tf.less_equal(ndims, 3), dtype=tf.int32)
+  end = 4 - tf.cast(tf.equal(ndims, 2), dtype=tf.int32)
+  new_shape = shape[begin:end]
+  return tf.reshape(image, new_shape)
+
+
+def _convert_translation_to_transform(translations: tf.Tensor) -> tf.Tensor:
+  """Converts translations to a projective transform.
+
+  The translation matrix looks like this:
+    [[1 0 -dx]
+     [0 1 -dy]
+     [0 0 1]]
+
+  Args:
+    translations: The 2-element list representing [dx, dy], or a matrix of
+      2-element lists representing [dx dy] to translate for each image. The
+      shape must be static.
+
+  Returns:
+    The transformation matrix of shape (num_images, 8).
+
+  Raises:
+    `TypeError` if
+      - the shape of `translations` is not known or
+      - the shape of `translations` is not rank 1 or 2.
+
+  """
+  translations = tf.convert_to_tensor(translations, dtype=tf.float32)
+  if translations.get_shape().ndims is None:
+    raise TypeError('translations rank must be statically known')
+  elif len(translations.get_shape()) == 1:
+    translations = translations[None]
+  elif len(translations.get_shape()) != 2:
+    raise TypeError('translations should have rank 1 or 2.')
+  num_translations = tf.shape(translations)[0]
+
+  return tf.concat(
+      values=[
+          tf.ones((num_translations, 1), tf.dtypes.float32),
+          tf.zeros((num_translations, 1), tf.dtypes.float32),
+          -translations[:, 0, None],
+          tf.zeros((num_translations, 1), tf.dtypes.float32),
+          tf.ones((num_translations, 1), tf.dtypes.float32),
+          -translations[:, 1, None],
+          tf.zeros((num_translations, 2), tf.dtypes.float32),
+      ],
+      axis=1,
+  )
+
+
+def _convert_angles_to_transform(
+    angles: tf.Tensor,
+    image_width: tf.Tensor,
+    image_height: tf.Tensor) -> tf.Tensor:
+  """Converts an angle or angles to a projective transform.
+
+  Args:
+    angles: A scalar to rotate all images, or a vector to rotate a batch of
+      images. This must be a scalar.
+    image_width: The width of the image(s) to be transformed.
+    image_height: The height of the image(s) to be transformed.
+
+  Returns:
+    A tensor of shape (num_images, 8).
+
+  Raises:
+    `TypeError` if `angles` is not rank 0 or 1.
+
+  """
+  angles = tf.convert_to_tensor(angles, dtype=tf.float32)
+  if len(angles.get_shape()) == 0:  # pylint:disable=g-explicit-length-test
+    angles = angles[None]
+  elif len(angles.get_shape()) != 1:
+    raise TypeError('Angles should have a rank 0 or 1.')
+  x_offset = ((image_width - 1) -
+              (tf.math.cos(angles) * (image_width - 1) - tf.math.sin(angles) *
+               (image_height - 1))) / 2.0
+  y_offset = ((image_height - 1) -
+              (tf.math.sin(angles) * (image_width - 1) + tf.math.cos(angles) *
+               (image_height - 1))) / 2.0
+  num_angles = tf.shape(angles)[0]
+  return tf.concat(
+      values=[
+          tf.math.cos(angles)[:, None],
+          -tf.math.sin(angles)[:, None],
+          x_offset[:, None],
+          tf.math.sin(angles)[:, None],
+          tf.math.cos(angles)[:, None],
+          y_offset[:, None],
+          tf.zeros((num_angles, 2), tf.dtypes.float32),
+      ],
+      axis=1,
+  )
+
+
+def transform(image: tf.Tensor, transforms) -> tf.Tensor:
+  """Prepares input data for `image_ops.transform`."""
+  original_ndims = tf.rank(image)
+  transforms = tf.convert_to_tensor(transforms, dtype=tf.float32)
+  if transforms.shape.rank == 1:
+    transforms = transforms[None]
+  image = to_4d(image)
+  image = image_ops.transform(
+      images=image,
+      transforms=transforms,
+      interpolation='nearest')
+  return from_4d(image, original_ndims)
+
+
+def translate(image: tf.Tensor, translations) -> tf.Tensor:
+  """Translates image(s) by provided vectors.
+
+  Args:
+    image: An image Tensor of type uint8.
+    translations: A vector or matrix representing [dx dy].
+
+  Returns:
+    The translated version of the image.
+
+  """
+  transforms = _convert_translation_to_transform(translations)
+  return transform(image, transforms=transforms)
+
+
+def rotate(image: tf.Tensor, degrees: float) -> tf.Tensor:
+  """Rotates the image by degrees either clockwise or counterclockwise.
+
+  Args:
+    image: An image Tensor of type uint8.
+    degrees: Float, a scalar angle in degrees to rotate all images by. If
+      degrees is positive the image will be rotated clockwise otherwise it will
+      be rotated counterclockwise.
+
+  Returns:
+    The rotated version of image.
+
+  """
+  # Convert from degrees to radians.
+  degrees_to_radians = math.pi / 180.0
+  radians = tf.cast(degrees * degrees_to_radians, tf.float32)
+
+  original_ndims = tf.rank(image)
+  image = to_4d(image)
+
+  image_height = tf.cast(tf.shape(image)[1], tf.float32)
+  image_width = tf.cast(tf.shape(image)[2], tf.float32)
+  transforms = _convert_angles_to_transform(angles=radians,
+                                            image_width=image_width,
+                                            image_height=image_height)
+  # In practice, we should randomize the rotation degrees by flipping
+  # it negatively half the time, but that's done on 'degrees' outside
+  # of the function.
+  image = transform(image, transforms=transforms)
+  return from_4d(image, original_ndims)
+
+
+def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor:
+  """Blend image1 and image2 using 'factor'.
+
+  Factor can be above 0.0.  A value of 0.0 means only image1 is used.
+  A value of 1.0 means only image2 is used.  A value between 0.0 and
+  1.0 means we linearly interpolate the pixel values between the two
+  images.  A value greater than 1.0 "extrapolates" the difference
+  between the two pixel values, and we clip the results to values
+  between 0 and 255.
+
+  Args:
+    image1: An image Tensor of type uint8.
+    image2: An image Tensor of type uint8.
+    factor: A floating point value above 0.0.
+
+  Returns:
+    A blended image Tensor of type uint8.
+  """
+  if factor == 0.0:
+    return tf.convert_to_tensor(image1)
+  if factor == 1.0:
+    return tf.convert_to_tensor(image2)
+
+  image1 = tf.cast(image1, tf.float32)
+  image2 = tf.cast(image2, tf.float32)
+
+  difference = image2 - image1
+  scaled = factor * difference
+
+  # Do addition in float.
+  temp = tf.cast(image1, tf.float32) + scaled
+
+  # Interpolate
+  if factor > 0.0 and factor < 1.0:
+    # Interpolation means we always stay within 0 and 255.
+    return tf.cast(temp, tf.uint8)
+
+  # Extrapolate:
+  #
+  # We need to clip and then cast.
+  return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8)
+
+
+def cutout(image: tf.Tensor, pad_size: int, replace: int = 0) -> tf.Tensor:
+  """Apply cutout (https://arxiv.org/abs/1708.04552) to image.
+
+  This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
+  a random location within `img`. The pixel values filled in will be of the
+  value `replace`. The located where the mask will be applied is randomly
+  chosen uniformly over the whole image.
+
+  Args:
+    image: An image Tensor of type uint8.
+    pad_size: Specifies how big the zero mask that will be generated is that
+      is applied to the image. The mask will be of size
+      (2*pad_size x 2*pad_size).
+    replace: What pixel value to fill in the image in the area that has
+      the cutout mask applied to it.
+
+  Returns:
+    An image Tensor that is of type uint8.
+  """
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+
+  # Sample the center location in the image where the zero mask will be applied.
+  cutout_center_height = tf.random.uniform(
+      shape=[], minval=0, maxval=image_height,
+      dtype=tf.int32)
+
+  cutout_center_width = tf.random.uniform(
+      shape=[], minval=0, maxval=image_width,
+      dtype=tf.int32)
+
+  lower_pad = tf.maximum(0, cutout_center_height - pad_size)
+  upper_pad = tf.maximum(0, image_height - cutout_center_height - pad_size)
+  left_pad = tf.maximum(0, cutout_center_width - pad_size)
+  right_pad = tf.maximum(0, image_width - cutout_center_width - pad_size)
+
+  cutout_shape = [image_height - (lower_pad + upper_pad),
+                  image_width - (left_pad + right_pad)]
+  padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+  mask = tf.pad(
+      tf.zeros(cutout_shape, dtype=image.dtype),
+      padding_dims, constant_values=1)
+  mask = tf.expand_dims(mask, -1)
+  mask = tf.tile(mask, [1, 1, 3])
+  image = tf.where(
+      tf.equal(mask, 0),
+      tf.ones_like(image, dtype=image.dtype) * replace,
+      image)
+  return image
+
+
+def solarize(image: tf.Tensor, threshold: int = 128) -> tf.Tensor:
+  # For each pixel in the image, select the pixel
+  # if the value is less than the threshold.
+  # Otherwise, subtract 255 from the pixel.
+  return tf.where(image < threshold, image, 255 - image)
+
+
+def solarize_add(image: tf.Tensor,
+                 addition: int = 0,
+                 threshold: int = 128) -> tf.Tensor:
+  # For each pixel in the image less than threshold
+  # we add 'addition' amount to it and then clip the
+  # pixel value to be between 0 and 255. The value
+  # of 'addition' is between -128 and 128.
+  added_image = tf.cast(image, tf.int64) + addition
+  added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8)
+  return tf.where(image < threshold, added_image, image)
+
+
+def color(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Color."""
+  degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image))
+  return blend(degenerate, image, factor)
+
+
+def contrast(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Contrast."""
+  degenerate = tf.image.rgb_to_grayscale(image)
+  # Cast before calling tf.histogram.
+  degenerate = tf.cast(degenerate, tf.int32)
+
+  # Compute the grayscale histogram, then compute the mean pixel value,
+  # and create a constant image size of that value.  Use that as the
+  # blending degenerate target of the original image.
+  hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256)
+  mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0
+  degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8))
+  return blend(degenerate, image, factor)
+
+
+def brightness(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Brightness."""
+  degenerate = tf.zeros_like(image)
+  return blend(degenerate, image, factor)
+
+
+def posterize(image: tf.Tensor, bits: int) -> tf.Tensor:
+  """Equivalent of PIL Posterize."""
+  shift = 8 - bits
+  return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift)
+
+
+def wrapped_rotate(image: tf.Tensor, degrees: float, replace: int) -> tf.Tensor:
+  """Applies rotation with wrap/unwrap."""
+  image = rotate(wrap(image), degrees=degrees)
+  return unwrap(image, replace)
+
+
+def translate_x(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Translate in X dimension."""
+  image = translate(wrap(image), [-pixels, 0])
+  return unwrap(image, replace)
+
+
+def translate_y(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Translate in Y dimension."""
+  image = translate(wrap(image), [0, -pixels])
+  return unwrap(image, replace)
+
+
+def shear_x(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Shearing in X dimension."""
+  # Shear parallel to x axis is a projective transform
+  # with a matrix form of:
+  # [1  level
+  #  0  1].
+  image = transform(image=wrap(image),
+                    transforms=[1., level, 0., 0., 1., 0., 0., 0.])
+  return unwrap(image, replace)
+
+
+def shear_y(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Shearing in Y dimension."""
+  # Shear parallel to y axis is a projective transform
+  # with a matrix form of:
+  # [1  0
+  #  level  1].
+  image = transform(image=wrap(image),
+                    transforms=[1., 0., 0., level, 1., 0., 0., 0.])
+  return unwrap(image, replace)
+
+
+def autocontrast(image: tf.Tensor) -> tf.Tensor:
+  """Implements Autocontrast function from PIL using TF ops.
+
+  Args:
+    image: A 3D uint8 tensor.
+
+  Returns:
+    The image after it has had autocontrast applied to it and will be of type
+    uint8.
+  """
+
+  def scale_channel(image: tf.Tensor) -> tf.Tensor:
+    """Scale the 2D image using the autocontrast rule."""
+    # A possibly cheaper version can be done using cumsum/unique_with_counts
+    # over the histogram values, rather than iterating over the entire image.
+    # to compute mins and maxes.
+    lo = tf.cast(tf.reduce_min(image), tf.float32)
+    hi = tf.cast(tf.reduce_max(image), tf.float32)
+
+    # Scale the image, making the lowest value 0 and the highest value 255.
+    def scale_values(im):
+      scale = 255.0 / (hi - lo)
+      offset = -lo * scale
+      im = tf.cast(im, tf.float32) * scale + offset
+      im = tf.clip_by_value(im, 0.0, 255.0)
+      return tf.cast(im, tf.uint8)
+
+    result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image)
+    return result
+
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image[:, :, 0])
+  s2 = scale_channel(image[:, :, 1])
+  s3 = scale_channel(image[:, :, 2])
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+
+
+def sharpness(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Implements Sharpness function from PIL using TF ops."""
+  orig_image = image
+  image = tf.cast(image, tf.float32)
+  # Make image 4D for conv operation.
+  image = tf.expand_dims(image, 0)
+  # SMOOTH PIL Kernel.
+  kernel = tf.constant(
+      [[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=tf.float32,
+      shape=[3, 3, 1, 1]) / 13.
+  # Tile across channel dimension.
+  kernel = tf.tile(kernel, [1, 1, 3, 1])
+  strides = [1, 1, 1, 1]
+  degenerate = tf.nn.depthwise_conv2d(
+      image, kernel, strides, padding='VALID', dilations=[1, 1])
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0])
+
+  # For the borders of the resulting image, fill in the values of the
+  # original image.
+  mask = tf.ones_like(degenerate)
+  padded_mask = tf.pad(mask, [[1, 1], [1, 1], [0, 0]])
+  padded_degenerate = tf.pad(degenerate, [[1, 1], [1, 1], [0, 0]])
+  result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image)
+
+  # Blend the final result.
+  return blend(result, orig_image, factor)
+
+
+def equalize(image: tf.Tensor) -> tf.Tensor:
+  """Implements Equalize function from PIL using TF ops."""
+  def scale_channel(im, c):
+    """Scale the data in the channel to implement equalize."""
+    im = tf.cast(im[:, :, c], tf.int32)
+    # Compute the histogram of the image channel.
+    histo = tf.histogram_fixed_width(im, [0, 255], nbins=256)
+
+    # For the purposes of computing the step, filter out the nonzeros.
+    nonzero = tf.where(tf.not_equal(histo, 0))
+    nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1])
+    step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255
+
+    def build_lut(histo, step):
+      # Compute the cumulative sum, shifting by step // 2
+      # and then normalization by step.
+      lut = (tf.cumsum(histo) + (step // 2)) // step
+      # Shift lut, prepending with 0.
+      lut = tf.concat([[0], lut[:-1]], 0)
+      # Clip the counts to be in range.  This is done
+      # in the C code for image.point.
+      return tf.clip_by_value(lut, 0, 255)
+
+    # If step is zero, return the original image.  Otherwise, build
+    # lut from the full histogram and step and then index from it.
+    result = tf.cond(tf.equal(step, 0),
+                     lambda: im,
+                     lambda: tf.gather(build_lut(histo, step), im))
+
+    return tf.cast(result, tf.uint8)
+
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image, 0)
+  s2 = scale_channel(image, 1)
+  s3 = scale_channel(image, 2)
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+
+
+def invert(image: tf.Tensor) -> tf.Tensor:
+  """Inverts the image pixels."""
+  image = tf.convert_to_tensor(image)
+  return 255 - image
+
+
+def wrap(image: tf.Tensor) -> tf.Tensor:
+  """Returns 'image' with an extra channel set to all 1s."""
+  shape = tf.shape(image)
+  extended_channel = tf.ones([shape[0], shape[1], 1], image.dtype)
+  extended = tf.concat([image, extended_channel], axis=2)
+  return extended
+
+
+def unwrap(image: tf.Tensor, replace: int) -> tf.Tensor:
+  """Unwraps an image produced by wrap.
+
+  Where there is a 0 in the last channel for every spatial position,
+  the rest of the three channels in that spatial dimension are grayed
+  (set to 128).  Operations like translate and shear on a wrapped
+  Tensor will leave 0s in empty locations.  Some transformations look
+  at the intensity of values to do preprocessing, and we want these
+  empty pixels to assume the 'average' value, rather than pure black.
+
+
+  Args:
+    image: A 3D Image Tensor with 4 channels.
+    replace: A one or three value 1D tensor to fill empty pixels.
+
+  Returns:
+    image: A 3D image Tensor with 3 channels.
+  """
+  image_shape = tf.shape(image)
+  # Flatten the spatial dimensions.
+  flattened_image = tf.reshape(image, [-1, image_shape[2]])
+
+  # Find all pixels where the last channel is zero.
+  alpha_channel = tf.expand_dims(flattened_image[:, 3], axis=-1)
+
+  replace = tf.concat([replace, tf.ones([1], image.dtype)], 0)
+
+  # Where they are zero, fill them in with 'replace'.
+  flattened_image = tf.where(
+      tf.equal(alpha_channel, 0),
+      tf.ones_like(flattened_image, dtype=image.dtype) * replace,
+      flattened_image)
+
+  image = tf.reshape(flattened_image, image_shape)
+  image = tf.slice(image, [0, 0, 0], [image_shape[0], image_shape[1], 3])
+  return image
+
+
+def _randomly_negate_tensor(tensor):
+  """With 50% prob turn the tensor negative."""
+  should_flip = tf.cast(tf.floor(tf.random.uniform([]) + 0.5), tf.bool)
+  final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor)
+  return final_tensor
+
+
+def _rotate_level_to_arg(level: float):
+  level = (level/_MAX_LEVEL) * 30.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+
+
+def _shrink_level_to_arg(level: float):
+  """Converts level to ratio by which we shrink the image content."""
+  if level == 0:
+    return (1.0,)  # if level is zero, do not shrink the image
+  # Maximum shrinking ratio is 2.9.
+  level = 2. / (_MAX_LEVEL / level) + 0.9
+  return (level,)
+
+
+def _enhance_level_to_arg(level: float):
+  return ((level/_MAX_LEVEL) * 1.8 + 0.1,)
+
+
+def _shear_level_to_arg(level: float):
+  level = (level/_MAX_LEVEL) * 0.3
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+
+
+def _translate_level_to_arg(level: float, translate_const: float):
+  level = (level/_MAX_LEVEL) * float(translate_const)
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+
+
+def _mult_to_arg(level: float, multiplier: float = 1.):
+  return (int((level / _MAX_LEVEL) * multiplier),)
+
+
+def _apply_func_with_prob(func: Any,
+                          image: tf.Tensor,
+                          args: Any,
+                          prob: float):
+  """Apply `func` to image w/ `args` as input with probability `prob`."""
+  assert isinstance(args, tuple)
+
+  # Apply the function with probability `prob`.
+  should_apply_op = tf.cast(
+      tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool)
+  augmented_image = tf.cond(
+      should_apply_op,
+      lambda: func(image, *args),
+      lambda: image)
+  return augmented_image
+
+
+def select_and_apply_random_policy(policies: Any, image: tf.Tensor):
+  """Select a random policy from `policies` and apply it to `image`."""
+  policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32)
+  # Note that using tf.case instead of tf.conds would result in significantly
+  # larger graphs and would even break export for some larger policies.
+  for (i, policy) in enumerate(policies):
+    image = tf.cond(
+        tf.equal(i, policy_to_select),
+        lambda selected_policy=policy: selected_policy(image),
+        lambda: image)
+  return image
+
+
+NAME_TO_FUNC = {
+    'AutoContrast': autocontrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': wrapped_rotate,
+    'Posterize': posterize,
+    'Solarize': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'Contrast': contrast,
+    'Brightness': brightness,
+    'Sharpness': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x,
+    'TranslateY': translate_y,
+    'Cutout': cutout,
+}
+
+# Functions that have a 'replace' parameter
+REPLACE_FUNCS = frozenset({
+    'Rotate',
+    'TranslateX',
+    'ShearX',
+    'ShearY',
+    'TranslateY',
+    'Cutout',
+})
+
+
+def level_to_arg(cutout_const: float, translate_const: float):
+  """Creates a dict mapping image operation names to their arguments."""
+
+  no_arg = lambda level: ()
+  posterize_arg = lambda level: _mult_to_arg(level, 4)
+  solarize_arg = lambda level: _mult_to_arg(level, 256)
+  solarize_add_arg = lambda level: _mult_to_arg(level, 110)
+  cutout_arg = lambda level: _mult_to_arg(level, cutout_const)
+  translate_arg = lambda level: _translate_level_to_arg(level, translate_const)
+
+  args = {
+      'AutoContrast': no_arg,
+      'Equalize': no_arg,
+      'Invert': no_arg,
+      'Rotate': _rotate_level_to_arg,
+      'Posterize': posterize_arg,
+      'Solarize': solarize_arg,
+      'SolarizeAdd': solarize_add_arg,
+      'Color': _enhance_level_to_arg,
+      'Contrast': _enhance_level_to_arg,
+      'Brightness': _enhance_level_to_arg,
+      'Sharpness': _enhance_level_to_arg,
+      'ShearX': _shear_level_to_arg,
+      'ShearY': _shear_level_to_arg,
+      'Cutout': cutout_arg,
+      'TranslateX': translate_arg,
+      'TranslateY': translate_arg,
+  }
+  return args
+
+
+def _parse_policy_info(name: Text,
+                       prob: float,
+                       level: float,
+                       replace_value: List[int],
+                       cutout_const: float,
+                       translate_const: float) -> Tuple[Any, float, Any]:
+  """Return the function that corresponds to `name` and update `level` param."""
+  func = NAME_TO_FUNC[name]
+  args = level_to_arg(cutout_const, translate_const)[name](level)
+
+  if name in REPLACE_FUNCS:
+    # Add in replace arg if it is required for the function that is called.
+    args = tuple(list(args) + [replace_value])
+
+  return func, prob, args
+
+
+class ImageAugment(object):
+  """Image augmentation class for applying image distortions."""
+
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Given an image tensor, returns a distorted image with the same shape.
+
+    Args:
+      image: `Tensor` of shape [height, width, 3] representing an image.
+
+    Returns:
+      The augmented version of `image`.
+    """
+    raise NotImplementedError()
+
+
+class AutoAugment(ImageAugment):
+  """Applies the AutoAugment policy to images.
+
+    AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
+  """
+
+  def __init__(self,
+               augmentation_name: Text = 'v0',
+               policies: Optional[Dict[Text, Any]] = None,
+               cutout_const: float = 100,
+               translate_const: float = 250):
+    """Applies the AutoAugment policy to images.
+
+    Args:
+      augmentation_name: The name of the AutoAugment policy to use. The
+        available options are `v0` and `test`. `v0` is the policy used for all
+        of the results in the paper and was found to achieve the best results on
+        the COCO dataset. `v1`, `v2` and `v3` are additional good policies found
+        on the COCO dataset that have slight variation in what operations were
+        used during the search procedure along with how many operations are
+        applied in parallel to a single image (2 vs 3).
+      policies: list of lists of tuples in the form `(func, prob, level)`,
+        `func` is a string name of the augmentation function, `prob` is the
+        probability of applying the `func` operation, `level` is the input
+        argument for `func`.
+      cutout_const: multiplier for applying cutout.
+      translate_const: multiplier for applying translation.
+    """
+    super(AutoAugment, self).__init__()
+
+
+    if policies is None:
+      self.available_policies = {
+          'v0': self.policy_v0(),
+          'test': self.policy_test(),
+          'simple': self.policy_simple(),
+      }
+
+    if augmentation_name not in self.available_policies:
+      raise ValueError(
+          'Invalid augmentation_name: {}'.format(augmentation_name))
+
+    self.augmentation_name = augmentation_name
+    self.policies = self.available_policies[augmentation_name]
+    self.cutout_const = float(cutout_const)
+    self.translate_const = float(translate_const)
+
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Applies the AutoAugment policy to `image`.
+
+    AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
+
+    Args:
+      image: `Tensor` of shape [height, width, 3] representing an image.
+
+    Returns:
+      A version of image that now has data augmentation applied to it based on
+      the `policies` pass into the function.
+    """
+    input_image_type = image.dtype
+
+    if input_image_type != tf.uint8:
+      image = tf.clip_by_value(image, 0.0, 255.0)
+      image = tf.cast(image, dtype=tf.uint8)
+
+    replace_value = [128] * 3
+
+    # func is the string name of the augmentation function, prob is the
+    # probability of applying the operation and level is the parameter
+    # associated with the tf op.
+
+    # tf_policies are functions that take in an image and return an augmented
+    # image.
+    tf_policies = []
+    for policy in self.policies:
+      tf_policy = []
+      # Link string name to the correct python function and make sure the
+      # correct argument is passed into that function.
+      for policy_info in policy:
+        policy_info = list(policy_info) + [
+            replace_value, self.cutout_const, self.translate_const
+        ]
+        tf_policy.append(_parse_policy_info(*policy_info))
+      # Now build the tf policy that will apply the augmentation procedue
+      # on image.
+      def make_final_policy(tf_policy_):
+
+        def final_policy(image_):
+          for func, prob, args in tf_policy_:
+            image_ = _apply_func_with_prob(func, image_, args, prob)
+          return image_
+
+        return final_policy
+
+      tf_policies.append(make_final_policy(tf_policy))
+
+    image = select_and_apply_random_policy(tf_policies, image)
+    image = tf.cast(image, dtype=input_image_type)
+    return image
+
+  @staticmethod
+  def policy_v0():
+    """Autoaugment policy that was used in AutoAugment Paper.
+
+    Each tuple is an augmentation operation of the form
+    (operation, probability, magnitude). Each element in policy is a
+    sub-policy that will be applied sequentially on the image.
+
+    Returns:
+      the policy.
+    """
+
+    # TODO(dankondratyuk): tensorflow_addons defines custom ops, which
+    # for some reason are not included when building/linking
+    # This results in the error, "Op type not registered
+    # 'Addons>ImageProjectiveTransformV2' in binary" when running on borg TPUs
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    return policy
+
+  @staticmethod
+  def policy_simple():
+    """Same as `policy_v0`, except with custom ops removed."""
+
+    policy = [
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+    ]
+    return policy
+
+  @staticmethod
+  def policy_test():
+    """Autoaugment test policy for debugging."""
+    policy = [
+        [('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)],
+    ]
+    return policy
+
+
+class RandAugment(ImageAugment):
+  """Applies the RandAugment policy to images.
+
+  RandAugment is from the paper https://arxiv.org/abs/1909.13719,
+  """
+
+  def __init__(self,
+               num_layers: int = 2,
+               magnitude: float = 10.,
+               cutout_const: float = 40.,
+               translate_const: float = 100.):
+    """Applies the RandAugment policy to images.
+
+    Args:
+      num_layers: Integer, the number of augmentation transformations to apply
+        sequentially to an image. Represented as (N) in the paper. Usually best
+        values will be in the range [1, 3].
+      magnitude: Integer, shared magnitude across all augmentation operations.
+        Represented as (M) in the paper. Usually best values are in the range
+        [5, 10].
+      cutout_const: multiplier for applying cutout.
+      translate_const: multiplier for applying translation.
+    """
+    super(RandAugment, self).__init__()
+
+    self.num_layers = num_layers
+    self.magnitude = float(magnitude)
+    self.cutout_const = float(cutout_const)
+    self.translate_const = float(translate_const)
+    self.available_ops = [
+        'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize', 'Solarize',
+        'Color', 'Contrast', 'Brightness', 'Sharpness', 'ShearX', 'ShearY',
+        'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd'
+    ]
+
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Applies the RandAugment policy to `image`.
+
+    Args:
+      image: `Tensor` of shape [height, width, 3] representing an image.
+
+    Returns:
+      The augmented version of `image`.
+    """
+    input_image_type = image.dtype
+
+    if input_image_type != tf.uint8:
+      image = tf.clip_by_value(image, 0.0, 255.0)
+      image = tf.cast(image, dtype=tf.uint8)
+
+    replace_value = [128] * 3
+    min_prob, max_prob = 0.2, 0.8
+
+    for _ in range(self.num_layers):
+      op_to_select = tf.random.uniform(
+          [], maxval=len(self.available_ops) + 1, dtype=tf.int32)
+
+      branch_fns = []
+      for (i, op_name) in enumerate(self.available_ops):
+        prob = tf.random.uniform([],
+                                 minval=min_prob,
+                                 maxval=max_prob,
+                                 dtype=tf.float32)
+        func, _, args = _parse_policy_info(op_name,
+                                           prob,
+                                           self.magnitude,
+                                           replace_value,
+                                           self.cutout_const,
+                                           self.translate_const)
+        branch_fns.append((
+            i,
+            # pylint:disable=g-long-lambda
+            lambda selected_func=func, selected_args=args: selected_func(
+                image, *selected_args)))
+        # pylint:enable=g-long-lambda
+
+      image = tf.switch_case(branch_index=op_to_select,
+                             branch_fns=branch_fns,
+                             default=lambda: tf.identity(image))
+
+    image = tf.cast(image, dtype=input_image_type)
+    return image
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/callbacks.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/callbacks.py
@ -0,0 +1,408 @@
+# Lint as: python3
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common modules for callbacks."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+from typing import Any, List, MutableMapping, Text
+import tensorflow as tf
+from tensorflow import keras
+
+from utils import optimizer_factory
+import horovod.tensorflow as hvd
+import time
+
+
+def get_callbacks(model_checkpoint: bool = True,
+                  include_tensorboard: bool = True,
+                  time_history: bool = True,
+                  track_lr: bool = True,
+                  write_model_weights: bool = True,
+                  initial_step: int = 0,
+                  batch_size: int = 0,
+                  log_steps: int = 100,
+                  model_dir: str = None,
+                  save_checkpoint_freq: int = 0,
+                  logger = None) -> List[tf.keras.callbacks.Callback]:
+  """Get all callbacks."""
+  model_dir = model_dir or ''
+  callbacks = []
+  if model_checkpoint and hvd.rank() == 0:
+    ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
+    callbacks.append(tf.keras.callbacks.ModelCheckpoint(
+        ckpt_full_path, save_weights_only=True, verbose=1, save_freq=save_checkpoint_freq))
+  if time_history and logger is not None and hvd.rank() == 0:
+    callbacks.append(
+        TimeHistory(
+            batch_size,
+            log_steps,
+            logdir=model_dir if include_tensorboard else None,
+            logger=logger))
+  if include_tensorboard:
+    callbacks.append(
+        CustomTensorBoard(
+            log_dir=model_dir,
+            track_lr=track_lr,
+            initial_step=initial_step,
+            write_images=write_model_weights))
+  return callbacks
+
+
+def get_scalar_from_tensor(t: tf.Tensor) -> int:
+  """Utility function to convert a Tensor to a scalar."""
+  t = tf.keras.backend.get_value(t)
+  if callable(t):
+    return t()
+  else:
+    return t
+
+
+class CustomTensorBoard(tf.keras.callbacks.TensorBoard):
+  """A customized TensorBoard callback that tracks additional datapoints.
+
+  Metrics tracked:
+  - Global learning rate
+
+  Attributes:
+    log_dir: the path of the directory where to save the log files to be parsed
+      by TensorBoard.
+    track_lr: `bool`, whether or not to track the global learning rate.
+    initial_step: the initial step, used for preemption recovery.
+    **kwargs: Additional arguments for backwards compatibility. Possible key is
+      `period`.
+  """
+
+  # TODO(b/146499062): track params, flops, log lr, l2 loss,
+  # classification loss
+
+  def __init__(self,
+               log_dir: str,
+               track_lr: bool = False,
+               initial_step: int = 0,
+               **kwargs):
+    super(CustomTensorBoard, self).__init__(log_dir=log_dir, **kwargs)
+    self.step = initial_step
+    self._track_lr = track_lr
+
+  def on_batch_begin(self,
+                     epoch: int,
+                     logs: MutableMapping[str, Any] = None) -> None:
+    self.step += 1
+    if logs is None:
+      logs = {}
+    logs.update(self._calculate_metrics())
+    super(CustomTensorBoard, self).on_batch_begin(epoch, logs)
+
+  def on_epoch_begin(self,
+                     epoch: int,
+                     logs: MutableMapping[str, Any] = None) -> None:
+    if logs is None:
+      logs = {}
+    metrics = self._calculate_metrics()
+    logs.update(metrics)
+    super(CustomTensorBoard, self).on_epoch_begin(epoch, logs)
+
+  def on_epoch_end(self,
+                   epoch: int,
+                   logs: MutableMapping[str, Any] = None) -> None:
+    if logs is None:
+      logs = {}
+    metrics = self._calculate_metrics()
+    logs.update(metrics)
+    super(CustomTensorBoard, self).on_epoch_end(epoch, logs)
+
+  def _calculate_metrics(self) -> MutableMapping[str, Any]:
+    logs = {}
+    # TODO(b/149030439): disable LR reporting.
+    if self._track_lr:
+      logs['learning_rate'] = self._calculate_lr()
+    return logs
+
+  def _calculate_lr(self) -> int:
+    """Calculates the learning rate given the current step."""
+    return get_scalar_from_tensor(
+        self._get_base_optimizer()._decayed_lr(var_dtype=tf.float32))  # pylint:disable=protected-access
+
+  def _get_base_optimizer(self) -> tf.keras.optimizers.Optimizer:
+    """Get the base optimizer used by the current model."""
+
+    optimizer = self.model.optimizer
+
+    # The optimizer might be wrapped by another class, so unwrap it
+    while hasattr(optimizer, '_optimizer'):
+      optimizer = optimizer._optimizer  # pylint:disable=protected-access
+
+    return optimizer
+
+class MovingAverageCallback(tf.keras.callbacks.Callback):
+  """A Callback to be used with a `MovingAverage` optimizer.
+
+  Applies moving average weights to the model during validation time to test
+  and predict on the averaged weights rather than the current model weights.
+  Once training is complete, the model weights will be overwritten with the
+  averaged weights (by default).
+
+  Attributes:
+    overwrite_weights_on_train_end: Whether to overwrite the current model
+      weights with the averaged weights from the moving average optimizer.
+    **kwargs: Any additional callback arguments.
+  """
+
+  def __init__(self,
+               overwrite_weights_on_train_end: bool = False,
+               **kwargs):
+    super(MovingAverageCallback, self).__init__(**kwargs)
+    self.overwrite_weights_on_train_end = overwrite_weights_on_train_end
+
+  def set_model(self, model: tf.keras.Model):
+    super(MovingAverageCallback, self).set_model(model)
+    assert isinstance(self.model.optimizer,
+                      optimizer_factory.MovingAverage)
+    self.model.optimizer.shadow_copy(self.model)
+
+  def on_test_begin(self, logs: MutableMapping[Text, Any] = None):
+    self.model.optimizer.swap_weights()
+
+  def on_test_end(self, logs: MutableMapping[Text, Any] = None):
+    self.model.optimizer.swap_weights()
+
+  def on_train_end(self, logs: MutableMapping[Text, Any] = None):
+    if self.overwrite_weights_on_train_end:
+      self.model.optimizer.assign_average_vars(self.model.variables)
+
+
+class AverageModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
+  """Saves and, optionally, assigns the averaged weights.
+
+  Taken from tfa.callbacks.AverageModelCheckpoint.
+
+  Attributes:
+    update_weights: If True, assign the moving average weights
+      to the model, and save them. If False, keep the old
+      non-averaged weights, but the saved model uses the
+      average weights.
+    See `tf.keras.callbacks.ModelCheckpoint` for the other args.
+  """
+
+  def __init__(
+      self,
+      update_weights: bool,
+      filepath: str,
+      monitor: str = 'val_loss',
+      verbose: int = 0,
+      save_best_only: bool = False,
+      save_weights_only: bool = False,
+      mode: str = 'auto',
+      save_freq: str = 'epoch',
+      **kwargs):
+    self.update_weights = update_weights
+    super().__init__(
+        filepath,
+        monitor,
+        verbose,
+        save_best_only,
+        save_weights_only,
+        mode,
+        save_freq,
+        **kwargs)
+
+  def set_model(self, model):
+    if not isinstance(model.optimizer, optimizer_factory.MovingAverage):
+      raise TypeError(
+          'AverageModelCheckpoint is only used when training'
+          'with MovingAverage')
+    return super().set_model(model)
+
+  def _save_model(self, epoch, logs):
+    assert isinstance(self.model.optimizer, optimizer_factory.MovingAverage)
+
+    if self.update_weights:
+      self.model.optimizer.assign_average_vars(self.model.variables)
+      return super()._save_model(epoch, logs)
+    else:
+      # Note: `model.get_weights()` gives us the weights (non-ref)
+      # whereas `model.variables` returns references to the variables.
+      non_avg_weights = self.model.get_weights()
+      self.model.optimizer.assign_average_vars(self.model.variables)
+      # result is currently None, since `super._save_model` doesn't
+      # return anything, but this may change in the future.
+      result = super()._save_model(epoch, logs)
+      self.model.set_weights(non_avg_weights)
+      return result
+
+
+
+class BatchTimestamp(object):
+  """A structure to store batch time stamp."""
+
+  def __init__(self, batch_index, timestamp):
+    self.batch_index = batch_index
+    self.timestamp = timestamp
+
+  def __repr__(self):
+    return "'BatchTimestamp<batch_index: {}, timestamp: {}>'".format(
+        self.batch_index, self.timestamp)
+
+
+
+class TimeHistory(tf.keras.callbacks.Callback):
+  """Callback for Keras models."""
+
+  def __init__(self, batch_size, log_steps, logger, logdir=None):
+    """Callback for logging performance.
+
+    Args:
+      batch_size: Total batch size.
+      log_steps: Interval of steps between logging of batch level stats.
+      logdir: Optional directory to write TensorBoard summaries.
+    """
+    # TODO(wcromar): remove this parameter and rely on `logs` parameter of
+    # on_train_batch_end()
+    self.batch_size = batch_size
+    super(TimeHistory, self).__init__()
+    self.log_steps = log_steps
+    self.last_log_step = 0
+    self.steps_before_epoch = 0
+    self.steps_in_epoch = 0
+    self.start_time = None
+    self.logger = logger
+    self.step_per_epoch = 0
+
+    if logdir:
+      self.summary_writer = tf.summary.create_file_writer(logdir)
+    else:
+      self.summary_writer = None
+
+    # Logs start of step 1 then end of each step based on log_steps interval.
+    self.timestamp_log = []
+
+    # Records the time each epoch takes to run from start to finish of epoch.
+    self.epoch_runtime_log = []
+    self.throughput = []
+
+  @property
+  def global_steps(self):
+    """The current 1-indexed global step."""
+    return self.steps_before_epoch + self.steps_in_epoch
+
+  @property
+  def average_steps_per_second(self):
+    """The average training steps per second across all epochs."""
+    return (self.global_steps - self.step_per_epoch) / sum(self.epoch_runtime_log[1:])
+
+  @property
+  def average_examples_per_second(self):
+    """The average number of training examples per second across all epochs."""
+    # return self.average_steps_per_second * self.batch_size
+    ind = int(0.1*len(self.throughput))
+    return sum(self.throughput[ind:])/(len(self.throughput[ind:])+1)
+
+  def on_train_end(self, logs=None):
+    self.train_finish_time = time.time()
+
+    if self.summary_writer:
+      self.summary_writer.flush()
+
+  def on_epoch_begin(self, epoch, logs=None):
+    self.epoch_start = time.time()
+
+  def on_batch_begin(self, batch, logs=None):
+    if not self.start_time:
+      self.start_time = time.time()
+
+    # Record the timestamp of the first global step
+    if not self.timestamp_log:
+      self.timestamp_log.append(BatchTimestamp(self.global_steps,
+                                               self.start_time))
+
+  def on_batch_end(self, batch, logs=None):
+    """Records elapse time of the batch and calculates examples per second."""
+    self.steps_in_epoch = batch + 1
+    steps_since_last_log = self.global_steps - self.last_log_step
+    if steps_since_last_log >= self.log_steps:
+      now = time.time()
+      elapsed_time = now - self.start_time
+      steps_per_second = steps_since_last_log / elapsed_time
+      examples_per_second = steps_per_second * self.batch_size
+
+      self.timestamp_log.append(BatchTimestamp(self.global_steps, now))
+      elapsed_time_str='{:.2f} seconds'.format(elapsed_time)
+      self.logger.log(step='PARAMETER', data={'TimeHistory': elapsed_time_str, 'examples/second': examples_per_second, 'steps': (self.last_log_step, self.global_steps)})
+
+      if self.summary_writer:
+        with self.summary_writer.as_default():
+          tf.summary.scalar('global_step/sec', steps_per_second,
+                            self.global_steps)
+          tf.summary.scalar('examples/sec', examples_per_second,
+                            self.global_steps)
+
+      self.last_log_step = self.global_steps
+      self.start_time = None
+      self.throughput.append(examples_per_second)
+
+  def on_epoch_end(self, epoch, logs=None):
+    if epoch == 0:
+      self.step_per_epoch = self.steps_in_epoch
+    epoch_run_time = time.time() - self.epoch_start
+    self.epoch_runtime_log.append(epoch_run_time)
+
+    self.steps_before_epoch += self.steps_in_epoch
+    self.steps_in_epoch = 0
+
+
+class EvalTimeHistory(tf.keras.callbacks.Callback):
+  """Callback for Keras models."""
+
+  def __init__(self, batch_size, logger, logdir=None):
+    """Callback for logging performance.
+
+    Args:
+      batch_size: Total batch size.
+      log_steps: Interval of steps between logging of batch level stats.
+      logdir: Optional directory to write TensorBoard summaries.
+    """
+    # TODO(wcromar): remove this parameter and rely on `logs` parameter of
+    # on_train_batch_end()
+    self.batch_size = batch_size
+    self.global_steps = 0
+    self.batch_time = []
+    self.eval_time = 0
+    super(EvalTimeHistory, self).__init__()
+    self.logger = logger
+
+
+  @property
+  def average_steps_per_second(self):
+    """The average training steps per second across all epochs."""
+    return (self.global_steps - 1) / self.eval_time
+
+  @property
+  def average_examples_per_second(self):
+    """The average number of training examples per second across all epochs."""
+    return self.average_steps_per_second * self.batch_size
+
+  def on_test_batch_end(self, batch, logs=None):
+    self.global_steps += 1
+    self.batch_time.append(time.time() - self.test_begin)
+
+  def on_test_batch_begin(self, epoch, logs=None):
+    self.test_begin = time.time()
+
+  def on_test_end(self, epoch, logs=None):
+    self.eval_time = sum(self.batch_time) - self.batch_time[0]
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/cmdline_helper.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/cmdline_helper.py
@ -0,0 +1,371 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import yaml
+
+
+def _add_bool_argument(parser, name=None, default=False, required=False, help=None):
+
+    if not isinstance(default, bool):
+        raise ValueError()
+
+    feature_parser = parser.add_mutually_exclusive_group(required=required)
+
+    feature_parser.add_argument('--' + name, dest=name, action='store_true', help=help, default=default)
+    feature_parser.add_argument('--no' + name, dest=name, action='store_false')
+    feature_parser.set_defaults(name=default)
+
+
+def parse_cmdline():
+
+    p = argparse.ArgumentParser(description="JoC-RN50v1.5-TF")
+
+    # ====== Define the common flags across models. ======
+    p.add_argument(
+        '--model_dir',
+        type=str,
+        default=None,
+        help=('The directory where the model and training/evaluation summaries'
+                'are stored.'))
+
+    p.add_argument(
+        '--config_file',
+        type=str,
+        default=None,
+        help=('A YAML file which specifies overrides. Note that this file can be '
+                'used as an override template to override the default parameters '
+                'specified in Python. If the same parameter is specified in both '
+                '`--config_file` and `--params_override`, the one in '
+                '`--params_override` will be used finally.'))
+
+    p.add_argument(
+        '--params_override',
+        type=str,
+        default=None,
+        help=('a YAML/JSON string or a YAML file which specifies additional '
+                'overrides over the default parameters and those specified in '
+                '`--config_file`. Note that this is supposed to be used only to '
+                'override the model parameters, but not the parameters like TPU '
+                'specific flags. One canonical use case of `--config_file` and '
+                '`--params_override` is users first define a template config file '
+                'using `--config_file`, then use `--params_override` to adjust the '
+                'minimal set of tuning parameters, for example setting up different'
+                ' `train_batch_size`. '
+                'The final override order of parameters: default_model_params --> '
+                'params from config_file --> params in params_override.'
+                'See also the help message of `--config_file`.'))
+
+    p.add_argument(
+        '--save_checkpoint_freq',
+        type=int,
+        default=1,
+        help='Number of epochs to save checkpoint.')
+
+    p.add_argument(
+      '--data_dir',
+      type=str,
+      default='.',
+      required=True,
+      help='The location of the input data. Files should be named `train-*` and `validation-*`.')
+    
+    p.add_argument(
+        '--mode',
+        type=str,
+        default='train_and_eval',
+        required=False,
+        help='Mode to run: `train`, `eval`, `train_and_eval` or `export`.')
+    
+    p.add_argument(
+        '--arch',
+        type=str,
+        default='efficientnet-b0',
+        required=False,
+        help='The type of the model, e.g. EfficientNet, etc.')
+    
+    p.add_argument(
+        '--dataset',
+        type=str,
+        default='ImageNet',
+        required=False,
+        help='The name of the dataset, e.g. ImageNet, etc.')
+    
+    p.add_argument(
+        '--log_steps',
+        type=int,
+        default=100,
+        help='The interval of steps between logging of batch level stats.')
+
+    p.add_argument(
+        '--time_history',
+        action='store_true',
+        default=True,
+        help='Logging the time for training steps.')
+    
+    p.add_argument(
+        '--use_xla',
+        action='store_true',
+        default=False,
+        help='Set to True to enable XLA')
+    
+    p.add_argument(
+        '--use_amp',
+        action='store_true',
+        default=False,
+        help='Set to True to enable AMP')
+
+    p.add_argument(
+        '--intraop_threads',
+        type=str,
+        default='',
+        help='intra thread should match the number of CPU cores')
+
+    p.add_argument(
+        '--interop_threads',
+        type=str,
+        default='',
+        help='inter thread should match the number of CPU sockets')
+    
+    p.add_argument(
+        '--export_dir', required=False, default=None, type=str, help="Directory in which to write exported SavedModel."
+    )
+
+    p.add_argument(
+        '--results_dir',
+        type=str,
+        required=False,
+        default='.',
+        help="Directory in which to write training logs, summaries and checkpoints."
+    )
+
+    p.add_argument(
+        '--inference_checkpoint',
+        type=str,
+        required=False,
+        default=None,
+        help="Path to checkpoint to do inference on."
+    )
+
+    p.add_argument(
+        '--to_predict',
+        type=str,
+        required=False,
+        default=None,
+        help="Path to image to do inference on."
+    )
+
+
+    p.add_argument(
+        '--log_filename',
+        type=str,
+        required=False,
+        default='log.json',
+        help="Name of the JSON file to which write the training log"
+    )
+
+    p.add_argument(
+        '--display_every',
+        default=10,
+        type=int,
+        required=False,
+        help="How often (in batches) to print out running information."
+    )
+
+    #model_params:
+    p.add_argument(
+        '--num_classes', type=int, default=1000, required=False, help="Number of classes to train on.")
+
+    p.add_argument(
+        '--batch_norm', type=str, default='default', required=False, help="Type of Batch norm used.")
+
+    p.add_argument(
+        '--activation', type=str, default='swish', required=False, help="Type of activation to be used.")
+
+    #optimizer:
+    p.add_argument(
+        '--optimizer', type=str, default='rmsprop', required=False, help="Optimizer to be used.")
+
+    p.add_argument(
+        '--momentum', type=float, default=0.9, required=False, help="The value of Momentum.")
+
+    p.add_argument(
+        '--epsilon', type=float, default=0.001, required=False, help="The value of Epsilon for optimizer.")
+
+    p.add_argument(
+        '--decay', type=float, default=0.9, required=False, help="The value of decay.")
+
+    p.add_argument(
+        '--moving_average_decay', type=float, default=0.0, required=False, help="The value of moving average.")
+
+    p.add_argument(
+        '--lookahead', action='store_true', default=False, required=False, help="Lookahead.")
+
+    p.add_argument(
+        '--nesterov', action='store_true', default=False, required=False, help="nesterov bool.")
+
+    p.add_argument(
+        '--beta_1', type=float, default=0.0, required=False, help="beta1 for Adam/AdamW.")
+
+    p.add_argument(
+        '--beta_2', type=float, default=0.0, required=False, help="beta2 for Adam/AdamW..")
+    
+    #loss:
+    p.add_argument(
+        '--label_smoothing', type=float, default=0.1, required=False, help="The value of label smoothing.")
+    p.add_argument(
+        '--mixup_alpha', type=float, default=0.0, required=False, help="Mix up alpha")
+
+    # Training specific params
+    p.add_argument(
+        '--max_epochs',
+        default=300,
+        type=int,
+        required=False,
+        help="Number of steps of training."
+    )
+
+    p.add_argument(
+        '--num_epochs_between_eval', 
+        type=int, 
+        default=1, 
+        required=False, 
+        help="Eval after how many steps of training.")
+
+    p.add_argument(
+        '--steps_per_epoch',
+        default=None,
+        type=int,
+        required=False,
+        help="Number of steps of training."
+    )
+    # LR Params
+    p.add_argument(
+        '--warmup_epochs',
+        default=5,
+        type=int,
+        required=False,
+        help="Number of steps considered as warmup and not taken into account for performance measurements."
+    )
+
+    p.add_argument(
+        '--lr_init', default=0.008, type=float, required=False, help="Initial value for the learning rate."
+    )
+
+    p.add_argument(
+        '--lr_decay', type=str, default='exponential', required=False, help="Type of LR Decay.")
+
+    p.add_argument('--lr_decay_rate', default=0.97, type=float, required=False, help="LR Decay rate.")
+
+    p.add_argument('--lr_decay_epochs', default=2.4, type=float, required=False, help="LR Decay epoch.")
+
+    p.add_argument(
+        '--lr_warmup_epochs',
+        default=5,
+        type=int,
+        required=False,
+        help="Number of warmup epochs for learning rate schedule."
+    )
+
+    p.add_argument('--weight_decay', default=5e-6, type=float, required=False, help="Weight Decay scale factor.")
+
+    p.add_argument(
+        '--weight_init',
+        default='fan_out',
+        choices=['fan_in', 'fan_out'],
+        type=str,
+        required=False,
+        help="Model weight initialization method."
+    )
+
+    p.add_argument(
+        '--train_num_examples', type=int, default=1281167, required=False, help="Training number of examples.")
+
+    p.add_argument(
+    '--train_batch_size', type=int, default=32, required=False, help="Training batch size per GPU.")
+
+    p.add_argument(
+    '--augmenter_name', type=str, default='autoaugment', required=False, help="Type of Augmentation during preprocessing only during training.")
+
+    #Rand-augment params
+    p.add_argument(
+        '--num_layers', type=int, default=None, required=False, help="Rand Augmentation parameter.")
+    p.add_argument(
+        '--magnitude', type=float, default=None, required=False, help="Rand Augmentation parameter.")
+    p.add_argument(
+        '--cutout_const', type=float, default=None, required=False, help="Rand/Auto Augmentation parameter.")
+    p.add_argument(
+        '--translate_const', type=float, default=None, required=False, help="Rand/Auto Augmentation parameter.")
+    #Auto-augment params
+    p.add_argument(
+        '--autoaugmentation_name', type=str, default=None, required=False, help="Auto-Augmentation parameter.")
+    #evaluation:
+
+    # Tensor format used for the computation.
+    p.add_argument(
+        '--data_format', choices=['NHWC', 'NCHW'], type=str, default='NCHW', required=False, help=argparse.SUPPRESS
+    )
+
+    # validation_dataset:
+    p.add_argument(
+    '--eval_num_examples', type=int, default=50000, required=False, help="Evaluation number of examples")
+    p.add_argument(
+    '--eval_batch_size', type=int, default=32, required=False, help="Evaluation batch size per GPU.")
+    p.add_argument(
+    '--predict_batch_size', type=int, default=32, required=False, help="Predict batch size per GPU.")
+    p.add_argument(
+    '--skip_eval', action='store_true', default=False, required=False, help="Skip eval during training.")
+
+    p.add_argument(
+        '--resume_checkpoint', action='store_true', default=False, required=False, help="Resume from a checkpoint in the model_dir.")
+
+    p.add_argument('--use_dali', action='store_true', default=False,
+                        help='Use dali for data loading and preprocessing of train dataset.')
+
+    p.add_argument('--use_dali_eval', action='store_true', default=False,
+                        help='Use dali for data loading and preprocessing of eval dataset.')
+
+    p.add_argument(
+        '--index_file', type=str, default='', required=False,
+        help="Path to index file required for dali.")
+
+    p.add_argument('--benchmark', action='store_true', default=False, required=False, help="Benchmarking or not")
+    # Callbacks options
+    p.add_argument(
+    '--enable_checkpoint_and_export', action='store_true', default=True, required=False, help="Evaluation number of examples")
+    p.add_argument(
+    '--enable_tensorboard', action='store_true', default=False, required=False, help="Enable Tensorboard logging.")
+    p.add_argument(
+    '--write_model_weights', action='store_true', default=False, required=False, help="whether to write model weights to visualize as image in TensorBoard..")
+
+    p.add_argument('--seed', type=int, default=None, required=False, help="Random seed.")
+
+    p.add_argument('--dtype', type=str, default='float32', required=False, help="Only permitted `float32`,`bfloat16`,`float16`,`fp32`,`bf16`")
+
+    p.add_argument('--run_eagerly', action='store_true', default=False, required=False, help="Random seed.")
+    
+
+
+    FLAGS, unknown_args = p.parse_known_args()
+
+    if len(unknown_args) > 0:
+
+        for bad_arg in unknown_args:
+            print("ERROR: Unknown command line arg: %s" % bad_arg)
+
+        raise ValueError("Invalid command line arg(s)")
+
+    return FLAGS
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/dali_index.sh
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/dali_index.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+SRC_DIR=${1}
+DST_DIR=${2}
+
+echo "Creating training file indexes"
+mkdir -p ${DST_DIR}
+
+for file in ${SRC_DIR}/train-*; do
+    BASENAME=$(basename $file)
+    DST_NAME=$DST_DIR/$BASENAME
+
+    echo "Creating index $DST_NAME for $file"
+    tfrecord2idx $file $DST_NAME
+done
+
+echo "Creating validation file indexes"
+for file in ${SRC_DIR}/validation-*; do
+    BASENAME=$(basename $file)
+    DST_NAME=$DST_DIR/$BASENAME
+
+    echo "Creating index $DST_NAME for $file"
+    tfrecord2idx $file $DST_NAME
+done
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/dataset_factory.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/dataset_factory.py
@ -0,0 +1,387 @@
+# Lint as: python3
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dataset utilities for vision tasks using TFDS and tf.data.Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+from typing import Any, List, Optional, Tuple, Mapping, Union
+import functools
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tensorflow import keras
+
+from utils import augment, preprocessing, Dali
+import horovod.tensorflow.keras as hvd
+import nvidia.dali.plugin.tf as dali_tf
+
+
+
+
+
+AUGMENTERS = {
+    'autoaugment': augment.AutoAugment,
+    'randaugment': augment.RandAugment,
+}
+
+class Dataset:
+  """An object for building datasets.
+
+  Allows building various pipelines fetching examples, preprocessing, etc.
+  Maintains additional state information calculated from the dataset, i.e.,
+  training set split, batch size, and number of steps (batches).
+  """
+
+  def __init__(self, 
+  data_dir,
+  index_file_dir,
+  split='train',
+  num_classes=None,
+  image_size=224,
+  num_channels=3,
+  batch_size=128,
+  dtype='float32',
+  one_hot=False,
+  use_dali=False,
+  augmenter=None,
+  shuffle_buffer_size=10000,
+  file_shuffle_buffer_size=1024,
+  cache=False,
+  mean_subtract=False,
+  standardize=False,
+  augmenter_params=None,
+  mixup_alpha=0.0):
+    """Initialize the builder from the config."""
+    if not os.path.exists(data_dir):
+        raise FileNotFoundError('Cannot find data dir: {}'.format(data_dir))
+    if one_hot and num_classes is None:
+        raise FileNotFoundError('Number of classes is required for one_hot')
+    self._data_dir = data_dir
+    self._split = split
+    self._image_size = image_size
+    self._num_classes = num_classes
+    self._num_channels = num_channels
+    self._batch_size = batch_size
+    self._dtype = dtype
+    self._one_hot = one_hot
+    self._augmenter_name = augmenter
+    self._shuffle_buffer_size = shuffle_buffer_size
+    self._file_shuffle_buffer_size = file_shuffle_buffer_size
+    self._cache = cache
+    self._mean_subtract = mean_subtract
+    self._standardize = standardize
+    self._index_file = index_file_dir
+    self._use_dali = use_dali
+    self.mixup_alpha = mixup_alpha
+    
+    self._num_gpus = hvd.size()
+
+    if self._augmenter_name is not None:
+      augmenter = AUGMENTERS.get(self._augmenter_name, None)
+      params = augmenter_params or {}
+      self._augmenter = augmenter(**params) if augmenter is not None else None
+    else:
+      self._augmenter = None
+
+  def mixup(self, batch_size, alpha, images, labels):
+    """Applies Mixup regularization to a batch of images and labels.
+    [1] Hongyi Zhang, Moustapha Cisse, Yann N. Dauphin, David Lopez-Paz
+      Mixup: Beyond Empirical Risk Minimization.
+      ICLR'18, https://arxiv.org/abs/1710.09412
+    Arguments:
+      batch_size: The input batch size for images and labels.
+      alpha: Float that controls the strength of Mixup regularization.
+      images: A batch of images of shape [batch_size, ...]
+      labels: A batch of labels of shape [batch_size, num_classes]
+    Returns:
+      A tuple of (images, labels) with the same dimensions as the input with
+      Mixup regularization applied.
+    """
+    # Mixup of images will be performed on device later
+    if alpha == 0.0:
+      images_mix_weight = tf.ones([batch_size, 1, 1, 1])
+      return (images, images_mix_weight), labels
+
+    mix_weight = tf.compat.v1.distributions.Beta(alpha, alpha).sample([batch_size, 1])
+    mix_weight = tf.maximum(mix_weight, 1. - mix_weight)
+    images_mix_weight = tf.reshape(mix_weight, [batch_size, 1, 1, 1])
+    # Mixup on a single batch is implemented by taking a weighted sum with the
+    # same batch in reverse.
+    labels_mix = labels * mix_weight + labels[::-1] * (1. - mix_weight)
+    return (images, images_mix_weight), labels_mix
+
+
+  @property
+  def is_training(self) -> bool:
+    """Whether this is the training set."""
+    return self._split == 'train'
+
+  @property
+  def global_batch_size(self) -> int:
+    """The batch size, multiplied by the number of replicas (if configured)."""
+    return self._batch_size * self._num_gpus
+
+  @property
+  def local_batch_size(self):
+    """The base unscaled batch size."""
+    return self._batch_size
+
+  @property
+  def dtype(self) -> tf.dtypes.DType:
+    """Converts the config's dtype string to a tf dtype.
+
+    Returns:
+      A mapping from string representation of a dtype to the `tf.dtypes.DType`.
+
+    Raises:
+      ValueError if the config's dtype is not supported.
+
+    """
+    dtype_map = {
+        'float32': tf.float32,
+        'bfloat16': tf.bfloat16,
+        'float16': tf.float16,
+        'fp32': tf.float32,
+        'bf16': tf.bfloat16,
+    }
+    try:
+      return dtype_map[self._dtype]
+    except:
+      raise ValueError('{} provided key. Invalid DType provided. Supported types: {}'.format(self._dtype,
+          dtype_map.keys()))
+
+  @property
+  def image_size(self) -> int:
+    """The size of each image (can be inferred from the dataset)."""
+    return int(self._image_size)
+
+  @property
+  def num_channels(self) -> int:
+    """The number of image channels (can be inferred from the dataset)."""
+    return int(self._num_channels)
+
+  @property
+  def num_classes(self) -> int:
+    """The number of classes (can be inferred from the dataset)."""
+    return int(self._num_classes)
+
+  @property
+  def num_steps(self) -> int:
+    """The number of classes (can be inferred from the dataset)."""
+    return int(self._num_steps)
+
+  def build(self) -> tf.data.Dataset:
+    """Construct a dataset end-to-end and return it.
+
+    Args:
+      input_context: An optional context provided by `tf.distribute` for
+        cross-replica training.
+
+    Returns:
+      A TensorFlow dataset outputting batched images and labels.
+    """
+    if self._use_dali:
+        print("Using dali for {train} dataloading".format(train = "training" if self.is_training else "validation"))
+        tfrec_filenames = sorted(tf.io.gfile.glob(os.path.join(self._data_dir, '%s-*' % self._split)))
+        tfrec_idx_filenames = sorted(tf.io.gfile.glob(os.path.join(self._index_file, '%s-*' % self._split)))
+
+        # # Create pipeline
+        dali_pipeline = Dali.DaliPipeline(tfrec_filenames=tfrec_filenames,
+        tfrec_idx_filenames=tfrec_idx_filenames,
+        height=self._image_size,
+        width=self._image_size,
+        batch_size=self.local_batch_size,
+        num_threads=1,
+        device_id=hvd.local_rank(),
+        shard_id=hvd.rank(),
+        num_gpus=hvd.size(),
+        num_classes=self.num_classes,
+        deterministic=False,
+        dali_cpu=False,
+        training=self.is_training)
+
+        # Define shapes and types of the outputs
+        shapes = (
+            (self.local_batch_size, self._image_size, self._image_size, 3),
+            (self.local_batch_size, self._num_classes))
+        dtypes = (
+            tf.float32,
+            tf.float32)
+
+        # Create dataset
+        dataset = dali_tf.DALIDataset(
+            pipeline=dali_pipeline,
+            batch_size=self.local_batch_size,
+            output_shapes=shapes,
+            output_dtypes=dtypes,
+            device_id=hvd.local_rank())
+        # if self.is_training and self._augmenter:
+        #     print('Augmenting with {}'.format(self._augmenter))
+        #     dataset.unbatch().map(self.augment_pipeline, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(self.local_batch_size)
+        return dataset
+    else:
+        print("Using tf native pipeline for {train} dataloading".format(train = "training" if self.is_training else "validation"))
+        dataset = self.load_records()
+        dataset = self.pipeline(dataset)
+
+        return dataset
+
+  # def augment_pipeline(self, image, label) -> Tuple[tf.Tensor, tf.Tensor]:
+  #   image = self._augmenter.distort(image)
+  #   return image, label
+
+
+  def load_records(self) -> tf.data.Dataset:
+    """Return a dataset loading files with TFRecords."""
+    if self._data_dir is None:
+        raise ValueError('Dataset must specify a path for the data files.')
+
+    file_pattern = os.path.join(self._data_dir,
+                                  '{}*'.format(self._split))
+    dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
+    return dataset
+
+  def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
+    """Build a pipeline fetching, shuffling, and preprocessing the dataset.
+
+    Args:
+      dataset: A `tf.data.Dataset` that loads raw files.
+
+    Returns:
+      A TensorFlow dataset outputting batched images and labels.
+    """
+    if self._num_gpus > 1:
+      dataset = dataset.shard(self._num_gpus, hvd.rank())
+
+    if self.is_training:
+      # Shuffle the input files.
+      dataset.shuffle(buffer_size=self._file_shuffle_buffer_size)
+
+    if self.is_training and not self._cache:
+      dataset = dataset.repeat()
+
+    # Read the data from disk in parallel
+    dataset = dataset.interleave(
+        tf.data.TFRecordDataset,
+        cycle_length=10,
+        block_length=1,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    if self._cache:
+      dataset = dataset.cache()
+
+    if self.is_training:
+      dataset = dataset.shuffle(self._shuffle_buffer_size)
+      dataset = dataset.repeat()
+
+    # Parse, pre-process, and batch the data in parallel
+    preprocess = self.parse_record
+    dataset = dataset.map(preprocess,
+                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    if self._num_gpus > 1:
+      # The batch size of the dataset will be multiplied by the number of
+      # replicas automatically when strategy.distribute_datasets_from_function
+      # is called, so we use local batch size here.
+      dataset = dataset.batch(self.local_batch_size,
+                              drop_remainder=self.is_training)
+    else:
+      dataset = dataset.batch(self.global_batch_size,
+                              drop_remainder=self.is_training)
+
+    # Apply Mixup
+    mixup_alpha = self.mixup_alpha if self.is_training else 0.0
+    dataset = dataset.map(
+        functools.partial(self.mixup, self.local_batch_size, mixup_alpha),
+        num_parallel_calls=64)
+
+    # Prefetch overlaps in-feed with training
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+    return dataset
+
+  def parse_record(self, record: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Parse an ImageNet record from a serialized string Tensor."""
+    keys_to_features = {
+        'image/encoded':
+            tf.io.FixedLenFeature((), tf.string, ''),
+        'image/format':
+            tf.io.FixedLenFeature((), tf.string, 'jpeg'),
+        'image/class/label':
+            tf.io.FixedLenFeature([], tf.int64, -1),
+        'image/class/text':
+            tf.io.FixedLenFeature([], tf.string, ''),
+        'image/object/bbox/xmin':
+            tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymin':
+            tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/xmax':
+            tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymax':
+            tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/class/label':
+            tf.io.VarLenFeature(dtype=tf.int64),
+    }
+
+    parsed = tf.io.parse_single_example(record, keys_to_features)
+
+    label = tf.reshape(parsed['image/class/label'], shape=[1])
+    label = tf.cast(label, dtype=tf.int32)
+
+    # Subtract one so that labels are in [0, 1000)
+    label -= 1
+
+    image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
+    image, label = self.preprocess(image_bytes, label)
+
+    return image, label
+
+  def preprocess(self, image: tf.Tensor, label: tf.Tensor
+                ) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Apply image preprocessing and augmentation to the image and label."""
+    if self.is_training:
+      image = preprocessing.preprocess_for_train(
+          image,
+          image_size=self._image_size,
+          mean_subtract=self._mean_subtract,
+          standardize=self._standardize,
+          dtype=self.dtype,
+          augmenter=self._augmenter)
+    else:
+      image = preprocessing.preprocess_for_eval(
+          image,
+          image_size=self._image_size,
+          num_channels=self._num_channels,
+          mean_subtract=self._mean_subtract,
+          standardize=self._standardize,
+          dtype=self.dtype)
+
+    label = tf.cast(label, tf.int32)
+    if self._one_hot:
+      label = tf.one_hot(label, self.num_classes)
+      label = tf.reshape(label, [self.num_classes])
+
+    return image, label
+
+  @classmethod
+  def from_params(cls, *args, **kwargs):
+    """Construct a dataset builder from a default config and any overrides."""
+    config = DatasetConfig.from_args(*args, **kwargs)
+    return cls(config)
+
+
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/hvd_utils.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/hvd_utils.py
@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import horovod.tensorflow as hvd
+
+__all__ = [
+    'is_using_hvd',
+]
+
+def is_using_hvd():
+    return hvd.size() > 1
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/learning_rate.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/learning_rate.py
@ -0,0 +1,130 @@
+# Lint as: python3
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Learning rate utilities for vision tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Any, List, Mapping
+
+import tensorflow as tf
+
+BASE_LEARNING_RATE = 0.1
+
+__all__ = [ 'WarmupDecaySchedule', 'PiecewiseConstantDecayWithWarmup' ]
+
+@tf.keras.utils.register_keras_serializable(package='Custom')
+class WarmupDecaySchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """A wrapper for LearningRateSchedule that includes warmup steps."""
+
+  def __init__(
+      self,
+      lr_schedule: tf.keras.optimizers.schedules.LearningRateSchedule,
+      warmup_steps: int,
+      **kwargs):
+    """Add warmup decay to a learning rate schedule.
+
+    Args:
+      lr_schedule: base learning rate scheduler
+      warmup_steps: number of warmup steps
+
+    """
+    super(WarmupDecaySchedule, self).__init__()
+    self._lr_schedule = lr_schedule
+    self._warmup_steps = warmup_steps
+
+
+  def __call__(self, step: int):
+    lr = self._lr_schedule(step)
+    if self._warmup_steps:
+      step_decay = step - self._warmup_steps
+      lr = self._lr_schedule(step_decay)
+      initial_learning_rate = tf.convert_to_tensor(
+          self._lr_schedule.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      global_step_recomp = tf.cast(step, dtype)
+      warmup_steps = tf.cast(self._warmup_steps, dtype)
+      warmup_lr = initial_learning_rate * global_step_recomp / warmup_steps
+      lr = tf.cond(global_step_recomp < warmup_steps,
+                   lambda: warmup_lr,
+                   lambda: lr)
+    return lr
+
+  def get_config(self) -> Mapping[str, Any]:
+    config = self._lr_schedule.get_config()
+    config.update({
+        "warmup_steps": self._warmup_steps,
+    })
+    config.update({
+        "lr_schedule": self._lr_schedule,
+    })
+    return config
+
+
+
+# TODO(b/149030439) - refactor this with
+# tf.keras.optimizers.schedules.PiecewiseConstantDecay + WarmupDecaySchedule.
+class PiecewiseConstantDecayWithWarmup(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Piecewise constant decay with warmup schedule."""
+
+  def __init__(self,
+               batch_size: int,
+               epoch_size: int,
+               warmup_epochs: int,
+               boundaries: List[int],
+               multipliers: List[float]):
+    """Piecewise constant decay with warmup.
+
+    Args:
+      batch_size: The training batch size used in the experiment.
+      epoch_size: The size of an epoch, or the number of examples in an epoch.
+      warmup_epochs: The number of warmup epochs to apply.
+      boundaries: The list of floats with strictly increasing entries.
+      multipliers: The list of multipliers/learning rates to use for the
+        piecewise portion. The length must be 1 less than that of boundaries.
+
+    """
+    super(PiecewiseConstantDecayWithWarmup, self).__init__()
+    if len(boundaries) != len(multipliers) - 1:
+      raise ValueError("The length of boundaries must be 1 less than the "
+                       "length of multipliers")
+
+    base_lr_batch_size = 256
+    steps_per_epoch = epoch_size // batch_size
+
+    self._rescaled_lr = BASE_LEARNING_RATE * batch_size / base_lr_batch_size
+    self._step_boundaries = [float(steps_per_epoch) * x for x in boundaries]
+    self._lr_values = [self._rescaled_lr * m for m in multipliers]
+    self._warmup_steps = warmup_epochs * steps_per_epoch
+
+  def __call__(self, step: int):
+    """Compute learning rate at given step."""
+    def warmup_lr():
+      return self._rescaled_lr * (
+          step / tf.cast(self._warmup_steps, tf.float32))
+    def piecewise_lr():
+      return tf.compat.v1.train.piecewise_constant(
+          tf.cast(step, tf.float32), self._step_boundaries, self._lr_values)
+    return tf.cond(step < self._warmup_steps, warmup_lr, piecewise_lr)
+
+  def get_config(self) -> Mapping[str, Any]:
+    return {
+        "rescaled_lr": self._rescaled_lr,
+        "step_boundaries": self._step_boundaries,
+        "lr_values": self._lr_values,
+        "warmup_steps": self._warmup_steps,
+    }
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/optimizer_factory.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/optimizer_factory.py
@ -0,0 +1,377 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizer factory for vision tasks."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorflow_addons as tfa
+
+from typing import Any, Dict, Text, List
+from tensorflow import keras
+# pylint: disable=protected-access
+
+from utils import learning_rate
+
+class MovingAverage(tf.keras.optimizers.Optimizer):
+  """Optimizer that computes a moving average of the variables.
+
+  Empirically it has been found that using the moving average of the trained
+  parameters of a deep network is better than using its trained parameters
+  directly. This optimizer allows you to compute this moving average and swap
+  the variables at save time so that any code outside of the training loop
+  will use by default the average values instead of the original ones.
+
+  Example of usage for training:
+  ```python
+  opt = tf.keras.optimizers.SGD(learning_rate)
+  opt = MovingAverage(opt)
+
+  opt.shadow_copy(model)
+  ```
+
+  At test time, swap the shadow variables to evaluate on the averaged weights:
+  ```python
+  opt.swap_weights()
+  # Test eval the model here
+  opt.swap_weights()
+  ```
+  """
+
+  def __init__(self,
+               optimizer: tf.keras.optimizers.Optimizer,
+               average_decay: float = 0.99,
+               start_step: int = 0,
+               dynamic_decay: bool = True,
+               name: Text = 'moving_average',
+               **kwargs):
+    """Construct a new MovingAverage optimizer.
+
+    Args:
+      optimizer: `tf.keras.optimizers.Optimizer` that will be
+        used to compute and apply gradients.
+      average_decay: float. Decay to use to maintain the moving averages
+        of trained variables.
+      start_step: int. What step to start the moving average.
+      dynamic_decay: bool. Whether to change the decay based on the number
+        of optimizer updates. Decay will start at 0.1 and gradually increase
+        up to `average_decay` after each optimizer update. This behavior is
+        similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
+      name: Optional name for the operations created when applying
+        gradients. Defaults to "moving_average".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+        `clipvalue`, `lr`, `decay`}.
+    """
+    super(MovingAverage, self).__init__(name, **kwargs)
+    self._optimizer = optimizer
+    self._average_decay = average_decay
+    self._start_step = tf.constant(start_step, tf.float32)
+    self._dynamic_decay = dynamic_decay
+
+  def shadow_copy(self, model: tf.keras.Model):
+    """Creates shadow variables for the given model weights."""
+    for var in model.weights:
+      self.add_slot(var, 'average', initializer='zeros')
+    self._average_weights = [
+        self.get_slot(var, 'average') for var in model.weights
+    ]
+    self._model_weights = model.weights
+
+  @property
+  def has_shadow_copy(self):
+    """Whether this optimizer has created shadow variables."""
+    return self._model_weights is not None
+
+  def _create_slots(self, var_list):
+    self._optimizer._create_slots(var_list=var_list)  # pylint: disable=protected-access
+
+  def apply_gradients(self, grads_and_vars, name: Text = None):
+    result = self._optimizer.apply_gradients(grads_and_vars, name)
+    self.update_average(self._optimizer.iterations)
+    return result
+
+  @tf.function
+  def update_average(self, step: tf.Tensor):
+    step = tf.cast(step, tf.float32)
+    if step < self._start_step:
+      decay = tf.constant(0., tf.float32)
+    elif self._dynamic_decay:
+      decay = step - self._start_step
+      decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
+    else:
+      decay = self._average_decay
+
+    def _apply_moving(v_moving, v_normal):
+      diff = v_moving - v_normal
+      v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
+      return v_moving
+
+    def _update(strategy, v_moving_and_v_normal):
+      for v_moving, v_normal in v_moving_and_v_normal:
+        strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
+
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(_update, args=(zip(self._average_weights,
+                                             self._model_weights),))
+
+  def swap_weights(self):
+    """Swap the average and moving weights.
+
+    This is a convenience method to allow one to evaluate the averaged weights
+    at test time. Loads the weights stored in `self._average` into the model,
+    keeping a copy of the original model weights. Swapping twice will return
+    the original weights.
+    """
+    if tf.distribute.in_cross_replica_context():
+      strategy = tf.distribute.get_strategy()
+      strategy.run(self._swap_weights, args=())
+    else:
+      raise ValueError('Swapping weights must occur under a '
+                       'tf.distribute.Strategy')
+
+  @tf.function
+  def _swap_weights(self):
+    def fn_0(a, b):
+      a.assign_add(b)
+      return a
+    def fn_1(b, a):
+      b.assign(a - b)
+      return b
+    def fn_2(a, b):
+      a.assign_sub(b)
+      return a
+
+    def swap(strategy, a_and_b):
+      """Swap `a` and `b` and mirror to all devices."""
+      for a, b in a_and_b:
+        strategy.extended.update(a, fn_0, args=(b,))  # a = a + b
+        strategy.extended.update(b, fn_1, args=(a,))  # b = a - b
+        strategy.extended.update(a, fn_2, args=(b,))  # a = a - b
+
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(
+        swap, args=(zip(self._average_weights, self._model_weights),))
+
+  def assign_average_vars(self, var_list: List[tf.Variable]):
+    """Assign variables in var_list with their respective averages.
+
+    Args:
+      var_list: List of model variables to be assigned to their average.
+    Returns:
+      assign_op: The op corresponding to the assignment operation of
+        variables to their average.
+    """
+    assign_op = tf.group([
+        var.assign(self.get_slot(var, 'average')) for var in var_list
+        if var.trainable
+    ])
+    return assign_op
+
+  def _create_hypers(self):
+    self._optimizer._create_hypers()  # pylint: disable=protected-access
+
+  def _prepare(self, var_list):
+    return self._optimizer._prepare(var_list=var_list)  # pylint: disable=protected-access
+
+  @property
+  def iterations(self):
+    return self._optimizer.iterations
+
+  @iterations.setter
+  def iterations(self, variable):
+    self._optimizer.iterations = variable
+
+  @property
+  def weights(self):
+    # return self._weights + self._optimizer.weights
+    return self._optimizer.weights
+
+  @property
+  def lr(self):
+    return self._optimizer._get_hyper('learning_rate')
+
+  @lr.setter
+  def lr(self, lr):
+    self._optimizer._set_hyper('learning_rate', lr)
+
+  @property
+  def learning_rate(self):
+    return self._optimizer._get_hyper('learning_rate')
+
+  @learning_rate.setter
+  def learning_rate(self, learning_rate):  # pylint: disable=redefined-outer-name
+    self._optimizer._set_hyper('learning_rate', learning_rate)
+
+  def _resource_apply_dense(self, grad, var):
+    return self._optimizer._resource_apply_dense(grad, var)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse(grad, var, indices)
+
+  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse_duplicate_indices(
+        grad, var, indices)
+
+  def get_config(self):
+    config = {
+        'optimizer': tf.keras.optimizers.serialize(self._optimizer),
+        'average_decay': self._average_decay,
+        'start_step': self._start_step,
+        'dynamic_decay': self._dynamic_decay,
+    }
+    base_config = super(MovingAverage, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    optimizer = tf.keras.optimizers.deserialize(
+        config.pop('optimizer'),
+        custom_objects=custom_objects,
+    )
+    return cls(optimizer, **config)
+
+
+def build_optimizer(
+    optimizer_name: Text,
+    base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule,
+    params: Dict[Text, Any]):
+  """Build the optimizer based on name.
+
+  Args:
+    optimizer_name: String representation of the optimizer name. Examples:
+      sgd, momentum, rmsprop.
+    base_learning_rate: `tf.keras.optimizers.schedules.LearningRateSchedule`
+      base learning rate.
+    params: String -> Any dictionary representing the optimizer params.
+      This should contain optimizer specific parameters such as
+      `base_learning_rate`, `decay`, etc.
+
+  Returns:
+    A tf.keras.Optimizer.
+
+  Raises:
+    ValueError if the provided optimizer_name is not supported.
+
+  """
+  optimizer_name = optimizer_name.lower()
+
+  if optimizer_name == 'sgd':
+    nesterov = params.get('nesterov', False)
+    optimizer = tf.keras.optimizers.SGD(learning_rate=base_learning_rate,
+                                        nesterov=nesterov)
+  elif optimizer_name == 'momentum':
+    nesterov = params.get('nesterov', False)
+    optimizer = tf.keras.optimizers.SGD(learning_rate=base_learning_rate,
+                                        momentum=params['momentum'],
+                                        nesterov=nesterov)
+  elif optimizer_name == 'rmsprop':
+    rho = params.get('decay', None) or params.get('rho', 0.9)
+    momentum = params.get('momentum', 0.9)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tf.keras.optimizers.RMSprop(learning_rate=base_learning_rate,
+                                            rho=rho,
+                                            momentum=momentum,
+                                            epsilon=epsilon)
+  elif optimizer_name == 'adam':
+    beta_1 = params.get('beta_1', 0.9)
+    beta_2 = params.get('beta_2', 0.999)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tf.keras.optimizers.Adam(learning_rate=base_learning_rate,
+                                         beta_1=beta_1,
+                                         beta_2=beta_2,
+                                         epsilon=epsilon)
+  elif optimizer_name == 'adamw':
+    weight_decay = params.get('weight_decay', 0.01)
+    beta_1 = params.get('beta_1', 0.9)
+    beta_2 = params.get('beta_2', 0.999)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tfa.optimizers.AdamW(weight_decay=weight_decay,
+                                     learning_rate=base_learning_rate,
+                                     beta_1=beta_1,
+                                     beta_2=beta_2,
+                                     epsilon=epsilon)
+  else:
+    raise ValueError('Unknown optimizer %s' % optimizer_name)
+
+  if params.get('lookahead', None):
+    optimizer = tfa.optimizers.Lookahead(optimizer)
+
+  # Moving average should be applied last, as it's applied at test time
+  moving_average_decay = params.get('moving_average_decay', 0.)
+  if moving_average_decay is not None and moving_average_decay > 0.:
+    optimizer = MovingAverage(
+        optimizer,
+        average_decay=moving_average_decay)
+  return optimizer
+
+
+def build_learning_rate(params: Dict[Text, Any],
+                        batch_size: int = None,
+                        train_steps: int = None,
+                        max_epochs: int = None):
+  """Build the learning rate given the provided configuration."""
+  decay_type = params['name']
+  base_lr = params['initial_lr']
+  decay_rate = params['decay_rate']
+  if params['decay_epochs'] is not None:
+    decay_steps = params['decay_epochs'] * train_steps
+  else:
+    decay_steps = 0
+  if params['warmup_epochs'] is not None:
+    warmup_steps = params['warmup_epochs'] * train_steps
+  else:
+    warmup_steps = 0
+
+  lr_multiplier = params['scale_by_batch_size']
+
+  if lr_multiplier and lr_multiplier > 0:
+    # Scale the learning rate based on the batch size and a multiplier
+    base_lr *= lr_multiplier * batch_size
+
+  if decay_type == 'exponential':
+    lr = tf.keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=base_lr,
+        decay_steps=decay_steps,
+        decay_rate=decay_rate,
+        staircase=params['staircase'])
+  elif decay_type == 'piecewise_constant_with_warmup':
+    lr = learning_rate.PiecewiseConstantDecayWithWarmup(
+        batch_size=batch_size,
+        epoch_size=params['examples_per_epoch'],
+        warmup_epochs=params['warmup_epochs'],
+        boundaries=params['boundaries'],
+        multipliers=params['multipliers'])
+  elif decay_type == 'cosine':
+    decay_steps = (max_epochs - params['warmup_epochs']) * train_steps
+    lr = tf.keras.experimental.CosineDecay(
+        initial_learning_rate=base_lr,
+        decay_steps=decay_steps,
+        alpha=0.0
+    )
+  elif decay_type == 'linearcosine':
+    decay_steps = (max_epochs - params['warmup_epochs']) * train_steps
+    lr = tf.keras.experimental.NoisyLinearCosineDecay(
+        initial_learning_rate=base_lr,
+        decay_steps=decay_steps,
+        initial_variance=0.5, 
+        variance_decay=0.55,
+        num_periods=0.5, alpha=0.0, beta=0.001
+    )
+  if warmup_steps > 0:
+    if decay_type != 'piecewise_constant_with_warmup':
+      lr = learning_rate.WarmupDecaySchedule(lr, warmup_steps)
+  return lr
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/preprocessing.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/preprocessing.py
@ -0,0 +1,404 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Preprocessing functions for images."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import tensorflow as tf
+from typing import List, Optional, Text, Tuple
+
+from utils import augment
+
+
+# Calculated from the ImageNet training set
+MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
+STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
+
+IMAGE_SIZE = 224
+CROP_PADDING = 32
+
+
+def mean_image_subtraction(
+    image_bytes: tf.Tensor,
+    means: Tuple[float, ...],
+    num_channels: int = 3,
+    dtype: tf.dtypes.DType = tf.float32,
+) ->  tf.Tensor:
+  """Subtracts the given means from each image channel.
+
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image_bytes = mean_image_subtraction(image_bytes, means)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image_bytes: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image_bytes.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image_bytes))
+  if dtype is not None:
+    means = tf.cast(means, dtype=dtype)
+
+  return image_bytes - means
+
+
+def standardize_image(
+    image_bytes: tf.Tensor,
+    stddev: Tuple[float, ...],
+    num_channels: int = 3,
+    dtype: tf.dtypes.DType = tf.float32,
+) ->  tf.Tensor:
+  """Divides the given stddev from each image channel.
+
+  For example:
+    stddev = [123.68, 116.779, 103.939]
+    image_bytes = standardize_image(image_bytes, stddev)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image_bytes: a tensor of size [height, width, C].
+    stddev: a C-vector of values to divide from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `stddev`.
+  """
+  if image_bytes.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(stddev) != num_channels:
+    raise ValueError('len(stddev) must match the number of channels')
+
+  # We have a 1-D tensor of stddev; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  stddev = tf.broadcast_to(stddev, tf.shape(image_bytes))
+  if dtype is not None:
+    stddev = tf.cast(stddev, dtype=dtype)
+
+  return image_bytes / stddev
+
+
+def normalize_images(features: tf.Tensor,
+                     mean_rgb: Tuple[float, ...] = MEAN_RGB,
+                     stddev_rgb: Tuple[float, ...] = STDDEV_RGB,
+                     num_channels: int = 3,
+                     dtype: tf.dtypes.DType = tf.float32,
+                     data_format: Text = 'channels_last') -> tf.Tensor:
+  """Normalizes the input image channels with the given mean and stddev.
+
+  Args:
+    features: `Tensor` representing decoded images in float format.
+    mean_rgb: the mean of the channels to subtract.
+    stddev_rgb: the stddev of the channels to divide.
+    num_channels: the number of channels in the input image tensor.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+    data_format: the format of the input image tensor
+                 ['channels_first', 'channels_last'].
+
+  Returns:
+    A normalized image `Tensor`.
+  """
+  # TODO(allencwang) - figure out how to use mean_image_subtraction and
+  # standardize_image on batches of images and replace the following.
+  if data_format == 'channels_first':
+    stats_shape = [num_channels, 1, 1]
+  else:
+    stats_shape = [1, 1, num_channels]
+
+  if dtype is not None:
+    features = tf.image.convert_image_dtype(features, dtype=dtype)
+
+  if mean_rgb is not None:
+    mean_rgb = tf.constant(mean_rgb,
+                           shape=stats_shape,
+                           dtype=features.dtype)
+    mean_rgb = tf.broadcast_to(mean_rgb, tf.shape(features))
+    features = features - mean_rgb
+
+  if stddev_rgb is not None:
+    stddev_rgb = tf.constant(stddev_rgb,
+                             shape=stats_shape,
+                             dtype=features.dtype)
+    stddev_rgb = tf.broadcast_to(stddev_rgb, tf.shape(features))
+    features = features / stddev_rgb
+
+  return features
+
+
+def decode_and_center_crop(image_bytes: tf.Tensor,
+                           image_size: int = IMAGE_SIZE,
+                           crop_padding: int = CROP_PADDING) -> tf.Tensor:
+  """Crops to center of image with padding then scales image_size.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    image_size: image height/width dimension.
+    crop_padding: the padding size to use when centering the crop.
+
+  Returns:
+    A decoded and cropped image `Tensor`.
+  """
+  decoded = image_bytes.dtype != tf.string
+  shape = (tf.shape(image_bytes) if decoded
+           else tf.image.extract_jpeg_shape(image_bytes))
+  image_height = shape[0]
+  image_width = shape[1]
+
+  padded_center_crop_size = tf.cast(
+      ((image_size / (image_size + crop_padding)) *
+       tf.cast(tf.minimum(image_height, image_width), tf.float32)),
+      tf.int32)
+
+  offset_height = ((image_height - padded_center_crop_size) + 1) // 2
+  offset_width = ((image_width - padded_center_crop_size) + 1) // 2
+  crop_window = tf.stack([offset_height, offset_width,
+                          padded_center_crop_size, padded_center_crop_size])
+  if decoded:
+    image = tf.image.crop_to_bounding_box(
+        image_bytes,
+        offset_height=offset_height,
+        offset_width=offset_width,
+        target_height=padded_center_crop_size,
+        target_width=padded_center_crop_size)
+  else:
+    image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+
+  image = resize_image(image_bytes=image,
+                       height=image_size,
+                       width=image_size)
+
+  return image
+
+
+def decode_crop_and_flip(image_bytes: tf.Tensor) -> tf.Tensor:
+  """Crops an image to a random part of the image, then randomly flips.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+
+  Returns:
+    A decoded and cropped image `Tensor`.
+
+  """
+  decoded = image_bytes.dtype != tf.string
+  bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+  shape = (tf.shape(image_bytes) if decoded
+           else tf.image.extract_jpeg_shape(image_bytes))
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      shape,
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_height, offset_width, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_height, offset_width,
+                          target_height, target_width])
+  if decoded:
+    cropped = tf.image.crop_to_bounding_box(
+        image_bytes,
+        offset_height=offset_height,
+        offset_width=offset_width,
+        target_height=target_height,
+        target_width=target_width)
+  else:
+    cropped = tf.image.decode_and_crop_jpeg(image_bytes,
+                                            crop_window,
+                                            channels=3)
+
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+
+
+def resize_image(image_bytes: tf.Tensor,
+                 height: int = IMAGE_SIZE,
+                 width: int = IMAGE_SIZE) -> tf.Tensor:
+  """Resizes an image to a given height and width.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    height: image height dimension.
+    width: image width dimension.
+
+  Returns:
+    A tensor containing the resized image.
+
+  """
+  return tf.compat.v1.image.resize(
+      image_bytes, [height, width], method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+
+
+def preprocess_for_predict(
+    images: tf.Tensor,
+    image_size: int = IMAGE_SIZE,
+    num_channels: int = 3,
+    dtype: tf.dtypes.DType = tf.float32
+) -> tf.Tensor:
+  images = tf.reshape(images, [image_size, image_size, num_channels])
+  if dtype is not None:
+    images = tf.image.convert_image_dtype(images, dtype=dtype)
+
+  return images
+
+
+def preprocess_for_eval(
+    image_bytes: tf.Tensor,
+    image_size: int = IMAGE_SIZE,
+    num_channels: int = 3,
+    mean_subtract: bool = False,
+    standardize: bool = False,
+    dtype: tf.dtypes.DType = tf.float32
+) -> tf.Tensor:
+  """Preprocesses the given image for evaluation.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    image_size: image height/width dimension.
+    num_channels: number of image input channels.
+    mean_subtract: whether or not to apply mean subtraction.
+    standardize: whether or not to apply standardization.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  images = decode_and_center_crop(image_bytes, image_size)
+  images = tf.reshape(images, [image_size, image_size, num_channels])
+
+  if mean_subtract:
+    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
+  if standardize:
+    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
+  if dtype is not None:
+    images = tf.image.convert_image_dtype(images, dtype=dtype)
+
+  return images
+
+
+def load_eval_image(filename: Text, image_size: int = IMAGE_SIZE) -> tf.Tensor:
+  """Reads an image from the filesystem and applies image preprocessing.
+
+  Args:
+    filename: a filename path of an image.
+    image_size: image height/width dimension.
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  image_bytes = tf.io.read_file(filename)
+  image = preprocess_for_eval(image_bytes, image_size)
+
+  return image
+
+
+def build_eval_dataset(filenames: List[Text],
+                       labels: List[int] = None,
+                       image_size: int = IMAGE_SIZE,
+                       batch_size: int = 1) -> tf.Tensor:
+  """Builds a tf.data.Dataset from a list of filenames and labels.
+
+  Args:
+    filenames: a list of filename paths of images.
+    labels: a list of labels corresponding to each image.
+    image_size: image height/width dimension.
+    batch_size: the batch size used by the dataset
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  if labels is None:
+    labels = [0] * len(filenames)
+
+  filenames = tf.constant(filenames)
+  labels = tf.constant(labels)
+  dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
+
+  dataset = dataset.map(
+      lambda filename, label: (load_eval_image(filename, image_size), label))
+  dataset = dataset.batch(batch_size)
+
+  return dataset
+
+
+def preprocess_for_train(image_bytes: tf.Tensor,
+                         image_size: int = IMAGE_SIZE,
+                         augmenter: Optional[augment.ImageAugment] = None,
+                         mean_subtract: bool = False,
+                         standardize: bool = False,
+                         dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+  """Preprocesses the given image for training.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of
+      arbitrary size of dtype tf.uint8.
+    image_size: image height/width dimension.
+    augmenter: the image augmenter to apply.
+    mean_subtract: whether or not to apply mean subtraction.
+    standardize: whether or not to apply standardization.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  images = decode_crop_and_flip(image_bytes=image_bytes)
+  images = resize_image(images, height=image_size, width=image_size)
+  if mean_subtract:
+    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
+  if standardize:
+    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
+  if augmenter is not None:
+    images = augmenter.distort(images)
+  if dtype is not None:
+    images = tf.image.convert_image_dtype(images, dtype)
+
+  return images
--- a/TensorFlow2/Classification/ConvNets/efficientnet/utils/setup.py
+++ b/TensorFlow2/Classification/ConvNets/efficientnet/utils/setup.py
@ -0,0 +1,61 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import tensorflow as tf
+import horovod.tensorflow as hvd
+
+
+def set_flags(params):
+    # os.environ['CUDA_CACHE_DISABLE'] = '1'
+    os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+    # os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
+    # os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '0'
+    os.environ['TF_ADJUST_HUE_FUSED'] = '1'
+    os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
+    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
+    # os.environ['TF_SYNC_ON_FINISH'] = '0'
+    os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
+    os.environ['HOROVOD_CACHE_CAPACITY'] = "0"
+    os.environ['HOROVOD_CYCLE_TIME'] = "1.0"
+    if params.intraop_threads:
+        os.environ['TF_NUM_INTRAOP_THREADS'] = params.intraop_threads
+    if params.interop_threads:
+        os.environ['TF_NUM_INTEROP_THREADS'] = params.interop_threads
+
+    if params.use_xla:
+        os.environ['TF_XLA_FLAGS'] = "--tf_xla_enable_lazy_compilation=false --tf_xla_auto_jit=1 --tf_xla_async_io_level=1"
+        os.environ['TF_EXTRA_PTXAS_OPTIONS'] = "-sw200428197=true"
+        tf.keras.backend.clear_session()
+        tf.config.optimizer.set_jit(True)
+
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
+        assert tf.config.experimental.get_memory_growth(gpu)
+    tf.config.experimental.set_visible_devices(gpus, 'GPU')
+    if gpus:
+        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+
+    np.random.seed(params.seed)
+    tf.random.set_seed(params.seed)
+
+    if params.use_amp:
+        policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16', loss_scale='dynamic')
+        tf.keras.mixed_precision.experimental.set_policy(policy)
+    else:
+        os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '0'