From 0b3f770b0a87a5bfc549c2ecaaf140f749f24ce0 Mon Sep 17 00:00:00 2001
From: Ahmad Kiswani <kiswani.ahmad@gmail.com>
Date: Tue, 9 Nov 2021 03:16:21 -0800
Subject: [PATCH] [SSD/PyT] New release vs 21.05 container

---
 PyTorch/Detection/SSD/.dockerignore           |   5 +
 PyTorch/Detection/SSD/.gitignore              |   6 +
 PyTorch/Detection/SSD/Dockerfile              |  21 +-
 PyTorch/Detection/SSD/README.md               | 216 ++++-----
 .../Detection/SSD/csrc/box_encoder_cuda.cu    | 440 -----------------
 PyTorch/Detection/SSD/csrc/interface.cpp      |  81 ----
 .../Detection/SSD/csrc/random_horiz_flip.cu   | 165 -------
 PyTorch/Detection/SSD/dle/inference.py        |   1 +
 .../SSD/examples/SSD300_inference.py          |   4 +-
 .../Detection/SSD/examples/inference.ipynb    |   2 +-
 PyTorch/Detection/SSD/main.py                 |  29 +-
 PyTorch/Detection/SSD/requirements.txt        |   5 +-
 PyTorch/Detection/SSD/setup.py                |  89 ----
 PyTorch/Detection/SSD/src/coco.py             | 447 ------------------
 PyTorch/Detection/SSD/src/distributed.py      |  82 ----
 .../Detection/SSD/{src => ssd}/__init__.py    |   0
 .../SSD/{src => ssd}/coco_pipeline.py         | 121 +++--
 PyTorch/Detection/SSD/{src => ssd}/data.py    |  24 +-
 .../Detection/SSD/{src => ssd}/entrypoints.py |   0
 .../Detection/SSD/{src => ssd}/evaluate.py    |  12 +-
 PyTorch/Detection/SSD/{src => ssd}/logger.py  |   0
 PyTorch/Detection/SSD/{src => ssd}/model.py   |   6 +-
 PyTorch/Detection/SSD/{src => ssd}/train.py   | 111 ++---
 PyTorch/Detection/SSD/{src => ssd}/utils.py   |   0
 hubconf.py                                    |   2 +-
 25 files changed, 286 insertions(+), 1583 deletions(-)
 create mode 100644 PyTorch/Detection/SSD/.dockerignore
 delete mode 100644 PyTorch/Detection/SSD/csrc/box_encoder_cuda.cu
 delete mode 100644 PyTorch/Detection/SSD/csrc/interface.cpp
 delete mode 100644 PyTorch/Detection/SSD/csrc/random_horiz_flip.cu
 delete mode 100644 PyTorch/Detection/SSD/setup.py
 delete mode 100644 PyTorch/Detection/SSD/src/coco.py
 delete mode 100644 PyTorch/Detection/SSD/src/distributed.py
 rename PyTorch/Detection/SSD/{src => ssd}/__init__.py (100%)
 rename PyTorch/Detection/SSD/{src => ssd}/coco_pipeline.py (66%)
 rename PyTorch/Detection/SSD/{src => ssd}/data.py (77%)
 rename PyTorch/Detection/SSD/{src => ssd}/entrypoints.py (100%)
 rename PyTorch/Detection/SSD/{src => ssd}/evaluate.py (94%)
 rename PyTorch/Detection/SSD/{src => ssd}/logger.py (100%)
 rename PyTorch/Detection/SSD/{src => ssd}/model.py (97%)
 rename PyTorch/Detection/SSD/{src => ssd}/train.py (66%)
 rename PyTorch/Detection/SSD/{src => ssd}/utils.py (100%)

diff --git a/PyTorch/Detection/SSD/.dockerignore b/PyTorch/Detection/SSD/.dockerignore
new file mode 100644
index 00000000..2fff3639
--- /dev/null
+++ b/PyTorch/Detection/SSD/.dockerignore
@@ -0,0 +1,5 @@
+resnet50-19c8e357.pth
+nogit/
+models/
+scripts/
+
diff --git a/PyTorch/Detection/SSD/.gitignore b/PyTorch/Detection/SSD/.gitignore
index eeb8a6ec..d9e1c2b0 100644
--- a/PyTorch/Detection/SSD/.gitignore
+++ b/PyTorch/Detection/SSD/.gitignore
@@ -1 +1,7 @@
 **/__pycache__
+resnet50-19c8e357.pth
+nogit/
+pbr/
+models/
+scripts/
+
diff --git a/PyTorch/Detection/SSD/Dockerfile b/PyTorch/Detection/SSD/Dockerfile
index e94b34e3..408ccf9c 100755
--- a/PyTorch/Detection/SSD/Dockerfile
+++ b/PyTorch/Detection/SSD/Dockerfile
@@ -1,19 +1,20 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.05-py3
 FROM ${FROM_IMAGE_NAME}
 
 # Set working directory
-WORKDIR /workspace
+WORKDIR /workspace/ssd
 
-ENV PYTHONPATH "${PYTHONPATH}:/workspace"
-
-COPY requirements.txt .
+# Install nv-cocoapi
+ENV COCOAPI_VERSION=2.0+nv0.6.0
+RUN export COCOAPI_TAG=$(echo ${COCOAPI_VERSION} | sed 's/^.*+n//') \
+ && pip install --no-cache-dir pybind11                             \
+ && pip install --no-cache-dir git+https://github.com/NVIDIA/cocoapi.git@${COCOAPI_TAG}#subdirectory=PythonAPI
+# Install dllogger
 RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
+
+# Install requirements
+COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN python3 -m pip install pycocotools==2.0.0
 
-# Copy SSD code
-COPY ./setup.py .
-COPY ./csrc ./csrc
-RUN pip install .
-
 COPY . .
diff --git a/PyTorch/Detection/SSD/README.md b/PyTorch/Detection/SSD/README.md
index 98d0b8c6..15d1dc36 100644
--- a/PyTorch/Detection/SSD/README.md
+++ b/PyTorch/Detection/SSD/README.md
@@ -31,15 +31,15 @@ This repository provides a script and recipe to train the SSD300 v1.1 model to a
         * [Inference performance benchmark](#inference-performance-benchmark)
     * [Results](#results)
         * [Training accuracy results](#training-accuracy-results)
-            * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
             * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
             * [Training loss plot](#training-loss-plot)
             * [Training stability test](#training-stability-test)
         * [Training performance results](#training-performance-results)
-            * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb) 
+            * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
             * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
         * [Inference performance results](#inference-performance-results)
-            * [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
+            * [Inference performance: NVIDIA DGX A100 (1x A100 80GB)](#inference-performance-nvidia-dgx-a100-1x-a100-80gb)
             * [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
 - [Release notes](#release-notes)
     * [Changelog](#changelog)
@@ -116,19 +116,19 @@ To enable warmup provide argument the `--warmup 300`
 by the number of GPUs and multiplied by the batch size divided by 32).
 
 ### Feature support matrix
- 
-The following features are supported by this model.  
- 
+
+The following features are supported by this model.
+
 | **Feature** | **SSD300 v1.1 PyTorch** |
 |:---------:|:----------:|
-|[APEX AMP](https://github.com/NVIDIA/apex)                                             |  Yes |
+|[AMP](https://pytorch.org/docs/stable/amp.html)                                        |  Yes |
 |[APEX DDP](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html)               |  Yes |
 |[NVIDIA DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html)  |  Yes |
 
 #### Features
- 
-[APEX](https://github.com/NVIDIA/apex) is a PyTorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training, whereas [AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
- 
+
+[AMP](https://pytorch.org/docs/stable/amp.html) is an abbreviation used for automatic mixed precision training.
+
 [DDP](https://nvidia.github.io/apex/parallel.html) stands for DistributedDataParallel and is used for multi-GPU training.
 
 [NVIDIA DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html) - DALI is a library accelerating data preparation pipeline.
@@ -163,60 +163,25 @@ documentation.
 -   Techniques used for mixed precision training, see the [Mixed-Precision
 Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
 blog.
--   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools
-for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+-   PyTorch AMP, see the [PyTorch Automatic Mixed Precision package](https://pytorch.org/docs/stable/amp.html).
 
 
 #### Enabling mixed precision
 
 Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP)
-library from [APEX](https://github.com/NVIDIA/apex) which casts variables
+autocast [torch.cuda.amp.autocast](https://pytorch.org/docs/stable/amp.html#autocasting) which casts variables
 to half-precision upon retrieval, while storing variables in single-precision format.
 Furthermore, to preserve small gradient magnitudes in backpropagation,
-a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling)
-step must be included when applying gradients. In PyTorch, loss scaling
-can be easily applied by using `scale_loss()` method provided by AMP.
-The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler)
-or fixed.
+a [gradient scaling](https://pytorch.org/docs/stable/amp.html#gradient-scaling)
+step must be included.
 
 For an in-depth walk through on AMP, check out sample usage
-[here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started).
-[APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains
-utility libraries, such as AMP, which require minimal network code changes
-to leverage Tensor Cores performance.
+[here](https://pytorch.org/docs/stable/amp.html).
 
-To enable mixed precision, you can:
-- Import AMP from APEX:
-
-  ```
-  from apex import amp
-  ```
-- Initialize an AMP handle:
-
-  ```
-  amp_handle = amp.init(enabled=True, verbose=True)
-  ```
-- Wrap your optimizer with the AMP handle:
-
-  ```
-  optimizer = amp_handle.wrap_optimizer(optimizer)
-  ```
-- Scale loss before backpropagation (assuming loss is stored in a variable called `losses`)
-  - Default backpropagate for FP32/TF32:
-
-    ```
-    losses.backward()
-    ```
-  - Scale loss and backpropagate with AMP:
-
-    ```
-    with optimizer.scale_loss(losses) as scaled_losses:
-       scaled_losses.backward()
-    ```
 
 #### Enabling TF32
 
-TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs.
 
 TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
 
@@ -253,11 +218,11 @@ The following section lists the requirements in order to start training the SSD3
 
 
 ### Requirements
-This repository contains `Dockerfile` which extends the PyTorch 20.06 NGC container
+This repository contains `Dockerfile` which extends the PyTorch 21.05 NGC container
 and encapsulates some dependencies.  Aside from these dependencies,
 ensure you have the following software:
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 20.06-py3+ NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+* [PyTorch 21.05 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
 * GPU-based architecture:
     * [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
     * [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/)
@@ -270,7 +235,7 @@ Documentation:
 * [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
 * [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
 
-For those unable to use the [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch),
+For those unable to use the [PyTorch 21.05 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch),
 to set up the required environment or create your own container,
 see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
 
@@ -303,7 +268,7 @@ docker build . -t nvidia_ssd
 
 4. Start an interactive session in the NGC container to run training/inference.
 ```
-nvidia-docker run --rm -it --ulimit memlock=-1 --ulimit stack=67108864 -v $COCO_DIR:/coco --ipc=host nvidia_ssd
+docker run --rm -it --gpus=all --ipc=host -v $COCO_DIR:/coco nvidia_ssd
 ```
 
 **Note**: the default mount point in the container is `/coco`.
@@ -326,7 +291,7 @@ For example, if you want to run 8 GPU training with Tensor Core acceleration and
 save checkpoints after each epoch, run:
 
 ```
-bash ./examples/SSD300_FP16_8GPU.sh . /coco --save
+bash ./examples/SSD300_FP16_8GPU.sh . /coco --save $SSD_CHECKPINT_PATH
 ```
 
 6. Start validation/evaluation.
@@ -351,7 +316,7 @@ python ./main.py --backbone resnet50 --checkpoint ./models/epoch_*.pt --data /co
 You can check your trained model with a Jupyter notebook provided in the examples directory.
 Start with running a Docker container with a Jupyter notebook server:
 ```
-nvidia-docker run --rm -it --ulimit memlock=-1 --ulimit stack=67108864 -v $SSD_CHECKPINT_PATH:/checkpoints/SSD300v1.1.pt -v $COCO_PATH:/datasets/coco2017 --ipc=host -p 8888:8888 nvidia_ssd jupyter-notebook --ip 0.0.0.0 --allow-root
+docker run --rm -it --gpus=all --ipc=host -v $SSD_CHECKPINT_PATH:/checkpoints/SSD300v1.1.pt -v $COCO_PATH:/datasets/coco2017 -p 8888:8888 nvidia_ssd jupyter-notebook --ip 0.0.0.0 --allow-root
 ```
 
 ## Advanced
@@ -367,7 +332,7 @@ In the root directory, the most important files are:
  - `requirements.txt`:      a set of extra Python requirements for running SSD300 v1.1;
  - `download_dataset.py`:   automatically downloads the COCO dataset for training.
 
-The `src/` directory contains modules used to train and evaluate the SSD300 v1.1 model
+The `ssd/` directory contains modules used to train and evaluate the SSD300 v1.1 model
  - `model.py`: the definition of SSD300 v1.1 model
  - `data.py`: definition of input pipelines used in training and evaluation
  - `train.py`: functions used to train the SSD300 v1.1 model
@@ -469,7 +434,7 @@ Our model expects input data aligned in a way a COCO dataset is aligned by the `
 `train2017` and `val2017` directories should contain images in JPEG format.
 Annotation format is described in [the COCO documentation](http://cocodataset.org/#format-data).
 
-The preprocessing of the data is defined in the `src/coco_pipeline.py` module.
+The preprocessing of the data is defined in the `ssd/coco_pipeline.py` module.
 
 ##### Data preprocessing
 
@@ -538,12 +503,12 @@ The container prints Jupyter notebook logs like this:
 [I 16:17:59.769 NotebookApp] JupyterLab extension loaded from /opt/conda/lib/python3.6/site-packages/jupyterlab
 [I 16:17:59.769 NotebookApp] JupyterLab application directory is /opt/conda/share/jupyter/lab
 [I 16:17:59.770 NotebookApp] Serving notebooks from local directory: /workspace
-[I 16:17:59.770 NotebookApp] The Jupyter Notebook is running at: 
+[I 16:17:59.770 NotebookApp] The Jupyter Notebook is running at:
 [I 16:17:59.770 NotebookApp] http://(65935d756c71 or 127.0.0.1):8888/?token=04c78049c67f45a4d759c8f6ddd0b2c28ac4eab60d81be4e
 [I 16:17:59.770 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
 [W 16:17:59.774 NotebookApp] No web browser found: could not locate runnable browser.
-[C 16:17:59.774 NotebookApp] 
-        
+[C 16:17:59.774 NotebookApp]
+
     To access the notebook, open this file in a browser:
         file:///root/.local/share/jupyter/runtime/nbserver-1-open.html
     Or copy and paste one of these URLs:
@@ -573,7 +538,7 @@ The following section shows how to run benchmarks measuring the model performanc
 
 #### Training performance benchmark
 
-The training benchmark was run in various scenarios on A100 40GB and V100 16G GPUs. The benchmark does not require a checkpoint from a fully trained model.
+The training benchmark was run in various scenarios on A100 80GB and V100 16G GPUs. The benchmark does not require a checkpoint from a fully trained model.
 
 To benchmark training, run:
 ```
@@ -595,7 +560,7 @@ Tensor Cores, and the `{data}` is the location of the COCO 2017 dataset.
 
 #### Inference performance benchmark
 
-Inference benchmark was run on 1x A100 40GB GPU and 1x V100 16G GPU. To benchmark inference, run:
+Inference benchmark was run on 1x A100 80GB GPU and 1x V100 16G GPU. To benchmark inference, run:
 ```
 python main.py --eval-batch-size {bs} \
                --mode benchmark-inference \
@@ -615,34 +580,40 @@ The following sections provide details on how we achieved our performance and ac
 
 #### Training accuracy results
 
-##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)
 
 Our results were obtained by running the `./examples/SSD300_A100_{FP16,TF32}_{1,4,8}GPU.sh`
-script in the `pytorch-20.06-py3` NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs.
+script in the `pytorch-21.05-py3` NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs.
 
 |GPUs       |Batch size / GPU|Accuracy - TF32|Accuracy  - mixed precision|Time to train - TF32|Time to train  - mixed precision|Time to train speedup  (TF32 to mixed precision)|
 |-----------|----------------|---------------|---------------------------|--------------------|--------------------------------|------------------------------------------------|
-|1          |64              |0.251          |0.252                      |16:00:00            |8:00:00                         |200.00%                                         |
-|4          |64              |0.250          |0.251                      |3:00:00             |1:36:00                         |187.50%                                         |
-|8          |64              |0.252          |0.251                      |1:40:00             |1:00:00                         |167.00%                                         |
-|1          |128             |0.251          |0.251                      |13:05:00            |7:00:00                         |189.05%                                         |               
-|4          |128             |0.252          |0.253                      |2:45:00             |1:30:00                         |183.33%                                         |
-|8          |128             |0.248          |0.249                      |1:20:00             |0:43:00                         |186.00%                                         | 
+|1          |64              |0.26           |0.26                       |07:45:00            |05:09:00                        |150.49%                                         |
+|4          |64              |0.26           |0.26                       |01:59:00            |01:19:00                        |149.52%                                         |
+|8          |64              |0.25           |0.26                       |01:02:00            |00:40:00                        |155.64%                                         |
+|1          |128             |0.26           |0.26                       |07:36:00            |04:57:00                        |153.50%                                         |
+|4          |128             |0.26           |0.26                       |01:55:00            |01:15:00                        |152.92%                                         |
+|8          |128             |0.26           |0.25                       |00:58:00            |00:38:00                        |151.89%                                         |
+|1          |256             |0.26           |0.26                       |07:34:00            |04:53:00                        |154.80%                                         |
+|4          |256             |0.25           |0.26                       |01:54:00            |01:14:00                        |152.98%                                         |
+|8          |256             |0.248          |0.25                       |00:57:00            |00:37:00                        |151.46%                                         |
 
 ##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
 Our results were obtained by running the `./examples/SSD300_FP{16,32}_{1,4,8}GPU.sh`
-script in the `pytorch-20.06-py3` NGC container on NVIDIA DGX-1 with 8x
+script in the `pytorch-21.05-py3` NGC container on NVIDIA DGX-1 with 8x
 V100 16GB GPUs.
 
 |GPUs       |Batch size / GPU|Accuracy - FP32|Accuracy  - mixed precision|Time to train - FP32|Time to train  - mixed precision|Time to train speedup  (FP32 to mixed precision)|
 |-----------|----------------|---------------|---------------------------|--------------------|--------------------------------|------------------------------------------------|
-|1          |32              |0.250          |0.250                      |20:20:13            |10:23:46                        |195.62%                                         |
-|4          |32              |0.249          |0.250                      |5:11:17             |2:39:28                         |195.20%                                         |
-|8          |32              |0.250          |0.250                      |2:37:00             |1:32:00                         |170.60%                                         |
-|1          |64              |<N/A>          |0.252                      |<N/A>               |9:27:33                         |215.00%                                         |
-|4          |64              |<N/A>          |0.251                      |<N/A>               |2:24:43                         |215.10%                                         |
-|8          |64              |<N/A>          |0.252                      |<N/A>               |1:31:00                         |172.50%                                         |
+|1          |32              |0.26           |0.26                       |20:14:00            |10:09:00                        |199.30%                                         |
+|4          |32              |0.25           |0.25                       |05:10:00            |02:40:00                        |193.88%                                         |
+|8          |32              |0.26           |0.25                       |02:35:00            |01:20:00                        |192.24%                                         |
+|1          |64              |<N/A>          |0.26                       |09:34:00            |<N/A>                           |<N/A>                                           |
+|4          |64              |<N/A>          |0.26                       |02:27:00            |<N/A>                           |<N/A>                                           |
+|8          |64              |<N/A>          |0.26                       |01:14:00            |<N/A>                           |<N/A>                                           |
+
+
+
 
 Due to smaller size, mixed precision models can be trained with bigger batches. In such cases mixed precision speedup is calculated versus FP32 training with maximum batch size for that precision
 
@@ -655,52 +626,52 @@ Here are example graphs of FP32, TF32 and AMP training on 8 GPU configuration:
 ##### Training stability test
 
 The SSD300 v1.1 model was trained for 65 epochs, starting
-from 15 different initial random seeds. The training was performed in the `pytorch-20.06-py3` NGC container on
-NVIDIA DGX A100 8x A100 40GB GPUs with batch size per GPU = 128.
+from 15 different initial random seeds. The training was performed in the `pytorch-21.05-py3` NGC container on
+NVIDIA DGX A100 8x A100 80GB GPUs with batch size per GPU = 128.
 After training, the models were evaluated on the test dataset. The following
 table summarizes the final mAP on the test set.
 
 |**Precision**|**Average mAP**|**Standard deviation**|**Minimum**|**Maximum**|**Median**|
 |------------:|--------------:|---------------------:|----------:|----------:|---------:|
-| AMP         | 0.2491314286  | 0.001498316675       | 0.24456   | 0.25182   | 0.24907  |
+| AMP         | 0.2514314286  | 0.001498316675       | 0.24456   | 0.25182   | 0.24907  |
 | TF32        | 0.2489106667  | 0.001749463047       | 0.24487   | 0.25148   | 0.24848  |
 
 
 #### Training performance results
 
-##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
 
 Our results were obtained by running the `main.py` script with the `--mode
-benchmark-training` flag in the `pytorch-20.06-py3` NGC container on NVIDIA
-DGX A100 (8x A100 40GB) GPUs. Performance numbers (in items/images per second)
+benchmark-training` flag in the `pytorch-21.05-py3` NGC container on NVIDIA
+DGX A100 (8x A100 80GB) GPUs. Performance numbers (in items/images per second)
 were averaged over an entire training epoch.
 
 |GPUs       |Batch size / GPU|Throughput - TF32|Throughput  - mixed precision|Throughput speedup (TF32 - mixed precision)|Weak scaling - TF32             |Weak scaling  - mixed precision                 |
 |-----------|----------------|-----------------|-----------------------------|-------------------------------------------|--------------------------------|------------------------------------------------|
-|1          |64              |201.43           |367.15                       |182.27%                                    |100.00%                         |100.00%                                         |
-|4          |64              |791.50           |1,444.00                     |182.44%                                    |392.94%                         |393.30%                                         |
-|8          |64              |1,582.72         |2,872.48                     |181.49%                                    |785.74%                         |782.37%                                         |
-|1          |128             |206.28           |387.95                       |188.07%                                    |100.00%                         |100.00%                                         |
-|4          |128             |822.39           |1,530.15                     |186.06%                                    |398.68%                         |397.73%                                         |
-|8          |128             |1,647.00         |3,092.00                     |187.74%                                    |798.43%                         |773.00%                                         |
+|1          |64              |279.85           |428.30                       |153.04%                                    |100%                            |100%                                            |
+|4          |64              |1095.17          |1660.59                      |151.62%                                    |391%                            |387%                                            |
+|8          |64              |2181.21          |3301.58                      |151.36%                                    |779%                            |770%                                            |
+|1          |128             |286.17           |440.74                       |154.01%                                    |100%                            |100%                                            |
+|4          |128             |1135.02          |1755.94                      |154.70%                                    |396%                            |398%                                            |
+|8          |128             |2264.92          |3510.29                      |154.98%                                    |791%                            |796%                                            |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
 ##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
 
 Our results were obtained by running the `main.py` script with the `--mode
-benchmark-training` flag in the `pytorch-20.06-py3` NGC container on NVIDIA
+benchmark-training` flag in the `pytorch-21.05-py3` NGC container on NVIDIA
 DGX-1 with 8x V100 16GB GPUs. Performance numbers (in items/images per second)
 were averaged over an entire training epoch.
 
 |GPUs       |Batch size / GPU|Throughput - FP32|Throughput  - mixed precision|Throughput speedup (FP32 - mixed precision)|Weak scaling - FP32             |Weak scaling  - mixed precision                 |
 |-----------|----------------|-----------------|-----------------------------|-------------------------------------------|--------------------------------|------------------------------------------------|
-|1          |32              |133.67           |215.30                       |161.07%                                    |100.00%                         |100.00%                                         |
-|4          |32              |532.05           |828.63                       |155.74%                                    |398.04%                         |384.88%                                         |
-|8          |32              |820.70           |1,647.74                     |200.77%                                    |614.02%                         |802.00%                                         |
-|1          |64              |<N/A>            |232.22                       |173.73%                                    |<N/A>                           |100.00%                                         |
-|4          |64              |<N/A>            |910.77                       |171.18%                                    |<N/A>                           |392.20%                                         |
-|8          |64              |<N/A>            |1,728.00                     |210.55%                                    |<N/A>                           |761.99%                                         |
+|1          |32              |108.27           |212.95                       |196.68%                                    |100%                            |100%                                            |
+|4          |32              |425.07           |826.38                       |194.41%                                    |392%                            |388%                                            |
+|8          |32              |846.58           |1610.82                      |190.27%                                    |781%                            |756%                                            |
+|1          |64              |<N/A>            |227.69                       |<N/A>                                      |<N/A>                           |100%                                            |
+|4          |64              |<N/A>            |891.27                       |<N/A>                                      |<N/A>                           |391%                                            |
+|8          |64              |<N/A>            |1770.09                      |<N/A>                                      |<N/A>                           |777%                                            |
 
 Due to smaller size, mixed precision models can be trained with bigger batches. In such cases mixed precision speedup is calculated versus FP32 training with maximum batch size for that precision
 
@@ -708,37 +679,38 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 
 #### Inference performance results
 
-##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
+##### Inference performance: NVIDIA DGX A100 (1x A100 80GB)
 
 Our results were obtained by running the `main.py` script with `--mode
-benchmark-inference` flag in the pytorch-20.06-py3 NGC container on NVIDIA
-DGX A100 (1x A100 40GB) GPU.
+benchmark-inference` flag in the pytorch-21.05-py3 NGC container on NVIDIA
+DGX A100 (1x A100 80GB) GPU.
 
 |Batch size |Throughput - TF32|Throughput  - mixed precision|Throughput speedup (TF32 - mixed precision)|Weak scaling - TF32 |Weak scaling  - mixed precision |
 |-----------|-----------------|-----------------------------|-------------------------------------------|--------------------|--------------------------------|
-|1          |113.51           |109.93                       | 96.85%	                                |100.00%             |100.00%                         |
-|2          |203.07           |214.43                       |105.59%	                                |178.90%             |195.06%                         |
-|4          |338.76           |368.45                       |108.76%	                                |298.30%	         |335.17%                         |
-|8          |485.65           |526.97                       |108.51%	                                |427.85%	         |479.37%                         |
-|16         |493.64           |867.42                       |175.72%	                                |434.89%             |789.07%                         |
-|32         |548.75           |910.17                       |165.86%	                                |483.44%             |827.95%            
+|1          |105.53           |  90.62                      | 85%                                       |100%                | 100%                           |
+|2          |197.77           | 168.41                      | 85%                                       |187%                | 185%                           |
+|4          |332.10           | 323.68                      | 97%                                       |314%                | 357%                           |
+|8          |526.12           | 523.96                      | 99%                                       |498%                | 578%                           |
+|16         |634.50           | 816.91                      |128%                                       |601%                | 901%                           |
+|32         |715.35           | 956.91                      |133%                                       |677%                |1055%                           |
+|64         |752.57           |1053.39                      |139%                                       |713%                |1162%                           |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
 ##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
 
 Our results were obtained by running the `main.py` script with `--mode
-benchmark-inference` flag in the pytorch-20.06-py3 NGC container on NVIDIA
+benchmark-inference` flag in the pytorch-21.05-py3 NGC container on NVIDIA
 DGX-1 with (1x V100 16GB) GPU.
 
 |Batch size |Throughput - FP32|Throughput  - mixed precision|Throughput speedup (FP32 - mixed precision)|Weak scaling - FP32 |Weak scaling  - mixed precision |
 |-----------|-----------------|-----------------------------|-------------------------------------------|--------------------|--------------------------------|
-|1          |82.50            |80.50                        | 97.58%	                                |100.00%             |100.00%                         |
-|2          |124.05           |147.46                       |118.87%	                                |150.36%             |183.18%                         |
-|4          |155.51           |255.16                       |164.08%	                                |188.50%	         |316.97%                         |
-|8          |182.37           |334.94                       |183.66%	                                |221.05%	         |416.07%                         |
-|16         |222.83           |358.25                       |160.77%	                                |270.10%             |445.03%                         |
-|32         |271.73           |438.85                       |161.50%	                                |329.37%             |545.16%                         |
+|1          | 75.05           | 57.03                       | 75%                                       |100%                |100%                            |
+|2          |138.39           |117.12                       | 84%                                       |184%                |205%                            |
+|4          |190.74           |185.38                       | 97%                                       |254%                |325%                            |
+|8          |237.34           |368.48                       |155%                                       |316%                |646%                            |
+|16         |285.32           |504.77                       |176%                                       |380%                |885%                            |
+|32         |306.22           |548.87                       |179%                                       |408%                |962%                            |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
@@ -746,6 +718,22 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
 
 ### Changelog
 
+May 2021
+ * upgrade the PyTorch container to 21.05
+ * replaced APEX AMP with native PyTorch AMP
+ * updated [nv-cocoapi](https://github.com/NVIDIA/cocoapi/) from 0.4.0 to 0.6.0
+ * code updated to use DALI 1.2.0
+
+April 2021
+ * upgrade the PyTorch container to 21.04
+ * changed python package naming
+
+March 2021
+ * upgrade the PyTorch container to 21.03
+ * code updated to use DALI 0.30.0
+ * use DALI [BoxEncoder](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/supported_ops.html#nvidia.dali.ops.BoxEncoder) instead of a CUDA extension
+ * replaced [cocoapi](https://github.com/cocodataset/cocoapi) with [nv-cocoapi](https://github.com/NVIDIA/cocoapi/)
+
 June 2020
  * upgrade the PyTorch container to 20.06
  * update performance tables to include A100 results
diff --git a/PyTorch/Detection/SSD/csrc/box_encoder_cuda.cu b/PyTorch/Detection/SSD/csrc/box_encoder_cuda.cu
deleted file mode 100644
index b740a18d..00000000
--- a/PyTorch/Detection/SSD/csrc/box_encoder_cuda.cu
+++ /dev/null
@@ -1,440 +0,0 @@
-/******************************************************************************
-*
-* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-
- ******************************************************************************/
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <THC/THCNumerics.cuh>
-#include <THC/THC.h>
-
-#include <cuda.h>
-
-//#define DEBUG
-
-// calculate the IoU of a single box against another box
-__device__
-float calc_single_iou(const float4 b1, const float4 b2) {
-  // (lt), (rb)
-  float l = max(b1.x, b2.x);
-  float t = max(b1.y, b2.y);
-  float r = min(b1.z, b2.z);
-  float b = min(b1.w, b2.w);
-
-  float first = (r - l);
-  first = (first < 0) ? 0 : first;
-  float second = (b - t);
-  second = (second < 0) ? 0 : second;
-
-  float intersection = first * second;
-
-  float area1 = (b1.w - b1.y) * (b1.z - b1.x);
-  float area2 = (b2.w - b2.y) * (b2.z - b2.x);
-
-  return intersection / (area1 + area2 - intersection);
-}
-
-__global__
-// boxes1 : [N x 4]
-// boxes2 : [M x 4]
-//   ious : [N x M]
-void calc_ious_kernel(const int N_img, const float4 *box1, const int *box1_offsets,
-                      const int M, const float4 *boxes2, float *ious) {
-
-  // launch N_img blocks
-  const int img = blockIdx.x;
-
-  // each block, i will run over the box1_N[i] source and M target boxes
-  // generating box1_N[i] x M outputs
-
-  // alias to start of boxes for this image
-  const float4 *b1 = &box1[box1_offsets[img]];
-
-  if (threadIdx.x == 0) {
-    //printf("offset for img %d : %d\n", img, box1_offsets[img]);
-  }
-
-  // number of boxes for this image from offsets
-  int N = box1_offsets[img+1] - box1_offsets[img];
-
-  for (int i = 0; i < N; ++i) {
-    // if (threadIdx.x == 0) printf("i : %d\n", i);
-    const float4 source = b1[i];
-    // for each source, loop over targets
-    for (int j = threadIdx.x; j < M; j += blockDim.x) {
-      const float4 target = boxes2[j];
-
-      float iou = calc_single_iou(source, target);
-
-      // store the calculated IoU in the correct spot
-      int out_idx = box1_offsets[img] * M + i * M + j;
-      ious[out_idx] = iou;
-
-    }
-  }
-}
-
-__device__
-void reduce_val_idx(int N, volatile float *vals, volatile int *idx) {
-  // naive: single thread for now
-  if (threadIdx.x == 0) {
-    float max_val = vals[0];
-    int max_idx = idx[0];
-
-    for (int i = 1; i < N; ++i) {
-      if (vals[i] > max_val) {
-        max_val = vals[i];
-        max_idx = idx[i];
-      }
-    }
-
-    vals[0] = max_val;
-    idx[0] = max_idx;
-  }
-}
-
-/**
- * perform remaining parts, storing temporary values in global workspace
- * workspace needs N_img * M values, each of 8 bytes (float, int)
- **/
-template <int BLOCK_SIZE, int MAX_BBOXES_PER_BLOCK>
-__global__
-void encode(const int N_img, const float4 *bbox_in, const long *labels_in, const int *offsets,
-            const int M, const float4 *dboxes, // const float *ious,
-            const float criteria, uint8_t *workspace, float4 *bbox_out, long *label_out) {
-
-  // Each block will take a single image's IoU set
-  const int img = blockIdx.x;
-
-  // shared memory for intermediate results
-  __shared__ volatile float best_bbox_iou_tmp[BLOCK_SIZE];
-  __shared__ volatile int best_bbox_idx_tmp[BLOCK_SIZE];
-
-  // shared memory for final best_bbox_{iou, idx} values
-  __shared__ volatile float best_bbox_iou[MAX_BBOXES_PER_BLOCK];
-  __shared__ volatile int best_bbox_idx[MAX_BBOXES_PER_BLOCK];
-
-  // index into the global workspace - each image needs (float + int) * M values
-  volatile float *best_dbox_iou = (float *)&workspace[img * M * 8];
-  volatile int *best_dbox_idx = (int *)&workspace[img * M * 8 + M * 4];
-
-  // number of input bboxes for this image
-  const int N_rows = offsets[img+1] - offsets[img];
-
-  // Check for potential crash
-  assert(N_rows <= MAX_BBOXES_PER_BLOCK);
-#ifdef DEBUG
-  if (threadIdx.x == 0)
-    printf("N rows: %d %d to %d (%p - %p)\n", N_rows, offsets[img], offsets[img+1], best_dbox_iou, best_dbox_idx);
-#endif
-
-  for (int i = threadIdx.x; i < MAX_BBOXES_PER_BLOCK; i += blockDim.x) {
-    best_bbox_iou[i] = -FLT_MAX;
-    best_bbox_idx[i] = -1;
-  }
-  __syncthreads();
-
-  // loop serially over the rows of the IoU set that correspond to this image
-  int row_num = 0;
-  for (int i = offsets[img]; i < offsets[img+1]; ++i) {
-    // reset shmem tallies
-    best_bbox_iou_tmp[threadIdx.x] = -FLT_MAX;
-    best_bbox_idx_tmp[threadIdx.x] = -1;
-
-    // index into the input buffer
-    // const float *row = &ious[i * M];
-    const float4 input_bbox = bbox_in[i];
-#ifdef DEBUG
-    if (threadIdx.x == 0)
-      printf("%d - %p\n", img, &input_bbox);
-#endif
-
-    // loop by threads over the columns
-    for (int j = threadIdx.x; j < M; j += blockDim.x) {
-
-      // check and store new max if necessary
-      const float4 input_dbox = dboxes[j];
-      // float new_val = row[j];
-      float new_val = calc_single_iou(input_bbox, input_dbox);
-
-      // handle per-row max in shared memory
-      if (new_val > best_bbox_iou_tmp[threadIdx.x]) {
-        best_bbox_iou_tmp[threadIdx.x] = new_val;
-        best_bbox_idx_tmp[threadIdx.x] = j;
-      }
-
-      // handle per-col max in global workspace
-      if (new_val > best_dbox_iou[j]) {
-        best_dbox_iou[j] = new_val;
-        best_dbox_idx[j] = row_num;
-
-#ifdef DEBUG
-        assert(best_dbox_idx[j] >= 0);
-        assert(best_dbox_idx[j] < N_rows);
-#endif
-      }
-    }
-
-    // Now we have all the values for this row -- reduce
-    __syncthreads();
-
-    // reduce - output is in max_{val, idx}_row[0]
-    reduce_val_idx(blockDim.x, best_bbox_iou_tmp, best_bbox_idx_tmp);
-#ifdef DEBUG
-    __syncthreads();
-#endif
-
-
-    // store output for row i
-    if (threadIdx.x == 0) {
-      best_bbox_iou[row_num] = best_bbox_iou_tmp[0];
-      best_bbox_idx[row_num] = best_bbox_idx_tmp[0];
-
-#ifdef DEBUG
-      assert(best_bbox_idx[row_num] >= 0);
-      assert(best_bbox_idx[row_num] < M);
-#endif
-    }
-    __syncthreads();
-
-    // keep track of _local_ row
-    row_num++;
-  }
-
-#ifdef DEBUG
-  if (threadIdx.x == 0) {
-    for (int i = 0; i < N_rows; ++i) {
-      printf("%d - row : %d : best bbox_idx: %d\n", img, i, best_bbox_idx[i]);
-    }
-  }
-#endif
-
-#ifdef DEBUG
-  // make sure all best_bbox_{iou, val} are seen by everyone
-  __syncthreads();
-#endif
-  // At this point we have the maximum values & indices for both bbox and dbox
-  /*
-        best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
-
-        idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
-        best_dbox_idx[best_bbox_idx[idx]] = idx
-  */
-  for (int i = threadIdx.x; i < N_rows; i += blockDim.x) {
-    int idx = best_bbox_idx[i];
-
-#ifdef DEBUG
-    assert(idx < M);
-    assert(idx >= 0);
-#endif
-
-    best_dbox_iou[idx] = 2.;
-    best_dbox_idx[idx] = i;
-#ifdef DEBUG
-    printf("%d - set best dbox_idx[%d] to %d\n", img, best_bbox_idx[i], i);
-#endif
-  }
-
-  /**
-        # filter IoU > 0.5
-        masks = best_dbox_ious > criteria
-        labels_out = torch.zeros(self.nboxes, dtype=torch.long)
-        #print(maxloc.shape, labels_in.shape, labels_out.shape)
-        labels_out[masks] = labels_in[best_dbox_idx[masks]]
-        bboxes_out = self.dboxes.clone()
-        bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
-        # Transform format to xywh format
-        x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
-                     0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
-                     -bboxes_out[:, 0] + bboxes_out[:, 2], \
-                     -bboxes_out[:, 1] + bboxes_out[:, 3]
-        bboxes_out[:, 0] = x
-        bboxes_out[:, 1] = y
-        bboxes_out[:, 2] = w
-        bboxes_out[:, 3] = h
-        return bboxes_out, labels_out
-  **/
-  __syncthreads();
-  for (int i = threadIdx.x; i < M; i += blockDim.x) {
-    // offset into output arrays: M values per image
-    // int output_idx = offsets[img] * M + i;
-    int output_idx = img * M + i;
-
-    // reset output labels to background
-    // NOTE: bbox_out is already cloned from dbox outside of this kernel
-    label_out[output_idx] = 0;
-
-    // Filter IoU > 0.5
-    bool mask = best_dbox_iou[i] > criteria;
-
-    float4 bbox = bbox_out[output_idx];
-    // copy some labels and bboxes
-    if (mask) {
-      // copy label
-#ifdef DEBUG
-      printf("%d : label: local input idx: %d, value: %d\n", i, best_dbox_idx[i], labels_in[offsets[img] + best_dbox_idx[i]]);
-      // printf("%d : label: local input idx: %d, value: %d\n", i, best_dbox_idx[i], labels_in[offsets[img] + i]);
-#endif
-      label_out[output_idx] = labels_in[offsets[img] + best_dbox_idx[i]];
-
-      // grab original box
-      bbox = bbox_in[offsets[img] + best_dbox_idx[i]];
-#ifdef DEBUG
-      printf("mask %d : %d : %f %f %f %f\n", i, best_dbox_idx[i], bbox.x, bbox.y, bbox.z, bbox.w);
-#endif
-    }
-
-    // transfer to xywh
-    float4 bbox_tmp;
-    bbox_tmp.x = 0.5 * (bbox.x + bbox.z);
-    bbox_tmp.y = 0.5 * (bbox.y + bbox.w);
-    bbox_tmp.z = bbox.z - bbox.x;
-    bbox_tmp.w = bbox.w - bbox.y;
-
-    // write out
-    bbox_out[output_idx] = bbox_tmp;
-  }
-}
-
-/**
-    def encode(self, bboxes_in, labels_in, criteria = 0.5):
-
-        ious = calc_iou_tensor(bboxes_in, self.dboxes)
-        best_dbox_ious, best_dbox_idx = ious.max(dim=0)
-        best_bbox_ious, best_bbox_idx = ious.max(dim=1)
-
-        # set best ious 2.0
-        best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
-
-        idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
-        best_dbox_idx[best_bbox_idx[idx]] = idx
-
-        # filter IoU > 0.5
-        masks = best_dbox_ious > criteria
-        labels_out = torch.zeros(self.nboxes, dtype=torch.long)
-        #print(maxloc.shape, labels_in.shape, labels_out.shape)
-        labels_out[masks] = labels_in[best_dbox_idx[masks]]
-        bboxes_out = self.dboxes.clone()
-        bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
-        # Transform format to xywh format
-        x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
-                     0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
-                     -bboxes_out[:, 0] + bboxes_out[:, 2], \
-                     -bboxes_out[:, 1] + bboxes_out[:, 3]
-        bboxes_out[:, 0] = x
-        bboxes_out[:, 1] = y
-        bboxes_out[:, 2] = w
-        bboxes_out[:, 3] = h
-        return bboxes_out, labels_out
-**/
-std::vector<at::Tensor> box_encoder(const int N_img,
-                                    const at::Tensor& bbox_input,
-                                    const at::Tensor& bbox_offsets,
-                                    const at::Tensor& labels_input,
-                                    const at::Tensor& dbox,
-                                    float criteria) {
-  // Check everything is on the device
-  AT_ASSERTM(bbox_input.type().is_cuda(), "bboxes must be a CUDA tensor");
-  AT_ASSERTM(bbox_offsets.type().is_cuda(), "bbox offsets must be a CUDA tensor");
-  AT_ASSERTM(labels_input.type().is_cuda(), "labels must be a CUDA tensor");
-  AT_ASSERTM(dbox.type().is_cuda(), "dboxes must be a CUDA tensor");
-
-  // Check at least offsets, bboxes and labels are consistent
-  // Note: offsets is N+1 vs. N for labels
-  AT_ASSERTM(N_img + 1 == bbox_offsets.numel(), "must have N_img+1 offsets");
-
-
-  auto num_bbox_total = bbox_offsets[bbox_offsets.numel()-1].item<int>();
-#ifdef DEBUG
-  printf("%d : bboxes: %d\n", (int)bbox_offsets.numel(), num_bbox_total);
-#endif
-  AT_ASSERTM(num_bbox_total <= 2048, "total num bboxes must be <= 2048");
-
-  AT_ASSERTM(bbox_input.size(0) == labels_input.size(0), "bbox and labels must have same leading dimension");
-
-  const int N = bbox_input.size(0);
-  const int M = dbox.size(0);
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  // allocate final outputs (known size)
-#ifdef DEBUG
-  printf("%d x %d\n", N_img * M, 4);
-  // at::Tensor bbox_out = dbox.type().tensor({N_img * M, 4});
-  printf("allocating %lu bytes for output labels\n", N_img*M*sizeof(long));
-#endif
-  at::Tensor labels_out = at::empty({N_img * M}, labels_input.options());
-  THCudaCheck(cudaGetLastError());
-
-  // copy default boxes to outputs
-#ifdef DEBUG
-  printf("allocating %lu bytes for output bboxes\n", N_img*M*4*sizeof(float));
-#endif
-  at::Tensor bbox_out = dbox.repeat({N_img, 1});
-  THCudaCheck(cudaGetLastError());
-
-  // need to allocate some workspace
-#ifdef DEBUG
-  printf("allocating %lu bytes for workspace\n", 8*M*N_img);
-#endif
-  // at::Tensor workspace = at::CUDA(at::kByte).zeros({8 * M * N_img});
-  at::Tensor workspace = at::zeros({8 * M * N_img}, at::CUDA(at::kByte));
-  THCudaCheck(cudaGetLastError());
-
-  // Encode the inputs
-  const int THREADS_PER_BLOCK = 256;
-  encode<THREADS_PER_BLOCK, 256><<<N_img, THREADS_PER_BLOCK, 0, stream.stream()>>>(N_img,
-                      (float4*)bbox_input.data<float>(),
-                      labels_input.data<long>(),
-                      bbox_offsets.data<int>(),
-                      M,
-                      (float4*)dbox.data<float>(),
-                      criteria,
-                      workspace.data<uint8_t>(),
-                      (float4*)bbox_out.data<float>(),
-                      labels_out.data<long>());
-
-  THCudaCheck(cudaGetLastError());
-  return {bbox_out, labels_out};
-}
-
-at::Tensor calc_ious(const int N_img,
-                     const at::Tensor& boxes1,
-                     const at::Tensor& boxes1_offsets,
-                     const at::Tensor& boxes2) {
-
-  const int N = boxes1.size(0);
-  const int M = boxes2.size(0);
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  // at::Tensor ious = at::CUDA(at::kFloat).zeros({N, M});
-  // at::Tensor ious = at::ones(at::CUDA(at::kFloat), {N, M});
-  at::Tensor ious = at::empty({N, M}, boxes1.options());
-
-  // Get IoU of all source x default box pairs
-  calc_ious_kernel<<<N_img, 256, 0, stream.stream()>>>(
-                        N_img,
-                        (float4*)boxes1.data<float>(),
-                        boxes1_offsets.data<int>(),
-                        M,
-                        (float4*)boxes2.data<float>(),
-                        ious.data<float>());
-
-  THCudaCheck(cudaGetLastError());
-  return ious;
-}
diff --git a/PyTorch/Detection/SSD/csrc/interface.cpp b/PyTorch/Detection/SSD/csrc/interface.cpp
deleted file mode 100644
index a8dea4e4..00000000
--- a/PyTorch/Detection/SSD/csrc/interface.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/******************************************************************************
-*
-* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-
- ******************************************************************************/
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-
-#include <torch/extension.h>
-#include <ATen/ATen.h>
-
-
-namespace py = pybind11;
-
-// Box encoder
-std::vector<at::Tensor> box_encoder(const int N_img,
-                                    const at::Tensor& bbox_input,
-                                    const at::Tensor& bbox_offsets,
-                                    const at::Tensor& labels_input,
-                                    const at::Tensor& dbox,
-                                    const float criteria = 0.5);
-
-std::vector<at::Tensor> random_horiz_flip(
-                             at::Tensor& img,
-                             at::Tensor& bboxes,
-                             const at::Tensor& bbox_offsets,
-                             const float p,
-                             const bool nhwc);
-
-// Fused color jitter application
-// ctm [4,4], img [H, W, C]
-py::array_t<float> apply_transform(int H, int W, int C, py::array_t<float> img, py::array_t<float> ctm) {
-  auto img_buf = img.request();
-  auto ctm_buf = ctm.request();
-
-  // printf("H: %d, W: %d, C: %d\n", H, W, C);
-  py::array_t<float> result{img_buf.size};
-  auto res_buf = result.request();
-
-  float *img_ptr = (float *)img_buf.ptr;
-  float *ctm_ptr = (float *)ctm_buf.ptr;
-  float *res_ptr = (float *)res_buf.ptr;
-
-  for (int h = 0; h < H; ++h) {
-    for (int w = 0; w < W; ++w) {
-      float *ptr = &img_ptr[h * W * C + w * C];
-      float *out_ptr = &res_ptr[h * W * C + w * C];
-      // manually unroll over C
-      out_ptr[0] = ctm_ptr[0] * ptr[0] + ctm_ptr[1] * ptr[1] + ctm_ptr[2] * ptr[2] + ctm_ptr[3];
-      out_ptr[1] = ctm_ptr[4] * ptr[0] + ctm_ptr[5] * ptr[1] + ctm_ptr[6] * ptr[2] + ctm_ptr[7];
-      out_ptr[2] = ctm_ptr[8] * ptr[0] + ctm_ptr[9] * ptr[1] + ctm_ptr[10] * ptr[2] + ctm_ptr[11];
-    }
-  }
-
-  result.resize({H, W, C});
-
-  return result;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // batched box encoder
-  m.def("box_encoder", &box_encoder, "box_encoder");
-  m.def("random_horiz_flip", &random_horiz_flip, "random_horiz_flip");
-  // Apply fused color jitter
-  m.def("apply_transform", &apply_transform, "apply_transform");
-}
diff --git a/PyTorch/Detection/SSD/csrc/random_horiz_flip.cu b/PyTorch/Detection/SSD/csrc/random_horiz_flip.cu
deleted file mode 100644
index de8a681e..00000000
--- a/PyTorch/Detection/SSD/csrc/random_horiz_flip.cu
+++ /dev/null
@@ -1,165 +0,0 @@
-/******************************************************************************
-*
-* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-
- ******************************************************************************/
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <THC/THCNumerics.cuh>
-#include <THC/THC.h>
-
-#include <cuda.h>
-
-/**
- * Each block will handle one channel of each image
- **/
-template <typename T>
-__global__
-void HorizFlipImagesAndBoxes(
-                             const int N,
-                             const int C,
-                             const int H,
-                             const int W,
-                             const T* img_in,
-                             float* bboxes,
-                             const int* offsets,
-                             const float p,
-                             const float* flip,
-                             T* img_out,
-                             const bool nhwc) {
-  // early return if not flipping
-  if (flip[blockIdx.x] < p) return;
-
-  // pointer offset into images
-  const int img_offset = blockIdx.x * C * H * W;
-  const T* img = &img_in[img_offset];
-  T* img_o = &img_out[img_offset];
-
-  // flip bboxes
-  auto bbox_offset_begin = offsets[blockIdx.x];
-  auto bbox_offset_end   = offsets[blockIdx.x + 1];
-  auto num_bboxes = bbox_offset_end - bbox_offset_begin;
-
-  const int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
-
-  // bboxes in ltrb format, scaled to [0, 1]
-  for (int i = thread_idx; i < num_bboxes; i += blockDim.x * blockDim.y) {
-    float *bbox = &bboxes[(bbox_offset_begin + thread_idx) * 4];
-    // Could do this inplace, but not register constrained
-    auto bbox_0 = bbox[0];
-    auto bbox_2 = bbox[2];
-    bbox[0] = 1. - bbox_2;
-    bbox[2] = 1. - bbox_0;
-  }
-
-  if (nhwc) {
-    // loop over float3 pixels, handle 3 values / thread
-    for (int h = threadIdx.y; h < H; h += blockDim.y) {
-      for (int w = threadIdx.x; w < W; w += blockDim.x) {
-        const T* img_hw = &img[h * W * C + w * C];
-        T * img_out_hw = &img_o[h * W * C + (W - 1 - w) * C];
-
-        for (int c = 0; c < C; ++c) {
-          img_out_hw[c] = img_hw[c];
-        }
-      }
-    }
-  } else {
-    // loop over channels
-    for (int c = 0; c < C; ++c) {
-      const T* img_c = &img[c * H * W];
-      T *img_out_c = &img_o[c * H * W];
-
-      // handle tiles of (h, w) at a time
-      for (int h = threadIdx.y; h < H; h += blockDim.y) {
-        for (int w = threadIdx.x; w < W; w += blockDim.x) {
-          const int input_idx = h * W + w;
-          const int output_idx = h * W + (W - 1 - w);
-
-
-          img_out_c[output_idx] = img_c[input_idx];
-        }
-      }
-    }
-  }
-}
-
-/**
-  * Take images and their bboxes, randomly flip on horizontal axis
-  * In/Out: img: NCHW tensor of N, C-channel images of constant (H, W)
-  * In/Out: bboxes: [N_i, 4] tensor of original bboxes in ltrb format
-  * In: bbox_offsets: [N] offset values into bboxes
-  * In: p \in [0, 1): probability of flipping each (img, bbox) pair
-  * In: nhwc: Tensor in NHWC format
-  * ----
-  * Note: allocate temp memory, but effectively do this inplace
-  */
-std::vector<at::Tensor> random_horiz_flip(
-                             at::Tensor& img,
-                             at::Tensor& bboxes,
-                             const at::Tensor& bbox_offsets,
-                             const float p,
-                             const bool nhwc) {
-  // dimensions
-  const int N = img.size(0);
-  int C, H, W;
-  if (nhwc) {
-    C = img.size(3);
-    H = img.size(1);
-    W = img.size(2);
-
-  } else {
-    C = img.size(1);
-    H = img.size(2);
-    W = img.size(3);
-  }
-
-  assert(img.type().is_cuda());
-  assert(bboxes.type().is_cuda());
-  assert(bbox_offsets.type().is_cuda());
-
-  // printf("%d %d %d %d\n", N, C, H, W);
-  // Need temp storage of size img
-  at::Tensor tmp_img = img.clone();
-  at::Tensor flip = at::zeros({N}, at::CUDA(at::kFloat)).uniform_(0., 1.);
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      img.type(),
-      "HorizFlipImagesAndBoxes",
-      [&] {
-        HorizFlipImagesAndBoxes<scalar_t><<<N, dim3(16, 16), 0, stream.stream()>>>(
-          N,
-          C,
-          H,
-          W,
-          img.data<scalar_t>(),
-          bboxes.data<float>(),
-          bbox_offsets.data<int>(),
-          p,
-          flip.data<float>(),
-          tmp_img.data<scalar_t>(),
-          nhwc);
-        THCudaCheck(cudaGetLastError());
-      });
-
-  // copy tmp_img -> img
-  // img = tmp_img;
-
-  return {tmp_img, bboxes};
-}
-
diff --git a/PyTorch/Detection/SSD/dle/inference.py b/PyTorch/Detection/SSD/dle/inference.py
index 50693f19..bb009331 100644
--- a/PyTorch/Detection/SSD/dle/inference.py
+++ b/PyTorch/Detection/SSD/dle/inference.py
@@ -17,6 +17,7 @@ import skimage
 
 def load_image(image_path):
     """Code from Loading_Pretrained_Models.ipynb - a Caffe2 tutorial"""
+    mean, std = 128, 128
     img = skimage.img_as_float(skimage.io.imread(image_path))
     if len(img.shape) == 2:
         img = np.array([img, img, img]).swapaxes(0,2)
diff --git a/PyTorch/Detection/SSD/examples/SSD300_inference.py b/PyTorch/Detection/SSD/examples/SSD300_inference.py
index 148454be..bc5b20d9 100644
--- a/PyTorch/Detection/SSD/examples/SSD300_inference.py
+++ b/PyTorch/Detection/SSD/examples/SSD300_inference.py
@@ -18,8 +18,8 @@ import numpy as np
 from apex.fp16_utils import network_to_half
 
 from dle.inference import prepare_input
-from src.model import SSD300, ResNet
-from src.utils import dboxes300_coco, Encoder
+from ssd.model import SSD300, ResNet
+from ssd.utils import dboxes300_coco, Encoder
 
 
 def load_checkpoint(model, model_file):
diff --git a/PyTorch/Detection/SSD/examples/inference.ipynb b/PyTorch/Detection/SSD/examples/inference.ipynb
index efdb5da8..cdc31879 100644
--- a/PyTorch/Detection/SSD/examples/inference.ipynb
+++ b/PyTorch/Detection/SSD/examples/inference.ipynb
@@ -338,7 +338,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.utils import dboxes300_coco, Encoder\n",
+    "from ssd.utils import dboxes300_coco, Encoder\n",
     "import matplotlib.patches as patches\n",
     "import json"
    ]
diff --git a/PyTorch/Detection/SSD/main.py b/PyTorch/Detection/SSD/main.py
index 7842ff13..c0c4db41 100644
--- a/PyTorch/Detection/SSD/main.py
+++ b/PyTorch/Detection/SSD/main.py
@@ -20,22 +20,18 @@ import numpy as np
 from torch.optim.lr_scheduler import MultiStepLR
 import torch.utils.data.distributed
 
-from src.model import SSD300, ResNet, Loss
-from src.utils import dboxes300_coco, Encoder
-from src.logger import Logger, BenchLogger
-from src.evaluate import evaluate
-from src.train import train_loop, tencent_trick, load_checkpoint, benchmark_train_loop, benchmark_inference_loop
-from src.data import get_train_loader, get_val_dataset, get_val_dataloader, get_coco_ground_truth
+from ssd.model import SSD300, ResNet, Loss
+from ssd.utils import dboxes300_coco, Encoder
+from ssd.logger import Logger, BenchLogger
+from ssd.evaluate import evaluate
+from ssd.train import train_loop, tencent_trick, load_checkpoint, benchmark_train_loop, benchmark_inference_loop
+from ssd.data import get_train_loader, get_val_dataset, get_val_dataloader, get_coco_ground_truth
 
 import dllogger as DLLogger
 
-
 # Apex imports
 try:
-    from apex.parallel.LARC import LARC
-    from apex import amp
     from apex.parallel import DistributedDataParallel as DDP
-    from apex.fp16_utils import *
 except ImportError:
     raise ImportError("Please install APEX from https://github.com/nvidia/apex")
 
@@ -51,10 +47,6 @@ def generate_mean_std(args):
     mean = mean.view(*view)
     std = std.view(*view)
 
-    if args.amp:
-        mean = mean.half()
-        std = std.half()
-
     return mean, std
 
 
@@ -171,8 +163,6 @@ def train(train_loop_func, logger, args):
     optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate,
                                 momentum=args.momentum, weight_decay=args.weight_decay)
     scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1)
-    if args.amp:
-        ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2')
 
     if args.distributed:
         ssd300 = DDP(ssd300)
@@ -200,13 +190,16 @@ def train(train_loop_func, logger, args):
             print('Model precision {} mAP'.format(acc))
         return
 
+    scaler = torch.cuda.amp.GradScaler(enabled=args.amp)
     mean, std = generate_mean_std(args)
 
     for epoch in range(start_epoch, args.epochs):
         start_epoch_time = time.time()
-        scheduler.step()
-        iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration,
+        iteration = train_loop_func(ssd300, loss_func, scaler,
+                                    epoch, optimizer, train_loader, val_dataloader, encoder, iteration,
                                     logger, args, mean, std)
+        if args.mode in ["training", "benchmark-training"]:
+            scheduler.step()
         end_epoch_time = time.time() - start_epoch_time
         total_time += end_epoch_time
 
diff --git a/PyTorch/Detection/SSD/requirements.txt b/PyTorch/Detection/SSD/requirements.txt
index b1e46f49..db0e31df 100644
--- a/PyTorch/Detection/SSD/requirements.txt
+++ b/PyTorch/Detection/SSD/requirements.txt
@@ -1,2 +1,3 @@
-Cython==0.28.4
-scikit-image==0.15.0
+Cython>=0.28.4
+scikit-image>=0.15.0
+ujson>=4.0.2
diff --git a/PyTorch/Detection/SSD/setup.py b/PyTorch/Detection/SSD/setup.py
deleted file mode 100644
index c96bde14..00000000
--- a/PyTorch/Detection/SSD/setup.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import os
-
-import torch
-from torch.utils.cpp_extension import CUDA_HOME
-from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import CUDAExtension
-
-from setuptools import find_packages
-from setuptools import setup
-
-requirements = ["torch", "torchvision"]
-
-
-def get_extensions():
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    extensions_dir = os.path.join(this_dir, "csrc")
-
-    source_cpu = glob.glob(os.path.join(extensions_dir, "*.cpp"))
-    source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"))
-
-    print('c++: ', source_cpu)
-    print('cuda: ', source_cuda)
-    sources = source_cpu
-    extension = CppExtension
-
-    define_macros = []
-
-    if CUDA_HOME is not None:
-        extension = CUDAExtension
-        sources += source_cuda
-        define_macros += [("WITH_CUDA", None)]
-
-    sources = [os.path.join(extensions_dir, s) for s in sources]
-
-    include_dirs = [extensions_dir]
-    extra_compile_flags= {'cxx' : []}
-    extra_compile_flags['nvcc'] = ['-DCUDA_HAS_FP16=1','-D__CUDA_NO_HALF_OPERATORS__','-D__CUDA_NO_HALF_CONVERSIONS__','-D__CUDA_NO_HALF2_OPERATORS__']
-
-    gencodes = [
-                '-gencode', 'arch=compute_52,code=sm_52',
-                '-gencode', 'arch=compute_60,code=sm_60',
-                '-gencode', 'arch=compute_61,code=sm_61',
-                '-gencode', 'arch=compute_70,code=sm_70',
-                '-gencode', 'arch=compute_75,code=sm_75',
-                '-gencode', 'arch=compute_75,code=compute_75',]
-
-    extra_compile_flags['nvcc'] += gencodes
-
-    ext_modules = [
-        extension(
-            "SSD._C",
-            sources,
-            include_dirs=include_dirs,
-            define_macros=define_macros,
-            extra_compile_args=extra_compile_flags,
-        )
-    ]
-
-    return ext_modules
-
-
-setup(
-    name="SSD",
-    version="0.1",
-    author="slayton",
-    url="",
-    description="SSD in pytorch",
-    packages=find_packages(exclude=("configs", "examples", "test",)),
-    # install_requires=requirements,
-    ext_modules=get_extensions(),
-    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
-)
diff --git a/PyTorch/Detection/SSD/src/coco.py b/PyTorch/Detection/SSD/src/coco.py
deleted file mode 100644
index 60d7eede..00000000
--- a/PyTorch/Detection/SSD/src/coco.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__author__ = 'tylin'
-__version__ = '2.0'
-# Interface for accessing the Microsoft COCO dataset.
-
-# Microsoft COCO is a large image dataset designed for object detection,
-# segmentation, and caption generation. pycocotools is a Python API that
-# assists in loading, parsing and visualizing the annotations in COCO.
-# Please visit http://mscoco.org/ for more information on COCO, including
-# for the data, paper, and tutorials. The exact format of the annotations
-# is also described on the COCO website. For example usage of the pycocotools
-# please see pycocotools_demo.ipynb. In addition to this API, please download both
-# the COCO images and annotations in order to run the demo.
-
-# An alternative to using the API is to load the annotations directly
-# into Python dictionary
-# Using the API provides additional utility functions. Note that this API
-# supports both *instance* and *caption* annotations. In the case of
-# captions not all functions are defined (e.g. categories are undefined).
-
-# The following API functions are defined:
-#  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
-#  decodeMask - Decode binary mask M encoded via run-length encoding.
-#  encodeMask - Encode binary mask M using run-length encoding.
-#  getAnnIds  - Get ann ids that satisfy given filter conditions.
-#  getCatIds  - Get cat ids that satisfy given filter conditions.
-#  getImgIds  - Get img ids that satisfy given filter conditions.
-#  loadAnns   - Load anns with the specified ids.
-#  loadCats   - Load cats with the specified ids.
-#  loadImgs   - Load imgs with the specified ids.
-#  annToMask  - Convert segmentation in an annotation to binary mask.
-#  showAnns   - Display the specified annotations.
-#  loadRes    - Load algorithm results and create API for accessing them.
-#  download   - Download COCO images from mscoco.org server.
-# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
-# Help on each functions can be accessed by: "help COCO>function".
-
-# See also COCO>decodeMask,
-# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
-# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
-# COCO>loadImgs, COCO>annToMask, COCO>showAnns
-
-# Microsoft COCO Toolbox.      version 2.0
-# Data, paper, and tutorials available at:  http://mscoco.org/
-# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
-# Licensed under the Simplified BSD License [see bsd.txt]
-
-import json
-import time
-import matplotlib.pyplot as plt
-from matplotlib.collections import PatchCollection
-from matplotlib.patches import Polygon
-import numpy as np
-import copy
-import itertools
-from pycocotools import mask as maskUtils
-import os
-from collections import defaultdict
-import sys
-PYTHON_VERSION = sys.version_info[0]
-if PYTHON_VERSION == 2:
-    from urllib import urlretrieve
-elif PYTHON_VERSION == 3:
-    from urllib.request import urlretrieve
-
-
-def _isArrayLike(obj):
-    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
-
-
-class COCO:
-    def __init__(self, annotation_file=None):
-        """
-        Constructor of Microsoft COCO helper class for reading and visualizing annotations.
-        :param annotation_file (str): location of annotation file
-        :param image_folder (str): location to the folder that hosts images.
-        :return:
-        """
-        # load dataset
-        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
-        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
-        if not annotation_file == None:
-            print('loading annotations into memory...')
-            tic = time.time()
-            dataset = json.load(open(annotation_file, 'r'))
-            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
-            print('Done (t={:0.2f}s)'.format(time.time()- tic))
-            self.dataset = dataset
-            self.createIndex()
-
-    def createIndex(self):
-        # create index
-        print('creating index...')
-        anns, cats, imgs = {}, {}, {}
-        imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
-        if 'annotations' in self.dataset:
-            for ann in self.dataset['annotations']:
-                imgToAnns[ann['image_id']].append(ann)
-                anns[ann['id']] = ann
-
-        if 'images' in self.dataset:
-            for img in self.dataset['images']:
-                imgs[img['id']] = img
-
-        if 'categories' in self.dataset:
-            for cat in self.dataset['categories']:
-                cats[cat['id']] = cat
-
-        if 'annotations' in self.dataset and 'categories' in self.dataset:
-            for ann in self.dataset['annotations']:
-                catToImgs[ann['category_id']].append(ann['image_id'])
-
-        print('index created!')
-
-        # create class members
-        self.anns = anns
-        self.imgToAnns = imgToAnns
-        self.catToImgs = catToImgs
-        self.imgs = imgs
-        self.cats = cats
-
-    def info(self):
-        """
-        Print information about the annotation file.
-        :return:
-        """
-        for key, value in self.dataset['info'].items():
-            print('{}: {}'.format(key, value))
-
-    def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
-        """
-        Get ann ids that satisfy given filter conditions. default skips that filter
-        :param imgIds  (int array)     : get anns for given imgs
-               catIds  (int array)     : get anns for given cats
-               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
-               iscrowd (boolean)       : get anns for given crowd label (False or True)
-        :return: ids (int array)       : integer array of ann ids
-        """
-        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
-        catIds = catIds if _isArrayLike(catIds) else [catIds]
-
-        if len(imgIds) == len(catIds) == len(areaRng) == 0:
-            anns = self.dataset['annotations']
-        else:
-            if not len(imgIds) == 0:
-                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
-                anns = list(itertools.chain.from_iterable(lists))
-            else:
-                anns = self.dataset['annotations']
-            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
-            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
-        if not iscrowd == None:
-            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
-        else:
-            ids = [ann['id'] for ann in anns]
-        return ids
-
-    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
-        """
-        filtering parameters. default skips that filter.
-        :param catNms (str array)  : get cats for given cat names
-        :param supNms (str array)  : get cats for given supercategory names
-        :param catIds (int array)  : get cats for given cat ids
-        :return: ids (int array)   : integer array of cat ids
-        """
-        catNms = catNms if _isArrayLike(catNms) else [catNms]
-        supNms = supNms if _isArrayLike(supNms) else [supNms]
-        catIds = catIds if _isArrayLike(catIds) else [catIds]
-
-        if len(catNms) == len(supNms) == len(catIds) == 0:
-            cats = self.dataset['categories']
-        else:
-            cats = self.dataset['categories']
-            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
-            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
-            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
-        ids = [cat['id'] for cat in cats]
-        return ids
-
-    def getImgIds(self, imgIds=[], catIds=[]):
-        '''
-        Get img ids that satisfy given filter conditions.
-        :param imgIds (int array) : get imgs for given ids
-        :param catIds (int array) : get imgs with all given cats
-        :return: ids (int array)  : integer array of img ids
-        '''
-        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
-        catIds = catIds if _isArrayLike(catIds) else [catIds]
-
-        if len(imgIds) == len(catIds) == 0:
-            ids = self.imgs.keys()
-        else:
-            ids = set(imgIds)
-            for i, catId in enumerate(catIds):
-                if i == 0 and len(ids) == 0:
-                    ids = set(self.catToImgs[catId])
-                else:
-                    ids &= set(self.catToImgs[catId])
-        return list(ids)
-
-    def loadAnns(self, ids=[]):
-        """
-        Load anns with the specified ids.
-        :param ids (int array)       : integer ids specifying anns
-        :return: anns (object array) : loaded ann objects
-        """
-        if _isArrayLike(ids):
-            return [self.anns[id] for id in ids]
-        elif type(ids) == int:
-            return [self.anns[ids]]
-
-    def loadCats(self, ids=[]):
-        """
-        Load cats with the specified ids.
-        :param ids (int array)       : integer ids specifying cats
-        :return: cats (object array) : loaded cat objects
-        """
-        if _isArrayLike(ids):
-            return [self.cats[id] for id in ids]
-        elif type(ids) == int:
-            return [self.cats[ids]]
-
-    def loadImgs(self, ids=[]):
-        """
-        Load anns with the specified ids.
-        :param ids (int array)       : integer ids specifying img
-        :return: imgs (object array) : loaded img objects
-        """
-        if _isArrayLike(ids):
-            return [self.imgs[id] for id in ids]
-        elif type(ids) == int:
-            return [self.imgs[ids]]
-
-    def showAnns(self, anns):
-        """
-        Display the specified annotations.
-        :param anns (array of object): annotations to display
-        :return: None
-        """
-        if len(anns) == 0:
-            return 0
-        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
-            datasetType = 'instances'
-        elif 'caption' in anns[0]:
-            datasetType = 'captions'
-        else:
-            raise Exception('datasetType not supported')
-        if datasetType == 'instances':
-            ax = plt.gca()
-            ax.set_autoscale_on(False)
-            polygons = []
-            color = []
-            for ann in anns:
-                c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
-                if 'segmentation' in ann:
-                    if type(ann['segmentation']) == list:
-                        # polygon
-                        for seg in ann['segmentation']:
-                            poly = np.array(seg).reshape((int(len(seg)/2), 2))
-                            polygons.append(Polygon(poly))
-                            color.append(c)
-                    else:
-                        # mask
-                        t = self.imgs[ann['image_id']]
-                        if type(ann['segmentation']['counts']) == list:
-                            rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
-                        else:
-                            rle = [ann['segmentation']]
-                        m = maskUtils.decode(rle)
-                        img = np.ones( (m.shape[0], m.shape[1], 3) )
-                        if ann['iscrowd'] == 1:
-                            color_mask = np.array([2.0,166.0,101.0])/255
-                        if ann['iscrowd'] == 0:
-                            color_mask = np.random.random((1, 3)).tolist()[0]
-                        for i in range(3):
-                            img[:,:,i] = color_mask[i]
-                        ax.imshow(np.dstack( (img, m*0.5) ))
-                if 'keypoints' in ann and type(ann['keypoints']) == list:
-                    # turn skeleton into zero-based index
-                    sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
-                    kp = np.array(ann['keypoints'])
-                    x = kp[0::3]
-                    y = kp[1::3]
-                    v = kp[2::3]
-                    for sk in sks:
-                        if np.all(v[sk]>0):
-                            plt.plot(x[sk],y[sk], linewidth=3, color=c)
-                    plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
-                    plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
-            p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
-            ax.add_collection(p)
-            p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
-            ax.add_collection(p)
-        elif datasetType == 'captions':
-            for ann in anns:
-                print(ann['caption'])
-
-    def loadRes(self, resFile):
-        """
-        Load result file and return a result api object.
-        :param   resFile (str)     : file name of result file
-        :return: res (obj)         : result api object
-        """
-        res = COCO()
-        res.dataset['images'] = [img for img in self.dataset['images']]
-
-        print('Loading and preparing results...')
-        tic = time.time()
-        if type(resFile) == str: #or type(resFile) == unicode:
-            anns = json.load(open(resFile))
-        elif type(resFile) == np.ndarray:
-            anns = self.loadNumpyAnnotations(resFile)
-        else:
-            anns = resFile
-        assert type(anns) == list, 'results in not an array of objects'
-        annsImgIds = [ann['image_id'] for ann in anns]
-        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
-               'Results do not correspond to current coco set'
-        if 'caption' in anns[0]:
-            imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
-            res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
-            for id, ann in enumerate(anns):
-                ann['id'] = id+1
-        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
-            for id, ann in enumerate(anns):
-                bb = ann['bbox']
-                x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
-                if not 'segmentation' in ann:
-                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
-                ann['area'] = bb[2]*bb[3]
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'segmentation' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
-            for id, ann in enumerate(anns):
-                # now only support compressed RLE format as segmentation results
-                ann['area'] = maskUtils.area(ann['segmentation'])
-                if not 'bbox' in ann:
-                    ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'keypoints' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
-            for id, ann in enumerate(anns):
-                s = ann['keypoints']
-                x = s[0::3]
-                y = s[1::3]
-                x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
-                ann['area'] = (x1-x0)*(y1-y0)
-                ann['id'] = id + 1
-                ann['bbox'] = [x0,y0,x1-x0,y1-y0]
-        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
-
-        res.dataset['annotations'] = anns
-        res.createIndex()
-        return res
-
-    def download(self, tarDir = None, imgIds = [] ):
-        '''
-        Download COCO images from mscoco.org server.
-        :param tarDir (str): COCO results directory name
-               imgIds (list): images to be downloaded
-        :return:
-        '''
-        if tarDir is None:
-            print('Please specify target directory')
-            return -1
-        if len(imgIds) == 0:
-            imgs = self.imgs.values()
-        else:
-            imgs = self.loadImgs(imgIds)
-        N = len(imgs)
-        if not os.path.exists(tarDir):
-            os.makedirs(tarDir)
-        for i, img in enumerate(imgs):
-            tic = time.time()
-            fname = os.path.join(tarDir, img['file_name'])
-            if not os.path.exists(fname):
-                urlretrieve(img['coco_url'], fname)
-            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
-
-    def loadNumpyAnnotations(self, data):
-        """
-        Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
-        :param  data (numpy.ndarray)
-        :return: annotations (python nested list)
-        """
-        print('Converting ndarray to lists...')
-        assert(type(data) == np.ndarray)
-        print(data.shape)
-        assert(data.shape[1] == 7)
-        N = data.shape[0]
-        ann = []
-        for i in range(N):
-            if i % 1000000 == 0:
-                print('{}/{}'.format(i,N))
-            ann += [{
-                'image_id'  : int(data[i, 0]),
-                'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
-                'score' : data[i, 5],
-                'category_id': int(data[i, 6]),
-                }]
-        return ann
-
-    def annToRLE(self, ann):
-        """
-        Convert annotation which can be polygons, uncompressed RLE to RLE.
-        :return: binary mask (numpy 2D array)
-        """
-        t = self.imgs[ann['image_id']]
-        h, w = t['height'], t['width']
-        segm = ann['segmentation']
-        if type(segm) == list:
-            # polygon -- a single object might consist of multiple parts
-            # we merge all parts into one mask rle code
-            rles = maskUtils.frPyObjects(segm, h, w)
-            rle = maskUtils.merge(rles)
-        elif type(segm['counts']) == list:
-            # uncompressed RLE
-            rle = maskUtils.frPyObjects(segm, h, w)
-        else:
-            # rle
-            rle = ann['segmentation']
-        return rle
-
-    def annToMask(self, ann):
-        """
-        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
-        :return: binary mask (numpy 2D array)
-        """
-        rle = self.annToRLE(ann)
-        m = maskUtils.decode(rle)
-        return m
diff --git a/PyTorch/Detection/SSD/src/distributed.py b/PyTorch/Detection/SSD/src/distributed.py
deleted file mode 100644
index 7776997f..00000000
--- a/PyTorch/Detection/SSD/src/distributed.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-import torch.distributed as dist
-from torch.nn.modules import Module
-
-'''
-This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
-launcher included with this example. It assumes that your run is using multiprocess with 1
-GPU/process, that the model is on the correct device, and that torch.set_device has been
-used to set the device.
-
-Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
-and will be allreduced at the finish of the backward pass.
-'''
-class DistributedDataParallel(Module):
-
-    def __init__(self, module):
-        super(DistributedDataParallel, self).__init__()
-        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
-
-        self.module = module
-
-        for p in self.module.state_dict().values():
-            if not torch.is_tensor(p):
-                continue
-            if dist._backend == dist.dist_backend.NCCL:
-                assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU."
-            dist.broadcast(p, 0)
-
-        def allreduce_params():
-            if(self.needs_reduction):
-                self.needs_reduction = False
-                buckets = {}
-                for param in self.module.parameters():
-                    if param.requires_grad and param.grad is not None:
-                        tp = param.data.type()
-                        if tp not in buckets:
-                            buckets[tp] = []
-                        buckets[tp].append(param)
-                if self.warn_on_half:
-                    if torch.cuda.HalfTensor in buckets:
-                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
-                              " It is recommended to use the NCCL backend in this case.")
-                        self.warn_on_half = False
-
-                for tp in buckets:
-                    bucket = buckets[tp]
-                    grads = [param.grad.data for param in bucket]
-                    coalesced = _flatten_dense_tensors(grads)
-                    dist.all_reduce(coalesced)
-                    coalesced /= dist.get_world_size()
-                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
-                        buf.copy_(synced)
-
-        for param in list(self.module.parameters()):
-            def allreduce_hook(*unused):
-                param._execution_engine.queue_callback(allreduce_params)
-            if param.requires_grad:
-                param.register_hook(allreduce_hook)
-
-    def forward(self, *inputs, **kwargs):
-        self.needs_reduction = True
-        return self.module(*inputs, **kwargs)
-
-    '''
-    def _sync_buffers(self):
-        buffers = list(self.module._all_buffers())
-        if len(buffers) > 0:
-            # cross-node buffer sync
-            flat_buffers = _flatten_dense_tensors(buffers)
-            dist.broadcast(flat_buffers, 0)
-            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
-                buf.copy_(synced)
-     def train(self, mode=True):
-        # Clear NCCL communicator and CUDA event cache of the default group ID,
-        # These cache will be recreated at the later call. This is currently a
-        # work-around for a potential NCCL deadlock.
-        if dist._backend == dist.dist_backend.NCCL:
-            dist._clear_group_cache()
-        super(DistributedDataParallel, self).train(mode)
-        self.module.train(mode)
-    '''
diff --git a/PyTorch/Detection/SSD/src/__init__.py b/PyTorch/Detection/SSD/ssd/__init__.py
similarity index 100%
rename from PyTorch/Detection/SSD/src/__init__.py
rename to PyTorch/Detection/SSD/ssd/__init__.py
diff --git a/PyTorch/Detection/SSD/src/coco_pipeline.py b/PyTorch/Detection/SSD/ssd/coco_pipeline.py
similarity index 66%
rename from PyTorch/Detection/SSD/src/coco_pipeline.py
rename to PyTorch/Detection/SSD/ssd/coco_pipeline.py
index 0dfbea20..3e2865b4 100644
--- a/PyTorch/Detection/SSD/src/coco_pipeline.py
+++ b/PyTorch/Detection/SSD/ssd/coco_pipeline.py
@@ -11,75 +11,114 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import torch
 import ctypes
+import time
 import logging
 
 import numpy as np
+import torch
 
 # DALI imports
+import nvidia.dali as dali
 from nvidia.dali.pipeline import Pipeline
-import nvidia.dali.ops as ops
-import nvidia.dali.types as types
-
-import time
 
 
 class COCOPipeline(Pipeline):
-    def __init__(self, batch_size, device_id, file_root, annotations_file, num_gpus,
-            output_fp16=False, output_nhwc=False, pad_output=False, num_threads=1, seed=15):
-        super(COCOPipeline, self).__init__(batch_size=batch_size, device_id=device_id,
-                                           num_threads=num_threads, seed = seed)
+    def __init__(self, batch_size, file_root, annotations_file, default_boxes,
+                 device_id, num_shards,
+                 output_fp16=False, output_nhwc=False, pad_output=False,
+                 num_threads=1, seed=15):
+        super(COCOPipeline, self).__init__(batch_size=batch_size,
+                                           device_id=device_id,
+                                           num_threads=num_threads,
+                                           seed=seed)
 
         if torch.distributed.is_initialized():
             shard_id = torch.distributed.get_rank()
         else:
             shard_id = 0
 
-        self.input = ops.COCOReader(file_root = file_root, annotations_file = annotations_file,
-                            shard_id = shard_id, num_shards = num_gpus, ratio=True, ltrb=True, random_shuffle=True,
-                                    skip_empty=True)
-        self.decode = ops.ImageDecoder(device = "cpu", output_type = types.RGB)
+        # Data loader and image decoder
+        self.input = dali.ops.readers.COCO(file_root=file_root,
+                                           annotations_file=annotations_file,
+                                           shard_id=shard_id,
+                                           num_shards=num_shards,
+                                           ratio=True,
+                                           ltrb=True,
+                                           shuffle_after_epoch=True,
+                                           skip_empty=True)
+        self.decode_slice = dali.ops.decoders.ImageSlice(device="cpu",
+                                                         output_type=dali.types.RGB)
 
         # Augumentation techniques
-        self.crop = ops.SSDRandomCrop(device="cpu", num_attempts=1)
-        self.twist = ops.ColorTwist(device="gpu")
+        ## Random crop
+        self.crop = dali.ops.RandomBBoxCrop(device="cpu",
+                                            aspect_ratio=[0.5, 2.0],
+                                            thresholds=[0, 0.1, 0.3, 0.5, 0.7, 0.9],
+                                            scaling=[0.3, 1.0],
+                                            bbox_layout="xyXY",
+                                            allow_no_crop=True,
+                                            num_attempts=1)
+        ## Color twist
+        self.hsv = dali.ops.Hsv(device="gpu",
+                                dtype=dali.types.FLOAT)  # use float to avoid clipping and quantizing the intermediate result
+        self.bc = dali.ops.BrightnessContrast(device="gpu",
+                                              contrast_center=128,  # input is in the [0, 255] range
+                                              dtype=dali.types.UINT8)
+        ## Cropping and normalization
+        dtype = dali.types.FLOAT16 if output_fp16 else dali.types.FLOAT
+        output_layout = dali.types.NHWC if output_nhwc else dali.types.NCHW
+        self.normalize = dali.ops.CropMirrorNormalize(
+            device="gpu",
+            crop=(300, 300),
+            mean=[0.0, 0.0, 0.0],
+            std=[255.0, 255.0, 255.0],
+            mirror=0,
+            dtype=dtype,
+            output_layout=output_layout,
+            pad_output=pad_output)
+        ## Flipping
+        self.flip = dali.ops.Flip(device="cpu")
+        self.bbflip = dali.ops.BbFlip(device="cpu", ltrb=True)
 
-        self.resize = ops.Resize(device = "gpu", resize_x = 300, resize_y = 300)
-
-        output_dtype = types.FLOAT16 if output_fp16 else types.FLOAT
-        output_layout = types.NHWC if output_nhwc else types.NCHW
-
-        self.normalize = ops.CropMirrorNormalize(device="gpu", crop=(300, 300),
-                                                 mean=[0.0, 0.0, 0.0],
-                                                 std=[255.0, 255.0, 255.0],
-                                                 mirror=0,
-                                                 output_dtype=output_dtype,
-                                                 output_layout=output_layout,
-                                                 pad_output=pad_output)
+        # Resize
+        self.resize = dali.ops.Resize(device="cpu",
+                                      resize_x=300,
+                                      resize_y=300)
 
         # Random variables
-        self.rng1 = ops.Uniform(range=[0.5, 1.5])
-        self.rng2 = ops.Uniform(range=[0.875, 1.125])
-        self.rng3 = ops.Uniform(range=[-0.5, 0.5])
+        self.rng1 = dali.ops.random.Uniform(range=[0.5, 1.5])
+        self.rng2 = dali.ops.random.Uniform(range=[0.875, 1.125])
+        self.rng3 = dali.ops.random.Uniform(range=[-0.5, 0.5])
+        self.flip_coin = dali.ops.random.CoinFlip(probability=0.5)
+
+        # bbox encoder
+        self.anchors = default_boxes(order='ltrb').cpu().numpy().flatten().tolist()
+        self.box_encoder = dali.ops.BoxEncoder(device="cpu",
+                                               criteria=0.5,
+                                               anchors=self.anchors)
 
     def define_graph(self):
         saturation = self.rng1()
         contrast = self.rng1()
         brightness = self.rng2()
         hue = self.rng3()
+        coin_rnd = self.flip_coin()
 
-        inputs, bboxes, labels = self.input()
-        images = self.decode(inputs)
+        inputs, bboxes, labels = self.input(name="Reader")
+        crop_begin, crop_size, bboxes, labels = self.crop(bboxes, labels)
+        images = self.decode_slice(inputs, crop_begin, crop_size)
+
+        images = self.flip(images, horizontal=coin_rnd)
+        bboxes = self.bbflip(bboxes, horizontal=coin_rnd)
+        images = self.resize(images)
+        images = images.gpu()
+
+        images = self.hsv(images, hue=hue, saturation=saturation)
+        images = self.bc(images, brightness=brightness, contrast=contrast)
 
-        images, bboxes, labels = self.crop(images, bboxes, labels)
-        images = self.resize(images.gpu())
-        images = self.twist(images.gpu(), saturation=saturation, contrast=contrast, brightness=brightness, hue=hue)
         images = self.normalize(images)
+        bboxes, labels = self.box_encoder(bboxes, labels)
 
         # bboxes and images and labels on GPU
         return (images, bboxes.gpu(), labels.gpu())
@@ -131,7 +170,7 @@ class DALICOCOIterator(object):
 
         self._num_gpus = len(pipelines)
         assert pipelines is not None, "Number of provided pipelines has to be at least 1"
-        self.batch_size = pipelines[0].batch_size
+        self.batch_size = pipelines[0].max_batch_size
         self._size = size
         self._pipes = pipelines
 
@@ -231,7 +270,7 @@ class DALICOCOIterator(object):
                 for k in range(len(l_list)):
                     if (pyt_labels[j][k].shape[0] != 0):
                         feed_ndarray(l_list[k], pyt_labels[j][k])
-                pyt_labels[j] = torch.cat(pyt_labels[j]).squeeze(dim=1)
+                pyt_labels[j] = torch.cat(pyt_labels[j])
 
             for j in range(len(pyt_offsets)):
                 pyt_offsets[j] = torch.IntTensor(bbox_offsets[j])
diff --git a/PyTorch/Detection/SSD/src/data.py b/PyTorch/Detection/SSD/ssd/data.py
similarity index 77%
rename from PyTorch/Detection/SSD/src/data.py
rename to PyTorch/Detection/SSD/ssd/data.py
index 10b87dd1..22fb9973 100644
--- a/PyTorch/Detection/SSD/src/data.py
+++ b/PyTorch/Detection/SSD/ssd/data.py
@@ -17,20 +17,26 @@ import os
 import torch
 from torch.utils.data import DataLoader
 
-from src.utils import dboxes300_coco, COCODetection
-from src.utils import SSDTransformer
-from src.coco import COCO
+from ssd.utils import dboxes300_coco, COCODetection
+from ssd.utils import SSDTransformer
+from pycocotools.coco import COCO
 #DALI import
-from src.coco_pipeline import COCOPipeline, DALICOCOIterator
+from ssd.coco_pipeline import COCOPipeline, DALICOCOIterator
 
 def get_train_loader(args, local_seed):
     train_annotate = os.path.join(args.data, "annotations/instances_train2017.json")
     train_coco_root = os.path.join(args.data, "train2017")
 
-    train_pipe = COCOPipeline(args.batch_size, args.local_rank, train_coco_root,
-                    train_annotate, args.N_gpu, num_threads=args.num_workers,
-                    output_fp16=args.amp, output_nhwc=False,
-                    pad_output=False, seed=local_seed)
+    train_pipe = COCOPipeline(batch_size=args.batch_size,
+        file_root=train_coco_root,
+        annotations_file=train_annotate,
+        default_boxes=dboxes300_coco(),
+        device_id=args.local_rank,
+        num_shards=args.N_gpu,
+        output_fp16=args.amp,
+        output_nhwc=False,
+        pad_output=False,
+        num_threads=args.num_workers, seed=local_seed)
     train_pipe.build()
     test_run = train_pipe.schedule_run(), train_pipe.share_outputs(), train_pipe.release_outputs()
     train_loader = DALICOCOIterator(train_pipe, 118287 / args.N_gpu)
@@ -64,5 +70,5 @@ def get_val_dataloader(dataset, args):
 
 def get_coco_ground_truth(args):
     val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
-    cocoGt = COCO(annotation_file=val_annotate)
+    cocoGt = COCO(annotation_file=val_annotate, use_ext=True)
     return cocoGt
diff --git a/PyTorch/Detection/SSD/src/entrypoints.py b/PyTorch/Detection/SSD/ssd/entrypoints.py
similarity index 100%
rename from PyTorch/Detection/SSD/src/entrypoints.py
rename to PyTorch/Detection/SSD/ssd/entrypoints.py
diff --git a/PyTorch/Detection/SSD/src/evaluate.py b/PyTorch/Detection/SSD/ssd/evaluate.py
similarity index 94%
rename from PyTorch/Detection/SSD/src/evaluate.py
rename to PyTorch/Detection/SSD/ssd/evaluate.py
index 923faa43..20ede884 100644
--- a/PyTorch/Detection/SSD/src/evaluate.py
+++ b/PyTorch/Detection/SSD/ssd/evaluate.py
@@ -38,11 +38,9 @@ def evaluate(model, coco, cocoGt, encoder, inv_map, args):
         print("Parsing batch: {}/{}".format(nbatch, len(coco)), end='\r')
         with torch.no_grad():
             inp = img.cuda()
-            if args.amp:
-                inp = inp.half()
-
-            # Get predictions
-            ploc, plabel = model(inp)
+            with torch.cuda.amp.autocast(enabled=args.amp):
+                # Get predictions
+                ploc, plabel = model(inp)
             ploc, plabel = ploc.float(), plabel.float()
 
             # Handle the batch of predictions produced
@@ -118,9 +116,9 @@ def evaluate(model, coco, cocoGt, encoder, inv_map, args):
         print("")
         print("Predicting Ended, total time: {:.2f} s".format(time.time() - start))
 
-    cocoDt = cocoGt.loadRes(final_results)
+    cocoDt = cocoGt.loadRes(final_results, use_ext=True)
 
-    E = COCOeval(cocoGt, cocoDt, iouType='bbox')
+    E = COCOeval(cocoGt, cocoDt, iouType='bbox', use_ext=True)
     E.evaluate()
     E.accumulate()
     if args.local_rank == 0:
diff --git a/PyTorch/Detection/SSD/src/logger.py b/PyTorch/Detection/SSD/ssd/logger.py
similarity index 100%
rename from PyTorch/Detection/SSD/src/logger.py
rename to PyTorch/Detection/SSD/ssd/logger.py
diff --git a/PyTorch/Detection/SSD/src/model.py b/PyTorch/Detection/SSD/ssd/model.py
similarity index 97%
rename from PyTorch/Detection/SSD/src/model.py
rename to PyTorch/Detection/SSD/ssd/model.py
index 91fdb9d4..3da96f48 100644
--- a/PyTorch/Detection/SSD/src/model.py
+++ b/PyTorch/Detection/SSD/ssd/model.py
@@ -141,12 +141,12 @@ class Loss(nn.Module):
         self.scale_xy = 1.0/dboxes.scale_xy
         self.scale_wh = 1.0/dboxes.scale_wh
 
-        self.sl1_loss = nn.SmoothL1Loss(reduce=False)
+        self.sl1_loss = nn.SmoothL1Loss(reduction='none')
         self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim = 0),
             requires_grad=False)
         # Two factor are from following links
         # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
-        self.con_loss = nn.CrossEntropyLoss(reduce=False)
+        self.con_loss = nn.CrossEntropyLoss(reduction='none')
 
     def _loc_vec(self, loc):
         """
@@ -187,7 +187,7 @@ class Loss(nn.Module):
         neg_mask = con_rank < neg_num
 
         #print(con.shape, mask.shape, neg_mask.shape)
-        closs = (con*(mask.float() + neg_mask.float())).sum(dim=1)
+        closs = (con*((mask + neg_mask).float())).sum(dim=1)
 
         # avoid no object detected
         total_loss = sl1 + closs
diff --git a/PyTorch/Detection/SSD/src/train.py b/PyTorch/Detection/SSD/ssd/train.py
similarity index 66%
rename from PyTorch/Detection/SSD/src/train.py
rename to PyTorch/Detection/SSD/ssd/train.py
index 68e1a13b..011f8210 100644
--- a/PyTorch/Detection/SSD/src/train.py
+++ b/PyTorch/Detection/SSD/ssd/train.py
@@ -15,20 +15,17 @@
 from torch.autograd import Variable
 import torch
 import time
-from SSD import _C as C
 
 from apex import amp
 
-def train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
+def train_loop(model, loss_func, scaler, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
     for nbatch, data in enumerate(train_dataloader):
         img = data[0][0][0]
         bbox = data[0][1][0]
         label = data[0][2][0]
         label = label.type(torch.cuda.LongTensor)
         bbox_offsets = data[0][3][0]
-        # handle random flipping outside of DALI for now
         bbox_offsets = bbox_offsets.cuda()
-        img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
         img.sub_(mean).div_(std)
         if not args.no_cuda:
             img = img.cuda()
@@ -40,44 +37,38 @@ def train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader,
         if bbox_offsets[-1].item() == 0:
             print("No labels in batch")
             continue
-        bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
+
         # output is ([N*8732, 4], [N*8732], need [N, 8732, 4], [N, 8732] respectively
         M = bbox.shape[0] // N
         bbox = bbox.view(N, M, 4)
         label = label.view(N, M)
 
-        ploc, plabel = model(img)
-        ploc, plabel = ploc.float(), plabel.float()
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            ploc, plabel = model(img)
 
-        trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
+            ploc, plabel = ploc.float(), plabel.float()
+            trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
+            gloc = Variable(trans_bbox, requires_grad=False)
+            glabel = Variable(label, requires_grad=False)
 
-        if not args.no_cuda:
-            label = label.cuda()
-        gloc = Variable(trans_bbox, requires_grad=False)
-        glabel = Variable(label, requires_grad=False)
-
-        loss = loss_func(ploc, plabel, gloc, glabel)
-
-        if args.local_rank == 0:
-            logger.update_iter(epoch, iteration, loss.item())
-
-        if args.amp:
-            with amp.scale_loss(loss, optim) as scale_loss:
-                scale_loss.backward()
-        else:
-            loss.backward()
+            loss = loss_func(ploc, plabel, gloc, glabel)
 
         if args.warmup is not None:
             warmup(optim, args.warmup, iteration, args.learning_rate)
 
-        optim.step()
+        scaler.scale(loss).backward()
+        scaler.step(optim)
+        scaler.update()
         optim.zero_grad()
+
+        if args.local_rank == 0:
+            logger.update_iter(epoch, iteration, loss.item())
         iteration += 1
 
     return iteration
 
 
-def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
+def benchmark_train_loop(model, loss_func, scaler, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
     start_time = None
     # tensor for results
     result = torch.zeros((1,)).cuda()
@@ -91,54 +82,40 @@ def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_d
         label = data[0][2][0]
         label = label.type(torch.cuda.LongTensor)
         bbox_offsets = data[0][3][0]
-        # handle random flipping outside of DALI for now
         bbox_offsets = bbox_offsets.cuda()
-        img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
-
+        img.sub_(mean).div_(std)
         if not args.no_cuda:
             img = img.cuda()
             bbox = bbox.cuda()
             label = label.cuda()
             bbox_offsets = bbox_offsets.cuda()
-        img.sub_(mean).div_(std)
 
         N = img.shape[0]
         if bbox_offsets[-1].item() == 0:
             print("No labels in batch")
             continue
-        bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
 
         # output is ([N*8732, 4], [N*8732], need [N, 8732, 4], [N, 8732] respectively
         M = bbox.shape[0] // N
         bbox = bbox.view(N, M, 4)
         label = label.view(N, M)
 
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            ploc, plabel = model(img)
 
+            ploc, plabel = ploc.float(), plabel.float()
+            trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
+            gloc = Variable(trans_bbox, requires_grad=False)
+            glabel = Variable(label, requires_grad=False)
 
+            loss = loss_func(ploc, plabel, gloc, glabel)
 
+        if args.warmup is not None:
+            warmup(optim, args.warmup, iteration, args.learning_rate)
 
-        ploc, plabel = model(img)
-        ploc, plabel = ploc.float(), plabel.float()
-
-        trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
-
-        if not args.no_cuda:
-            label = label.cuda()
-        gloc = Variable(trans_bbox, requires_grad=False)
-        glabel = Variable(label, requires_grad=False)
-
-        loss = loss_func(ploc, plabel, gloc, glabel)
-
-
-
-        # loss scaling
-        if args.amp:
-            with amp.scale_loss(loss, optim) as scale_loss:
-                scale_loss.backward()
-        else:
-            loss.backward()
-
-        optim.step()
+        scaler.scale(loss).backward()
+        scaler.step(optim)
+        scaler.update()
         optim.zero_grad()
 
         if nbatch >= args.benchmark_warmup + args.benchmark_iterations:
@@ -162,38 +139,30 @@ def loop(dataloader, reset=True):
         if reset:
             dataloader.reset()
 
-def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
+def benchmark_inference_loop(model, loss_func, scaler, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
     assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
-    start_time = None
     model.eval()
-
-    i = -1
     val_datas = loop(val_dataloader, False)
 
-    while True:
-        i += 1
+    for i in range(args.benchmark_warmup + args.benchmark_iterations):
         torch.cuda.synchronize()
-        if i >= args.benchmark_warmup:
-            start_time = time.time()
+        start_time = time.time()
 
         data = next(val_datas)
-
+        img = data[0]
         with torch.no_grad():
-            img = data[0]
             if not args.no_cuda:
                 img = img.cuda()
-            if args.amp:
-                img = img.half()
             img.sub_(mean).div_(std)
-            img = Variable(img, requires_grad=False)
-            _ = model(img)
-            torch.cuda.synchronize()
+            with torch.cuda.amp.autocast(enabled=args.amp):
+                _ = model(img)
 
-            if i >= args.benchmark_warmup + args.benchmark_iterations:
-                break
+        torch.cuda.synchronize()
+        end_time = time.time()
 
-            if i >= args.benchmark_warmup:
-                logger.update(args.eval_batch_size, time.time() - start_time)
+
+        if i >= args.benchmark_warmup:
+            logger.update(args.eval_batch_size, end_time - start_time)
 
     logger.print_result()
 
diff --git a/PyTorch/Detection/SSD/src/utils.py b/PyTorch/Detection/SSD/ssd/utils.py
similarity index 100%
rename from PyTorch/Detection/SSD/src/utils.py
rename to PyTorch/Detection/SSD/ssd/utils.py
diff --git a/hubconf.py b/hubconf.py
index bca7e2ae..aa0d1076 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,7 +1,7 @@
 import os
 import sys
 
-from PyTorch.Detection.SSD.src import nvidia_ssd, nvidia_ssd_processing_utils
+from PyTorch.Detection.SSD.ssd import nvidia_ssd, nvidia_ssd_processing_utils
 sys.path.append(os.path.join(sys.path[0], 'PyTorch/Detection/SSD'))
 
 from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import nvidia_tacotron2