Merge pull request #12 from NVIDIA/nvpstr/master

Adding 9 new models (6 in TensorFlow and 3 in PyTorch)
2019-03-18 20:57:20 +01:00 · 2019-03-18 20:57:20 +01:00 · b1ae8dd47c
parent 713d95411a 7a8544e0a2
commit b1ae8dd47c
1201 changed files with 358857 additions and 48804 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
-[submodule "TensorFlow/OpenSeq2Seq"]
-	path = TensorFlow/OpenSeq2Seq
-	url = https://github.com/NVIDIA/OpenSeq2Seq
--- a/Caffe2/Classification/cifar10/get_cifar10.sh
+++ b/Caffe2/Classification/cifar10/get_cifar10.sh
@ -1,19 +0,0 @@
-#!/usr/bin/env sh
-# This scripts downloads the CIFAR10 (binary version) data and unzips it.
-set -e
-
-cd "$( cd "$(dirname "$0")" ; pwd -P )"
-
-echo "Downloading..."
-
-wget --no-check-certificate http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
-
-echo "Unzipping..."
-
-tar -xf cifar-10-binary.tar.gz && rm -f cifar-10-binary.tar.gz
-mv cifar-10-batches-bin/* . && rm -rf cifar-10-batches-bin
-
-# Creation is split out because leveldb sometimes causes segfault
-# and needs to be re-created.
-
-echo "Done."
--- a/Caffe2/Classification/cifar10/make_cifar10.sh
+++ b/Caffe2/Classification/cifar10/make_cifar10.sh
@ -1,7 +0,0 @@
-#!/bin/bash
-set -e
-
-cd "$( cd "$(dirname "$0")" ; pwd -P )"
-
-# Create CIFAR10 train + test databases
-make_cifar_db --db lmdb --input_folder "$(pwd)" --output_train_db_name cifar10_train_lmdb --output_test_db_name cifar10_test_lmdb
--- a/Caffe2/Classification/cifar10/train_cifar10.py
+++ b/Caffe2/Classification/cifar10/train_cifar10.py
@ -1,208 +0,0 @@
-#!/usr/bin/env python
-"""Example: train a model on CIFAR10."""
-from __future__ import division, print_function
-
-import argparse
-import functools
-import logging
-import os.path
-
-from caffe2.python import brew, core, data_parallel_model, optimizer, workspace
-from caffe2.python.core import DataType
-from caffe2.python.model_helper import ModelHelper
-from caffe2.python.modeling.initializers import Initializer, pFP16Initializer
-
-
-logging.basicConfig()
-
-TRAIN_ENTRIES = 50000
-TEST_ENTRIES = 10000
-BATCH_SIZE = 100
-EPOCHS = 10
-DISPLAY = 100
-ACCURACY_MIN = 0.7
-ACCURACY_MAX = 0.8
-
-
-def AddInputOps(model, reader, batch_size, dtype):
-    """Add input ops."""
-    data, label = brew.image_input(
-        model, [reader], ['data', 'label'],
-        batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
-        scale=32, crop=32, mirror=1, color=True, mean=128.0,
-        output_type='float16' if dtype == DataType.FLOAT16 else 'float',
-        is_test=False)
-    data = model.StopGradient(data, data)
-
-
-def AddForwardPassOps(model, loss_scale, dtype):
-    """Add forward pass ops and return a list of losses."""
-    initializer = (pFP16Initializer if dtype == DataType.FLOAT16
-                   else Initializer)
-    with brew.arg_scope([brew.conv, brew.fc],
-                        WeightInitializer=initializer,
-                        BiasInitializer=initializer):
-        conv1 = brew.conv(model, 'data', 'conv1', 3, 32, 5, pad=2,
-                          weight_init=('GaussianFill',
-                                       {'std': 0.0001, 'mean': 0.0}))
-        pool1 = brew.max_pool(model, conv1, 'pool1', kernel=3, stride=2)
-        relu1 = brew.relu(model, pool1, 'relu1')
-        conv2 = brew.conv(model, relu1, 'conv2', 32, 32, 5, pad=2,
-                          weight_init=('GaussianFill', {'std': 0.01}))
-        conv2 = brew.relu(model, conv2, conv2)
-        pool2 = brew.average_pool(model, conv2, 'pool2', kernel=3, stride=2)
-        conv3 = brew.conv(model, pool2, 'conv3', 32, 64, 5, pad=2,
-                          weight_init=('GaussianFill', {'std': 0.01}))
-        conv3 = brew.relu(model, conv3, conv3)
-        pool3 = brew.average_pool(model, conv3, 'pool3', kernel=3, stride=2)
-        fc1 = brew.fc(model, pool3, 'fc1', 64 * 3 * 3, 64,
-                      weight_init=('GaussianFill', {'std': 0.1}))
-        fc2 = brew.fc(model, fc1, 'fc2', 64, 10,
-                      weight_init=('GaussianFill', {'std': 0.1}))
-
-    if dtype == DataType.FLOAT16:
-        fc2 = model.net.HalfToFloat(fc2, fc2 + '_fp32')
-    softmax, loss = model.SoftmaxWithLoss([fc2, 'label'], ['softmax', 'loss'])
-    loss = model.Scale(loss, loss, scale=loss_scale)
-    brew.accuracy(model, [softmax, 'label'], 'accuracy')
-    return [loss]
-
-
-def AddOptimizerOps(model):
-    """Add optimizer ops."""
-    optimizer.add_weight_decay(model, 0.004)
-    stepsize = TRAIN_ENTRIES * EPOCHS // BATCH_SIZE
-    optimizer.build_sgd(
-        model, 0.001,
-        policy='step', stepsize=stepsize, gamma=0.1,
-        momentum=0.9, nesterov=False)
-
-
-def AddPostSyncOps(model):
-    """Add ops which run after the initial parameter sync."""
-    for param_info in model.GetOptimizationParamInfo(model.GetParams()):
-        if param_info.blob_copy is not None:
-            # Ensure copies are in sync after initial broadcast
-            model.param_init_net.HalfToFloat(
-                param_info.blob,
-                param_info.blob_copy[core.DataType.FLOAT]
-            )
-
-
-def createTrainModel(lmdb_path, devices, dtype):
-    """Create and return a training model, complete with training ops."""
-    model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
-    reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
-    data_parallel_model.Parallelize_GPU(
-        model,
-        input_builder_fun=functools.partial(
-            AddInputOps, reader=reader,
-            batch_size=(BATCH_SIZE // len(devices)), dtype=dtype),
-        forward_pass_builder_fun=functools.partial(
-            AddForwardPassOps, dtype=dtype),
-        optimizer_builder_fun=AddOptimizerOps,
-        post_sync_builder_fun=AddPostSyncOps,
-        devices=devices, use_nccl=True)
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-    return model
-
-
-def createTestModel(lmdb_path, devices, dtype):
-    """Create and return a test model. Does not include training ops."""
-    model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
-                        init_params=False)
-    reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
-    data_parallel_model.Parallelize_GPU(
-        model,
-        input_builder_fun=functools.partial(
-            AddInputOps, reader=reader,
-            batch_size=(BATCH_SIZE // len(devices)), dtype=dtype),
-        forward_pass_builder_fun=functools.partial(
-            AddForwardPassOps, dtype=dtype),
-        param_update_builder_fun=None,
-        devices=devices)
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-    return model
-
-
-def getArgs():
-    """Return command-line arguments."""
-    CURDIR = os.path.dirname(__file__)
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--train-lmdb', help='Path to training LMDB',
-                        default=os.path.join(CURDIR, 'cifar10_train_lmdb'))
-    parser.add_argument('--test-lmdb', help='Path to test LMDB',
-                        default=os.path.join(CURDIR, 'cifar10_test_lmdb'))
-    parser.add_argument('--dtype', choices=['float', 'float16'],
-                        default='float', help='Data type used for training')
-    parser.add_argument('--gpus',
-                        help='Comma separated list of GPU devices to use')
-    parser.add_argument('--num_gpus', type=int, default=1,
-                        help='Number of GPU devices (instead of --gpus)')
-    parser.add_argument('--all-gpus', action='store_true',
-                        help='Use all GPUs in the system')
-    args = parser.parse_args()
-
-    args.dtype = (DataType.FLOAT16 if args.dtype == 'float16'
-                  else DataType.FLOAT)
-
-    if args.all_gpus:
-        args.num_gpus = workspace.NumCudaDevices()
-        args.gpus = range(args.num_gpus)
-    else:
-        if args.gpus is not None:
-            args.gpus = [int(x) for x in args.gpus.split(',')]
-            args.num_gpus = len(args.gpus)
-        else:
-            args.gpus = range(args.num_gpus)
-            args.num_gpus = args.num_gpus
-    return args
-
-
-def main(args):
-    """Train and test."""
-    train_model = createTrainModel(args.train_lmdb, args.gpus, args.dtype)
-    test_model = createTestModel(args.test_lmdb, args.gpus, args.dtype)
-
-    train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
-    test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
-    scope_prefix = 'gpu_%d/' % args.gpus[0]
-
-    for epoch in range(1, EPOCHS + 1):
-        # Train
-        for iteration in range(1, train_iter_per_epoch + 1):
-            workspace.RunNet(train_model.net.Proto().name)
-            if not iteration % DISPLAY:
-                loss = workspace.FetchBlob(scope_prefix + 'loss')
-                print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
-                    epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
-
-        # Test
-        losses = []
-        accuracies = []
-        for _ in range(test_iter_per_epoch):
-            workspace.RunNet(test_model.net.Proto().name)
-            # Take average values across all GPUs
-            losses.append(sum(
-                workspace.FetchBlob('gpu_%d/loss' % g) for g in args.gpus
-            ) / len(args.gpus))
-            accuracies.append(sum(
-                workspace.FetchBlob('gpu_%d/accuracy' % g) for g in args.gpus
-            ) / len(args.gpus))
-
-        loss = sum(losses) / len(losses)
-        accuracy = sum(accuracies) / len(accuracies)
-        print("Test loss: %f, accuracy: %f" % (loss, accuracy))
-
-    if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
-        raise RuntimeError(
-            "Final accuracy %f is not in the expected range [%f, %f]" %
-            (accuracy, ACCURACY_MIN, ACCURACY_MAX))
-
-
-if __name__ == '__main__':
-    core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
-    main(getArgs())
--- a/Caffe2/Classification/mnist/.gitignore
+++ b/Caffe2/Classification/mnist/.gitignore
@ -1,2 +0,0 @@
-*.mdb
-*-ubyte
--- a/Caffe2/Classification/mnist/get_mnist.sh
+++ b/Caffe2/Classification/mnist/get_mnist.sh
@ -1,14 +0,0 @@
-#!/usr/bin/env sh
-# This scripts downloads the mnist data and unzips it.
-
-cd "$( cd "$(dirname "$0")" ; pwd -P )"
-
-echo "Downloading..."
-
-for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
-do
-    if [ ! -e $fname ]; then
-        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
-        gunzip ${fname}.gz
-    fi
-done
--- a/Caffe2/Classification/mnist/make_mnist.sh
+++ b/Caffe2/Classification/mnist/make_mnist.sh
@ -1,7 +0,0 @@
-#!/bin/bash
-
-cd "$( cd "$(dirname "$0")" ; pwd -P )"
-
-# Create MNIST databases from previously downloaded data
-make_mnist_db --db lmdb --image_file train-images-idx3-ubyte --label_file train-labels-idx1-ubyte --output_file mnist_train_lmdb
-make_mnist_db --db lmdb --image_file t10k-images-idx3-ubyte --label_file t10k-labels-idx1-ubyte --output_file mnist_test_lmdb
--- a/Caffe2/Classification/mnist/train_lenet.py
+++ b/Caffe2/Classification/mnist/train_lenet.py
@ -1,133 +0,0 @@
-#!/usr/bin/env python
-"""Example: train LeNet on MNIST."""
-from __future__ import division, print_function
-
-import argparse
-import os.path
-
-import numpy as np
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import brew, core, optimizer, workspace
-from caffe2.python.model_helper import ModelHelper
-
-
-TRAIN_ENTRIES = 60000
-TEST_ENTRIES = 10000
-BATCH_SIZE = 100
-EPOCHS = 4
-DISPLAY = 100
-ACCURACY_MIN = 0.98
-ACCURACY_MAX = 0.999
-
-
-def AddInputOps(model, reader, batch_size):
-    """Add input ops."""
-    data, label = brew.image_input(
-        model, [reader], ['data', 'label'],
-        batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
-        scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
-        is_test=False)
-    data = model.StopGradient(data, data)
-
-
-def AddForwardPassOps(model):
-    """Add forward pass ops and return a list of losses."""
-    conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
-    pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
-    conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
-    pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
-    fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
-    fc3 = brew.relu(model, fc3, fc3)
-    pred = brew.fc(model, fc3, 'pred', 500, 10)
-    softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
-    brew.accuracy(model, [softmax, 'label'], 'accuracy')
-    return [loss]
-
-
-def AddOptimizerOps(model):
-    """Add optimizer ops."""
-    optimizer.build_sgd(model, 0.01,
-                        policy='step', stepsize=1, gamma=0.999,
-                        momentum=0.9, nesterov=False)
-
-
-def createTrainModel(lmdb_path):
-    """Create and return a training model, complete with training ops."""
-    model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
-    reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
-    AddInputOps(model, reader, BATCH_SIZE)
-    losses = AddForwardPassOps(model)
-    model.AddGradientOperators(losses)
-    AddOptimizerOps(model)
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-
-    return model
-
-
-def createTestModel(lmdb_path):
-    """Create and return a test model. Does not include training ops."""
-    model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
-                        init_params=False)
-    reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
-    AddInputOps(model, reader, BATCH_SIZE)
-    AddForwardPassOps(model)
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-    return model
-
-
-def getArgs():
-    """Return command-line arguments."""
-    CURDIR = os.path.dirname(__file__)
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--train-lmdb', help='Path to training LMDB',
-                        default=os.path.join(CURDIR, 'mnist_train_lmdb'))
-    parser.add_argument('--test-lmdb', help='Path to test LMDB',
-                        default=os.path.join(CURDIR, 'mnist_test_lmdb'))
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    """Train and test."""
-    device = 0
-    with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, device)):
-        train_model = createTrainModel(args.train_lmdb)
-        test_model = createTestModel(args.test_lmdb)
-
-    train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
-    test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
-
-    for epoch in range(1, EPOCHS + 1):
-        # Train
-        for iteration in range(1, train_iter_per_epoch + 1):
-            workspace.RunNet(train_model.net.Proto().name)
-            if not iteration % DISPLAY:
-                loss = workspace.FetchBlob('loss')
-                print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
-                    epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
-
-        # Test
-        losses = []
-        accuracies = []
-        for _ in range(test_iter_per_epoch):
-            workspace.RunNet(test_model.net.Proto().name)
-            losses.append(workspace.FetchBlob('loss'))
-            accuracies.append(workspace.FetchBlob('accuracy'))
-
-        loss = np.array(losses).mean()
-        accuracy = np.array(accuracies).mean()
-        print("Test loss: %f, accuracy: %f" % (loss, accuracy))
-
-    if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
-        raise RuntimeError(
-            "Final accuracy %f is not in the expected range [%f, %f]" %
-            (accuracy, ACCURACY_MIN, ACCURACY_MAX))
-
-
-if __name__ == '__main__':
-    core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
-    main(getArgs())
--- a/Caffe2/Classification/mnist/train_lenet_fp16.py
+++ b/Caffe2/Classification/mnist/train_lenet_fp16.py
@ -1,139 +0,0 @@
-#!/usr/bin/env python
-"""Example: train LeNet on MNIST (with fp16)."""
-from __future__ import division, print_function
-
-import argparse
-import os.path
-
-import numpy as np
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import brew, core, optimizer, workspace
-from caffe2.python.model_helper import ModelHelper
-from caffe2.python.modeling.initializers import pFP16Initializer
-
-
-TRAIN_ENTRIES = 60000
-TEST_ENTRIES = 10000
-BATCH_SIZE = 100
-EPOCHS = 4
-DISPLAY = 100
-ACCURACY_MIN = 0.98
-ACCURACY_MAX = 0.999
-
-
-def AddInputOps(model, reader, batch_size):
-    """Add input ops."""
-    data, label = brew.image_input(
-        model, [reader], ['data', 'label'],
-        batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
-        scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
-        output_type='float16', is_test=True)
-    data = model.StopGradient(data, data)
-
-
-def AddForwardPassOps(model):
-    """Add forward pass ops and return a list of losses."""
-    with brew.arg_scope([brew.conv, brew.fc],
-                        WeightInitializer=pFP16Initializer,
-                        BiasInitializer=pFP16Initializer):
-        conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
-        pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
-        conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
-        pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
-        fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
-        fc3 = brew.relu(model, fc3, fc3)
-        pred = brew.fc(model, fc3, 'pred', 500, 10)
-
-    # Cast back to fp32 for remaining ops
-    pred = model.net.HalfToFloat(pred, pred + '_fp32')
-    softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
-    brew.accuracy(model, [softmax, 'label'], 'accuracy')
-    return [loss]
-
-
-def AddOptimizerOps(model):
-    """Add optimizer ops."""
-    optimizer.build_sgd(model, 0.01,
-                        policy='step', stepsize=1, gamma=0.999,
-                        momentum=0.9, nesterov=False)
-
-
-def createTrainModel(lmdb_path):
-    """Create and return a training model, complete with training ops."""
-    model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
-    reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
-    AddInputOps(model, reader, BATCH_SIZE)
-    losses = AddForwardPassOps(model)
-    model.AddGradientOperators(losses)
-    AddOptimizerOps(model)
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-    return model
-
-
-def createTestModel(lmdb_path):
-    """Create and return a test model. Does not include training ops."""
-    model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
-                        init_params=False)
-    reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
-    AddInputOps(model, reader, BATCH_SIZE)
-    AddForwardPassOps(model)
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-    return model
-
-
-def getArgs():
-    """Return command-line arguments."""
-    CURDIR = os.path.dirname(__file__)
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--train-lmdb', help='Path to training LMDB',
-                        default=os.path.join(CURDIR, 'mnist_train_lmdb'))
-    parser.add_argument('--test-lmdb', help='Path to test LMDB',
-                        default=os.path.join(CURDIR, 'mnist_test_lmdb'))
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    """Train and test."""
-    device = 0
-    with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, device)):
-        train_model = createTrainModel(args.train_lmdb)
-        test_model = createTestModel(args.test_lmdb)
-
-    train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
-    test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
-
-    for epoch in range(1, EPOCHS + 1):
-        # Train
-        for iteration in range(1, train_iter_per_epoch + 1):
-            workspace.RunNet(train_model.net.Proto().name)
-            if not iteration % DISPLAY:
-                loss = workspace.FetchBlob('loss')
-                print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
-                    epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
-
-        # Test
-        losses = []
-        accuracies = []
-        for _ in range(test_iter_per_epoch):
-            workspace.RunNet(test_model.net.Proto().name)
-            losses.append(workspace.FetchBlob('loss'))
-            accuracies.append(workspace.FetchBlob('accuracy'))
-
-        loss = np.array(losses).mean()
-        accuracy = np.array(accuracies).mean()
-        print("Test loss: %f, accuracy: %f" % (loss, accuracy))
-
-    if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
-        raise RuntimeError(
-            "Final accuracy %f is not in the expected range [%f, %f]" %
-            (accuracy, ACCURACY_MIN, ACCURACY_MAX))
-
-
-if __name__ == '__main__':
-    core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
-    main(getArgs())
--- a/Caffe2/Classification/mnist/train_lenet_mgpu.py
+++ b/Caffe2/Classification/mnist/train_lenet_mgpu.py
@ -1,159 +0,0 @@
-#!/usr/bin/env python
-"""Example: train LeNet on MNIST (with multi-GPU)."""
-from __future__ import division, print_function
-
-import argparse
-import functools
-import logging
-import os.path
-
-from caffe2.python import brew, core, data_parallel_model, optimizer, workspace
-from caffe2.python.model_helper import ModelHelper
-
-
-logging.basicConfig()
-
-TRAIN_ENTRIES = 60000
-TEST_ENTRIES = 10000
-BATCH_SIZE = 100
-EPOCHS = 4
-DISPLAY = 100
-ACCURACY_MIN = 0.98
-ACCURACY_MAX = 0.999
-
-
-def AddInputOps(model, reader, batch_size):
-    """Add input ops."""
-    data, label = brew.image_input(
-        model, [reader], ['data', 'label'],
-        batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
-        scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
-        is_test=True)
-    data = model.StopGradient(data, data)
-
-
-def AddForwardPassOps(model, loss_scale):
-    """Add forward pass ops and return a list of losses."""
-    conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
-    pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
-    conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
-    pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
-    fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
-    fc3 = brew.relu(model, fc3, fc3)
-    pred = brew.fc(model, fc3, 'pred', 500, 10)
-    softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
-    loss = model.Scale(loss, loss, scale=loss_scale)
-    brew.accuracy(model, [softmax, 'label'], 'accuracy')
-    return [loss]
-
-
-def AddOptimizerOps(model):
-    """Add optimizer ops."""
-    optimizer.build_sgd(model, 0.01,
-                        policy='step', stepsize=1, gamma=0.999,
-                        momentum=0.9, nesterov=False)
-
-
-def createTrainModel(lmdb_path, devices):
-    """Create and return a training model, complete with training ops."""
-    model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
-    reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
-    data_parallel_model.Parallelize_GPU(
-        model,
-        input_builder_fun=functools.partial(
-            AddInputOps, reader=reader,
-            batch_size=(BATCH_SIZE // len(devices))),
-        forward_pass_builder_fun=AddForwardPassOps,
-        optimizer_builder_fun=AddOptimizerOps,
-        devices=devices, use_nccl=True)
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-    return model
-
-
-def createTestModel(lmdb_path, devices):
-    """Create and return a test model. Does not include training ops."""
-    model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
-                        init_params=False)
-    reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
-    data_parallel_model.Parallelize_GPU(
-        model,
-        input_builder_fun=functools.partial(
-            AddInputOps, reader=reader,
-            batch_size=(BATCH_SIZE // len(devices))),
-        forward_pass_builder_fun=AddForwardPassOps,
-        param_update_builder_fun=None,
-        devices=devices)
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-    return model
-
-
-def getArgs():
-    """Return command-line arguments."""
-    CURDIR = os.path.dirname(__file__)
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--train-lmdb', help='Path to training LMDB',
-                        default=os.path.join(CURDIR, 'mnist_train_lmdb'))
-    parser.add_argument('--test-lmdb', help='Path to test LMDB',
-                        default=os.path.join(CURDIR, 'mnist_test_lmdb'))
-    parser.add_argument('--gpus',
-                        help='Comma separated list of GPU devices to use')
-    parser.add_argument('--num_gpus', type=int, default=1,
-                        help='Number of GPU devices (instead of --gpus)')
-    args = parser.parse_args()
-
-    if args.gpus is not None:
-        args.gpus = [int(x) for x in args.gpus.split(',')]
-        args.num_gpus = len(args.gpus)
-    else:
-        args.gpus = range(args.num_gpus)
-        args.num_gpus = args.num_gpus
-    return args
-
-
-def main(args):
-    """Train and test."""
-    train_model = createTrainModel(args.train_lmdb, args.gpus)
-    test_model = createTestModel(args.test_lmdb, args.gpus)
-
-    train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
-    test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
-    scope_prefix = 'gpu_%d/' % args.gpus[0]
-
-    for epoch in range(1, EPOCHS + 1):
-        # Train
-        for iteration in range(1, train_iter_per_epoch + 1):
-            workspace.RunNet(train_model.net.Proto().name)
-            if not iteration % DISPLAY:
-                loss = workspace.FetchBlob(scope_prefix + 'loss')
-                print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
-                    epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
-
-        # Test
-        losses = []
-        accuracies = []
-        for _ in range(test_iter_per_epoch):
-            workspace.RunNet(test_model.net.Proto().name)
-            # Take average values across all GPUs
-            losses.append(sum(
-                workspace.FetchBlob('gpu_%d/loss' % g) for g in args.gpus
-            ) / len(args.gpus))
-            accuracies.append(sum(
-                workspace.FetchBlob('gpu_%d/accuracy' % g) for g in args.gpus
-            ) / len(args.gpus))
-
-        loss = sum(losses) / len(losses)
-        accuracy = sum(accuracies) / len(accuracies)
-        print("Test loss: %f, accuracy: %f" % (loss, accuracy))
-
-    if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
-        raise RuntimeError(
-            "Final accuracy %f is not in the expected range [%f, %f]" %
-            (accuracy, ACCURACY_MIN, ACCURACY_MAX))
-
-
-if __name__ == '__main__':
-    core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
-    main(getArgs())
--- a/PyTorch/Detection/SSD/Dockerfile
+++ b/PyTorch/Detection/SSD/Dockerfile
@ -0,0 +1,20 @@
+FROM gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.03-py3-stage
+
+# Set working directory
+WORKDIR /mlperf
+
+RUN apt-get update && apt-get install -y python3-tk python-pip git tmux htop tree
+
+# Necessary pip packages
+RUN pip install --upgrade pip
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN python3 -m pip install pycocotools==2.0.0
+
+# Copy SSD code
+COPY ./setup.py .
+COPY ./csrc ./csrc
+RUN pip install .
+
+COPY . .
--- a/PyTorch/Detection/SSD/LICENSE.md
+++ b/PyTorch/Detection/SSD/LICENSE.md
@ -0,0 +1,203 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   
+   Copyright 2018 NVIDIA Corporation
+
+   Copyright 2018 The MLPerf Authors
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/PyTorch/Detection/SSD/README.md
+++ b/PyTorch/Detection/SSD/README.md
@ -0,0 +1,344 @@
+# SSD300 v1.1 For PyTorch
+
+## Table Of Contents
+* [The model](#the-model)
+  * [Default configuration](#default-configuration)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick start guide](#quick-start-guide)
+* [Details](#details)
+  * [Command line arguments](#command-line-arguments)
+  * [Getting the data](#getting-the-data)
+  * [Training process](#training-process)
+    * [Data preprocessing](#data-preprocessing)
+    * [Data augmentation](#data-augmentation)
+  * [Enabling mixed precision](#enabling-mixed-precision)
+* [Benchmarking](#benchmarking)
+  * [Training performance benchmark](#training-performance-benchmark)
+  * [Inference performance benchmark](#inference-performance-benchmark)
+* [Results](#results)
+  * [Training accuracy results](#training-accuracy-results)
+  * [Training performance results](#training-performance-results)
+  * [Inference performance results](#inference-performance-results)
+* [Changelog](#changelog)
+* [Known issues](#known-issues)
+
+## The model
+The SSD300 v1.1 model is based on the
+[SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325) paper, which
+describes SSD as “a method for detecting objects in images using a single deep neural network".
+The input size is fixed to 300x300.
+
+The main difference between this model and the one described in the paper is in the backbone.
+Specifically, the VGG model is obsolete and is replaced by the ResNet-50 model.
+
+From the
+[Speed/accuracy trade-offs for modern convolutional object detectors](https://arxiv.org/abs/1611.10012)
+paper, the following enhancements were made to the backbone:
+*   The conv5_x, avgpool, fc and softmax layers were removed from the original classification model.
+*   All strides in conv4_x are set to 1x1. 
+
+The backbone is followed by 5 additional convolutional layers.
+In addition to the convolutional layers, we attached 6 detection heads:
+*   The first detection head is attached to the last conv4_x layer.
+*   The other five detection heads are attached to the corresponding 5 additional layers.
+
+Detector heads are similar to the ones referenced in the paper, however,
+they are enhanced by additional BatchNorm layers after each convolution.
+
+ 
+Additionally, we removed weight decay on every bias parameter and
+all the BatchNorm layer parameters as described in the
+[Highly Scalable Deep Learning Training System with Mixed-Precision: 
+Training ImageNet in Four Minutes](https://arxiv.org/abs/1807.11205) paper. 
+
+This model trains with mixed precision tensor cores on Volta, therefore you can get results much faster than training without tensor cores.
+This model is tested against each NGC monthly container release to ensure
+consistent accuracy and performance over time.
+
+Because of these enhancements, the SSD300 v1.1 model achieves higher accuracy.
+
+Training of SSD requires computational costly augmentations. To fully utilize GPUs during training we are using [NVIDIA DALI](https://github.com/NVIDIA/DALI) library to accelerate data preparation pipeline.
+
+
+### Default configuration
+We trained the model for 65 epochs with the following setup:
+*	SGD with momentum (0.9)
+*	Learning rate = 2.6e-3 * number of GPUs * (batch_size / 32)
+*	Learning rate decay – multiply by 0.1 before 43 and 54 epochs
+*	We use linear warmup of the learning rate during the first epoch. For more information, see the
+ [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677) paper.
+To enable warmup provide argument the `--warmup 300`
+*	Weight decay:
+    *	0 for BatchNorms and biases
+	*   5e-4 for other layers
+	
+**Note**: The learning rate is automatically scaled (in other words, mutliplied by the number of GPUs and multiplied by the batch size divided by 32).
+
+## Setup
+The following section list the requirements in order to start training the SSD300 v1.1 model.
+
+
+### Requirements
+This repository contains `Dockerfile` which extends the PyTorch 19.03 NGC container and encapsulates some dependencies.  Aside from these dependencies, ensure you have the following software:
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 19.03-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+* [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
+Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+
+
+## Quick Start Guide
+To train your model using mixed precision with Tensor Cores, perform the
+following steps using the default parameters of the SSD v1.1 model on the
+[COCO 2017](http://cocodataset.org/#download) dataset.
+
+### 1. Download and preprocess the dataset.
+
+Extract the COCO 2017 dataset with `download_dataset.sh $COCO_DIR`.
+Data will be downloaded to the `$COCO_DIR` directory (on the host).
+
+### 2. Build the SSD300 v1.1 PyTorch NGC container.
+
+` docker build . -t nvidia_ssd `
+
+### 3. Launch the NGC container to run training/inference.
+`nvidia-docker run --rm -it --ulimit memlock=-1 --ulimit stack=67108864 -v $COCO_DIR:/coco --ipc=host nvidia_ssd`
+
+
+**Note**: the default mount point in the container is `/coco`.
+
+### 4. Start training.
+
+The `./examples` directory provides several sample scripts for various GPU settings
+and act as wrappers around the main.py script.
+The example scripts need two arguments:
+- A path to root SSD directory.
+- A path to COCO 2017 dataset.
+
+Remaining arguments are passed to the `main.py` script.
+
+The `--save` flag, saves the model after each epoch.
+The checkpoints are stored as `./models/epoch_*.pt`.
+
+Use `python main.py -h` to obtain the list of available options in the `main.py` script.
+For example, if you want to run 8 GPU training with TensorCore acceleration and
+save checkpoints after each epoch, run:
+
+`bash ./examples/SSD300_FP16_8GPU.sh . /coco --save`
+
+For information about how to train using mixed precision, see the [Mixed Precision Training paper](https://arxiv.org/abs/1710.03740) and [Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
+
+For PyTorch, easily adding mixed-precision support is available from NVIDIA’s [APEX](https://github.com/NVIDIA/apex), a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
+
+
+### 5. Start validation/evaluation.
+
+
+The `main.py` training script automatically runs validation during training.
+The results from the validation are printed to stdout.
+
+Pycocotools’ open-sourced scripts provides a consistent way to evaluate models on the COCO dataset. We are using these scripts during validation to measure models performance in AP metric. Metrics below are evaluated using pycocotools’ methodology, in the following format:
+```
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.250
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.423
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.257
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.076
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.269
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.399
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.237
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.342
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.358
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.118
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.394
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.548
+```
+The metric reported in our results is present in the first row.
+
+To evaluate a checkpointed model saved in previous point, run:
+
+`python ./main.py --backbone resnet50 --mode evaluation --checkpoint ./models/epoch_*.pt --data /coco`
+
+### 6. Optionally, resume training from a checkpointed model.
+
+`python ./main.py --backbone resnet50 --checkpoint ./models/epoch_*.pt --data /coco`
+
+## Details
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Command line arguments
+All these parameters can be controlled by passing command line arguments to the `main.py` script. To get a complete list of all command line arguments with descriptions and default values you can run:
+
+`python main.py --help`
+
+### Getting the data
+
+The SSD model was trained on the COCO 2017 dataset. The val2017 validation set was used as a validation dataset. PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
+
+This repository contains the `download_dataset.sh` download script which will automatically
+download and preprocess the training, validation and test datasets. By default,
+data will be downloaded to the `/coco` directory.
+
+### Training process
+Training the SSD model is implemented in the `main.py` script. 
+
+By default, training is running for 65 epochs. Because evaluation is relatively time consuming,
+it is not running every epoch. With default settings, evaluation is executed after epochs:
+21, 31, 37, 42, 48, 53, 59, 64. The model is evaluated using pycocotools distributed with
+the COCO dataset.
+ Which epochs should be evaluated can be reconfigured with argument `--evaluation`.
+
+To run training with Tensor Cores, use the `--fp16` flag when running the `main.py` script.
+The flag `--save` flag enables storing checkpoints after each epoch under `./models/epoch_*.pt`.
+
+#### Data preprocessing
+Before we feed data to the model, both during training and inference, we perform:
+*	Normalization
+*	Encoding bounding boxes
+*   Resize to 300x300
+
+#### Data augmentation
+During training we perform the following augmentation techniques:
+*	Random crop
+*	Random horizontal flip
+*	Color jitter
+
+
+
+### Enabling mixed precision
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.  Using [mixed precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) training previously required two steps:
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+ 
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP),  library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval,
+while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients.
+In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by amp. The scaling value to be used can be dynamic or fixed.
+
+For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
+
+To enable mixed precision, you can:
+- Import AMP from APEX, for example:
+
+  `from apex import amp`
+- Initialize an AMP handle, for example: 
+
+  `amp_handle = amp.init(enabled=True, verbose=True)`
+- Wrap your optimizer with the AMP handle, for example:
+
+  `optimizer = amp_handle.wrap_optimizer(optimizer)`
+- Scale loss before backpropagation (assuming loss is stored in a variable called losses)
+  - Default backpropagate for FP32:
+
+    `losses.backward()`
+  - Scale loss and backpropagate with AMP:
+
+    ```
+    with optimizer.scale_loss(losses) as scaled_losses:
+       scaled_losses.backward()
+    ```
+
+For information about:
+- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+
+
+
+
+
+
+
+## Benchmarking
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+### Training performance benchmark
+Training benchmark was run in various scenarios on V100 16G GPU. For each scenario, batch size was set to 32. The benchmark does not require a checkpoint from a fully trained model.
+
+To benchmark training, run:
+```
+python -m torch.distributed.launch --nproc_per_node={NGPU} \
+       main.py --batch-size {bs} \
+               --mode benchmark-training \
+               --benchmark-warmup 100 \
+               --benchmark-iterations 200 \
+               {fp16} \
+               --data {data}
+```
+Where the `{NGPU}` selects number of GPUs used in benchmark, the `{bs}` is the desired batch size, the `{fp16}` is set to `--fp16` if you want to benchmark training with tensor cores, and the `{data}` is the location of the COCO 2017 dataset.
+
+Benchmark warmup is specified to omit first iterations of first epoch. Benchmark iterations is number of iterations used to measure performance.
+
+### Inference performance benchmark
+Inference benchmark was run on 1x V100 16G GPU.  To benchmark inference, run:
+```
+python main.py --eval-batch-size {bs} \
+               --mode benchmark-inference \
+               --benchmark-warmup 100 \
+               --benchmark-iterations 200 \
+               {fp16} \
+               --data {data}
+```
+Where the `{bs}` is the desired batch size, the `{fp16}` is set to `--fp16` if you want to benchmark inference with Tensor Cores, and the `{data}` is the location of the COCO 2017 dataset.
+
+Benchmark warmup is specified to omit first iterations of first epoch. Benchmark iterations is number of iterations used to measure performance.
+
+## Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+### Training accuracy results
+Our results were obtained by running the `./examples/SSD300_FP{16,32}_{1,4,8}GPU.sh`
+script in the pytorch-19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Batch was set to size best utilizing GPU memory. For FP32 precision, batch size is 32, for mixed precision batch size is 64
+
+| **Number of GPUs** | **Mixed precision mAP** | **Training time with mixed precision** | **FP32 mAP** | **Training time with FP32** |
+|:------------------:|:------------------------:|:-------------------------------------:|:------------:|:---------------------------:|
+| 1                  | 0.2494                   | 10h 39min                             | 0.2483       | 21h 40min                   |
+| 4                  | 0.2495                   | 2h 53min                              | 0.2478       | 5h 52min                    |
+| 8                  | 0.2489                   | 1h 31min                              | 0.2475       | 2h 54min                    |
+
+
+Here are example graphs of FP32 and FP16 training on 8 GPU configuration:
+
+![TrainingLoss](./img/training_loss.png)
+
+![ValidationAccuracy](./img/validation_accuracy.png)
+
+### Training performance results
+
+Our results were obtained by running the `main.py` script with the
+`--mode benchmark-training` flag in the pytorch-19.03-py3 NGC container on NVIDIA DGX-1 with V100 16G GPUs.
+
+| **Number of GPUs** | **Batch size per GPU** | **Mixed precision img/s (median)** | **FP32 img/s (median)** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with mixed precision** | **Multi-gpu weak scaling with FP32** |
+|:------------------:|:----------------------:|:----------------------------------:|:-----------------------:|:---------------------------------:|:-----------------------------------------------:|:------------------------------------:|
+| 1                  | 32                     |  217.052                           |  102.495                | 2.12                              | 1.00                                            | 1.00                                 |
+| 4                  | 32                     |  838.457                           |  397.797                | 2.11                              | 3.86                                            | 3.88                                 |
+| 8                  | 32                     | 1639.843                           |  789.695                | 2.08                              | 7.56                                            | 7.70                                 |
+
+To achieve same results, follow the [Quick start guide](#quick-start-guide) outlined above.
+
+### Inference performance results
+
+Our results were obtained by running the `main.py` script with `--mode benchmark-inference` flag in the pytorch-19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPUs.
+
+| **Batch size** | **Mixed precision img/s (median)** | **FP32 img/s (median)** |
+|:--------------:|:----------------------------------:|:-----------------------:|
+|              2 |                            163.12  |                147.91   |
+|              4 |                            296.60  |                201.62   |
+|              8 |                            412.52  |                228.16   |
+|             16 |                            470.10  |                280.57   |
+|             32 |                            520.54  |                302.43   |
+
+To achieve same results, follow the [Quick start guide](#quick-start-guide) outlined above.
+
+## Changelog
+
+March 2019
+ * Initial release
+
+## Known issues
+There are no known issues with this model.
--- a/PyTorch/Detection/SSD/init.py
+++ b/PyTorch/Detection/SSD/init.py
--- a/PyTorch/Detection/SSD/csrc/box_encoder_cuda.cu
+++ b/PyTorch/Detection/SSD/csrc/box_encoder_cuda.cu
@ -0,0 +1,440 @@
+/******************************************************************************
+*
+* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+
+ ******************************************************************************/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCNumerics.cuh>
+#include <THC/THC.h>
+
+#include <cuda.h>
+
+//#define DEBUG
+
+// calculate the IoU of a single box against another box
+__device__
+float calc_single_iou(const float4 b1, const float4 b2) {
+  // (lt), (rb)
+  float l = max(b1.x, b2.x);
+  float t = max(b1.y, b2.y);
+  float r = min(b1.z, b2.z);
+  float b = min(b1.w, b2.w);
+
+  float first = (r - l);
+  first = (first < 0) ? 0 : first;
+  float second = (b - t);
+  second = (second < 0) ? 0 : second;
+
+  float intersection = first * second;
+
+  float area1 = (b1.w - b1.y) * (b1.z - b1.x);
+  float area2 = (b2.w - b2.y) * (b2.z - b2.x);
+
+  return intersection / (area1 + area2 - intersection);
+}
+
+__global__
+// boxes1 : [N x 4]
+// boxes2 : [M x 4]
+//   ious : [N x M]
+void calc_ious_kernel(const int N_img, const float4 *box1, const int *box1_offsets,
+                      const int M, const float4 *boxes2, float *ious) {
+
+  // launch N_img blocks
+  const int img = blockIdx.x;
+
+  // each block, i will run over the box1_N[i] source and M target boxes
+  // generating box1_N[i] x M outputs
+
+  // alias to start of boxes for this image
+  const float4 *b1 = &box1[box1_offsets[img]];
+
+  if (threadIdx.x == 0) {
+    //printf("offset for img %d : %d\n", img, box1_offsets[img]);
+  }
+
+  // number of boxes for this image from offsets
+  int N = box1_offsets[img+1] - box1_offsets[img];
+
+  for (int i = 0; i < N; ++i) {
+    // if (threadIdx.x == 0) printf("i : %d\n", i);
+    const float4 source = b1[i];
+    // for each source, loop over targets
+    for (int j = threadIdx.x; j < M; j += blockDim.x) {
+      const float4 target = boxes2[j];
+
+      float iou = calc_single_iou(source, target);
+
+      // store the calculated IoU in the correct spot
+      int out_idx = box1_offsets[img] * M + i * M + j;
+      ious[out_idx] = iou;
+
+    }
+  }
+}
+
+__device__
+void reduce_val_idx(int N, volatile float *vals, volatile int *idx) {
+  // naive: single thread for now
+  if (threadIdx.x == 0) {
+    float max_val = vals[0];
+    int max_idx = idx[0];
+
+    for (int i = 1; i < N; ++i) {
+      if (vals[i] > max_val) {
+        max_val = vals[i];
+        max_idx = idx[i];
+      }
+    }
+
+    vals[0] = max_val;
+    idx[0] = max_idx;
+  }
+}
+
+/**
+ * perform remaining parts, storing temporary values in global workspace
+ * workspace needs N_img * M values, each of 8 bytes (float, int)
+ **/
+template <int BLOCK_SIZE, int MAX_BBOXES_PER_BLOCK>
+__global__
+void encode(const int N_img, const float4 *bbox_in, const long *labels_in, const int *offsets,
+            const int M, const float4 *dboxes, // const float *ious,
+            const float criteria, uint8_t *workspace, float4 *bbox_out, long *label_out) {
+
+  // Each block will take a single image's IoU set
+  const int img = blockIdx.x;
+
+  // shared memory for intermediate results
+  __shared__ volatile float best_bbox_iou_tmp[BLOCK_SIZE];
+  __shared__ volatile int best_bbox_idx_tmp[BLOCK_SIZE];
+
+  // shared memory for final best_bbox_{iou, idx} values
+  __shared__ volatile float best_bbox_iou[MAX_BBOXES_PER_BLOCK];
+  __shared__ volatile int best_bbox_idx[MAX_BBOXES_PER_BLOCK];
+
+  // index into the global workspace - each image needs (float + int) * M values
+  volatile float *best_dbox_iou = (float *)&workspace[img * M * 8];
+  volatile int *best_dbox_idx = (int *)&workspace[img * M * 8 + M * 4];
+
+  // number of input bboxes for this image
+  const int N_rows = offsets[img+1] - offsets[img];
+
+  // Check for potential crash
+  assert(N_rows <= MAX_BBOXES_PER_BLOCK);
+#ifdef DEBUG
+  if (threadIdx.x == 0)
+    printf("N rows: %d %d to %d (%p - %p)\n", N_rows, offsets[img], offsets[img+1], best_dbox_iou, best_dbox_idx);
+#endif
+
+  for (int i = threadIdx.x; i < MAX_BBOXES_PER_BLOCK; i += blockDim.x) {
+    best_bbox_iou[i] = -FLT_MAX;
+    best_bbox_idx[i] = -1;
+  }
+  __syncthreads();
+
+  // loop serially over the rows of the IoU set that correspond to this image
+  int row_num = 0;
+  for (int i = offsets[img]; i < offsets[img+1]; ++i) {
+    // reset shmem tallies
+    best_bbox_iou_tmp[threadIdx.x] = -FLT_MAX;
+    best_bbox_idx_tmp[threadIdx.x] = -1;
+
+    // index into the input buffer
+    // const float *row = &ious[i * M];
+    const float4 input_bbox = bbox_in[i];
+#ifdef DEBUG
+    if (threadIdx.x == 0)
+      printf("%d - %p\n", img, &input_bbox);
+#endif
+
+    // loop by threads over the columns
+    for (int j = threadIdx.x; j < M; j += blockDim.x) {
+
+      // check and store new max if necessary
+      const float4 input_dbox = dboxes[j];
+      // float new_val = row[j];
+      float new_val = calc_single_iou(input_bbox, input_dbox);
+
+      // handle per-row max in shared memory
+      if (new_val > best_bbox_iou_tmp[threadIdx.x]) {
+        best_bbox_iou_tmp[threadIdx.x] = new_val;
+        best_bbox_idx_tmp[threadIdx.x] = j;
+      }
+
+      // handle per-col max in global workspace
+      if (new_val > best_dbox_iou[j]) {
+        best_dbox_iou[j] = new_val;
+        best_dbox_idx[j] = row_num;
+
+#ifdef DEBUG
+        assert(best_dbox_idx[j] >= 0);
+        assert(best_dbox_idx[j] < N_rows);
+#endif
+      }
+    }
+
+    // Now we have all the values for this row -- reduce
+    __syncthreads();
+
+    // reduce - output is in max_{val, idx}_row[0]
+    reduce_val_idx(blockDim.x, best_bbox_iou_tmp, best_bbox_idx_tmp);
+#ifdef DEBUG
+    __syncthreads();
+#endif
+
+
+    // store output for row i
+    if (threadIdx.x == 0) {
+      best_bbox_iou[row_num] = best_bbox_iou_tmp[0];
+      best_bbox_idx[row_num] = best_bbox_idx_tmp[0];
+
+#ifdef DEBUG
+      assert(best_bbox_idx[row_num] >= 0);
+      assert(best_bbox_idx[row_num] < M);
+#endif
+    }
+    __syncthreads();
+
+    // keep track of _local_ row
+    row_num++;
+  }
+
+#ifdef DEBUG
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < N_rows; ++i) {
+      printf("%d - row : %d : best bbox_idx: %d\n", img, i, best_bbox_idx[i]);
+    }
+  }
+#endif
+
+#ifdef DEBUG
+  // make sure all best_bbox_{iou, val} are seen by everyone
+  __syncthreads();
+#endif
+  // At this point we have the maximum values & indices for both bbox and dbox
+  /*
+        best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
+
+        idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
+        best_dbox_idx[best_bbox_idx[idx]] = idx
+  */
+  for (int i = threadIdx.x; i < N_rows; i += blockDim.x) {
+    int idx = best_bbox_idx[i];
+
+#ifdef DEBUG
+    assert(idx < M);
+    assert(idx >= 0);
+#endif
+
+    best_dbox_iou[idx] = 2.;
+    best_dbox_idx[idx] = i;
+#ifdef DEBUG
+    printf("%d - set best dbox_idx[%d] to %d\n", img, best_bbox_idx[i], i);
+#endif
+  }
+
+  /**
+        # filter IoU > 0.5
+        masks = best_dbox_ious > criteria
+        labels_out = torch.zeros(self.nboxes, dtype=torch.long)
+        #print(maxloc.shape, labels_in.shape, labels_out.shape)
+        labels_out[masks] = labels_in[best_dbox_idx[masks]]
+        bboxes_out = self.dboxes.clone()
+        bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
+        # Transform format to xywh format
+        x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
+                     0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
+                     -bboxes_out[:, 0] + bboxes_out[:, 2], \
+                     -bboxes_out[:, 1] + bboxes_out[:, 3]
+        bboxes_out[:, 0] = x
+        bboxes_out[:, 1] = y
+        bboxes_out[:, 2] = w
+        bboxes_out[:, 3] = h
+        return bboxes_out, labels_out
+  **/
+  __syncthreads();
+  for (int i = threadIdx.x; i < M; i += blockDim.x) {
+    // offset into output arrays: M values per image
+    // int output_idx = offsets[img] * M + i;
+    int output_idx = img * M + i;
+
+    // reset output labels to background
+    // NOTE: bbox_out is already cloned from dbox outside of this kernel
+    label_out[output_idx] = 0;
+
+    // Filter IoU > 0.5
+    bool mask = best_dbox_iou[i] > criteria;
+
+    float4 bbox = bbox_out[output_idx];
+    // copy some labels and bboxes
+    if (mask) {
+      // copy label
+#ifdef DEBUG
+      printf("%d : label: local input idx: %d, value: %d\n", i, best_dbox_idx[i], labels_in[offsets[img] + best_dbox_idx[i]]);
+      // printf("%d : label: local input idx: %d, value: %d\n", i, best_dbox_idx[i], labels_in[offsets[img] + i]);
+#endif
+      label_out[output_idx] = labels_in[offsets[img] + best_dbox_idx[i]];
+
+      // grab original box
+      bbox = bbox_in[offsets[img] + best_dbox_idx[i]];
+#ifdef DEBUG
+      printf("mask %d : %d : %f %f %f %f\n", i, best_dbox_idx[i], bbox.x, bbox.y, bbox.z, bbox.w);
+#endif
+    }
+
+    // transfer to xywh
+    float4 bbox_tmp;
+    bbox_tmp.x = 0.5 * (bbox.x + bbox.z);
+    bbox_tmp.y = 0.5 * (bbox.y + bbox.w);
+    bbox_tmp.z = bbox.z - bbox.x;
+    bbox_tmp.w = bbox.w - bbox.y;
+
+    // write out
+    bbox_out[output_idx] = bbox_tmp;
+  }
+}
+
+/**
+    def encode(self, bboxes_in, labels_in, criteria = 0.5):
+
+        ious = calc_iou_tensor(bboxes_in, self.dboxes)
+        best_dbox_ious, best_dbox_idx = ious.max(dim=0)
+        best_bbox_ious, best_bbox_idx = ious.max(dim=1)
+
+        # set best ious 2.0
+        best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
+
+        idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
+        best_dbox_idx[best_bbox_idx[idx]] = idx
+
+        # filter IoU > 0.5
+        masks = best_dbox_ious > criteria
+        labels_out = torch.zeros(self.nboxes, dtype=torch.long)
+        #print(maxloc.shape, labels_in.shape, labels_out.shape)
+        labels_out[masks] = labels_in[best_dbox_idx[masks]]
+        bboxes_out = self.dboxes.clone()
+        bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
+        # Transform format to xywh format
+        x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
+                     0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
+                     -bboxes_out[:, 0] + bboxes_out[:, 2], \
+                     -bboxes_out[:, 1] + bboxes_out[:, 3]
+        bboxes_out[:, 0] = x
+        bboxes_out[:, 1] = y
+        bboxes_out[:, 2] = w
+        bboxes_out[:, 3] = h
+        return bboxes_out, labels_out
+**/
+std::vector<at::Tensor> box_encoder(const int N_img,
+                                    const at::Tensor& bbox_input,
+                                    const at::Tensor& bbox_offsets,
+                                    const at::Tensor& labels_input,
+                                    const at::Tensor& dbox,
+                                    float criteria) {
+  // Check everything is on the device
+  AT_ASSERTM(bbox_input.type().is_cuda(), "bboxes must be a CUDA tensor");
+  AT_ASSERTM(bbox_offsets.type().is_cuda(), "bbox offsets must be a CUDA tensor");
+  AT_ASSERTM(labels_input.type().is_cuda(), "labels must be a CUDA tensor");
+  AT_ASSERTM(dbox.type().is_cuda(), "dboxes must be a CUDA tensor");
+
+  // Check at least offsets, bboxes and labels are consistent
+  // Note: offsets is N+1 vs. N for labels
+  AT_ASSERTM(N_img + 1 == bbox_offsets.numel(), "must have N_img+1 offsets");
+
+
+  auto num_bbox_total = bbox_offsets[bbox_offsets.numel()-1].item<int>();
+#ifdef DEBUG
+  printf("%d : bboxes: %d\n", (int)bbox_offsets.numel(), num_bbox_total);
+#endif
+  AT_ASSERTM(num_bbox_total <= 2048, "total num bboxes must be <= 2048");
+
+  AT_ASSERTM(bbox_input.size(0) == labels_input.size(0), "bbox and labels must have same leading dimension");
+
+  const int N = bbox_input.size(0);
+  const int M = dbox.size(0);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  // allocate final outputs (known size)
+#ifdef DEBUG
+  printf("%d x %d\n", N_img * M, 4);
+  // at::Tensor bbox_out = dbox.type().tensor({N_img * M, 4});
+  printf("allocating %lu bytes for output labels\n", N_img*M*sizeof(long));
+#endif
+  at::Tensor labels_out = at::empty({N_img * M}, labels_input.options());
+  THCudaCheck(cudaGetLastError());
+
+  // copy default boxes to outputs
+#ifdef DEBUG
+  printf("allocating %lu bytes for output bboxes\n", N_img*M*4*sizeof(float));
+#endif
+  at::Tensor bbox_out = dbox.repeat({N_img, 1});
+  THCudaCheck(cudaGetLastError());
+
+  // need to allocate some workspace
+#ifdef DEBUG
+  printf("allocating %lu bytes for workspace\n", 8*M*N_img);
+#endif
+  // at::Tensor workspace = at::CUDA(at::kByte).zeros({8 * M * N_img});
+  at::Tensor workspace = at::zeros({8 * M * N_img}, at::CUDA(at::kByte));
+  THCudaCheck(cudaGetLastError());
+
+  // Encode the inputs
+  const int THREADS_PER_BLOCK = 256;
+  encode<THREADS_PER_BLOCK, 256><<<N_img, THREADS_PER_BLOCK, 0, stream.stream()>>>(N_img,
+                      (float4*)bbox_input.data<float>(),
+                      labels_input.data<long>(),
+                      bbox_offsets.data<int>(),
+                      M,
+                      (float4*)dbox.data<float>(),
+                      criteria,
+                      workspace.data<uint8_t>(),
+                      (float4*)bbox_out.data<float>(),
+                      labels_out.data<long>());
+
+  THCudaCheck(cudaGetLastError());
+  return {bbox_out, labels_out};
+}
+
+at::Tensor calc_ious(const int N_img,
+                     const at::Tensor& boxes1,
+                     const at::Tensor& boxes1_offsets,
+                     const at::Tensor& boxes2) {
+
+  const int N = boxes1.size(0);
+  const int M = boxes2.size(0);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  // at::Tensor ious = at::CUDA(at::kFloat).zeros({N, M});
+  // at::Tensor ious = at::ones(at::CUDA(at::kFloat), {N, M});
+  at::Tensor ious = at::empty({N, M}, boxes1.options());
+
+  // Get IoU of all source x default box pairs
+  calc_ious_kernel<<<N_img, 256, 0, stream.stream()>>>(
+                        N_img,
+                        (float4*)boxes1.data<float>(),
+                        boxes1_offsets.data<int>(),
+                        M,
+                        (float4*)boxes2.data<float>(),
+                        ious.data<float>());
+
+  THCudaCheck(cudaGetLastError());
+  return ious;
+}
--- a/PyTorch/Detection/SSD/csrc/interface.cpp
+++ b/PyTorch/Detection/SSD/csrc/interface.cpp
@ -0,0 +1,81 @@
+/******************************************************************************
+*
+* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+
+ ******************************************************************************/
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+
+
+namespace py = pybind11;
+
+// Box encoder
+std::vector<at::Tensor> box_encoder(const int N_img,
+                                    const at::Tensor& bbox_input,
+                                    const at::Tensor& bbox_offsets,
+                                    const at::Tensor& labels_input,
+                                    const at::Tensor& dbox,
+                                    const float criteria = 0.5);
+
+std::vector<at::Tensor> random_horiz_flip(
+                             at::Tensor& img,
+                             at::Tensor& bboxes,
+                             const at::Tensor& bbox_offsets,
+                             const float p,
+                             const bool nhwc);
+
+// Fused color jitter application
+// ctm [4,4], img [H, W, C]
+py::array_t<float> apply_transform(int H, int W, int C, py::array_t<float> img, py::array_t<float> ctm) {
+  auto img_buf = img.request();
+  auto ctm_buf = ctm.request();
+
+  // printf("H: %d, W: %d, C: %d\n", H, W, C);
+  py::array_t<float> result{img_buf.size};
+  auto res_buf = result.request();
+
+  float *img_ptr = (float *)img_buf.ptr;
+  float *ctm_ptr = (float *)ctm_buf.ptr;
+  float *res_ptr = (float *)res_buf.ptr;
+
+  for (int h = 0; h < H; ++h) {
+    for (int w = 0; w < W; ++w) {
+      float *ptr = &img_ptr[h * W * C + w * C];
+      float *out_ptr = &res_ptr[h * W * C + w * C];
+      // manually unroll over C
+      out_ptr[0] = ctm_ptr[0] * ptr[0] + ctm_ptr[1] * ptr[1] + ctm_ptr[2] * ptr[2] + ctm_ptr[3];
+      out_ptr[1] = ctm_ptr[4] * ptr[0] + ctm_ptr[5] * ptr[1] + ctm_ptr[6] * ptr[2] + ctm_ptr[7];
+      out_ptr[2] = ctm_ptr[8] * ptr[0] + ctm_ptr[9] * ptr[1] + ctm_ptr[10] * ptr[2] + ctm_ptr[11];
+    }
+  }
+
+  result.resize({H, W, C});
+
+  return result;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // batched box encoder
+  m.def("box_encoder", &box_encoder, "box_encoder");
+  m.def("random_horiz_flip", &random_horiz_flip, "random_horiz_flip");
+  // Apply fused color jitter
+  m.def("apply_transform", &apply_transform, "apply_transform");
+}
--- a/PyTorch/Detection/SSD/csrc/random_horiz_flip.cu
+++ b/PyTorch/Detection/SSD/csrc/random_horiz_flip.cu
@ -0,0 +1,165 @@
+/******************************************************************************
+*
+* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+
+ ******************************************************************************/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCNumerics.cuh>
+#include <THC/THC.h>
+
+#include <cuda.h>
+
+/**
+ * Each block will handle one channel of each image
+ **/
+template <typename T>
+__global__
+void HorizFlipImagesAndBoxes(
+                             const int N,
+                             const int C,
+                             const int H,
+                             const int W,
+                             const T* img_in,
+                             float* bboxes,
+                             const int* offsets,
+                             const float p,
+                             const float* flip,
+                             T* img_out,
+                             const bool nhwc) {
+  // early return if not flipping
+  if (flip[blockIdx.x] < p) return;
+
+  // pointer offset into images
+  const int img_offset = blockIdx.x * C * H * W;
+  const T* img = &img_in[img_offset];
+  T* img_o = &img_out[img_offset];
+
+  // flip bboxes
+  auto bbox_offset_begin = offsets[blockIdx.x];
+  auto bbox_offset_end   = offsets[blockIdx.x + 1];
+  auto num_bboxes = bbox_offset_end - bbox_offset_begin;
+
+  const int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // bboxes in ltrb format, scaled to [0, 1]
+  for (int i = thread_idx; i < num_bboxes; i += blockDim.x * blockDim.y) {
+    float *bbox = &bboxes[(bbox_offset_begin + thread_idx) * 4];
+    // Could do this inplace, but not register constrained
+    auto bbox_0 = bbox[0];
+    auto bbox_2 = bbox[2];
+    bbox[0] = 1. - bbox_2;
+    bbox[2] = 1. - bbox_0;
+  }
+
+  if (nhwc) {
+    // loop over float3 pixels, handle 3 values / thread
+    for (int h = threadIdx.y; h < H; h += blockDim.y) {
+      for (int w = threadIdx.x; w < W; w += blockDim.x) {
+        const T* img_hw = &img[h * W * C + w * C];
+        T * img_out_hw = &img_o[h * W * C + (W - 1 - w) * C];
+
+        for (int c = 0; c < C; ++c) {
+          img_out_hw[c] = img_hw[c];
+        }
+      }
+    }
+  } else {
+    // loop over channels
+    for (int c = 0; c < C; ++c) {
+      const T* img_c = &img[c * H * W];
+      T *img_out_c = &img_o[c * H * W];
+
+      // handle tiles of (h, w) at a time
+      for (int h = threadIdx.y; h < H; h += blockDim.y) {
+        for (int w = threadIdx.x; w < W; w += blockDim.x) {
+          const int input_idx = h * W + w;
+          const int output_idx = h * W + (W - 1 - w);
+
+
+          img_out_c[output_idx] = img_c[input_idx];
+        }
+      }
+    }
+  }
+}
+
+/**
+  * Take images and their bboxes, randomly flip on horizontal axis
+  * In/Out: img: NCHW tensor of N, C-channel images of constant (H, W)
+  * In/Out: bboxes: [N_i, 4] tensor of original bboxes in ltrb format
+  * In: bbox_offsets: [N] offset values into bboxes
+  * In: p \in [0, 1): probability of flipping each (img, bbox) pair
+  * In: nhwc: Tensor in NHWC format
+  * ----
+  * Note: allocate temp memory, but effectively do this inplace
+  */
+std::vector<at::Tensor> random_horiz_flip(
+                             at::Tensor& img,
+                             at::Tensor& bboxes,
+                             const at::Tensor& bbox_offsets,
+                             const float p,
+                             const bool nhwc) {
+  // dimensions
+  const int N = img.size(0);
+  int C, H, W;
+  if (nhwc) {
+    C = img.size(3);
+    H = img.size(1);
+    W = img.size(2);
+
+  } else {
+    C = img.size(1);
+    H = img.size(2);
+    W = img.size(3);
+  }
+
+  assert(img.type().is_cuda());
+  assert(bboxes.type().is_cuda());
+  assert(bbox_offsets.type().is_cuda());
+
+  // printf("%d %d %d %d\n", N, C, H, W);
+  // Need temp storage of size img
+  at::Tensor tmp_img = img.clone();
+  at::Tensor flip = at::zeros({N}, at::CUDA(at::kFloat)).uniform_(0., 1.);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      img.type(),
+      "HorizFlipImagesAndBoxes",
+      [&] {
+        HorizFlipImagesAndBoxes<scalar_t><<<N, dim3(16, 16), 0, stream.stream()>>>(
+          N,
+          C,
+          H,
+          W,
+          img.data<scalar_t>(),
+          bboxes.data<float>(),
+          bbox_offsets.data<int>(),
+          p,
+          flip.data<float>(),
+          tmp_img.data<scalar_t>(),
+          nhwc);
+        THCudaCheck(cudaGetLastError());
+      });
+
+  // copy tmp_img -> img
+  // img = tmp_img;
+
+  return {tmp_img, bboxes};
+}
+
--- a/PyTorch/Detection/SSD/download_dataset.sh
+++ b/PyTorch/Detection/SSD/download_dataset.sh
@ -0,0 +1,8 @@
+# Get COCO 2017 data sets
+COCO_DIR=${1:-"/coco"}
+dir=$(pwd)
+mkdir $COCO_DIR; cd $COCO_DIR
+curl -O http://images.cocodataset.org/zips/train2017.zip; unzip train2017.zip
+curl -O http://images.cocodataset.org/zips/val2017.zip; unzip val2017.zip
+curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip; unzip annotations_trainval2017.zip
+cd $dir
--- a/PyTorch/Detection/SSD/examples/SSD300_FP16_1GPU.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP16_1GPU.sh
@ -0,0 +1,4 @@
+# This script launches SSD300 training in FP16 on 1 GPUs using 64 batch size
+# Usage bash SSD300_FP16_1GPU.sh <path to this repository> <path to dataset> <additional flags>
+
+python $1/main.py --backbone resnet50 --warmup 300 --bs 64 --fp16 --data $2 ${@:3}
--- a/PyTorch/Detection/SSD/examples/SSD300_FP16_4GPU.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP16_4GPU.sh
@ -0,0 +1,4 @@
+# This script launches SSD300 training in FP16 on 4 GPUs using 256 batch size (64 per GPU)
+# Usage ./SSD300_FP16_4GPU.sh <path to this repository> <path to dataset> <additional flags>
+
+python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --fp16 --data $2 ${@:3}
--- a/PyTorch/Detection/SSD/examples/SSD300_FP16_8GPU.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP16_8GPU.sh
@ -0,0 +1,4 @@
+# This script launches SSD300 training in FP16 on 8 GPUs using 512 batch size (64 per GPU)
+# Usage ./SSD300_FP16_8GPU.sh <path to this repository> <path to dataset> <additional flags>
+
+python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --fp16 --data $2 ${@:3}
--- a/PyTorch/Detection/SSD/examples/SSD300_FP16_EVAL.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP16_EVAL.sh
@ -0,0 +1,4 @@
+# This script evaluates SSD300 model in FP16 using 32 batch size on 1 GPU
+# Usage: ./SSD300_FP16_EVAL.sh <path to this repository> <path to dataset> <path to checkpoint> <additional flags>
+
+python $1/main.py --backbone resnet50 --fp16 --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}
--- a/PyTorch/Detection/SSD/examples/SSD300_FP16_INFERENCE_BENCHMARK.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP16_INFERENCE_BENCHMARK.sh
@ -0,0 +1,4 @@
+# This script launches SSD300 inference benchmark in FP16 on 1 GPU with 64 batch size
+# Usage bash SSD300_FP16_INFERENCE_BENCHMARK.sh <path to this repository> <path to dataset> <additional flags>
+
+python $1/main.py --backbone resnet50 --mode benchmark-inference --bs 64 --fp16 --data $2 ${@:3}
--- a/PyTorch/Detection/SSD/examples/SSD300_FP32_1GPU.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP32_1GPU.sh
@ -0,0 +1,4 @@
+# This script launches SSD300 training in FP32 on 1 GPUs using 32 batch size
+# Usage ./SSD300_FP32_1GPU.sh <path to this repository> <path to dataset> <additional flags>
+
+python $1/main.py --backbone resnet50 --bs 32 --warmup 300 --data $2 ${@:3}
--- a/PyTorch/Detection/SSD/examples/SSD300_FP32_4GPU.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP32_4GPU.sh
@ -0,0 +1,4 @@
+# This script launches SSD300 training in FP32 on 4 GPUs using 128 batch size (32 per GPU)
+# Usage ./SSD300_FP32_4GPU.sh <path to this repository> <path to dataset> <additional flags>
+
+python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --data $2 ${@:3}
--- a/PyTorch/Detection/SSD/examples/SSD300_FP32_8GPU.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP32_8GPU.sh
@ -0,0 +1,4 @@
+# This script launches SSD300 training in FP32 on 8 GPUs using 256 batch size (32 per GPU)
+# Usage ./SSD300_FP32_8GPU.sh <path to this repository> <path to dataset> <additional flags>
+
+python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --data $2 ${@:3}
--- a/PyTorch/Detection/SSD/examples/SSD300_FP32_EVAL.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP32_EVAL.sh
@ -0,0 +1,4 @@
+# This script evaluates SSD300 model in FP32 using 32 batch size on 1 GPU
+# Usage: ./SSD300_FP32_EVAL.sh <path to this repository> <path to dataset> <path to checkpoint> <additional flags>
+
+python $1/main.py --backbone resnet50 --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}
--- a/PyTorch/Detection/SSD/examples/SSD300_FP32_INFERENCE_BENCHMARK.sh
+++ b/PyTorch/Detection/SSD/examples/SSD300_FP32_INFERENCE_BENCHMARK.sh
@ -0,0 +1,4 @@
+# This script launches SSD300 inference benchmark in FP32 on 1 GPU with 64 batch size
+# Usage bash SSD300_FP32_INFERENCE_BENCHMARK.sh <path to this repository> <path to dataset> <additional flags>
+
+python $1/main.py --backbone resnet50 --warmup 300 --mode benchmark-inference --bs 32 --data $2 ${@:3}
--- a/PyTorch/Detection/SSD/img/training_loss.png
+++ b/PyTorch/Detection/SSD/img/training_loss.png
--- a/PyTorch/Detection/SSD/img/validation_accuracy.png
+++ b/PyTorch/Detection/SSD/img/validation_accuracy.png
--- a/PyTorch/Detection/SSD/main.py
+++ b/PyTorch/Detection/SSD/main.py
@ -0,0 +1,240 @@
+import os
+import time
+from argparse import ArgumentParser
+import torch
+import numpy as np
+from torch.optim.lr_scheduler import MultiStepLR
+import torch.utils.data.distributed
+
+from src.model import SSD300, Loss
+from src.utils import dboxes300_coco, Encoder
+from src.logger import Logger, BenchLogger
+from src.evaluate import evaluate
+from src.train import train_loop, tencent_trick, load_checkpoint, benchmark_train_loop, benchmark_inference_loop
+from src.data import get_train_loader, get_val_dataset, get_val_dataloader, get_coco_ground_truth
+
+# Apex imports
+try:
+    from apex.parallel.LARC import LARC
+    from apex import amp
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+except ImportError:
+    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
+
+def generate_mean_std(args):
+    mean_val = [0.485, 0.456, 0.406]
+    std_val = [0.229, 0.224, 0.225]
+
+    mean = torch.tensor(mean_val).cuda()
+    std = torch.tensor(std_val).cuda()
+
+    view = [1, len(mean_val), 1, 1]
+
+    mean = mean.view(*view)
+    std = std.view(*view)
+
+    if args.fp16:
+        mean = mean.half()
+        std = std.half()
+
+    return mean, std
+
+
+def make_parser():
+    parser = ArgumentParser(description="Train Single Shot MultiBox Detector"
+                                        " on COCO")
+    parser.add_argument('--data', '-d', type=str, default='/coco', required=True,
+                        help='path to test and training data files')
+    parser.add_argument('--epochs', '-e', type=int, default=65,
+                        help='number of epochs for training')
+    parser.add_argument('--batch-size', '--bs', type=int, default=32,
+                        help='number of examples for each iteration')
+    parser.add_argument('--eval-batch-size', '--ebs', type=int, default=32,
+                        help='number of examples for each evaluation iteration')
+    parser.add_argument('--no-cuda', action='store_true',
+                        help='use available GPUs')
+    parser.add_argument('--seed', '-s', type=int,
+                        help='manually set random seed for torch')
+    parser.add_argument('--checkpoint', type=str, default=None,
+                        help='path to model checkpoint file')
+    parser.add_argument('--save', action='store_true',
+                        help='save model checkpoints')
+    parser.add_argument('--mode', type=str, default='training',
+                        choices=['training', 'evaluation', 'benchmark-training', 'benchmark-inference'])
+    parser.add_argument('--evaluation', nargs='*', type=int, default=[21, 31, 37, 42, 48, 53, 59, 64],
+                        help='epochs at which to evaluate')
+    parser.add_argument('--multistep', nargs='*', type=int, default=[43, 54],
+                        help='epochs at which to decay learning rate')
+
+    # Hyperparameters
+    parser.add_argument('--learning-rate', '--lr', type=float, default=2.6e-3,
+                        help='learning rate')
+    parser.add_argument('--momentum', '-m', type=float, default=0.9,
+                        help='momentum argument for SGD optimizer')
+    parser.add_argument('--weight-decay', '--wd', type=float, default=0.0005,
+                        help='momentum argument for SGD optimizer')
+
+    parser.add_argument('--profile', type=int, default=None)
+    parser.add_argument('--warmup', type=int, default=None)
+    parser.add_argument('--benchmark-iterations', type=int, default=20, metavar='N',
+                        help='Run N iterations while benchmarking (ignored when training and validation)')
+    parser.add_argument('--benchmark-warmup', type=int, default=20, metavar='N',
+                        help='Number of warmup iterations for benchmarking')
+
+    parser.add_argument('--backbone', type=str, default='resnet50',
+                        choices=['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'])
+    parser.add_argument('--num-workers', type=int, default=4)
+    parser.add_argument('--fp16', action='store_true')
+    parser.add_argument('--amp', action='store_true')
+
+    # Distributed
+    parser.add_argument('--local_rank', default=0, type=int,
+                        help='Used for multi-process training. Can either be manually set ' +
+                             'or automatically set by using \'python -m multiproc\'.')
+
+    return parser
+
+
+def train(train_loop_func, logger, args):
+    if args.amp:
+        amp_handle = amp.init(enabled=args.fp16)
+    # Check that GPUs are actually available
+    use_cuda = not args.no_cuda
+
+    # Setup multi-GPU if necessary
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+    if args.distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+        args.N_gpu = torch.distributed.get_world_size()
+    else:
+        args.N_gpu = 1
+
+    if args.seed is None:
+        args.seed = np.random.randint(1e4)
+
+    if args.distributed:
+        args.seed = (args.seed + torch.distributed.get_rank()) % 2**32
+    print("Using seed = {}".format(args.seed))
+    torch.manual_seed(args.seed)
+    np.random.seed(seed=args.seed)
+
+
+    # Setup data, defaults
+    dboxes = dboxes300_coco()
+    encoder = Encoder(dboxes)
+    cocoGt = get_coco_ground_truth(args)
+
+    train_loader = get_train_loader(args, args.seed - 2**31)
+
+    val_dataset = get_val_dataset(args)
+    val_dataloader = get_val_dataloader(val_dataset, args)
+
+    ssd300 = SSD300(backbone=args.backbone)
+    args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
+    start_epoch = 0
+    iteration = 0
+    loss_func = Loss(dboxes)
+
+    if use_cuda:
+        ssd300.cuda()
+        loss_func.cuda()
+
+    if args.fp16 and not args.amp:
+        ssd300 = network_to_half(ssd300)
+
+    if args.distributed:
+        ssd300 = DDP(ssd300)
+
+    optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate,
+                                    momentum=args.momentum, weight_decay=args.weight_decay)
+    scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1)
+    if args.fp16:
+        if args.amp:
+            optimizer = amp_handle.wrap_optimizer(optimizer)
+        else:
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.)
+    if args.checkpoint is not None:
+        if os.path.isfile(args.checkpoint):
+            load_checkpoint(ssd300, args.checkpoint)
+            checkpoint = torch.load(args.checkpoint,
+                                    map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device()))
+            start_epoch = checkpoint['epoch']
+            iteration = checkpoint['iteration']
+            scheduler.load_state_dict(checkpoint['scheduler'])
+            ssd300.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        else:
+            print('Provided checkpoint is not path to a file')
+            return
+
+    inv_map = {v: k for k, v in val_dataset.label_map.items()}
+
+    total_time = 0
+
+    if args.mode == 'evaluation':
+        acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
+        if args.local_rank == 0:
+            print('Model precision {} mAP'.format(acc))
+
+        return
+    mean, std = generate_mean_std(args)
+
+    for epoch in range(start_epoch, args.epochs):
+        start_epoch_time = time.time()
+        scheduler.step()
+        iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration,
+                                    logger, args, mean, std)
+        end_epoch_time = time.time() - start_epoch_time
+        total_time += end_epoch_time
+
+        if args.local_rank == 0:
+            logger.update_epoch_time(epoch, end_epoch_time)
+
+        if epoch in args.evaluation:
+            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
+
+            if args.local_rank == 0:
+                logger.update_epoch(epoch, acc)
+
+        if args.save and args.local_rank == 0:
+            print("saving model...")
+            obj = {'epoch': epoch + 1,
+                   'iteration': iteration,
+                   'optimizer': optimizer.state_dict(),
+                   'scheduler': scheduler.state_dict(),
+                   'label_map': val_dataset.label_info}
+            if args.distributed:
+                obj['model'] = ssd300.module.state_dict()
+            else:
+                obj['model'] = ssd300.state_dict()
+            torch.save(obj, './models/epoch_{}.pt'.format(epoch))
+        train_loader.reset()
+    print('total training time: {}'.format(total_time))
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    args = parser.parse_args()
+    if args.local_rank == 0:
+        os.makedirs('./models', exist_ok=True)
+
+    torch.backends.cudnn.benchmark = True
+
+    if args.mode == 'benchmark-training':
+        train_loop_func = benchmark_train_loop
+        logger = BenchLogger('Training benchmark')
+        args.epochs = 1
+    elif args.mode == 'benchmark-inference':
+        train_loop_func = benchmark_inference_loop
+        logger = BenchLogger('Inference benchmark')
+        args.epochs = 1
+    else:
+        train_loop_func = train_loop
+        logger = Logger('Training logger', print_freq=1)
+
+    train(train_loop_func, logger, args)
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp16.json
@ -0,0 +1,31 @@
+{
+    "model": "",
+    "ngpus": [1, 4, 8],
+    "bs": [2, 4, 8, 16, 32, 64, 128],
+    "metric_keys": ["images_per_second"],
+    "metrics": {
+        "1": {
+            "2": {
+                "images_per_second": 191.25867003414876
+            },
+            "4": {
+                "images_per_second": 340.9537905548054
+            },
+            "8": {
+                "images_per_second": 517.2612062140391
+            },
+            "16": {
+                "images_per_second": 711.5516679788083
+            },
+            "32": {
+                "images_per_second": 812.9203401838566
+            },
+            "64": {
+                "images_per_second": 951.7432815456556
+            },
+            "128": {
+                "images_per_second": 876.1868813828711
+            }
+        }
+    }
+}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp32.json
@ -0,0 +1,31 @@
+{
+    "model": "",
+    "ngpus": [1, 4, 8],
+    "bs": [2, 4, 8, 16, 32, 64, 128],
+    "metric_keys": ["images_per_second"],
+    "metrics": {
+        "1": {
+            "2": {
+                "images_per_second": 174.58768325581374
+            },
+            "4": {
+                "images_per_second": 254.24180710755593
+            },
+            "8": {
+                "images_per_second": 308.95847419165545
+            },
+            "16": {
+                "images_per_second": 419.60746029488445
+            },
+            "32": {
+                "images_per_second": 453.81433823995565
+            },
+            "64": {
+                "images_per_second": 592.6385687558369
+            },
+            "128": {
+                "images_per_second": 603.8453409148115
+            }
+        }
+    }
+}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp16.json
@ -0,0 +1,59 @@
+{
+    "model": "",
+    "ngpus": [1, 4, 8],
+    "bs": [2, 4, 8, 16, 32, 64],
+    "metric_keys": ["images_per_second"],
+    "metrics": {
+        "1": {
+            "2": {
+                "images_per_second": 40.71944999694824
+            },
+            "4": {
+                "images_per_second": 68.22257804870605
+            },
+            "8": {
+                "images_per_second": 121.42024612426758
+            },
+            "16": {
+                "images_per_second": 159.56442260742188
+            },
+            "32": {
+                "images_per_second": 185.69010543823242
+            }
+        },
+        "4": {
+            "2": {
+                "images_per_second": 40.75998783111572
+            },
+            "4": {
+                "images_per_second": 75.58991050720215
+            },
+            "8": {
+                "images_per_second": 142.64888381958008
+            },
+            "16": {
+                "images_per_second": 256.07005310058594
+            },
+            "32": {
+                "images_per_second": 300.8989944458008
+            }
+        },
+        "8": {
+            "2": {
+                "images_per_second": 61.28578186035156
+            },
+            "4": {
+                "images_per_second": 119.46021270751953
+            },
+            "8": {
+                "images_per_second": 231.7295379638672
+            },
+            "16": {
+                "images_per_second": 430.5494079589844
+            },
+            "32": {
+                "images_per_second": 454.2975769042969
+            }
+        }
+    }
+}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp32.json
@ -0,0 +1,59 @@
+{
+    "model": "",
+    "ngpus": [1, 4, 8],
+    "bs": [2, 4, 8, 16, 32],
+    "metric_keys": ["images_per_second"],
+    "metrics": {
+        "1": {
+            "2": {
+                "images_per_second": 48.635780334472656
+            },
+            "4": {
+                "images_per_second": 66.06407419840494
+            },
+            "8": {
+                "images_per_second": 83.91736857096353
+            },
+            "16": {
+                "images_per_second": 102.67040761311848
+            },
+            "32": {
+                "images_per_second": 110.02347819010416
+            }
+        },
+        "4": {
+            "2": {
+                "images_per_second": 41.199180603027344
+            },
+            "4": {
+                "images_per_second": 79.85076141357422
+            },
+            "8": {
+                "images_per_second": 145.39981587727863
+            },
+            "16": {
+                "images_per_second": 247.95855712890625
+            },
+            "32": {
+                "images_per_second": 341.29132080078125
+            }
+        },
+        "8": {
+            "2": {
+                "images_per_second": 63.07561111450195
+            },
+            "4": {
+                "images_per_second": 123.25757344563802
+            },
+            "8": {
+                "images_per_second": 237.3413340250651
+            },
+            "16": {
+                "images_per_second": 376.59598795572913
+            },
+            "32": {
+                "images_per_second": 507.9451497395833
+            }
+        }
+    }
+}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp16.json
@ -0,0 +1,34 @@
+{
+   "bs" : [
+      2,
+      4,
+      8,
+      16,
+      32
+   ],
+   "metric_keys" : [
+      "images_per_second"
+   ],
+   "metrics" : {
+      "1" : {
+         "16" : {
+            "images_per_second" : 470.099200788709
+         },
+         "2" : {
+            "images_per_second" : 163.117099093173
+         },
+         "32" : {
+            "images_per_second" : 520.538879400471
+         },
+         "4" : {
+            "images_per_second" : 296.604178917743
+         },
+         "8" : {
+            "images_per_second" : 412.522394180558
+         }
+      }
+   },
+   "ngpus" : [
+      1
+   ]
+}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp32.json
@ -0,0 +1,34 @@
+{
+   "bs" : [
+      2,
+      4,
+      8,
+      16,
+      32
+   ],
+   "metric_keys" : [
+      "images_per_second"
+   ],
+   "metrics" : {
+      "1" : {
+         "16" : {
+            "images_per_second" : 280.570005994299
+         },
+         "2" : {
+            "images_per_second" : 147.914221468741
+         },
+         "32" : {
+            "images_per_second" : 302.430594818483
+         },
+         "4" : {
+            "images_per_second" : 201.622430560779
+         },
+         "8" : {
+            "images_per_second" : 228.159516872363
+         }
+      }
+   },
+   "ngpus" : [
+      1
+   ]
+}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json
@ -0,0 +1,52 @@
+{
+   "bs" : [
+      2,
+      4,
+      8,
+      16,
+      32
+   ],
+   "metric_keys" : [
+      "images_per_second"
+   ],
+   "metrics" : {
+      "1" : {
+         "16" : {
+            "images_per_second" : 192.623916625977
+         },
+         "2" : {
+            "images_per_second" : 48.7488899230957
+         },
+         "32" : {
+            "images_per_second" : 204.250648498535
+         },
+         "4" : {
+            "images_per_second" : 95.4697418212891
+         },
+         "8" : {
+            "images_per_second" : 164.66495513916
+         }
+      },
+      "4" : {
+         "16" : {
+            "images_per_second" : 701.366027832031
+         },
+         "2" : {
+            "images_per_second" : 154.449935913086
+         },
+         "32" : {
+            "images_per_second" : 771.171325683594
+         },
+         "4" : {
+            "images_per_second" : 300.332641601562
+         },
+         "8" : {
+            "images_per_second" : 550.924163818359
+         }
+      }
+   },
+   "ngpus" : [
+      1,
+      4
+   ]
+}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json
@ -0,0 +1,45 @@
+{
+   "bs" : [
+      2,
+      4,
+      8,
+      16
+   ],
+   "metric_keys" : [
+      "images_per_second"
+   ],
+   "metrics" : {
+      "1" : {
+         "16" : {
+            "images_per_second" : 121.772495269775
+         },
+         "2" : {
+            "images_per_second" : 60.2171878814697
+         },
+         "4" : {
+            "images_per_second" : 90.5315437316895
+         },
+         "8" : {
+            "images_per_second" : 103.113033294678
+         }
+      },
+      "4" : {
+         "16" : {
+            "images_per_second" : 472.226806640625
+         },
+         "2" : {
+            "images_per_second" : 184.061141967773
+         },
+         "4" : {
+            "images_per_second" : 324.639801025391
+         },
+         "8" : {
+            "images_per_second" : 391.055908203125
+         }
+      }
+   },
+   "ngpus" : [
+      1,
+      4
+   ]
+}
--- a/PyTorch/Detection/SSD/qa/benchmark_performance.py
+++ b/PyTorch/Detection/SSD/qa/benchmark_performance.py
@ -0,0 +1,81 @@
+import argparse
+import subprocess
+
+from qa.qa_utils import compare_benchmarks, load_json, save_json, OKBLUE, ENDC, FAIL
+
+
+
+# parsing
+def parse_testscript_args():
+    parser = argparse.ArgumentParser(description='PyTorch Benchmark Tests')
+    parser.add_argument('--bs', default=[1], type=int, nargs='+')
+    parser.add_argument('--ngpus', default=[1], type=int, nargs='+')
+    parser.add_argument('--benchmark-mode', default='training', choices=['training', 'inference'],
+                        help='benchmark training or inference', required=True)
+    parser.add_argument('--bench-iterations', type=int, default=20, metavar='N',
+                        help='Run N iterations while benchmarking (ignored when training and validation)')
+    parser.add_argument('--bench-warmup', type=int, default=10, metavar='N',
+                        help='Number of warmup iterations for benchmarking')
+    parser.add_argument('--fp16', action='store_true', help='Run model in mixed precision.')
+    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                        help='number of data loading workers')
+    parser.add_argument('--data', type=str, metavar='<PATH>', required=True,
+                        help='path to the dataset')
+    parser.add_argument('--results-file', default='experiment_raport.json', type=str,
+                        help='file in which to store JSON experiment raport')
+    parser.add_argument('--benchmark-file', type=str, metavar='FILE', required=True,
+                        help='path to the file with baselines')
+    return parser.parse_args()
+
+
+# job command
+command_template = 'python3 {launcher} qa/qa_perf_main.py --bs {bs} --ebs {bs} ' \
+                   '--benchmark-mode {mode} --benchmark-warmup {bw} --benchmark-iterations {bi} {fp16} ' \
+                   '--backbone resnet50 --seed 1 --data {data} --results-file {results_file} --benchmark-file {benchmark_file}'
+
+if __name__ == '__main__':
+    args = parse_testscript_args()
+
+    fp16 = '--fp16' if args.fp16 else ''
+
+    # create results json file
+    # todo: maybe some template json file?
+    results = {'ngpus': args.ngpus,
+               'bs': args.bs,
+               'metric_keys': ['images_per_second'],
+               'metrics': {}}
+
+    for gpu in args.ngpus:
+        results['metrics'][str(gpu)] = {}
+        for bs in args.bs:
+            results['metrics'][str(gpu)][str(bs)] = {'images_per_second': None}
+
+    save_json(args.results_file, results)
+
+    # run qa_perf_main.py tests one by one
+    for gpu in args.ngpus:
+        launcher = '' if gpu == 1 else '-m torch.distributed.launch --nproc_per_node={}'.format(gpu)
+        for bs in args.bs:
+            print('#' * 80)
+            command = command_template.format(launcher=launcher, bs=bs, workers=args.workers, mode=args.benchmark_mode,
+                                              bw=args.bench_warmup, bi=args.bench_iterations, fp16=fp16,
+                                              data=args.data, results_file=args.results_file,
+                                              benchmark_file=args.benchmark_file)
+
+            print('Running "{}"'.format(command))
+
+            process = subprocess.Popen(command, shell=True)
+            output, error = process.communicate()
+
+            if error is not None:
+                print(FAIL + 'Program exited with status {}. Data has not been collected'.format(error) + ENDC)
+            # elif results['metrics'][str(gpu)][str(bs)]['images_per_second'] is None:
+            #     print(WARNING + 'Program did not end sucessfully. Data has not been collected.' + ENDC)
+            else:
+                print(OKBLUE + 'Program ended sucessfully. Data has been collected.' + ENDC)
+
+    results_data = load_json(args.results_file)
+    benchmark_data = load_json(args.benchmark_file)
+    exit_code = compare_benchmarks(results_data, benchmark_data, args, 0.16 if args.benchmark_mode == 'inference' else 0.1)
+    print(exit_code)
+    exit(exit_code)
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json
@ -0,0 +1 @@
+{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [8.812795396454991, 5.914838795058071, 6, 5.092440919584583, 4.887887316499735, 4.744666463422983, 4.694560192557922, 4.567333741479565, 4.492525351620137, 6, 4.408311570055099, 4.334232046614567, 6, 4.263646488106407, 4.2514614595596445, 4.2171871953656055, 4.206751160226014, 4.1795772798196715, 4.156515416099515, 6, 4.108870625495911, 4.0985876759066855, 4.075221928967139, 4.080158276849438, 6, 4.033980131669857, 4.037739227952915, 6, 3.99941903534935, 6, 3.9875937877263565, 3.971811039999583, 3.980771179282509, 3.953947089124455, 3.9305202960968018, 3.9366443781873546, 3.9252991879350754, 3.8827156307395367, 3.9388060424005102, 3.88922161618695, 3.8874285418914396, 6, 3.8936942113018453, 3.537499847891029, 3.4058184228089177, 6, 6, 3.3219671837627627, 3.295458280363458, 3.262115957955606, 6, 6, 6, 3.2190717260910433, 3.213117691627236, 3.1739242191397987, 3.1791626058811704, 3.2088054501854177, 3.1719801842385507, 3.187761370792139, 3.1809213312432236, 3.1823803410259397, 3.1752594631311677, 3.1709555600928425, 3.1823559530957817], "val.acc": [0.025120322205631106, 0.06065902615325462, 0.08224594352985645, 0.09868630608427395, 0.11402055039858493, 0.11779455253460233, 0.1232203941357061, 0.13708232144631768, 0.13614397127135028, 0.13289094380937685, 0.14004009449749777, 0.1369843423424096, 0.13877603069457692, 0.15418866425831707, 0.1500001994042602, 0.1542573219664272, 0.14771151227315413, 0.15896497766306272, 0.1600724682809656, 0.15881491661088476, 0.16213217020726906, 0.16466781280171408, 0.15738430149539484, 0.16634155547369375, 0.1623110334880526, 0.16394517553182106, 0.1494171026560053, 0.16762167601953265, 0.16063595691096758, 0.16982898253523193, 0.17321918229909394, 0.17242960413896102, 0.1625123530546557, 0.18330429802960516, 0.16333127233412115, 0.17973452067250242, 0.16699022570278652, 0.17183956548028687, 0.17168756775917593, 0.17547718325478198, 0.1750019046551496, 0.18416070771679066, 0.1711460087987496, 0.231325087097653, 0.23716038401167305, 0.23886896590018106, 0.2403412383214709, 0.24380227870861898, 0.24383605475007317, 0.2449733300818802, 0.24508423152154857, 0.24252172333110344, 0.24566254540226004, 0.24661345705692578, 0.25123807624083877, 0.25184439401895475, 0.2519010236397111, 0.25191664071239706, 0.2522156441636805, 0.25215053241008767, 0.2525434296889651, 0.2524917808636186, 0.2527410425201369, 0.2534121449798447, 0.25279479287831214]}, "bs": [64], "model": "", "ngpus": [8]}
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json
@ -0,0 +1 @@
+{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [9.887425426832973, 6.30290542835752, 5.566619733535567, 5.192713968618468, 4.943981836976963, 4.777146058311629, 4.682364774062644, 4.566371860462505, 4.479279315107254, 5, 4.398730874582149, 4.31779890601812, 4.293896813580043, 4.250142149529603, 4.219812418175577, 4.21572122303159, 4.187492328960302, 4.147948342119242, 4.134799897931028, 4.131298205737984, 4.071315974647822, 4.074750597299968, 4.0595350983882055, 4.042616275720722, 4.029284068070124, 4.02082926113012, 3.9983501902834298, 4.00984974094874, 3.9730074155799167, 5, 3.9646901324326294, 3.952598022061144, 3.944574903713043, 3.9182081201711596, 3.9252539055836775, 3.907297405092997, 3.8867245969813986, 3.87151758639573, 3.8793927009449254, 3.8687505586699107, 3.8750464156204956, 5, 3.8645522469516402, 3.504709825765618, 3.3920036476251862, 3.318732707260998, 5, 3.295415750237011, 3.2602547589347872, 5, 5, 5, 5, 3.199645553613854, 3.1623374312205086, 5, 3.147109237820821, 3.158245995575684, 3.1465386938319977, 3.1480963979746055, 3.151234711101482, 3.146022343739672, 3.1410668343956294, 3.142435818259893, 3.123337645718104], "val.acc": [0.01106397969239677, 0.04958324872172423, 0.07470961174804201, 0.08412781056028416, 0.1052591997157941, 0.11592629309116805, 0.1275672396324061, 0.12472585915140484, 0.13138377072048255, 0.1262696666605193, 0.13354663690485083, 0.14424123617821044, 0.14059169419863984, 0.14768715602101368, 0.15450788443085858, 0.14792122925940135, 0.1508861356435794, 0.157419558440425, 0.15279118544884585, 0.16075469826863828, 0.14747077091644412, 0.16340857637480236, 0.14427366437395484, 0.15709914018423293, 0.16324391683493303, 0.16440443232887508, 0.16479726175439752, 0.17508843799046686, 0.16142292492169025, 0.1643848499786872, 0.16912610131976924, 0.16376330941842296, 0.16894551721633602, 0.17771765128166106, 0.1749561896689298, 0.1695538322677119, 0.16778561571905298, 0.16380194923909086, 0.16994188486879763, 0.1716953661397215, 0.17755697810460197, 0.17187995479426885, 0.1742018462295355, 0.23426649845846764, 0.23613136034024038, 0.24175797706337981, 0.2425279583355936, 0.24352550398110506, 0.24411115979837528, 0.24656561042490024, 0.24383524308920906, 0.24686666489675338, 0.24814559219197632, 0.24840393696219026, 0.251965847689631, 0.25254138256097747, 0.2523565615073023, 0.2529904738785998, 0.253555154014026, 0.2530651493203877, 0.25358174010109197, 0.2537683728256746, 0.2539384684886946, 0.2540280117408162, 0.2534652864501853]}, "bs": [32], "model": "", "ngpus": [8]}
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json
@ -0,0 +1,20 @@
+{
+   "metrics" : {
+      "val.acc" : [
+         0.0100971670737651
+      ],
+      "train.loss" : [
+         9.85026645043801
+      ]
+   },
+   "ngpus" : [
+      8
+   ],
+   "metric_keys" : [
+      "train.loss",
+      "val.acc"
+   ],
+   "bs" : [
+      64
+   ]
+}
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json
@ -0,0 +1,20 @@
+{
+   "bs" : [
+      32
+   ],
+   "metrics" : {
+      "train.loss" : [
+         8.79916159380589
+      ],
+      "val.acc" : [
+         0.0238952010105531
+      ]
+   },
+   "metric_keys" : [
+      "train.loss",
+      "val.acc"
+   ],
+   "ngpus" : [
+      8
+   ]
+}
--- a/PyTorch/Detection/SSD/qa/qa_accuracy_main.py
+++ b/PyTorch/Detection/SSD/qa/qa_accuracy_main.py
@ -0,0 +1,73 @@
+# core imports
+import os
+import numpy as np
+
+# pytorch imports
+import torch
+import torch.utils.data.distributed
+
+# Apex imports
+try:
+    from apex.parallel.LARC import LARC
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+except ImportError:
+    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
+
+# project imports
+from src.train import train_loop
+from main import train, make_parser
+from src.logger import Logger
+from qa.qa_utils import load_json, create_json_file, compare_acc, save_json
+
+RESULT = None
+
+
+def add_benchmark_args(parser):
+    parser.add_argument('--benchmark-mode', type=str, default='epoch-accuracy',
+                        choices=['full-accuracy', 'epoch-accuracy'], required=True)
+    parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
+                        help='path to the file with baselines', required=True)
+    return parser
+
+
+def main(args):
+    if args.local_rank == 0:
+        os.makedirs('./models', exist_ok=True)
+
+    if args.seed is not None:
+        print("Using seed = {}".format(args.seed))
+        torch.manual_seed(args.seed)
+        np.random.seed(seed=args.seed)
+
+    torch.backends.cudnn.benchmark = True
+
+    if args.benchmark_mode == 'epoch-accuracy':
+        args.epochs = 1
+
+    train_loop_func = train_loop
+    logger = Logger('Accuracy test', print_freq=10)
+
+    args.evaluation = list(range(90))
+    train(train_loop_func, logger, args)
+
+    exit_code = 0
+    if args.local_rank == 0:
+        train_loss_results, val_acc_results, train_time_results = logger.print_results()
+        print(train_time_results)
+        print(train_loss_results)
+        print(val_acc_results)
+        measured_results = create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=args.batch_size)
+        save_json('/results/results.json', measured_results)
+        print(measured_results)
+        benchmark_results = load_json(args.benchmark_file)
+        exit_code = compare_acc(measured_results, benchmark_results, args)
+        exit(exit_code)
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    parser = add_benchmark_args(parser)
+    args = parser.parse_args()
+    print(args)
+    main(args)
--- a/PyTorch/Detection/SSD/qa/qa_perf_main.py
+++ b/PyTorch/Detection/SSD/qa/qa_perf_main.py
@ -0,0 +1,199 @@
+# core imports
+import os
+import numpy as np
+import json
+from pprint import pprint
+import time
+
+# pytorch imports
+import torch
+import torch.utils.data.distributed
+from torch.autograd import Variable
+
+
+# Apex imports
+try:
+    from apex.parallel.LARC import LARC
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+except ImportError:
+    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
+
+# project imports
+from main import train, make_parser
+from src.logger import BenchLogger
+# from src.train import benchmark_inference_loop, benchmark_train_loop
+
+from SSD import _C as C
+
+RESULT = None
+
+
+def add_benchmark_args(parser):
+    parser.add_argument('--benchmark-mode', type=str, choices=['training', 'inference'],
+                        default='inference', required=True)
+    parser.add_argument('--results-file', default='experiment_raport.json', type=str,
+                        help='file in which to store JSON experiment raport')
+    parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
+                        help='path to the file with baselines')
+    return parser
+
+def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
+    start_time = None
+    # tensor for results
+    result = torch.zeros((1,)).cuda()
+    for i, data in enumerate(loop(train_dataloader)):
+        if i >= args.benchmark_warmup:
+            start_time = time.time()
+
+        img = data[0][0][0]
+        bbox = data[0][1][0]
+        label = data[0][2][0]
+        label = label.type(torch.cuda.LongTensor)
+        bbox_offsets = data[0][3][0]
+        # handle random flipping outside of DALI for now
+        bbox_offsets = bbox_offsets.cuda()
+        img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
+
+        if not args.no_cuda:
+            img = img.cuda()
+            bbox = bbox.cuda()
+            label = label.cuda()
+            bbox_offsets = bbox_offsets.cuda()
+        img.sub_(mean).div_(std)
+
+        N = img.shape[0]
+        if bbox_offsets[-1].item() == 0:
+            print("No labels in batch")
+            continue
+        bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
+
+        M = bbox.shape[0] // N
+        bbox = bbox.view(N, M, 4)
+        label = label.view(N, M)
+
+
+
+
+
+        ploc, plabel = model(img)
+        ploc, plabel = ploc.float(), plabel.float()
+
+        trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
+
+        if not args.no_cuda:
+            label = label.cuda()
+        gloc = Variable(trans_bbox, requires_grad=False)
+        glabel = Variable(label, requires_grad=False)
+
+        loss = loss_func(ploc, plabel, gloc, glabel)
+
+
+
+        # loss scaling
+        if args.fp16:
+            if args.amp:
+                with optim.scale_loss(loss) as scale_loss:
+                    scale_loss.backward()
+            else:
+                optim.backward(loss)
+        else:
+            loss.backward()
+
+        optim.step()
+        optim.zero_grad()
+        iteration += 1
+
+        # reduce all results from every gpu
+        if i >= args.benchmark_warmup + args.benchmark_iterations:
+            result.data[0] = logger.print_result()
+            if args.N_gpu > 1:
+                torch.distributed.reduce(result, 0)
+            if args.local_rank == 0:
+                global RESULT
+                RESULT = float(result.data[0])
+            return
+
+        if i >= args.benchmark_warmup:
+            logger.update(args.batch_size, time.time() - start_time)
+
+def loop(dataloader):
+    while True:
+        for data in dataloader:
+            yield data
+
+def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
+    assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
+    start_time = None
+    model.eval()
+    i=-1
+    dataloader = loop(val_dataloader)
+    while True:
+        i+=1
+        with torch.no_grad():
+            torch.cuda.synchronize()
+            if i >= args.benchmark_warmup:
+                start_time = time.time()
+            data = next(dataloader)
+
+            img = data[0]
+
+            if not args.no_cuda:
+                img = img.cuda()
+
+            if args.fp16:
+                img = img.half()
+
+            img.sub_(mean).div_(std)
+            img = Variable(img, requires_grad=False)
+            _ = model(img)
+            torch.cuda.synchronize()
+
+            if i >= args.benchmark_warmup + args.benchmark_iterations:
+                global RESULT
+                RESULT = logger.print_result()
+                return
+
+            if i >= args.benchmark_warmup:
+                logger.update(args.batch_size, time.time() - start_time)
+
+
+def main(args):
+    if args.local_rank == 0:
+        os.makedirs('./models', exist_ok=True)
+
+    if args.seed is not None:
+        print("Using seed = {}".format(args.seed))
+        torch.manual_seed(args.seed)
+        np.random.seed(seed=args.seed)
+
+    torch.backends.cudnn.benchmark = True
+
+    if args.benchmark_mode == 'training':
+        train_loop_func = benchmark_train_loop
+        logger = BenchLogger('Training benchmark')
+    else:
+        train_loop_func = benchmark_inference_loop
+        logger = BenchLogger('Inference benchmark')
+
+    args.epochs = 1
+
+    train(train_loop_func, logger, args)
+
+    if args.local_rank == 0:
+        global RESULT
+        with open(args.results_file) as f:
+            results = json.load(f)
+        results['metrics'][str(args.N_gpu)][str(args.batch_size)] = {'images_per_second': RESULT}
+        pprint(results)
+
+        with open(args.results_file, 'w') as f:
+            json.dump(results, f)
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    parser = add_benchmark_args(parser)
+    args = parser.parse_args()
+    print(args)
+    main(args)
--- a/PyTorch/Detection/SSD/qa/qa_utils.py
+++ b/PyTorch/Detection/SSD/qa/qa_utils.py
@ -0,0 +1,115 @@
+import json
+
+# terminal stdout colors
+OKBLUE = '\033[94m'
+OKGREEN = '\033[92m'
+WARNING = '\033[93m'
+FAIL = '\033[91m'
+ENDC = '\033[0m'
+
+
+# load results and benchmark
+def load_json(filepath):
+    with open(filepath) as f:
+        data = json.load(f)
+    return data
+
+
+def save_json(filepath, data):
+    with open(filepath, 'w') as f:
+        json.dump(data, f)
+
+
+# compare func
+def compare(measured_value, true_value, pmargin=0.1):
+    assert 0 < pmargin < 1, 'Margin should be in range [0, 1]'
+    return (1 - pmargin) * true_value < measured_value
+
+
+# compare 2 benchmark json files
+def compare_benchmarks(results, benchmark, args, pmargin=0.1):
+    # sanity check
+    for metric in results['metric_keys']:
+        if metric not in benchmark['metric_keys']:
+            assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
+
+    assert len(args.bs) <= len(benchmark['bs']), 'len(args.bs) <= len(benchmark["bs"] ({} <= {})'.format(len(args.bs),
+                                                                                                         len(benchmark[
+                                                                                                                 'bs']))
+    assert len(args.bs) == len(results['bs']), 'len(args.bs) <= len(results["bs"] ({} == {})'.format(len(args.bs),
+                                                                                                     len(results['bs']))
+    for bs in results['bs']:
+        if bs not in benchmark['bs']:
+            assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
+
+    assert len(args.ngpus) <= len(benchmark['ngpus']), 'len(args.ngpus) <= len(benchmark["ngpus"]) ({} <= {})'.format(
+        len(args.bs), len(benchmark['ngpus']))
+    assert len(args.ngpus) == len(results['ngpus']), 'len(args.ngpus) == len(results["ngpus"]) ({} == {})'.format(
+        len(args.bs), len(results['ngpus']))
+    for gpu in results['ngpus']:
+        if gpu not in benchmark['ngpus']:
+            assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
+
+    # compare measured numbers with benchmark
+    exit = 0
+    for metric in results['metric_keys']:
+        for gpu in results['ngpus']:
+            for bs in results['bs']:
+                measured_metric = results['metrics'][str(gpu)][str(bs)][metric]
+                ground_truth_metric = benchmark['metrics'][str(gpu)][str(bs)][metric]
+                ok = compare(measured_metric, ground_truth_metric, pmargin)
+                if ok:
+                    print(OKGREEN + 'BENCHMARK PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
+                else:
+                    print(FAIL + 'BENCHMARK NOT PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
+                    exit = 1
+    return exit
+
+# compare 2 benchmark json files
+def compare_acc(results, benchmark, args):
+    # sanity check
+    for metric in results['metric_keys']:
+        if metric not in benchmark['metric_keys']:
+            assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
+
+    for bs in results['bs']:
+        if bs not in benchmark['bs']:
+            assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
+
+    for gpu in results['ngpus']:
+        if gpu not in benchmark['ngpus']:
+            assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
+
+    # compare measured numbers with benchmark
+    for i, (result, ground_truth) in enumerate(zip(results['metrics']['val.acc'], benchmark['metrics']['val.acc'])):
+        if i > 43: # before first decay accuracy tends to vary more than 15% at ~30th epoch
+            if ground_truth * 0.9 > result:
+                print(FAIL + 'ACCURACY TEST NOT PASSED' + ENDC)
+                return 1
+
+    # compare measured numbers with benchmark
+    for i, (result, ground_truth) in enumerate(zip(results['metrics']['train.loss'], benchmark['metrics']['train.loss'])):
+        if i > 43:
+            if ground_truth * 1.1 < result:
+                print(FAIL + 'LOSS TEST NOT PASSED' + ENDC)
+                return 1
+
+    print(OKGREEN + 'ACCURACY TEST PASSED' + ENDC)
+    return 0
+
+def create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=32):
+    results = {"ngpus": [ngpus],
+               "bs": [bs],
+               "metric_keys": ["train.loss", "val.acc"],
+               "metrics": {
+                   "train.loss": [],
+                   "val.acc": []
+               }
+               }
+
+    for i, ((epoch1, acc), (epoch2, loss)) in enumerate(zip(val_acc_results, train_loss_results)):
+        assert i == epoch1 == epoch2
+        results['metrics']['train.loss'].append(loss)
+        results['metrics']['val.acc'].append(acc)
+
+    return results
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_1epoch_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_1epoch_run.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json --data $1
+
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_full_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_full_run.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json --data $1
+
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_1epoch_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_1epoch_run.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json --data $1
+
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_full_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_full_run.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json --data $1
+
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp16.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp16.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+
+python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --fp16  --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp16.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp32.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp32.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+
+python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp32.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp16.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp16.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+
+python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 32 --fp16  --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp32.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp32.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+
+python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json --data $1
--- a/PyTorch/Detection/SSD/requirements.txt
+++ b/PyTorch/Detection/SSD/requirements.txt
@ -0,0 +1 @@
+Cython==0.28.4
--- a/PyTorch/Detection/SSD/setup.py
+++ b/PyTorch/Detection/SSD/setup.py
@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+import torch
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ["torch", "torchvision"]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "csrc")
+
+    source_cpu = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"))
+
+    print('c++: ', source_cpu)
+    print('cuda: ', source_cuda)
+    sources = source_cpu
+    extension = CppExtension
+
+    define_macros = []
+
+    if CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+
+    include_dirs = [extensions_dir]
+    extra_compile_flags= {'cxx' : []}
+    extra_compile_flags['nvcc'] = ['-DCUDA_HAS_FP16=1','-D__CUDA_NO_HALF_OPERATORS__','-D__CUDA_NO_HALF_CONVERSIONS__','-D__CUDA_NO_HALF2_OPERATORS__']
+
+    gencodes = [
+                #'-gencode', 'arch=compute_50,code=sm_50',
+                #'-gencode', 'arch=compute_52,code=sm_52',
+                #'-gencode', 'arch=compute_60,code=sm_60',
+                #'-gencode', 'arch=compute_61,code=sm_61',
+                '-gencode', 'arch=compute_70,code=sm_70',
+                '-gencode', 'arch=compute_70,code=compute_70',]
+
+    extra_compile_flags['nvcc'] += gencodes
+
+    ext_modules = [
+        extension(
+            "SSD._C",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_flags,
+        )
+    ]
+
+    return ext_modules
+
+
+setup(
+    name="SSD",
+    version="0.1",
+    author="slayton",
+    url="",
+    description="SSD in pytorch",
+    packages=find_packages(exclude=("configs", "examples", "test",)),
+    # install_requires=requirements,
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
--- a/PyTorch/Detection/SSD/src/init.py
+++ b/PyTorch/Detection/SSD/src/init.py
--- a/PyTorch/Detection/SSD/src/coco.py
+++ b/PyTorch/Detection/SSD/src/coco.py
@ -0,0 +1,433 @@
+__author__ = 'tylin'
+__version__ = '2.0'
+# Interface for accessing the Microsoft COCO dataset.
+
+# Microsoft COCO is a large image dataset designed for object detection,
+# segmentation, and caption generation. pycocotools is a Python API that
+# assists in loading, parsing and visualizing the annotations in COCO.
+# Please visit http://mscoco.org/ for more information on COCO, including
+# for the data, paper, and tutorials. The exact format of the annotations
+# is also described on the COCO website. For example usage of the pycocotools
+# please see pycocotools_demo.ipynb. In addition to this API, please download both
+# the COCO images and annotations in order to run the demo.
+
+# An alternative to using the API is to load the annotations directly
+# into Python dictionary
+# Using the API provides additional utility functions. Note that this API
+# supports both *instance* and *caption* annotations. In the case of
+# captions not all functions are defined (e.g. categories are undefined).
+
+# The following API functions are defined:
+#  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
+#  decodeMask - Decode binary mask M encoded via run-length encoding.
+#  encodeMask - Encode binary mask M using run-length encoding.
+#  getAnnIds  - Get ann ids that satisfy given filter conditions.
+#  getCatIds  - Get cat ids that satisfy given filter conditions.
+#  getImgIds  - Get img ids that satisfy given filter conditions.
+#  loadAnns   - Load anns with the specified ids.
+#  loadCats   - Load cats with the specified ids.
+#  loadImgs   - Load imgs with the specified ids.
+#  annToMask  - Convert segmentation in an annotation to binary mask.
+#  showAnns   - Display the specified annotations.
+#  loadRes    - Load algorithm results and create API for accessing them.
+#  download   - Download COCO images from mscoco.org server.
+# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
+# Help on each functions can be accessed by: "help COCO>function".
+
+# See also COCO>decodeMask,
+# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
+# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
+# COCO>loadImgs, COCO>annToMask, COCO>showAnns
+
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
+# Licensed under the Simplified BSD License [see bsd.txt]
+
+import json
+import time
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+import numpy as np
+import copy
+import itertools
+from pycocotools import mask as maskUtils
+import os
+from collections import defaultdict
+import sys
+PYTHON_VERSION = sys.version_info[0]
+if PYTHON_VERSION == 2:
+    from urllib import urlretrieve
+elif PYTHON_VERSION == 3:
+    from urllib.request import urlretrieve
+
+
+def _isArrayLike(obj):
+    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+
+
+class COCO:
+    def __init__(self, annotation_file=None):
+        """
+        Constructor of Microsoft COCO helper class for reading and visualizing annotations.
+        :param annotation_file (str): location of annotation file
+        :param image_folder (str): location to the folder that hosts images.
+        :return:
+        """
+        # load dataset
+        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
+        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
+        if not annotation_file == None:
+            print('loading annotations into memory...')
+            tic = time.time()
+            dataset = json.load(open(annotation_file, 'r'))
+            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
+            print('Done (t={:0.2f}s)'.format(time.time()- tic))
+            self.dataset = dataset
+            self.createIndex()
+
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        anns, cats, imgs = {}, {}, {}
+        imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                imgToAnns[ann['image_id']].append(ann)
+                anns[ann['id']] = ann
+
+        if 'images' in self.dataset:
+            for img in self.dataset['images']:
+                imgs[img['id']] = img
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                catToImgs[ann['category_id']].append(ann['image_id'])
+
+        print('index created!')
+
+        # create class members
+        self.anns = anns
+        self.imgToAnns = imgToAnns
+        self.catToImgs = catToImgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def info(self):
+        """
+        Print information about the annotation file.
+        :return:
+        """
+        for key, value in self.dataset['info'].items():
+            print('{}: {}'.format(key, value))
+
+    def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
+        """
+        Get ann ids that satisfy given filter conditions. default skips that filter
+        :param imgIds  (int array)     : get anns for given imgs
+               catIds  (int array)     : get anns for given cats
+               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
+               iscrowd (boolean)       : get anns for given crowd label (False or True)
+        :return: ids (int array)       : integer array of ann ids
+        """
+        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(imgIds) == len(catIds) == len(areaRng) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(imgIds) == 0:
+                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.dataset['annotations']
+            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
+            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
+        if not iscrowd == None:
+            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+        else:
+            ids = [ann['id'] for ann in anns]
+        return ids
+
+    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
+        """
+        filtering parameters. default skips that filter.
+        :param catNms (str array)  : get cats for given cat names
+        :param supNms (str array)  : get cats for given supercategory names
+        :param catIds (int array)  : get cats for given cat ids
+        :return: ids (int array)   : integer array of cat ids
+        """
+        catNms = catNms if _isArrayLike(catNms) else [catNms]
+        supNms = supNms if _isArrayLike(supNms) else [supNms]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(catNms) == len(supNms) == len(catIds) == 0:
+            cats = self.dataset['categories']
+        else:
+            cats = self.dataset['categories']
+            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
+            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
+            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
+        ids = [cat['id'] for cat in cats]
+        return ids
+
+    def getImgIds(self, imgIds=[], catIds=[]):
+        '''
+        Get img ids that satisfy given filter conditions.
+        :param imgIds (int array) : get imgs for given ids
+        :param catIds (int array) : get imgs with all given cats
+        :return: ids (int array)  : integer array of img ids
+        '''
+        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(imgIds) == len(catIds) == 0:
+            ids = self.imgs.keys()
+        else:
+            ids = set(imgIds)
+            for i, catId in enumerate(catIds):
+                if i == 0 and len(ids) == 0:
+                    ids = set(self.catToImgs[catId])
+                else:
+                    ids &= set(self.catToImgs[catId])
+        return list(ids)
+
+    def loadAnns(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying anns
+        :return: anns (object array) : loaded ann objects
+        """
+        if _isArrayLike(ids):
+            return [self.anns[id] for id in ids]
+        elif type(ids) == int:
+            return [self.anns[ids]]
+
+    def loadCats(self, ids=[]):
+        """
+        Load cats with the specified ids.
+        :param ids (int array)       : integer ids specifying cats
+        :return: cats (object array) : loaded cat objects
+        """
+        if _isArrayLike(ids):
+            return [self.cats[id] for id in ids]
+        elif type(ids) == int:
+            return [self.cats[ids]]
+
+    def loadImgs(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying img
+        :return: imgs (object array) : loaded img objects
+        """
+        if _isArrayLike(ids):
+            return [self.imgs[id] for id in ids]
+        elif type(ids) == int:
+            return [self.imgs[ids]]
+
+    def showAnns(self, anns):
+        """
+        Display the specified annotations.
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+            return 0
+        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
+            datasetType = 'instances'
+        elif 'caption' in anns[0]:
+            datasetType = 'captions'
+        else:
+            raise Exception('datasetType not supported')
+        if datasetType == 'instances':
+            ax = plt.gca()
+            ax.set_autoscale_on(False)
+            polygons = []
+            color = []
+            for ann in anns:
+                c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
+                if 'segmentation' in ann:
+                    if type(ann['segmentation']) == list:
+                        # polygon
+                        for seg in ann['segmentation']:
+                            poly = np.array(seg).reshape((int(len(seg)/2), 2))
+                            polygons.append(Polygon(poly))
+                            color.append(c)
+                    else:
+                        # mask
+                        t = self.imgs[ann['image_id']]
+                        if type(ann['segmentation']['counts']) == list:
+                            rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
+                        else:
+                            rle = [ann['segmentation']]
+                        m = maskUtils.decode(rle)
+                        img = np.ones( (m.shape[0], m.shape[1], 3) )
+                        if ann['iscrowd'] == 1:
+                            color_mask = np.array([2.0,166.0,101.0])/255
+                        if ann['iscrowd'] == 0:
+                            color_mask = np.random.random((1, 3)).tolist()[0]
+                        for i in range(3):
+                            img[:,:,i] = color_mask[i]
+                        ax.imshow(np.dstack( (img, m*0.5) ))
+                if 'keypoints' in ann and type(ann['keypoints']) == list:
+                    # turn skeleton into zero-based index
+                    sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
+                    kp = np.array(ann['keypoints'])
+                    x = kp[0::3]
+                    y = kp[1::3]
+                    v = kp[2::3]
+                    for sk in sks:
+                        if np.all(v[sk]>0):
+                            plt.plot(x[sk],y[sk], linewidth=3, color=c)
+                    plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
+                    plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
+            p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
+            ax.add_collection(p)
+            p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
+            ax.add_collection(p)
+        elif datasetType == 'captions':
+            for ann in anns:
+                print(ann['caption'])
+
+    def loadRes(self, resFile):
+        """
+        Load result file and return a result api object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = COCO()
+        res.dataset['images'] = [img for img in self.dataset['images']]
+
+        print('Loading and preparing results...')
+        tic = time.time()
+        if type(resFile) == str: #or type(resFile) == unicode:
+            anns = json.load(open(resFile))
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, 'results in not an array of objects'
+        annsImgIds = [ann['image_id'] for ann in anns]
+        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
+               'Results do not correspond to current coco set'
+        if 'caption' in anns[0]:
+            imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
+            res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
+            for id, ann in enumerate(anns):
+                ann['id'] = id+1
+        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                bb = ann['bbox']
+                x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
+                if not 'segmentation' in ann:
+                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann['area'] = bb[2]*bb[3]
+                ann['id'] = id+1
+                ann['iscrowd'] = 0
+        elif 'segmentation' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                # now only support compressed RLE format as segmentation results
+                ann['area'] = maskUtils.area(ann['segmentation'])
+                if not 'bbox' in ann:
+                    ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
+                ann['id'] = id+1
+                ann['iscrowd'] = 0
+        elif 'keypoints' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                s = ann['keypoints']
+                x = s[0::3]
+                y = s[1::3]
+                x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann['area'] = (x1-x0)*(y1-y0)
+                ann['id'] = id + 1
+                ann['bbox'] = [x0,y0,x1-x0,y1-y0]
+        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
+
+    def download(self, tarDir = None, imgIds = [] ):
+        '''
+        Download COCO images from mscoco.org server.
+        :param tarDir (str): COCO results directory name
+               imgIds (list): images to be downloaded
+        :return:
+        '''
+        if tarDir is None:
+            print('Please specify target directory')
+            return -1
+        if len(imgIds) == 0:
+            imgs = self.imgs.values()
+        else:
+            imgs = self.loadImgs(imgIds)
+        N = len(imgs)
+        if not os.path.exists(tarDir):
+            os.makedirs(tarDir)
+        for i, img in enumerate(imgs):
+            tic = time.time()
+            fname = os.path.join(tarDir, img['file_name'])
+            if not os.path.exists(fname):
+                urlretrieve(img['coco_url'], fname)
+            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
+
+    def loadNumpyAnnotations(self, data):
+        """
+        Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
+        :param  data (numpy.ndarray)
+        :return: annotations (python nested list)
+        """
+        print('Converting ndarray to lists...')
+        assert(type(data) == np.ndarray)
+        print(data.shape)
+        assert(data.shape[1] == 7)
+        N = data.shape[0]
+        ann = []
+        for i in range(N):
+            if i % 1000000 == 0:
+                print('{}/{}'.format(i,N))
+            ann += [{
+                'image_id'  : int(data[i, 0]),
+                'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
+                'score' : data[i, 5],
+                'category_id': int(data[i, 6]),
+                }]
+        return ann
+
+    def annToRLE(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE to RLE.
+        :return: binary mask (numpy 2D array)
+        """
+        t = self.imgs[ann['image_id']]
+        h, w = t['height'], t['width']
+        segm = ann['segmentation']
+        if type(segm) == list:
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(segm, h, w)
+            rle = maskUtils.merge(rles)
+        elif type(segm['counts']) == list:
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(segm, h, w)
+        else:
+            # rle
+            rle = ann['segmentation']
+        return rle
+
+    def annToMask(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
+        :return: binary mask (numpy 2D array)
+        """
+        rle = self.annToRLE(ann)
+        m = maskUtils.decode(rle)
+        return m
--- a/PyTorch/Detection/SSD/src/coco_pipeline.py
+++ b/PyTorch/Detection/SSD/src/coco_pipeline.py
@ -0,0 +1,267 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import ctypes
+import logging
+
+import numpy as np
+
+# DALI imports
+from nvidia.dali.pipeline import Pipeline
+import nvidia.dali.ops as ops
+import nvidia.dali.types as types
+
+import time
+
+
+class COCOPipeline(Pipeline):
+    def __init__(self, batch_size, device_id, file_root, annotations_file, num_gpus,
+            output_fp16=False, output_nhwc=False, pad_output=False, num_threads=1, seed=15):
+        super(COCOPipeline, self).__init__(batch_size=batch_size, device_id=device_id,
+                                           num_threads=num_threads, seed = seed)
+
+        try:
+            shard_id = torch.distributed.get_rank()
+        except RuntimeError:
+            shard_id = 0
+
+        self.input = ops.COCOReader(file_root = file_root, annotations_file = annotations_file,
+                            shard_id = shard_id, num_shards = num_gpus, ratio=True, ltrb=True, random_shuffle=True,
+                                    skip_empty=True)
+        self.decode = ops.HostDecoder(device = "cpu", output_type = types.RGB)
+
+        # Augumentation techniques
+        self.crop = ops.SSDRandomCrop(device="cpu", num_attempts=1)
+        self.twist = ops.ColorTwist(device="gpu")
+
+        self.resize = ops.Resize(device = "gpu", resize_x = 300, resize_y = 300)
+
+        output_dtype = types.FLOAT16 if output_fp16 else types.FLOAT
+        output_layout = types.NHWC if output_nhwc else types.NCHW
+
+        self.normalize = ops.CropMirrorNormalize(device="gpu", crop=(300, 300),
+                                                 mean=[0.0, 0.0, 0.0],
+                                                 std=[255.0, 255.0, 255.0],
+                                                 mirror=0,
+                                                 output_dtype=output_dtype,
+                                                 output_layout=output_layout,
+                                                 pad_output=pad_output)
+
+        # Random variables
+        self.rng1 = ops.Uniform(range=[0.5, 1.5])
+        self.rng2 = ops.Uniform(range=[0.875, 1.125])
+        self.rng3 = ops.Uniform(range=[-0.5, 0.5])
+
+    def define_graph(self):
+        saturation = self.rng1()
+        contrast = self.rng1()
+        brightness = self.rng2()
+        hue = self.rng3()
+
+        inputs, bboxes, labels = self.input()
+        images = self.decode(inputs)
+
+        images, bboxes, labels = self.crop(images, bboxes, labels)
+        images = self.resize(images.gpu())
+        images = self.twist(images.gpu(), saturation=saturation, contrast=contrast, brightness=brightness, hue=hue)
+        images = self.normalize(images)
+
+        # bboxes and images and labels on GPU
+        return (images, bboxes.gpu(), labels.gpu())
+
+to_torch_type = {
+    np.dtype(np.float32) : torch.float32,
+    np.dtype(np.float64) : torch.float64,
+    np.dtype(np.float16) : torch.float16,
+    np.dtype(np.uint8)   : torch.uint8,
+    np.dtype(np.int8)    : torch.int8,
+    np.dtype(np.int16)   : torch.int16,
+    np.dtype(np.int32)   : torch.int32,
+    np.dtype(np.int64)   : torch.int64
+}
+
+def feed_ndarray(dali_tensor, arr):
+    """
+    Copy contents of DALI tensor to pyTorch's Tensor.
+
+    Parameters
+    ----------
+    `dali_tensor` : nvidia.dali.backend.TensorCPU or nvidia.dali.backend.TensorGPU
+                    Tensor from which to copy
+    `arr` : torch.Tensor
+            Destination of the copy
+    """
+    assert dali_tensor.shape() == list(arr.size()), \
+            ("Shapes do not match: DALI tensor has size {0}"
+            ", but PyTorch Tensor has size {1}".format(dali_tensor.shape(), list(arr.size())))
+    #turn raw int to a c void pointer
+    c_type_pointer = ctypes.c_void_p(arr.data_ptr())
+    dali_tensor.copy_to_external(c_type_pointer)
+    return arr
+
+class DALICOCOIterator(object):
+    """
+    COCO DALI iterator for pyTorch.
+
+    Parameters
+    ----------
+    pipelines : list of nvidia.dali.pipeline.Pipeline
+                List of pipelines to use
+    size : int
+           Epoch size.
+    """
+    def __init__(self, pipelines, size):
+        if not isinstance(pipelines, list):
+            pipelines = [pipelines]
+
+        self._num_gpus = len(pipelines)
+        assert pipelines is not None, "Number of provided pipelines has to be at least 1"
+        self.batch_size = pipelines[0].batch_size
+        self._size = size
+        self._pipes = pipelines
+
+        # Build all pipelines
+        for p in self._pipes:
+            p.build()
+
+        # Use double-buffering of data batches
+        self._data_batches = [[None, None, None, None] for i in range(self._num_gpus)]
+        self._counter = 0
+        self._current_data_batch = 0
+        self.output_map = ["image", "bboxes", "labels"]
+
+        # We need data about the batches (like shape information),
+        # so we need to run a single batch as part of setup to get that info
+        self._first_batch = None
+        self._first_batch = self.next()
+
+    def __next__(self):
+        if self._first_batch is not None:
+            batch = self._first_batch
+            self._first_batch = None
+            return batch
+        if self._counter > self._size:
+            raise StopIteration
+
+        # Gather outputs
+        outputs = []
+        for p in self._pipes:
+            p._prefetch()
+        for p in self._pipes:
+            outputs.append(p._share_outputs())
+        for i in range(self._num_gpus):
+            dev_id = self._pipes[i].device_id
+            out_images = []
+            bboxes = []
+            labels = []
+            # segregate outputs into image/labels/bboxes entries
+            for j, out in enumerate(outputs[i]):
+                if self.output_map[j] == "image":
+                    out_images.append(out)
+                elif self.output_map[j] == "bboxes":
+                    bboxes.append(out)
+                elif self.output_map[j] == "labels":
+                    labels.append(out)
+
+            # Change DALI TensorLists into Tensors
+            images = [x.as_tensor() for x in out_images]
+            images_shape = [x.shape() for x in images]
+
+            # Prepare bboxes shapes
+            bboxes_shape = []
+            for j in range(len(bboxes)):
+                bboxes_shape.append([])
+                for k in range(len(bboxes[j])):
+                    bboxes_shape[j].append(bboxes[j].at(k).shape())
+
+            # Prepare labels shapes and offsets
+            labels_shape = []
+            bbox_offsets = []
+
+            torch.cuda.synchronize()
+            for j in range(len(labels)):
+                labels_shape.append([])
+                bbox_offsets.append([0])
+                for k in range(len(labels[j])):
+                    lshape = labels[j].at(k).shape()
+                    bbox_offsets[j].append(bbox_offsets[j][k] + lshape[0])
+                    labels_shape[j].append(lshape)
+
+            # We always need to alocate new memory as bboxes and labels varies in shape
+            images_torch_type = to_torch_type[np.dtype(images[0].dtype())]
+            bboxes_torch_type = to_torch_type[np.dtype(bboxes[0].at(0).dtype())]
+            labels_torch_type = to_torch_type[np.dtype(labels[0].at(0).dtype())]
+
+            torch_gpu_device = torch.device('cuda', dev_id)
+            torch_cpu_device = torch.device('cpu')
+
+            pyt_images = [torch.zeros(shape, dtype=images_torch_type, device=torch_gpu_device) for shape in images_shape]
+            pyt_bboxes = [[torch.zeros(shape, dtype=bboxes_torch_type, device=torch_gpu_device) for shape in shape_list] for shape_list in bboxes_shape]
+            pyt_labels = [[torch.zeros(shape, dtype=labels_torch_type, device=torch_gpu_device) for shape in shape_list] for shape_list in labels_shape]
+            pyt_offsets = [torch.zeros(len(offset), dtype=torch.int32, device=torch_cpu_device) for offset in bbox_offsets]
+
+            self._data_batches[i][self._current_data_batch] = (pyt_images, pyt_bboxes, pyt_labels, pyt_offsets)
+
+            # Copy data from DALI Tensors to torch tensors
+            for j, i_arr in enumerate(images):
+                feed_ndarray(i_arr, pyt_images[j])
+
+            for j, b_list in enumerate(bboxes):
+                for k in range(len(b_list)):
+                    if (pyt_bboxes[j][k].shape[0] != 0):
+                        feed_ndarray(b_list.at(k), pyt_bboxes[j][k])
+                pyt_bboxes[j] = torch.cat(pyt_bboxes[j])
+
+            for j, l_list in enumerate(labels):
+                for k in range(len(l_list)):
+                    if (pyt_labels[j][k].shape[0] != 0):
+                        feed_ndarray(l_list.at(k), pyt_labels[j][k])
+                pyt_labels[j] = torch.cat(pyt_labels[j]).squeeze(dim=1)
+
+            for j in range(len(pyt_offsets)):
+                pyt_offsets[j] = torch.IntTensor(bbox_offsets[j])
+
+        for p in self._pipes:
+            p._release_outputs()
+            p._start_run()
+
+        copy_db_index = self._current_data_batch
+        # Change index for double buffering
+        self._current_data_batch = (self._current_data_batch + 1) % 2
+        self._counter += self._num_gpus * self.batch_size
+        return [db[copy_db_index] for db in self._data_batches]
+
+    def next(self):
+        """
+        Returns the next batch of data.
+        """
+        return self.__next__();
+
+    def __iter__(self):
+        return self
+
+    def reset(self):
+        """
+        Resets the iterator after the full epoch.
+        DALI iterators do not support resetting before the end of the epoch
+        and will ignore such request.
+        """
+        if self._counter > self._size:
+            self._counter = self._counter % self._size
+        else:
+            logging.warning("DALI iterator does not support resetting while epoch is not finished. Ignoring...")
--- a/PyTorch/Detection/SSD/src/data.py
+++ b/PyTorch/Detection/SSD/src/data.py
@ -0,0 +1,54 @@
+import os
+
+import torch
+from torch.utils.data import DataLoader
+
+from src.utils import dboxes300_coco, COCODetection
+from src.utils import SSDTransformer
+from src.coco import COCO
+#DALI import
+from src.coco_pipeline import COCOPipeline, DALICOCOIterator
+
+def get_train_loader(args, local_seed):
+    train_annotate = os.path.join(args.data, "annotations/instances_train2017.json")
+    train_coco_root = os.path.join(args.data, "train2017")
+
+    train_pipe = COCOPipeline(args.batch_size, args.local_rank, train_coco_root,
+                    train_annotate, args.N_gpu, num_threads=args.num_workers,
+                    output_fp16=args.fp16, output_nhwc=False,
+                    pad_output=False, seed=local_seed)
+    train_pipe.build()
+    test_run = train_pipe.run()
+    train_loader = DALICOCOIterator(train_pipe, 118287 / args.N_gpu)
+    return train_loader
+
+
+def get_val_dataset(args):
+    dboxes = dboxes300_coco()
+    val_trans = SSDTransformer(dboxes, (300, 300), val=True)
+
+    val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
+    val_coco_root = os.path.join(args.data, "val2017")
+
+    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
+    return val_coco
+
+
+def get_val_dataloader(dataset, args):
+    if args.distributed:
+        val_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+    else:
+        val_sampler = None
+
+    val_dataloader = DataLoader(dataset,
+                                batch_size=args.eval_batch_size,
+                                shuffle=False,  # Note: distributed sampler is shuffled :(
+                                sampler=val_sampler,
+                                num_workers=args.num_workers)
+
+    return val_dataloader
+
+def get_coco_ground_truth(args):
+    val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
+    cocoGt = COCO(annotation_file=val_annotate)
+    return cocoGt
--- a/PyTorch/Detection/SSD/src/distributed.py
+++ b/PyTorch/Detection/SSD/src/distributed.py
@ -0,0 +1,82 @@
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+import torch.distributed as dist
+from torch.nn.modules import Module
+
+'''
+This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
+launcher included with this example. It assumes that your run is using multiprocess with 1
+GPU/process, that the model is on the correct device, and that torch.set_device has been
+used to set the device.
+
+Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
+and will be allreduced at the finish of the backward pass.
+'''
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+
+        for p in self.module.state_dict().values():
+            if not torch.is_tensor(p):
+                continue
+            if dist._backend == dist.dist_backend.NCCL:
+                assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU."
+            dist.broadcast(p, 0)
+
+        def allreduce_params():
+            if(self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for param in self.module.parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = param.data.type()
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
+                              " It is recommended to use the NCCL backend in this case.")
+                        self.warn_on_half = False
+
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    dist.all_reduce(coalesced)
+                    coalesced /= dist.get_world_size()
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        for param in list(self.module.parameters()):
+            def allreduce_hook(*unused):
+                param._execution_engine.queue_callback(allreduce_params)
+            if param.requires_grad:
+                param.register_hook(allreduce_hook)
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    '''
+    def _sync_buffers(self):
+        buffers = list(self.module._all_buffers())
+        if len(buffers) > 0:
+            # cross-node buffer sync
+            flat_buffers = _flatten_dense_tensors(buffers)
+            dist.broadcast(flat_buffers, 0)
+            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
+                buf.copy_(synced)
+     def train(self, mode=True):
+        # Clear NCCL communicator and CUDA event cache of the default group ID,
+        # These cache will be recreated at the later call. This is currently a
+        # work-around for a potential NCCL deadlock.
+        if dist._backend == dist.dist_backend.NCCL:
+            dist._clear_group_cache()
+        super(DistributedDataParallel, self).train(mode)
+        self.module.train(mode)
+    '''
--- a/PyTorch/Detection/SSD/src/evaluate.py
+++ b/PyTorch/Detection/SSD/src/evaluate.py
@ -0,0 +1,124 @@
+import torch
+import time
+import numpy as np
+from contextlib import redirect_stdout
+import io
+
+from pycocotools.cocoeval import COCOeval
+
+
+def evaluate(model, coco, cocoGt, encoder, inv_map, args):
+    if args.distributed:
+        N_gpu = torch.distributed.get_world_size()
+    else:
+        N_gpu = 1
+
+    model.eval()
+    if not args.no_cuda:
+        model.cuda()
+    ret = []
+    start = time.time()
+
+    # for idx, image_id in enumerate(coco.img_keys):
+    for nbatch, (img, img_id, img_size, _, _) in enumerate(coco):
+        print("Parsing batch: {}/{}".format(nbatch, len(coco)), end='\r')
+        with torch.no_grad():
+            inp = img.cuda()
+            if args.fp16:
+                inp = inp.half()
+
+            # Get predictions
+            ploc, plabel = model(inp)
+            ploc, plabel = ploc.float(), plabel.float()
+
+            # Handle the batch of predictions produced
+            # This is slow, but consistent with old implementation.
+            for idx in range(ploc.shape[0]):
+                # ease-of-use for specific predictions
+                ploc_i = ploc[idx, :, :].unsqueeze(0)
+                plabel_i = plabel[idx, :, :].unsqueeze(0)
+
+                try:
+                    result = encoder.decode_batch(ploc_i, plabel_i, 0.50, 200)[0]
+                except:
+                    # raise
+                    print("")
+                    print("No object detected in idx: {}".format(idx))
+                    continue
+
+                htot, wtot = img_size[0][idx].item(), img_size[1][idx].item()
+                loc, label, prob = [r.cpu().numpy() for r in result]
+                for loc_, label_, prob_ in zip(loc, label, prob):
+                    ret.append([img_id[idx], loc_[0] * wtot, \
+                                loc_[1] * htot,
+                                (loc_[2] - loc_[0]) * wtot,
+                                (loc_[3] - loc_[1]) * htot,
+                                prob_,
+                                inv_map[label_]])
+
+    # Now we have all predictions from this rank, gather them all together
+    # if necessary
+    ret = np.array(ret).astype(np.float32)
+
+    # Multi-GPU eval
+    if args.distributed:
+        # NCCL backend means we can only operate on GPU tensors
+        ret_copy = torch.tensor(ret).cuda()
+        # Everyone exchanges the size of their results
+        ret_sizes = [torch.tensor(0).cuda() for _ in range(N_gpu)]
+
+        torch.cuda.synchronize()
+        torch.distributed.all_gather(ret_sizes, torch.tensor(ret_copy.shape[0]).cuda())
+        torch.cuda.synchronize()
+
+        # Get the maximum results size, as all tensors must be the same shape for
+        # the all_gather call we need to make
+        max_size = 0
+        sizes = []
+        for s in ret_sizes:
+            max_size = max(max_size, s.item())
+            sizes.append(s.item())
+
+        # Need to pad my output to max_size in order to use in all_gather
+        ret_pad = torch.cat([ret_copy, torch.zeros(max_size - ret_copy.shape[0], 7, dtype=torch.float32).cuda()])
+
+        # allocate storage for results from all other processes
+        other_ret = [torch.zeros(max_size, 7, dtype=torch.float32).cuda() for i in range(N_gpu)]
+        # Everyone exchanges (padded) results
+
+        torch.cuda.synchronize()
+        torch.distributed.all_gather(other_ret, ret_pad)
+        torch.cuda.synchronize()
+
+        # Now need to reconstruct the _actual_ results from the padded set using slices.
+        cat_tensors = []
+        for i in range(N_gpu):
+            cat_tensors.append(other_ret[i][:sizes[i]][:])
+
+        final_results = torch.cat(cat_tensors).cpu().numpy()
+    else:
+        # Otherwise full results are just our results
+        final_results = ret
+
+    if args.local_rank == 0:
+        print("")
+        print("Predicting Ended, total time: {:.2f} s".format(time.time() - start))
+
+    cocoDt = cocoGt.loadRes(final_results)
+
+    E = COCOeval(cocoGt, cocoDt, iouType='bbox')
+    E.evaluate()
+    E.accumulate()
+    if args.local_rank == 0:
+        E.summarize()
+        print("Current AP: {:.5f}".format(E.stats[0]))
+    else:
+        # fix for cocoeval indiscriminate prints
+        with redirect_stdout(io.StringIO()):
+            E.summarize()
+
+    # put your model in training mode back on
+    model.train()
+
+    return E.stats[0]  # Average Precision  (AP) @[ IoU=050:0.95 | area=   all | maxDets=100 ]
+
--- a/PyTorch/Detection/SSD/src/logger.py
+++ b/PyTorch/Detection/SSD/src/logger.py
@ -0,0 +1,103 @@
+import math
+import numpy as np
+
+
+class EpochMeter:
+    def __init__(self, name):
+        self.name = name
+        self.data = []
+
+    def update(self, epoch, val):
+        self.data.append((epoch, val))
+
+
+class IterationMeter:
+    def __init__(self, name):
+        self.name = name
+        self.data = []
+
+    def update(self, epoch, iteration, val):
+        self.data.append((epoch, iteration, val))
+
+
+class IterationAverageMeter:
+    def __init__(self, name):
+        self.name = name
+        self.data = []
+        self.n = 0
+        self.sum = 0
+
+    def update_iter(self, val):
+        if math.isfinite(val): # sometimes loss === 'inf'
+            self.n += 1
+            self.sum += 0 if math.isinf(val) else val
+
+    def update_epoch(self, epoch):
+        self.data.append((epoch, self.sum / self.n))
+        self.n = 0
+        self.sum = 0
+
+
+class Logger:
+    def __init__(self, name, print_freq=20):
+        self.name = name
+        self.train_loss_logger = IterationAverageMeter("Training loss")
+        self.train_epoch_time_logger = EpochMeter("Training 1 epoch time")
+        self.val_acc_logger = EpochMeter("Validation accuracy")
+        self.print_freq = print_freq
+
+    def update_iter(self, epoch, iteration, loss):
+        self.train_loss_logger.update_iter(loss)
+        if iteration % self.print_freq == 0:
+            print('epoch: {}\titeraion: {}\tloss: {}'.format(epoch, iteration, loss))
+
+    def update_epoch(self, epoch, acc):
+        self.train_loss_logger.update_epoch(epoch)
+        self.val_acc_logger.update(epoch, acc)
+        print('epoch: {}\tmAP accuracy: {}'.format(epoch, acc))
+
+    def update_epoch_time(self, epoch, time):
+        self.train_epoch_time_logger.update(epoch, time)
+        print('epoch: {}\ttime: {}'.format(epoch, time))
+
+    def print_results(self):
+        return self.train_loss_logger.data, self.val_acc_logger.data, self.train_epoch_time_logger
+
+
+class BenchmarkMeter:
+    def __init__(self, name):
+        self.name = name
+        self.data = []
+        self.total_images = 0
+        self.total_time = 0
+        self.avr_images_per_second = 0
+
+    def update(self, bs, time):
+        self.total_images += bs
+        self.total_time += time
+        self.avr_images_per_second = self.total_images / self.total_time
+        self.data.append(bs / time)
+
+
+class BenchLogger(Logger):
+    def __init__(self, name):
+        super().__init__(name)
+        self.name = name
+        self.images_per_ses = BenchmarkMeter(self.name)
+
+    def update(self, bs, time):
+        self.images_per_ses.update(bs, time)
+
+    def print_result(self):
+        total_bs = self.images_per_ses.total_images
+        total_time = self.images_per_ses.total_time
+        avr = self.images_per_ses.avr_images_per_second
+        med = np.median(self.images_per_ses.data)
+
+        print("Done benchmarking. Total images: {}\ttotal time: {:.3f}\tAverage images/sec: {:.3f}\tMedian images/sec: {:.3f}".format(
+            total_bs,
+            total_time,
+            avr,
+            med
+        ))
+        return med
--- a/PyTorch/Detection/SSD/src/model.py
+++ b/PyTorch/Detection/SSD/src/model.py
@ -0,0 +1,181 @@
+import torch
+import torch.nn as nn
+from torchvision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152
+
+
+class ResNet(nn.Module):
+    def __init__(self, backbone='resnet50'):
+        super().__init__()
+        if backbone == 'resnet18':
+            backbone = resnet18(pretrained=True)
+            self.out_channels = [256, 512, 512, 256, 256, 128]
+        elif backbone == 'resnet34':
+            backbone = resnet34(pretrained=True)
+            self.out_channels = [256, 512, 512, 256, 256, 256]
+        elif backbone == 'resnet50':
+            backbone = resnet50(pretrained=True)
+            self.out_channels = [1024, 512, 512, 256, 256, 256]
+        elif backbone == 'resnet101':
+            backbone = resnet101(pretrained=True)
+            self.out_channels = [1024, 512, 512, 256, 256, 256]
+        else:  # backbone == 'resnet152':
+            backbone = resnet152(pretrained=True)
+            self.out_channels = [1024, 512, 512, 256, 256, 256]
+
+
+        self.feature_extractor = nn.Sequential(*list(backbone.children())[:7])
+
+        conv4_block1 = self.feature_extractor[-1][0]
+
+        conv4_block1.conv1.stride = (1, 1)
+        conv4_block1.conv2.stride = (1, 1)
+        conv4_block1.downsample[0].stride = (1, 1)
+
+    def forward(self, x):
+        x = self.feature_extractor(x)
+        return x
+
+
+class SSD300(nn.Module):
+    def __init__(self, backbone='resnet50'):
+        super().__init__()
+
+        self.feature_extractor = ResNet(backbone=backbone)
+
+        self.label_num = 81  # number of COCO classes
+        self._build_additional_features(self.feature_extractor.out_channels)
+        self.num_defaults = [4, 6, 6, 6, 4, 4]
+        self.loc = []
+        self.conf = []
+
+        for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels):
+            self.loc.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
+            self.conf.append(nn.Conv2d(oc, nd * self.label_num, kernel_size=3, padding=1))
+
+        self.loc = nn.ModuleList(self.loc)
+        self.conf = nn.ModuleList(self.conf)
+        self._init_weights()
+
+    def _build_additional_features(self, input_size):
+        self.additional_blocks = []
+        for i, (input_size, output_size, channels) in enumerate(zip(input_size[:-1], input_size[1:], [256, 256, 128, 128, 128])):
+            if i < 3:
+                layer = nn.Sequential(
+                    nn.Conv2d(input_size, channels, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(channels),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(channels, output_size, kernel_size=3, padding=1, stride=2, bias=False),
+                    nn.BatchNorm2d(output_size),
+                    nn.ReLU(inplace=True),
+                )
+            else:
+                layer = nn.Sequential(
+                    nn.Conv2d(input_size, channels, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(channels),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(channels, output_size, kernel_size=3, bias=False),
+                    nn.BatchNorm2d(output_size),
+                    nn.ReLU(inplace=True),
+                )
+
+            self.additional_blocks.append(layer)
+
+        self.additional_blocks = nn.ModuleList(self.additional_blocks)
+
+    def _init_weights(self):
+        layers = [*self.additional_blocks, *self.loc, *self.conf]
+        for layer in layers:
+            for param in layer.parameters():
+                if param.dim() > 1: nn.init.xavier_uniform_(param)
+
+    # Shape the classifier to the view of bboxes
+    def bbox_view(self, src, loc, conf):
+        ret = []
+        for s, l, c in zip(src, loc, conf):
+            ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1)))
+
+        locs, confs = list(zip(*ret))
+        locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
+        return locs, confs
+
+    def forward(self, x):
+        x = self.feature_extractor(x)
+
+        detection_feed = [x]
+        for l in self.additional_blocks:
+            x = l(x)
+            detection_feed.append(x)
+
+        # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
+        locs, confs = self.bbox_view(detection_feed, self.loc, self.conf)
+
+        # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
+        return locs, confs
+
+
+class Loss(nn.Module):
+    """
+        Implements the loss as the sum of the followings:
+        1. Confidence Loss: All labels, with hard negative mining
+        2. Localization Loss: Only on positive labels
+        Suppose input dboxes has the shape 8732x4
+    """
+    def __init__(self, dboxes):
+        super(Loss, self).__init__()
+        self.scale_xy = 1.0/dboxes.scale_xy
+        self.scale_wh = 1.0/dboxes.scale_wh
+
+        self.sl1_loss = nn.SmoothL1Loss(reduce=False)
+        self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim = 0),
+            requires_grad=False)
+        # Two factor are from following links
+        # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
+        self.con_loss = nn.CrossEntropyLoss(reduce=False)
+
+    def _loc_vec(self, loc):
+        """
+            Generate Location Vectors
+        """
+        gxy = self.scale_xy*(loc[:, :2, :] - self.dboxes[:, :2, :])/self.dboxes[:, 2:, ]
+        gwh = self.scale_wh*(loc[:, 2:, :]/self.dboxes[:, 2:, :]).log()
+        return torch.cat((gxy, gwh), dim=1).contiguous()
+
+    def forward(self, ploc, plabel, gloc, glabel):
+        """
+            ploc, plabel: Nx4x8732, Nxlabel_numx8732
+                predicted location and labels
+
+            gloc, glabel: Nx4x8732, Nx8732
+                ground truth location and labels
+        """
+        mask = glabel > 0
+        pos_num = mask.sum(dim=1)
+
+        vec_gd = self._loc_vec(gloc)
+
+        # sum on four coordinates, and mask
+        sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
+        sl1 = (mask.float()*sl1).sum(dim=1)
+
+        # hard negative mining
+        con = self.con_loss(plabel, glabel)
+
+        # postive mask will never selected
+        con_neg = con.clone()
+        con_neg[mask] = 0
+        _, con_idx = con_neg.sort(dim=1, descending=True)
+        _, con_rank = con_idx.sort(dim=1)
+
+        # number of negative three times positive
+        neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1)
+        neg_mask = con_rank < neg_num
+
+        #print(con.shape, mask.shape, neg_mask.shape)
+        closs = (con*(mask.float() + neg_mask.float())).sum(dim=1)
+
+        # avoid no object detected
+        total_loss = sl1 + closs
+        num_mask = (pos_num > 0).float()
+        pos_num = pos_num.float().clamp(min=1e-6)
+        ret = (total_loss*num_mask/pos_num).mean(dim=0)
+        return ret
--- a/PyTorch/Detection/SSD/src/train.py
+++ b/PyTorch/Detection/SSD/src/train.py
@ -0,0 +1,223 @@
+from torch.autograd import Variable
+import torch
+import time
+from SSD import _C as C
+
+def train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
+#     for nbatch, (img, _, img_size, bbox, label) in enumerate(train_dataloader):
+    for nbatch, data in enumerate(train_dataloader):
+        img = data[0][0][0]
+        bbox = data[0][1][0]
+        label = data[0][2][0]
+        label = label.type(torch.cuda.LongTensor)
+        bbox_offsets = data[0][3][0]
+        # handle random flipping outside of DALI for now
+        bbox_offsets = bbox_offsets.cuda()
+        img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
+        img.sub_(mean).div_(std)
+        if not args.no_cuda:
+            img = img.cuda()
+            bbox = bbox.cuda()
+            label = label.cuda()
+            bbox_offsets = bbox_offsets.cuda()
+
+        N = img.shape[0]
+        if bbox_offsets[-1].item() == 0:
+            print("No labels in batch")
+            continue
+        bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
+        # output is ([N*8732, 4], [N*8732], need [N, 8732, 4], [N, 8732] respectively
+        M = bbox.shape[0] // N
+        bbox = bbox.view(N, M, 4)
+        label = label.view(N, M)
+
+        ploc, plabel = model(img)
+        ploc, plabel = ploc.float(), plabel.float()
+
+        trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
+
+        if not args.no_cuda:
+            label = label.cuda()
+        gloc = Variable(trans_bbox, requires_grad=False)
+        glabel = Variable(label, requires_grad=False)
+
+        loss = loss_func(ploc, plabel, gloc, glabel)
+
+        if args.local_rank == 0:
+            logger.update_iter(epoch, iteration, loss.item())
+
+        if args.fp16:
+            if args.amp:
+                with optim.scale_loss(loss) as scale_loss:
+                    scale_loss.backward()
+            else:
+                optim.backward(loss)
+        else:
+            loss.backward()
+
+        if args.warmup is not None:
+            warmup(optim, args.warmup, iteration, args.learning_rate)
+
+        optim.step()
+        optim.zero_grad()
+        iteration += 1
+
+    return iteration
+
+
+def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
+    start_time = None
+    # tensor for results
+    result = torch.zeros((1,)).cuda()
+    for i, data in enumerate(loop(train_dataloader)):
+        if i >= args.benchmark_warmup:
+            start_time = time.time()
+
+        img = data[0][0][0]
+        bbox = data[0][1][0]
+        label = data[0][2][0]
+        label = label.type(torch.cuda.LongTensor)
+        bbox_offsets = data[0][3][0]
+        # handle random flipping outside of DALI for now
+        bbox_offsets = bbox_offsets.cuda()
+        img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
+
+        if not args.no_cuda:
+            img = img.cuda()
+            bbox = bbox.cuda()
+            label = label.cuda()
+            bbox_offsets = bbox_offsets.cuda()
+        img.sub_(mean).div_(std)
+
+        N = img.shape[0]
+        if bbox_offsets[-1].item() == 0:
+            print("No labels in batch")
+            continue
+        bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
+
+        M = bbox.shape[0] // N
+        bbox = bbox.view(N, M, 4)
+        label = label.view(N, M)
+
+
+
+
+
+        ploc, plabel = model(img)
+        ploc, plabel = ploc.float(), plabel.float()
+
+        trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
+
+        if not args.no_cuda:
+            label = label.cuda()
+        gloc = Variable(trans_bbox, requires_grad=False)
+        glabel = Variable(label, requires_grad=False)
+
+        loss = loss_func(ploc, plabel, gloc, glabel)
+
+
+
+        # loss scaling
+        if args.fp16:
+            if args.amp:
+                with optim.scale_loss(loss) as scale_loss:
+                    scale_loss.backward()
+            else:
+                optim.backward(loss)
+        else:
+            loss.backward()
+
+        optim.step()
+        optim.zero_grad()
+
+        if i >= args.benchmark_warmup + args.benchmark_iterations:
+            break
+
+        if i >= args.benchmark_warmup:
+            logger.update(args.batch_size, time.time() - start_time)
+
+
+    result.data[0] = logger.print_result()
+    if args.N_gpu > 1:
+        torch.distributed.reduce(result, 0)
+    if args.local_rank == 0:
+        print('Training performance = {} FPS'.format(float(result.data[0])))
+
+
+
+def loop(dataloader):
+    while True:
+        for data in dataloader:
+            yield data
+
+def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
+    assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
+    start_time = None
+    model.eval()
+
+    i = -1
+    val_datas = loop(val_dataloader)
+
+    while True:
+        i += 1
+        torch.cuda.synchronize()
+        if i >= args.benchmark_warmup:
+            start_time = time.time()
+
+        data = next(val_datas)
+
+        with torch.no_grad():
+            img = data[0]
+            if not args.no_cuda:
+                img = img.cuda()
+            if args.fp16:
+                img = img.half()
+            img.sub_(mean).div_(std)
+            img = Variable(img, requires_grad=False)
+            _ = model(img)
+            torch.cuda.synchronize()
+
+            if i >= args.benchmark_warmup + args.benchmark_iterations:
+                break
+
+            if i >= args.benchmark_warmup:
+                logger.update(args.eval_batch_size, time.time() - start_time)
+
+    logger.print_result()
+
+def warmup(optim, warmup_iters, iteration, base_lr):
+    if iteration < warmup_iters:
+        new_lr = 1. * base_lr / warmup_iters * iteration
+        for param_group in optim.param_groups:
+            param_group['lr'] = new_lr
+
+
+def load_checkpoint(model, checkpoint):
+    """
+    Load model from checkpoint.
+    """
+    print("loading model checkpoint", checkpoint)
+    od = torch.load(checkpoint)
+
+    # remove proceeding 'N.' from checkpoint that comes from DDP wrapper
+    saved_model = od["model"]
+    model.load_state_dict(saved_model)
+
+
+def tencent_trick(model):
+    """
+    Divide parameters into 2 groups.
+    First group is BNs and all biases.
+    Second group is the remaining model's parameters.
+    Weight decay will be disabled in first group (aka tencent trick).
+    """
+    decay, no_decay = [], []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias"):
+            no_decay.append(param)
+        else:
+            decay.append(param)
+    return [{'params': no_decay, 'weight_decay': 0.0},
+            {'params': decay}]
--- a/PyTorch/Detection/SSD/src/utils.py
+++ b/PyTorch/Detection/SSD/src/utils.py
@ -0,0 +1,578 @@
+import torch
+import torchvision.transforms as transforms
+import torch.utils.data as data
+from PIL import Image
+import os
+import numpy as np
+import random
+import itertools
+import torch.nn.functional as F
+import json
+import time
+import bz2
+import pickle
+from math import sqrt
+
+
+# This function is from https://github.com/kuangliu/pytorch-ssd.
+def calc_iou_tensor(box1, box2):
+    """ Calculation of IoU based on two boxes tensor,
+        Reference to https://github.com/kuangliu/pytorch-src
+        input:
+            box1 (N, 4)
+            box2 (M, 4)
+        output:
+            IoU (N, M)
+    """
+    N = box1.size(0)
+    M = box2.size(0)
+
+    be1 = box1.unsqueeze(1).expand(-1, M, -1)
+    be2 = box2.unsqueeze(0).expand(N, -1, -1)
+
+    # Left Top & Right Bottom
+    lt = torch.max(be1[:,:,:2], be2[:,:,:2])
+    #mask1 = (be1[:,:, 0] < be2[:,:, 0]) ^ (be1[:,:, 1] < be2[:,:, 1])
+    #mask1 = ~mask1
+    rb = torch.min(be1[:,:,2:], be2[:,:,2:])
+    #mask2 = (be1[:,:, 2] < be2[:,:, 2]) ^ (be1[:,:, 3] < be2[:,:, 3])
+    #mask2 = ~mask2
+
+    delta = rb - lt
+    delta[delta < 0] = 0
+    intersect = delta[:,:,0]*delta[:,:,1]
+    #*mask1.float()*mask2.float()
+
+    delta1 = be1[:,:,2:] - be1[:,:,:2]
+    area1 = delta1[:,:,0]*delta1[:,:,1]
+    delta2 = be2[:,:,2:] - be2[:,:,:2]
+    area2 = delta2[:,:,0]*delta2[:,:,1]
+
+    iou = intersect/(area1 + area2 - intersect)
+    return iou
+
+
+# This function is from https://github.com/kuangliu/pytorch-ssd.
+class Encoder(object):
+    """
+        Inspired by https://github.com/kuangliu/pytorch-src
+        Transform between (bboxes, lables) <-> SSD output
+
+        dboxes: default boxes in size 8732 x 4,
+            encoder: input ltrb format, output xywh format
+            decoder: input xywh format, output ltrb format
+
+        encode:
+            input  : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
+            output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
+            criteria : IoU threshold of bboexes
+
+        decode:
+            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
+            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
+            criteria : IoU threshold of bboexes
+            max_output : maximum number of output bboxes
+    """
+
+    def __init__(self, dboxes):
+        self.dboxes = dboxes(order="ltrb")
+        self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0)
+        self.nboxes = self.dboxes.size(0)
+        self.scale_xy = dboxes.scale_xy
+        self.scale_wh = dboxes.scale_wh
+
+    def encode(self, bboxes_in, labels_in, criteria = 0.5):
+
+        ious = calc_iou_tensor(bboxes_in, self.dboxes)
+        best_dbox_ious, best_dbox_idx = ious.max(dim=0)
+        best_bbox_ious, best_bbox_idx = ious.max(dim=1)
+
+        # set best ious 2.0
+        best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
+
+        idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
+        best_dbox_idx[best_bbox_idx[idx]] = idx
+
+        # filter IoU > 0.5
+        masks = best_dbox_ious > criteria
+        labels_out = torch.zeros(self.nboxes, dtype=torch.long)
+        labels_out[masks] = labels_in[best_dbox_idx[masks]]
+        bboxes_out = self.dboxes.clone()
+        bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
+        # Transform format to xywh format
+        x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
+                     0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
+                     -bboxes_out[:, 0] + bboxes_out[:, 2], \
+                     -bboxes_out[:, 1] + bboxes_out[:, 3]
+        bboxes_out[:, 0] = x
+        bboxes_out[:, 1] = y
+        bboxes_out[:, 2] = w
+        bboxes_out[:, 3] = h
+        return bboxes_out, labels_out
+
+    def scale_back_batch(self, bboxes_in, scores_in):
+        """
+            Do scale and transform from xywh to ltrb
+            suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox
+        """
+        if bboxes_in.device == torch.device("cpu"):
+            self.dboxes = self.dboxes.cpu()
+            self.dboxes_xywh = self.dboxes_xywh.cpu()
+        else:
+            self.dboxes = self.dboxes.cuda()
+            self.dboxes_xywh = self.dboxes_xywh.cuda()
+
+        bboxes_in = bboxes_in.permute(0, 2, 1)
+        scores_in = scores_in.permute(0, 2, 1)
+
+        bboxes_in[:, :, :2] = self.scale_xy*bboxes_in[:, :, :2]
+        bboxes_in[:, :, 2:] = self.scale_wh*bboxes_in[:, :, 2:]
+
+        bboxes_in[:, :, :2] = bboxes_in[:, :, :2]*self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
+        bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp()*self.dboxes_xywh[:, :, 2:]
+
+        # Transform format to ltrb
+        l, t, r, b = bboxes_in[:, :, 0] - 0.5*bboxes_in[:, :, 2],\
+                     bboxes_in[:, :, 1] - 0.5*bboxes_in[:, :, 3],\
+                     bboxes_in[:, :, 0] + 0.5*bboxes_in[:, :, 2],\
+                     bboxes_in[:, :, 1] + 0.5*bboxes_in[:, :, 3]
+
+        bboxes_in[:, :, 0] = l
+        bboxes_in[:, :, 1] = t
+        bboxes_in[:, :, 2] = r
+        bboxes_in[:, :, 3] = b
+
+        return bboxes_in, F.softmax(scores_in, dim=-1)
+
+    def decode_batch(self, bboxes_in, scores_in,  criteria = 0.45, max_output=200):
+        bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
+
+        output = []
+        for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
+            bbox = bbox.squeeze(0)
+            prob = prob.squeeze(0)
+            output.append(self.decode_single(bbox, prob, criteria, max_output))
+        return output
+
+    # perform non-maximum suppression
+    def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
+        # Reference to https://github.com/amdegroot/ssd.pytorch
+
+        bboxes_out = []
+        scores_out = []
+        labels_out = []
+
+        for i, score in enumerate(scores_in.split(1, 1)):
+            # skip background
+            # print(score[score>0.90])
+            if i == 0: continue
+            # print(i)
+
+            score = score.squeeze(1)
+            mask = score > 0.05
+
+            bboxes, score = bboxes_in[mask, :], score[mask]
+            if score.size(0) == 0: continue
+
+            score_sorted, score_idx_sorted = score.sort(dim=0)
+
+            # select max_output indices
+            score_idx_sorted = score_idx_sorted[-max_num:]
+            candidates = []
+            #maxdata, maxloc = scores_in.sort()
+
+            while score_idx_sorted.numel() > 0:
+                idx = score_idx_sorted[-1].item()
+                bboxes_sorted = bboxes[score_idx_sorted, :]
+                bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
+                iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
+                # we only need iou < criteria
+                score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
+                candidates.append(idx)
+
+            bboxes_out.append(bboxes[candidates, :])
+            scores_out.append(score[candidates])
+            labels_out.extend([i]*len(candidates))
+
+        bboxes_out, labels_out, scores_out = torch.cat(bboxes_out, dim=0), \
+               torch.tensor(labels_out, dtype=torch.long), \
+               torch.cat(scores_out, dim=0)
+
+
+        _, max_ids = scores_out.sort(dim=0)
+        max_ids = max_ids[-max_output:]
+        return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
+
+
+class DefaultBoxes(object):
+    def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \
+                       scale_xy=0.1, scale_wh=0.2):
+
+        self.feat_size = feat_size
+        self.fig_size = fig_size
+
+        self.scale_xy_ = scale_xy
+        self.scale_wh_ = scale_wh
+
+        # According to https://github.com/weiliu89/caffe
+        # Calculation method slightly different from paper
+        self.steps = steps
+        self.scales = scales
+
+        fk = fig_size/np.array(steps)
+        self.aspect_ratios = aspect_ratios
+
+        self.default_boxes = []
+        # size of feature and number of feature
+        for idx, sfeat in enumerate(self.feat_size):
+
+            sk1 = scales[idx]/fig_size
+            sk2 = scales[idx+1]/fig_size
+            sk3 = sqrt(sk1*sk2)
+            all_sizes = [(sk1, sk1), (sk3, sk3)]
+
+            for alpha in aspect_ratios[idx]:
+                w, h = sk1*sqrt(alpha), sk1/sqrt(alpha)
+                all_sizes.append((w, h))
+                all_sizes.append((h, w))
+            for w, h in all_sizes:
+                for i, j in itertools.product(range(sfeat), repeat=2):
+                    cx, cy = (j+0.5)/fk[idx], (i+0.5)/fk[idx]
+                    self.default_boxes.append((cx, cy, w, h))
+
+        self.dboxes = torch.tensor(self.default_boxes)
+        self.dboxes.clamp_(min=0, max=1)
+        # For IoU calculation
+        self.dboxes_ltrb = self.dboxes.clone()
+        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]
+        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]
+        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]
+        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]
+
+    @property
+    def scale_xy(self):
+        return self.scale_xy_
+
+    @property
+    def scale_wh(self):
+        return self.scale_wh_
+
+    def __call__(self, order="ltrb"):
+        if order == "ltrb": return self.dboxes_ltrb
+        if order == "xywh": return self.dboxes
+
+
+def dboxes300_coco():
+    figsize = 300
+    feat_size = [38, 19, 10, 5, 3, 1]
+    steps = [8, 16, 32, 64, 100, 300]
+    # use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
+    scales = [21, 45, 99, 153, 207, 261, 315]
+    aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+    dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
+    return dboxes
+
+
+# This function is from https://github.com/chauhan-utk/ssd.DomainAdaptation.
+class SSDCropping(object):
+    """ Cropping for SSD, according to original paper
+        Choose between following 3 conditions:
+        1. Preserve the original image
+        2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9
+        3. Random crop
+        Reference to https://github.com/chauhan-utk/src.DomainAdaptation
+    """
+    def __init__(self):
+
+        self.sample_options = (
+            # Do nothing
+            None,
+            # min IoU, max IoU
+            (0.1, None),
+            (0.3, None),
+            (0.5, None),
+            (0.7, None),
+            (0.9, None),
+            # no IoU requirements
+            (None, None),
+        )
+
+    def __call__(self, img, img_size, bboxes, labels):
+
+        # Ensure always return cropped image
+        while True:
+            mode = random.choice(self.sample_options)
+
+            if mode is None:
+                return img, img_size, bboxes, labels
+
+            htot, wtot = img_size
+
+            min_iou, max_iou = mode
+            min_iou = float("-inf") if min_iou is None else min_iou
+            max_iou = float("+inf") if max_iou is None else max_iou
+
+            # Implementation use 50 iteration to find possible candidate
+            for _ in range(1):
+                # suze of each sampled path in [0.1, 1] 0.3*0.3 approx. 0.1
+                w = random.uniform(0.3 , 1.0)
+                h = random.uniform(0.3 , 1.0)
+
+                if w/h < 0.5 or w/h > 2:
+                    continue
+
+                # left 0 ~ wtot - w, top 0 ~ htot - h
+                left = random.uniform(0, 1.0 - w)
+                top = random.uniform(0, 1.0 - h)
+
+                right = left + w
+                bottom = top + h
+
+                ious = calc_iou_tensor(bboxes, torch.tensor([[left, top, right, bottom]]))
+
+                # tailor all the bboxes and return
+                if not ((ious > min_iou) & (ious < max_iou)).all():
+                    continue
+
+                # discard any bboxes whose center not in the cropped image
+                xc = 0.5*(bboxes[:, 0] + bboxes[:, 2])
+                yc = 0.5*(bboxes[:, 1] + bboxes[:, 3])
+
+                masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom)
+
+                # if no such boxes, continue searching again
+                if not masks.any():
+                    continue
+
+                bboxes[bboxes[:, 0] < left, 0] = left
+                bboxes[bboxes[:, 1] < top, 1] = top
+                bboxes[bboxes[:, 2] > right, 2] = right
+                bboxes[bboxes[:, 3] > bottom, 3] = bottom
+
+                bboxes = bboxes[masks, :]
+                labels = labels[masks]
+
+                left_idx = int(left*wtot)
+                top_idx =  int(top*htot)
+                right_idx = int(right*wtot)
+                bottom_idx = int(bottom*htot)
+                img = img.crop((left_idx, top_idx, right_idx, bottom_idx))
+
+                bboxes[:, 0] = (bboxes[:, 0] - left)/w
+                bboxes[:, 1] = (bboxes[:, 1] - top)/h
+                bboxes[:, 2] = (bboxes[:, 2] - left)/w
+                bboxes[:, 3] = (bboxes[:, 3] - top)/h
+
+                htot = bottom_idx - top_idx
+                wtot = right_idx - left_idx
+                return img, (htot, wtot), bboxes, labels
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, image, bboxes):
+        if random.random() < self.p:
+            bboxes[:, 0], bboxes[:, 2] = 1.0 - bboxes[:, 2], 1.0 - bboxes[:, 0]
+            return image.transpose(Image.FLIP_LEFT_RIGHT), bboxes
+        return image, bboxes
+
+
+# Do data augumentation
+class SSDTransformer(object):
+    """ SSD Data Augumentation, according to original paper
+        Composed by several steps:
+        Cropping
+        Resize
+        Flipping
+        Jittering
+    """
+    def __init__(self, dboxes, size = (300, 300), val=False):
+
+        # define vgg16 mean
+        self.size = size
+        self.val = val
+
+        self.dboxes_ = dboxes #DefaultBoxes300()
+        self.encoder = Encoder(self.dboxes_)
+
+        self.crop = SSDCropping()
+        self.img_trans = transforms.Compose([
+            transforms.Resize(self.size),
+            transforms.ColorJitter(brightness=0.125, contrast=0.5,
+                saturation=0.5, hue=0.05
+            ),
+            transforms.ToTensor()
+        ])
+        self.hflip = RandomHorizontalFlip()
+
+        # All Pytorch Tensor will be normalized
+        # https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
+        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                               std=[0.229, 0.224, 0.225])
+        self.trans_val = transforms.Compose([
+            transforms.Resize(self.size),
+            transforms.ToTensor(),
+            #ToTensor(),
+            self.normalize,])
+
+    @property
+    def dboxes(self):
+        return self.dboxes_
+
+    def __call__(self, img, img_size, bbox=None, label=None, max_num=200):
+        #img = torch.tensor(img)
+        if self.val:
+            bbox_out = torch.zeros(max_num, 4)
+            label_out =  torch.zeros(max_num, dtype=torch.long)
+            bbox_out[:bbox.size(0), :] = bbox
+            label_out[:label.size(0)] = label
+            return self.trans_val(img), img_size, bbox_out, label_out
+
+        img, img_size, bbox, label = self.crop(img, img_size, bbox, label)
+        img, bbox = self.hflip(img, bbox)
+
+        img = self.img_trans(img).contiguous()
+        img = self.normalize(img)
+
+        bbox, label = self.encoder.encode(bbox, label)
+
+        return img, img_size, bbox, label
+
+
+# Implement a datareader for COCO dataset
+class COCODetection(data.Dataset):
+    def __init__(self, img_folder, annotate_file, transform=None):
+        self.img_folder = img_folder
+        self.annotate_file = annotate_file
+
+        # Start processing annotation
+        with open(annotate_file) as fin:
+            self.data = json.load(fin)
+
+        self.images = {}
+
+        self.label_map = {}
+        self.label_info = {}
+        start_time = time.time()
+        # 0 stand for the background
+        cnt = 0
+        self.label_info[cnt] = "background"
+        for cat in self.data["categories"]:
+            cnt += 1
+            self.label_map[cat["id"]] = cnt
+            self.label_info[cnt] = cat["name"]
+
+        # build inference for images
+        for img in self.data["images"]:
+            img_id = img["id"]
+            img_name = img["file_name"]
+            img_size = (img["height"],img["width"])
+            if img_id in self.images: raise Exception("dulpicated image record")
+            self.images[img_id] = (img_name, img_size, [])
+
+        # read bboxes
+        for bboxes in self.data["annotations"]:
+            img_id = bboxes["image_id"]
+            category_id = bboxes["category_id"]
+            bbox = bboxes["bbox"]
+            bbox_label = self.label_map[bboxes["category_id"]]
+            self.images[img_id][2].append((bbox, bbox_label))
+
+        for k, v in list(self.images.items()):
+            if len(v[2]) == 0:
+                self.images.pop(k)
+
+        self.img_keys = list(self.images.keys())
+        self.transform = transform
+
+    @property
+    def labelnum(self):
+        return len(self.label_info)
+
+    @staticmethod
+    def load(pklfile):
+        with bz2.open(pklfile, "rb") as fin:
+            ret = pickle.load(fin)
+        return ret
+
+    def save(self, pklfile):
+        with bz2.open(pklfile, "wb") as fout:
+            pickle.dump(self, fout)
+
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        img_id = self.img_keys[idx]
+        img_data = self.images[img_id]
+        fn = img_data[0]
+        img_path = os.path.join(self.img_folder, fn)
+        img = Image.open(img_path).convert("RGB")
+
+        htot, wtot = img_data[1]
+        bbox_sizes = []
+        bbox_labels = []
+
+        #for (xc, yc, w, h), bbox_label in img_data[2]:
+        for (l,t,w,h), bbox_label in img_data[2]:
+            r = l + w
+            b = t + h
+            #l, t, r, b = xc - 0.5*w, yc - 0.5*h, xc + 0.5*w, yc + 0.5*h
+            bbox_size = (l/wtot, t/htot, r/wtot, b/htot)
+            bbox_sizes.append(bbox_size)
+            bbox_labels.append(bbox_label)
+
+        bbox_sizes = torch.tensor(bbox_sizes)
+        bbox_labels =  torch.tensor(bbox_labels)
+
+
+        if self.transform != None:
+            img, (htot, wtot), bbox_sizes, bbox_labels = \
+                self.transform(img, (htot, wtot), bbox_sizes, bbox_labels)
+        else:
+            pass
+
+        return img, img_id, (htot, wtot), bbox_sizes, bbox_labels
+
+
+def draw_patches(img, bboxes, labels, order="xywh", label_map={}):
+
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
+    # Suppose bboxes in fractional coordinate:
+    # cx, cy, w, h
+    # img = img.numpy()
+    img = np.array(img)
+    labels = np.array(labels)
+    bboxes = bboxes.numpy()
+
+    if label_map:
+        labels = [label_map.get(l) for l in labels]
+
+    if order == "ltrb":
+        xmin, ymin, xmax, ymax = bboxes[:, 0],  bboxes[:, 1],  bboxes[:, 2],  bboxes[:, 3]
+        cx, cy, w, h = (xmin + xmax)/2, (ymin + ymax)/2, xmax - xmin, ymax - ymin
+    else:
+        cx, cy, w, h = bboxes[:, 0],  bboxes[:, 1],  bboxes[:, 2],  bboxes[:, 3]
+
+    htot, wtot,_ = img.shape
+    cx *= wtot
+    cy *= htot
+    w *= wtot
+    h *= htot
+
+    bboxes = zip(cx, cy, w, h)
+
+    plt.imshow(img)
+    ax = plt.gca()
+    for (cx, cy, w, h), label in zip(bboxes, labels):
+        if label == "background": continue
+        ax.add_patch(patches.Rectangle((cx-0.5*w, cy-0.5*h),
+                                        w, h, fill=False, color="r"))
+        bbox_props = dict(boxstyle="round", fc="y", ec="0.5", alpha=0.3)
+        ax.text(cx-0.5*w, cy-0.5*h, label, ha="center", va="center", size=15, bbox=bbox_props)
+    plt.show()
+
--- a/PyTorch/Detection/SSD/ssd/pycache/argparse.cpython-36.pyc
+++ b/PyTorch/Detection/SSD/ssd/pycache/argparse.cpython-36.pyc
--- a/PyTorch/Distributed/README.md
+++ b/PyTorch/Distributed/README.md
@ -1,25 +0,0 @@
-# Basic Multirpocess Example based on the MNIST example
-
-
-This example is based on [PyTorch's MNIST Example](https://github.com/pytorch/examples/tree/master/mnist).
-
-This example demonstrates how to modify a network to use a basic but effective distributed data parallel module. This parallel method is designed to easily run multi-gpu runs on a single node. It was created as current parallel methods integraded into pytorch can induce significant overhead due to python GIL lock. This method will reduce the influence of those overheads and potentially provide a benefit in performance, especially for networks with a significant number of fast running operations.
-
-## Getting started
-Prior to running please run
-```pip install -r requirements.txt```
-
-and start a single process run to allow the dataset to be downloaded (This will not work properly in multi-gpu. You can stop this job as soon as it starts iterating.).
-```python main.py```
-
-You can now the code multi-gpu with
-```python -m multiproc main.py ...```
-adding any normal option you'd like.
-
-## Converting your own model
-To understand how to convert your own model to use the distributed module included, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END:   ADDED FOR DISTRIBUTED======``` flags.
-
-Copy the distributed.py and multiproc.py files from here to your local workspace.
-
-## Requirements
-Pytorch master branch built from source. This requirement is to use NCCL as a distributed backend.
--- a/PyTorch/Distributed/distributed.py
+++ b/PyTorch/Distributed/distributed.py
@ -1,183 +0,0 @@
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-import torch.distributed as dist
-from torch.nn.modules import Module
-from torch.autograd import Variable
-
-'''
-This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
-launcher included with this example. It assumes that your run is using multiprocess with 1
-GPU/process, that the model is on the correct device, and that torch.set_device has been
-used to set the device.
-
-Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
-and will be allreduced at the finish of the backward pass.
-'''
-
-def flat_dist_call(tensors, call, extra_args=None):
-    flat_dist_call.warn_on_half = True
-    buckets = {}
-    for tensor in tensors:
-        tp = tensor.type()
-        if tp not in buckets:
-            buckets[tp] = []
-        buckets[tp].append(tensor)
-                    
-    if flat_dist_call.warn_on_half:
-        if torch.cuda.HalfTensor in buckets:
-            print("WARNING: gloo dist backend for half parameters may be extremely slow." +
-                  " It is recommended to use the NCCL backend in this case.")
-            flat_dist_call.warn_on_half = False
-
-    for tp in buckets:
-        bucket = buckets[tp]
-        coalesced = _flatten_dense_tensors(bucket)
-        if extra_args is not None:
-            call(coalesced, *extra_args)
-        else:
-            call(coalesced)
-        coalesced /= dist.get_world_size()
-        for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)):
-            buf.copy_(synced)
-            
-class DistributedDataParallel(Module):
-
-    def __init__(self, module, message_size=10000000):
-        super(DistributedDataParallel, self).__init__()
-        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
-        
-        self.message_size = message_size
-        
-        #reference to last iterations parameters to see if anything has changed
-        self.param_refs = []
-        
-        self.reduction_stream = torch.cuda.Stream()
-        
-        self.module = module
-        self.param_list = list(self.module.parameters())
-        
-        if dist._backend == dist.dist_backend.NCCL:
-            for param in self.param_list:
-                assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU."
-                
-        self.record = []
-        self.create_hooks()
-
-        flat_dist_call([param.data for param in self.module.parameters()], dist.broadcast, (0,) )
-        
-    def create_hooks(self):
-        #all reduce gradient hook
-        def allreduce_params():
-            if(self.needs_reduction):
-                self.needs_reduction = False
-                self.needs_refresh = False
-            else:
-                return
-            grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
-            flat_dist_call(grads, dist.all_reduce)
-            t_record = torch.cuda.IntTensor(self.record)
-            dist.broadcast(t_record, 0)
-            self.record = [int(entry) for entry in t_record]
-
-
-        def flush_buckets():
-            if not self.needs_reduction:
-                return
-            self.needs_reduction = False
-            
-            ready = []
-            for i in range(len(self.param_state)):
-                if self.param_state[i] == 1:
-                    param = self.param_list[self.record[i]]
-                    if param.grad is not None:
-                        ready.append(param.grad.data)
-
-            if(len(ready)>0):
-                orig_stream = torch.cuda.current_stream()
-                with torch.cuda.stream(self.reduction_stream):
-                    self.reduction_stream.wait_stream(orig_stream)
-                    flat_dist_call(ready, dist.all_reduce)
-                    
-            torch.cuda.current_stream().wait_stream(self.reduction_stream)
-
-        for param_i, param in enumerate(list(self.module.parameters())):
-            def wrapper(param_i):
-                
-                def allreduce_hook(*unused):
-                    if self.needs_refresh:
-                        self.record.append(param_i)
-                        Variable._execution_engine.queue_callback(allreduce_params)
-                    else:
-                        Variable._execution_engine.queue_callback(flush_buckets)
-                        self.param_state[self.record.index(param_i)] = 1
-                        self.comm_ready_buckets()
-                    
-                    
-                if param.requires_grad:
-                    param.register_hook(allreduce_hook)
-            wrapper(param_i)
-
-
-    def comm_ready_buckets(self):
-
-        ready = []
-        counter = 0
-
-        while counter < len(self.param_state) and self.param_state[counter] == 2:
-            counter += 1
-
-        while counter < len(self.param_state) and self.param_state[counter] == 1:
-            ready.append(counter)
-            counter += 1
-
-        if not ready:
-            return
-
-        grads = []
-        for ind in ready:
-            param_ind = self.record[ind]
-            if self.param_list[param_ind].grad is not None:
-                grads.append(self.param_list[param_ind].grad.data)
-
-        bucket = []
-        bucket_inds = []
-        while grads:
-            bucket.append(grads.pop(0))
-            bucket_inds.append(ready.pop(0))
-            
-            cumm_size = 0
-            for ten in bucket:
-                cumm_size += ten.numel()
-
-            if cumm_size < self.message_size:
-                continue
-
-            evt = torch.cuda.Event()
-            evt.record(torch.cuda.current_stream())
-            evt.wait(stream=self.reduction_stream)
-        
-            with torch.cuda.stream(self.reduction_stream):
-                flat_dist_call(bucket, dist.all_reduce)
-
-            for ind in bucket_inds:
-                self.param_state[ind] = 2
-        
-    def forward(self, *inputs, **kwargs):
-
-        param_list = [param for param in list(self.module.parameters()) if param.requires_grad]
-
-        
-
-        self.needs_refresh = True if not self.param_refs else any(
-            [param1 is not param2 for param1, param2 in zip(param_list, self.param_refs)]
-        )
-                
-        if  self.needs_refresh:
-            self.record = []
-
-            
-        self.param_state = [0 for i in range(len(param_list))]
-        self.param_refs = param_list
-        self.needs_reduction = True
-        
-        return self.module(*inputs, **kwargs)
--- a/PyTorch/Distributed/main.py
+++ b/PyTorch/Distributed/main.py
@ -1,196 +0,0 @@
-from __future__ import print_function
-import argparse
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torchvision import datasets, transforms
-from torch.autograd import Variable
-
-#=====START: ADDED FOR DISTRIBUTED======
-'''Add custom module for distributed'''
-
-from distributed import DistributedDataParallel as DDP
-'''Import distributed data loader'''
-import torch.utils.data
-import torch.utils.data.distributed
-
-'''Import torch.distributed'''
-import torch.distributed as dist
-
-#=====END:   ADDED FOR DISTRIBUTED======
-
-# Training settings
-parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                    help='input batch size for training (default: 64)')
-parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                    help='input batch size for testing (default: 1000)')
-parser.add_argument('--epochs', type=int, default=10, metavar='N',
-                    help='number of epochs to train (default: 10)')
-parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
-                    help='learning rate (default: 0.01)')
-parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
-                    help='SGD momentum (default: 0.5)')
-parser.add_argument('--no-cuda', action='store_true', default=False,
-                    help='disables CUDA training')
-parser.add_argument('--seed', type=int, default=1, metavar='S',
-                    help='random seed (default: 1)')
-parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                    help='how many batches to wait before logging training status')
-
-#======START: ADDED FOR DISTRIBUTED======
-'''
-Add some distributed options. For explanation of dist-url and dist-backend please see
-http://pytorch.org/tutorials/intermediate/dist_tuto.html
-
--world-size and --rank are required parameters as they will be used by the multiproc.py launcher
-but do not have to be set explicitly.
-'''
-
-parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
-                    help='url used to set up distributed training')
-parser.add_argument('--dist-backend', default='nccl', type=str,
-                    help='distributed backend')
-parser.add_argument('--world-size', default=1, type=int,
-                    help='Number of GPUs to use. Can either be manually set ' +
-                    'or automatically set by using \'python -m multiproc\'.')
-parser.add_argument('--rank', default=0, type=int,
-                    help='Used for multi-process training. Can either be manually set ' +
-                    'or automatically set by using \'python -m multiproc\'.')
-#=====END:   ADDED FOR DISTRIBUTED======
-
-args = parser.parse_args()
-args.cuda = not args.no_cuda and torch.cuda.is_available()
-
-#======START: ADDED FOR DISTRIBUTED======
-'''Add a convenience flag to see if we are running distributed'''
-args.distributed = args.world_size > 1
-
-'''Check that we are running with cuda, as distributed is only supported for cuda.'''
-if args.distributed:
-    assert args.cuda, "Distributed mode requires running with CUDA."
-
-if args.distributed:
-    '''
-    Set cuda device so everything is done on the right GPU.
-    THIS MUST BE DONE AS SOON AS POSSIBLE.
-    '''
-    torch.cuda.set_device(args.rank % torch.cuda.device_count())
-
-    '''Initialize distributed communication'''
-    dist.init_process_group(args.dist_backend, init_method=args.dist_url,
-                            world_size=args.world_size)
-
-#=====END:   ADDED FOR DISTRIBUTED======
-
-torch.manual_seed(args.seed)
-if args.cuda:
-    torch.cuda.manual_seed(args.seed)
-
-
-kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
-
-#=====START: ADDED FOR DISTRIBUTED======
-'''
-Change sampler to distributed if running distributed.
-Shuffle data loader only if distributed.
-'''
-train_dataset = datasets.MNIST('../data', train=True, download=True,
-                               transform=transforms.Compose([
-                                   transforms.ToTensor(),
-                                   transforms.Normalize((0.1307,), (0.3081,))
-                               ]))
-
-if args.distributed:
-    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
-else:
-    train_sampler = None
-
-train_loader = torch.utils.data.DataLoader(
-    train_dataset, sampler=train_sampler,
-    batch_size=args.batch_size, shuffle=(train_sampler is None), **kwargs
-)
-
-#=====END:   ADDED FOR DISTRIBUTED======
-
-test_loader = torch.utils.data.DataLoader(
-    datasets.MNIST('../data', train=False, transform=transforms.Compose([
-                       transforms.ToTensor(),
-                       transforms.Normalize((0.1307,), (0.3081,))
-                   ])),
-    batch_size=args.test_batch_size, shuffle=True, **kwargs)
-
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
-        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
-        self.conv2_drop = nn.Dropout2d()
-        self.fc1 = nn.Linear(320, 50)
-        self.fc2 = nn.Linear(50, 10)
-
-    def forward(self, x):
-        x = F.relu(F.max_pool2d(self.conv1(x), 2))
-        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
-        x = x.view(-1, 320)
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, training=self.training)
-        x = self.fc2(x)
-        return F.log_softmax(x)
-
-model = Net()
-if args.cuda:
-    model.cuda()
-
-#=====START: ADDED FOR DISTRIBUTED======
-'''
-Wrap model in our version of DistributedDataParallel.
-This must be done AFTER the model is converted to cuda.
-'''
-
-if args.distributed:
-    model = DDP(model)
-#=====END:   ADDED FOR DISTRIBUTED======
-
-optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
-
-def train(epoch):
-    model.train()
-    for batch_idx, (data, target) in enumerate(train_loader):
-        if args.cuda:
-            data, target = data.cuda(), target.cuda()
-        data, target = Variable(data), Variable(target)
-        optimizer.zero_grad()
-        output = model(data)
-        loss = F.nll_loss(output, target)
-        loss.backward()
-        optimizer.step()
-        if batch_idx % args.log_interval == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.data[0]))
-
-def test():
-    model.eval()
-    test_loss = 0
-    correct = 0
-    for data, target in test_loader:
-        if args.cuda:
-            data, target = data.cuda(), target.cuda()
-        data, target = Variable(data, volatile=True), Variable(target)
-        output = model(data)
-        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
-        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
-        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
-
-    test_loss /= len(test_loader.dataset)
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
-
-
-for epoch in range(1, args.epochs + 1):
-    train(epoch)
-    test()
--- a/PyTorch/Distributed/multiproc.py
+++ b/PyTorch/Distributed/multiproc.py
@ -1,28 +0,0 @@
-import torch
-import sys
-import subprocess
-
-argslist = list(sys.argv)[1:]
-world_size = torch.cuda.device_count()
-
-if '--world-size' in argslist:
-    argslist[argslist.index('--world-size')+1] = str(world_size)
-else:
-    argslist.append('--world-size')
-    argslist.append(str(world_size))
-
-workers = []
-
-for i in range(world_size):
-    if '--rank' in argslist:
-        argslist[argslist.index('--rank')+1] = str(i)
-    else:
-        argslist.append('--rank')
-        argslist.append(str(i))
-    stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w")
-    print(argslist)
-    p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
-    workers.append(p)
-
-for p in workers:
-    p.wait()
--- a/PyTorch/Distributed/requirements.txt
+++ b/PyTorch/Distributed/requirements.txt
@ -1,2 +0,0 @@
-torch
-torchvision
--- a/PyTorch/Distributed/run_distributed.sh
+++ b/PyTorch/Distributed/run_distributed.sh
@ -1 +0,0 @@
-python -m multiproc main.py
--- a/PyTorch/LanguageModel/README.md
+++ b/PyTorch/LanguageModel/README.md
@ -1,58 +0,0 @@
-# Word-level language modeling RNN
-
-This example is based on [PyTorch's Word-level language modeling RNN Example](https://github.com/pytorch/examples/tree/master/word_language_model).
-
-This example trains a multi-layer RNN (Elman, GRU, or LSTM) on a language modeling task.
-By default, the training script uses the Wikitext-2 dataset, provided.
-The trained model can then be used by the generate script to generate new text.
-
-```bash
-python main.py --cuda --epochs 6        # Train a LSTM on Wikitext-2 with CUDA, reaching perplexity of 117.61
-python main.py --cuda --epochs 6 --tied # Train a tied LSTM on Wikitext-2 with CUDA, reaching perplexity of 110.44
-python main.py --cuda --tied            # Train a tied LSTM on Wikitext-2 with CUDA for 40 epochs, reaching perplexity of 87.17
-python generate.py                      # Generate samples from the trained LSTM model.
-```
-
-The model uses the `nn.RNN` module (and its sister modules `nn.GRU` and `nn.LSTM`)
-which will automatically use the cuDNN backend if run on CUDA with cuDNN installed.
-
-During training, if a keyboard interrupt (Ctrl-C) is received,
-training is stopped and the current model is evaluated against the test dataset.
-
-The `main.py` script accepts the following arguments:
-
-```bash
-optional arguments:
-  -h, --help         show this help message and exit
-  --data DATA        location of the data corpus
-  --model MODEL      type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)
-  --emsize EMSIZE    size of word embeddings
-  --nhid NHID        number of hidden units per layer
-  --nlayers NLAYERS  number of layers
-  --lr LR            initial learning rate
-  --clip CLIP        gradient clipping
-  --epochs EPOCHS    upper epoch limit
-  --batch-size N     batch size
-  --bptt BPTT        sequence length
-  --dropout DROPOUT  dropout applied to layers (0 = no dropout)
-  --decay DECAY      learning rate decay per epoch
-  --tied             tie the word embedding and softmax weights
-  --seed SEED        random seed
-  --cuda             use CUDA
-  --log-interval N   report interval
-  --save SAVE        path to save the final model
-```
-
-With these arguments, a variety of models can be tested.
-As an example, the following arguments produce slower but better models:
-
-```bash
-python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40           # Test perplexity of 80.97
-python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 --tied    # Test perplexity of 75.96
-python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40        # Test perplexity of 77.42
-python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied # Test perplexity of 72.30
-```
-
-Perplexities on PTB are equal or better than
-[Recurrent Neural Network Regularization (Zaremba et al. 2014)](https://arxiv.org/pdf/1409.2329.pdf)
-and are similar to [Using the Output Embedding to Improve Language Models (Press & Wolf 2016](https://arxiv.org/abs/1608.05859) and [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling (Inan et al. 2016)](https://arxiv.org/pdf/1611.01462.pdf), though both of these papers have improved perplexities by using a form of recurrent dropout [(variational dropout)](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks).
--- a/PyTorch/LanguageModel/data.py
+++ b/PyTorch/LanguageModel/data.py
@ -1,49 +0,0 @@
-import os
-import torch
-
-
-class Dictionary(object):
-    def __init__(self):
-        self.word2idx = {}
-        self.idx2word = []
-
-    def add_word(self, word):
-        if word not in self.word2idx:
-            self.idx2word.append(word)
-            self.word2idx[word] = len(self.idx2word) - 1
-        return self.word2idx[word]
-
-    def __len__(self):
-        return len(self.idx2word)
-
-
-class Corpus(object):
-    def __init__(self, path):
-        self.dictionary = Dictionary()
-        self.train = self.tokenize(os.path.join(path, 'train.txt'))
-        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
-        self.test = self.tokenize(os.path.join(path, 'test.txt'))
-
-    def tokenize(self, path):
-        """Tokenizes a text file."""
-        assert os.path.exists(path)
-        # Add words to the dictionary
-        with open(path, 'r') as f:
-            tokens = 0
-            for line in f:
-                words = line.split() + ['<eos>']
-                tokens += len(words)
-                for word in words:
-                    self.dictionary.add_word(word)
-
-        # Tokenize file content
-        with open(path, 'r') as f:
-            ids = torch.LongTensor(tokens)
-            token = 0
-            for line in f:
-                words = line.split() + ['<eos>']
-                for word in words:
-                    ids[token] = self.dictionary.word2idx[word]
-                    token += 1
-
-        return ids
--- a/PyTorch/LanguageModel/data/wikitext-2/README
+++ b/PyTorch/LanguageModel/data/wikitext-2/README
@ -1,3 +0,0 @@
-This is raw data from the wikitext-2 dataset.
-
-See https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
--- a/PyTorch/LanguageModel/data/wikitext-2/test.txt
+++ b/PyTorch/LanguageModel/data/wikitext-2/test.txt
--- a/PyTorch/LanguageModel/data/wikitext-2/train.txt
+++ b/PyTorch/LanguageModel/data/wikitext-2/train.txt
--- a/PyTorch/LanguageModel/data/wikitext-2/valid.txt
+++ b/PyTorch/LanguageModel/data/wikitext-2/valid.txt
--- a/PyTorch/LanguageModel/fp16util.py
+++ b/PyTorch/LanguageModel/fp16util.py
@ -1,36 +0,0 @@
-import torch
-
-
-def params_to_type(params, totype):
-    new_params = []
-    for param in params:
-        new_params.append(param.type(totype))
-    return new_params
-
-
-def params_to_16(params):
-    return params_to_type(params, torch.cuda.HalfTensor)
-
-
-def params_to_32(params):
-    return params_to_type(params, torch.cuda.FloatTensor)
-
-
-def clone_params(net):
-    new_params = []
-    for param in list(net.parameters()):
-        new_params.append(param.data.clone())
-    return new_params
-
-
-def clone_grads(net):
-    new_params = []
-    for param in list(net.parameters()):
-        new_params.append(param.grad.data.clone())
-    return new_params
-
-
-def copy_in_params(net, params):
-    net_params = list(net.parameters())
-    for i in range(len(params)):
-        net_params[i].data.copy_(params[i])
--- a/PyTorch/LanguageModel/generate.py
+++ b/PyTorch/LanguageModel/generate.py
@ -1,74 +0,0 @@
-###############################################################################
-# Language Modeling on Penn Tree Bank
-#
-# This file generates new sentences sampled from the language model
-#
-###############################################################################
-
-import argparse
-
-import torch
-from torch.autograd import Variable
-
-import data
-
-parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Language Model')
-
-# Model parameters.
-parser.add_argument('--data', type=str, default='./data/wikitext-2',
-                    help='location of the data corpus')
-parser.add_argument('--checkpoint', type=str, default='./model.pt',
-                    help='model checkpoint to use')
-parser.add_argument('--outf', type=str, default='generated.txt',
-                    help='output file for generated text')
-parser.add_argument('--words', type=int, default='1000',
-                    help='number of words to generate')
-parser.add_argument('--seed', type=int, default=1111,
-                    help='random seed')
-parser.add_argument('--cuda', action='store_true',
-                    help='use CUDA')
-parser.add_argument('--temperature', type=float, default=1.0,
-                    help='temperature - higher will increase diversity')
-parser.add_argument('--log-interval', type=int, default=100,
-                    help='reporting interval')
-args = parser.parse_args()
-
-# Set the random seed manually for reproducibility.
-torch.manual_seed(args.seed)
-if torch.cuda.is_available():
-    if not args.cuda:
-        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
-    else:
-        torch.cuda.manual_seed(args.seed)
-
-if args.temperature < 1e-3:
-    parser.error("--temperature has to be greater or equal 1e-3")
-
-with open(args.checkpoint, 'rb') as f:
-    model = torch.load(f)
-model.eval()
-
-if args.cuda:
-    model.cuda()
-else:
-    model.cpu()
-
-corpus = data.Corpus(args.data)
-ntokens = len(corpus.dictionary)
-hidden = model.init_hidden(1)
-input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True)
-if args.cuda:
-    input.data = input.data.cuda()
-
-with open(args.outf, 'w') as outf:
-    for i in range(args.words):
-        output, hidden = model(input, hidden)
-        word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
-        word_idx = torch.multinomial(word_weights, 1)[0]
-        input.data.fill_(word_idx)
-        word = corpus.dictionary.idx2word[word_idx]
-
-        outf.write(word + ('\n' if i % 20 == 19 else ' '))
-
-        if i % args.log_interval == 0:
-            print('| Generated {}/{} words'.format(i, args.words))
--- a/PyTorch/LanguageModel/main.py
+++ b/PyTorch/LanguageModel/main.py
@ -1,236 +0,0 @@
-# coding: utf-8
-import argparse
-import time
-import math
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-from fp16util import *
-import data
-import model
-
-parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM Language Model')
-parser.add_argument('--data', type=str, default='./data/wikitext-2',
-                    help='location of the data corpus')
-parser.add_argument('--model', type=str, default='LSTM',
-                    help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
-parser.add_argument('--emsize', type=int, default=200,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=200,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=2,
-                    help='number of layers')
-parser.add_argument('--lr', type=float, default=20,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=40,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=20, metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=35,
-                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.2,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--tied', action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--seed', type=int, default=1111,
-                    help='random seed')
-parser.add_argument('--cuda', action='store_true',
-                    help='use CUDA')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                    help='report interval')
-parser.add_argument('--save', type=str,  default='model.pt',
-                    help='path to save the final model')
-parser.add_argument('--fp16', action='store_true',
-                    help='Run model in pseudo-fp16 mode (fp16 storage fp32 math).')
-parser.add_argument('--loss_scale', type=float, default=1,
-                    help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
-
-args = parser.parse_args()
-
-# Set the random seed manually for reproducibility.
-torch.manual_seed(args.seed)
-if torch.cuda.is_available():
-    if not args.cuda:
-        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
-    else:
-        torch.cuda.manual_seed(args.seed)
-if args.fp16 and not args.cuda:
-    print("WARNING: --fp16 requires --cuda, ignoring --fp16 option")
-
-###############################################################################
-# Load data
-###############################################################################
-
-corpus = data.Corpus(args.data)
-
-# Starting from sequential data, batchify arranges the dataset into columns.
-# For instance, with the alphabet as the sequence and batch size 4, we'd get
-# ┌ a g m s ┐
-# │ b h n t │
-# │ c i o u │
-# │ d j p v │
-# │ e k q w │
-# └ f l r x ┘.
-# These columns are treated as independent by the model, which means that the
-# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
-# batch processing.
-
-def batchify(data, bsz):
-    # Work out how cleanly we can divide the dataset into bsz parts.
-    nbatch = data.size(0) // bsz
-    # Trim off any extra elements that wouldn't cleanly fit (remainders).
-    data = data.narrow(0, 0, nbatch * bsz)
-    # Evenly divide the data across the bsz batches.
-    data = data.view(bsz, -1).t().contiguous()
-    if args.cuda:
-        data = data.cuda()
-    return data
-
-
-eval_batch_size = 10
-train_data = batchify(corpus.train, args.batch_size)
-val_data = batchify(corpus.valid, eval_batch_size)
-test_data = batchify(corpus.test, eval_batch_size)
-
-###############################################################################
-# Build the model
-###############################################################################
-
-ntokens = len(corpus.dictionary)
-model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
-
-if args.cuda and args.fp16:
-    model.type(torch.cuda.HalfTensor)
-    param_copy = params_to_32(clone_params(model))
-elif args.cuda:
-    model.cuda()
-criterion = nn.CrossEntropyLoss()
-
-###############################################################################
-# Training code
-###############################################################################
-
-
-def repackage_hidden(h):
-    """Wraps hidden states in new Variables, to detach them from their history."""
-    if type(h) == Variable:
-        return Variable(h.data)
-    else:
-        return tuple(repackage_hidden(v) for v in h)
-
-
-# get_batch subdivides the source data into chunks of length args.bptt.
-# If source is equal to the example output of the batchify function, with
-# a bptt-limit of 2, we'd get the following two Variables for i = 0:
-# ┌ a g m s ┐ ┌ b h n t ┐
-# └ b h n t ┘ └ c i o u ┘
-# Note that despite the name of the function, the subdivison of data is not
-# done along the batch dimension (i.e. dimension 1), since that was handled
-# by the batchify function. The chunks are along dimension 0, corresponding
-# to the seq_len dimension in the LSTM.
-
-def get_batch(source, i, evaluation=False):
-    seq_len = min(args.bptt, len(source) - 1 - i)
-    data = Variable(source[i:i+seq_len], volatile=evaluation)
-    target = Variable(source[i+1:i+1+seq_len].view(-1))
-    return data, target
-
-
-def evaluate(data_source):
-    # Turn on evaluation mode which disables dropout.
-    model.eval()
-    total_loss = 0
-    ntokens = len(corpus.dictionary)
-    hidden = model.init_hidden(eval_batch_size)
-    for i in range(0, data_source.size(0) - 1, args.bptt):
-        data, targets = get_batch(data_source, i, evaluation=True)
-        output, hidden = model(data, hidden)
-        output_flat = output.view(-1, ntokens)
-        #total loss can overflow if accumulated in fp16.
-        total_loss += len(data) * criterion(output_flat, targets).data.float()
-        hidden = repackage_hidden(hidden)
-    return total_loss[0] / len(data_source)
-
-
-def train():
-    # Turn on training mode which enables dropout.
-    model.train()
-    total_loss = 0
-    start_time = time.time()
-    ntokens = len(corpus.dictionary)
-    hidden = model.init_hidden(args.batch_size)
-    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
-        data, targets = get_batch(train_data, i)
-        # Starting each batch, we detach the hidden state from how it was previously produced.
-        # If we didn't, the model would try backpropagating all the way to start of the dataset.
-        hidden = repackage_hidden(hidden)
-        model.zero_grad()
-        output, hidden = model(data, hidden)
-        loss = criterion(output.view(-1, ntokens), targets)
-        loss = loss * args.loss_scale
-        loss.backward()
-        loss = loss / args.loss_scale
-        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
-        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
-
-        if args.fp16 and args.cuda:
-            grad = params_to_32(clone_grads(model))
-            for i, _ in enumerate(param_copy):
-                param_copy[i] = param_copy[i] - grad[i] * (lr/args.loss_scale)
-            copy_in_params(model, params_to_16(param_copy))
-        else:
-            for p in model.parameters():
-                p.data.add_(-lr/args.loss_scale, p.grad.data)
-
-        total_loss += loss.data
-
-        if batch % args.log_interval == 0 and batch > 0:
-            cur_loss = total_loss[0] / args.log_interval
-            elapsed = time.time() - start_time
-            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
-                  'loss {:5.2f} | ppl {:8.2f}'.format(
-                      epoch, batch, len(train_data) // args.bptt, lr,
-                      elapsed * 1000 / args.log_interval, cur_loss, math.exp(min(cur_loss, 20))))
-            total_loss = 0
-            start_time = time.time()
-
-
-# Loop over epochs.
-lr = args.lr
-best_val_loss = None
-
-# At any point you can hit Ctrl + C to break out of training early.
-try:
-    for epoch in range(1, args.epochs+1):
-        epoch_start_time = time.time()
-        train()
-        val_loss = evaluate(val_data)
-        print('-' * 89)
-        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-              'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
-                                         val_loss, math.exp(min(val_loss, 20))))
-        print('-' * 89)
-        # Save the model if the validation loss is the best we've seen so far.
-        if not best_val_loss or val_loss < best_val_loss:
-            with open(args.save, 'wb') as f:
-                torch.save(model, f)
-            best_val_loss = val_loss
-        else:
-            # Anneal the learning rate if no improvement has been seen in the validation dataset.
-            lr /= 4.0
-except KeyboardInterrupt:
-    print('-' * 89)
-    print('Exiting from training early')
-
-# Load the best saved model.
-with open(args.save, 'rb') as f:
-    model = torch.load(f)
-
-# Run on test data.
-test_loss = evaluate(test_data)
-print('=' * 89)
-print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
-    test_loss, math.exp(test_loss)))
-print('=' * 89)
--- a/PyTorch/LanguageModel/model.py
+++ b/PyTorch/LanguageModel/model.py
@ -1,59 +0,0 @@
-import torch.nn as nn
-from torch.autograd import Variable
-
-
-class RNNModel(nn.Module):
-    """Container module with an encoder, a recurrent module, and a decoder."""
-
-    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
-        super(RNNModel, self).__init__()
-        self.drop = nn.Dropout(dropout)
-        self.encoder = nn.Embedding(ntoken, ninp)
-        if rnn_type in ['LSTM', 'GRU']:
-            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
-        else:
-            try:
-                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
-            except KeyError:
-                raise ValueError("""An invalid option for `--model` was supplied,
-                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
-            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
-        self.decoder = nn.Linear(nhid, ntoken)
-
-        # Optionally tie weights as in:
-        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
-        # https://arxiv.org/abs/1608.05859
-        # and
-        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
-        # https://arxiv.org/abs/1611.01462
-        if tie_weights:
-            if nhid != ninp:
-                raise ValueError('When using the tied flag, nhid must be equal to emsize')
-            self.decoder.weight = self.encoder.weight
-
-        self.init_weights()
-
-        self.rnn_type = rnn_type
-        self.nhid = nhid
-        self.nlayers = nlayers
-
-    def init_weights(self):
-        initrange = 0.1
-        self.encoder.weight.data.uniform_(-initrange, initrange)
-        self.decoder.bias.data.fill_(0)
-        self.decoder.weight.data.uniform_(-initrange, initrange)
-
-    def forward(self, input, hidden):
-        emb = self.drop(self.encoder(input))
-        output, hidden = self.rnn(emb, hidden)
-        output = self.drop(output)
-        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
-        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
-
-    def init_hidden(self, bsz):
-        weight = next(self.parameters()).data
-        if self.rnn_type == 'LSTM':
-            return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
-                    Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
-        else:
-            return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())
--- a/PyTorch/LanguageModel/requirements.txt
+++ b/PyTorch/LanguageModel/requirements.txt
@ -1 +0,0 @@
-torch
--- a/PyTorch/Segmentation/MaskRCNN/NOTICE
+++ b/PyTorch/Segmentation/MaskRCNN/NOTICE
@ -0,0 +1,4 @@
+MaskRCNN PyTorch
+
+This repository includes software from https://github.com/facebookresearch/maskrcnn-benchmark
+licensed under the MIT License.
--- a/PyTorch/Segmentation/MaskRCNN/README.md
+++ b/PyTorch/Segmentation/MaskRCNN/README.md
@ -0,0 +1,480 @@
+# Mask R-CNN For PyTorch
+This repository provides a script and recipe to train and infer on MaskRCNN to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+* [The model](#the-model)
+  * [Default configuration](#default-configuration)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick start guide](#quick-start-guide)
+* [Details](#details)
+  * [Command line arguments](#command-line-arguments)
+  * [Getting the data](#getting-the-data)
+  * [Training process](#training-process)
+  * [Enabling mixed precision](#enabling-mixed-precision)
+* [Benchmarking](#benchmarking)
+* [Results](#results)
+  * [Training accuracy results](#training-accuracy-results)
+    * [Training stability test](#training-stability-test)
+  * [Training performance results](#training-performance-results)
+    * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
+    * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
+  * [Inference performance results](#inference-performance-results)
+    * [NVIDIA DGX-1 16G (1x V100 16G)](#nvidia-dgx-1-16g-1x-v100-16g)
+    * [NVIDIA DGX-1 32G (1x V100 32G)](#nvidia-dgx-1-32g-1x-v100-32g)
+* [Changelog](#changelog)
+* [Known issues](#known-issues)
+
+## The model
+
+Mask R-CNN is a convolution based neural network for the task of object instance segmentation. The paper describing the model can be found [here](https://arxiv.org/abs/1703.06870). NVIDIA’s Mask R-CNN 19.2 is an optimized version of [Facebook’s implementation](https://github.com/facebookresearch/maskrcnn-benchmark), leveraging mixed precision arithmetic and tensor cores on V100 GPUs for 1.3x faster training times while maintaining target accuracy. Because this model trains with mixed precision tensor cores on Volta, researchers can get results much faster than training without tensor cores.  This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+The repository also contains scripts to interactively launch training, benchmarking and inference routines in a Docker container.
+
+The major differences between the official implementation of the paper and our version of Mask R-CNN are as follows:
+  - Mixed precision support with [PyTorch AMP](https://www.google.com/url?q=https://github.com/NVIDIA/apex&sa=D&ust=1552333999336000&usg=AFQjCNGO8dE0mQMO0lUfhMyCs7g0NmJXkw).
+  - Gradient accumulation to simulate larger batches.
+  - Custom fused CUDA kernels for faster computations.
+
+These techniques/optimizations improve model performance and reduce training time by a factor of 1.3x, allowing you to perform more efficient instance segmentation with no additional effort.
+
+Other publicly available implementations of Mask R-CNN include:
+  - [Matterport](https://github.com/matterport/Mask_RCNN)
+  - [Tensorpack](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN)
+  - [Google’s tensorflow model](https://github.com/tensorflow/models/tree/master/research/object_detection)
+
+### Default Configuration
+The default configuration of this model can be found at `pytorch/maskrcnn_benchmark/config/defaults.py`. The default hyper-parameters are as follows:
+  - General:
+    - Base Learning Rate set to 0.001
+    - Global batch size set to 16 images
+    - Steps set to 30000
+    - Images resized with aspect ratio maintained and smaller side length between [800,1333]
+    - Global train batch size - 16
+    - Global test batch size - 8
+
+  - Feature extractor:
+    - Backend network set to Resnet50_conv4
+    - Backbone network weights are frozen after second epoch
+
+  - Region Proposal Network (RPN):
+    - Anchor stride set to 16
+    - Anchor sizes set to (32, 64, 128, 256, 512)
+    - Foreground IOU Threshold set to 0.7, Background IOU Threshold set to 0.5
+    - RPN target fraction of positive proposals set to 0.5
+    - Train Pre-NMS Top proposals set to 12000
+    - Train Post-NMS Top proposals set to 2000
+    - Test Pre-NMS Top proposals set to 6000
+    - Test Post-NMS Top proposals set to 1000
+    - RPN NMS Threshold set to 0.7
+
+  - RoI heads:
+    - Foreground threshold set to 0.5
+    - Batch size per image set to 512
+    - Positive fraction of batch set to 0.25
+
+This repository implements multi-gpu and gradient accumulation to support larger batches and mixed precision support. This implementation also includes the following optimizations.
+  - Target generation - Optimized GPU implementation for generating binary mask ground truths from the list of polygon coordinates that exist in the dataset.
+  - Custom CUDA kernels for:
+    - Box Intersection over Union (IoU) computation
+    - Proposal matcher
+    - Generate anchor boxes
+    - Pre NMS box selection - Selection of RoIs based on objectness score before NMS is applied.
+
+    The source files can be found under `maskrcnn_benchmark/csrc/cuda`.
+## Setup
+The following sections list the requirements in order to start training the Mask R-CNN model.
+### Requirements
+This repository contains `Dockerfile` which extends the PyTorch NGC container and encapsulates some dependencies.  Aside from these dependencies, ensure you have the following components:
+  - [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+  - [PyTorch 19.02-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+  - [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+
+  For more information about how to get started with NGC containers, see the
+  following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
+  Documentation:
+  - [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+  - [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+  - [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+
+## Quick Start Guide
+To train your model using mixed precision with tensor cores or using FP32, perform the following steps using the default parameters of the Mask R-CNN model on the COCO 2014 dataset.
+
+### 1. Clone the repository.
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples.git
+cd DeepLearningExamples/PyTorch/Segmentation/MaskRCNN
+```
+
+### 2. Download and preprocess the dataset.
+This repository provides scripts to download and extract the COCO 2014 dataset.  Data will be downloaded to the `current working` directory on the host and extracted to a user-defined directory
+
+To download, verify, and extract the COCO dataset, use the following scripts:
+  ```
+  cd Detectron_PyT
+  ./download_dataset.sh <data/dir>
+  ```
+By default, the data is organized into the following structure:
+  ```
+  <data/dir>
+    annotations/
+      instances_train2014.json
+      instances_val2014.json
+    train2014/
+      COCO_train2014_*.jpg
+    val2014/
+      COCO_val2014_*.jpg
+  ```
+
+### 3. Build the Mask R-CNN PyTorch NGC container.
+```
+bash scripts/docker/build.sh
+```
+
+### 4. Start an interactive session in the NGC container to run training/inference.
+After you build the container image, you can start an interactive CLI session with  
+```
+bash scripts/docker/interactive.sh <path/to/dataset/>
+```
+The `interactive.sh` script requires that the location on the dataset is specified.  For example, `/home/<USER>/Detectron_PyT/detectron/lib/datasets/data/coco`
+
+
+### 5. Start training.
+```
+bash scripts/train.sh
+```
+The `train.sh` script trains a model and performs evaluation on the COCO 2014 dataset. By default, the training script:
+  - Uses 8 GPUs.
+  - Saves a checkpoint every 2500 iterations and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
+  - Mixed precision training with Tensor Cores is invoked by adding `DTYPE \"float16\"` to the end of the above command as shown in the train script. This will override the default `DTYPE` configuration which is float32.
+
+  The `scripts/train.sh` script runs the following Python command:
+  ```
+  python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file “configs/e2e_mask_rcnn_R_50_FPN_1x.yaml”   
+  ```
+
+### 6. Start validation/evaluation.
+  ```
+  bash scripts/eval.sh
+  ```
+Model evaluation on a checkpoint can be launched by running the  `pytorch/scripts/eval.sh` script. The script requires:
+- the location of the checkpoint folder to be specified and present within/mounted to the container.
+- a text file named last_checkpoint which contains the path to the latest checkpoint. This mechanism is required in order to resume training from the latest checkpoint.
+- The file last_checkpoint is automatically created at the end of the training process.
+
+By default, evaluation is performed on the test dataset once training is complete. To skip evaluation at the end of training, issue the `--skip-test` flag.
+
+Additionally, to perform evaluation after every epoch and terminate training on reaching a minimum required mAP score, set
+- `PER_EPOCH_EVAL = True`
+- `MIN_BBOX_MAP = <required value>`
+- `MIN_MASK_MAP = <required value>`
+
+### 7. Start inference/predictions.
+
+Model predictions can be obtained on a test dataset and a model checkpoint by running the  `scripts/inference.sh <config/file/path>` script. The script requires:
+  - the location of the checkpoint folder and dataset to be specified and present within/mounted to the container.
+  - a text file named last_checkpoint which contains the path to the checkpoint.
+
+For example:
+```
+bash scripts/inference.sh configs/e2e_mask_rcnn_R_50_FPN_1x.yaml
+```
+
+Model predictions get saved in the `<OUTPUT_DIR>/inference` directory.
+
+To perform inference and skip computation of mAP scores, issue the `--skip-eval` flag. Performance is reported in seconds per iteration per GPU. The benchmarking scripts can be used to extract frames per second on training and inference.
+
+## Details
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Command line arguments
+You can modify the training behaviour through the various flags in both the `train_net.py` script and through overriding specific parameters in the YAML config files. Flags in the `train_net.py` script are as follows:
+  
+  `--config_file` - path to config file containing model params
+  
+  `--skip-test` - skips model testing after training
+  
+  `--skip-eval` - skips computation of mAP scores
+  
+  `--opts` - allows for you to override specific params in config file
+
+For example:
+```
+python -m torch.distributed.launch --nproc_per_node=2 tools/train_net.py \
+    --config-file configs/e2e_faster_rcnn_R_50_FPN_1x.yaml \
+    --skip-eval \
+    DTYPE "float16" \
+    OUTPUT_DIR RESULTS \
+    SOLVER.BASE_LR 0.002 \
+    SOLVER.STEPS “(360000, 480000)”
+```
+
+### Getting the data
+The Mask R-CNN model was trained on the [COCO 2014](http://cocodataset.org/#download) dataset.  This dataset comes with a training and validation set.  
+
+This repository contains the `./download_dataset.sh`,`./verify_dataset.sh`, and `./extract_dataset.sh` scripts which automatically download and preprocess the training and validation sets.
+
+In order to run on your own dataset, ensure your dataset is present/mounted to the Docker container with the following hierarchy:
+```
+my_dataset/
+  images_train/
+  images_val/
+  instances_train.json
+  instances_val.json
+```
+and add it to `DATASETS` dictionary in `maskrcnn_benchmark/config/paths_catalog.py`
+
+```
+DATASETS = {
+        "my_dataset_train": {
+            "img_dir": "data/images_train",
+            "ann_file": "data/instances_train.json"
+        },
+        "my_dataset_val": {
+            "img_dir": "data/images_val",
+            "ann_file": "data/instances_val.json"
+        },
+      }
+```
+```
+python -m torch.distributed.launch --nproc_per_node=<NUM_GPUS> tools/train_net.py \
+        --config-file <CONFIG? \
+        DATASETS.TRAIN "(\"my_dataset_train\")"\
+        DATASETS.TEST "(\"my_dataset_val\")"\
+        DTYPE "float16" \
+        OUTPUT_DIR <RESULTS> \
+        | tee <LOGFILE>
+```
+
+### Training Process
+Training is performed using the `tools/train_net.py` script along with parameters defined in the config file. The default config files can be found in the `pytorch/configs/` directory.
+
+The `e2e_mask_rcnn_R_50_FPN_1x.yaml` file was used to gather accuracy and performance metrics. This configuration sets the following parameters:
+  - Backbone weights to ResNet-50
+  - Feature extractor set to ResNet-50 with Feature Pyramid Networks (FPN)
+  - RPN uses FPN
+  - RoI Heads use FPN
+  - Dataset - COCO 2014
+  - Base Learning Rate - 0.02
+  - Global train batch size - 16
+  - Global test batch size - 8
+  - RPN batch size - 256
+  - ROI batch size - 512
+  - Solver steps - (60000, 80000)
+  - Max iterations - 90000
+  - Warmup iterations - 500
+  - Warmup factor = 0.33
+    - Initial learning rate = Base Learning Rate x Warmup factor
+
+The default feature extractor can be changed by setting `CONV_BODY` parameter in `yaml` file to any of the following:
+  - R-50-C4
+  - R-50-C5
+  - R-101-C4
+  - R-101-C5
+  - R-101-FPN
+
+The default backbone can be changed to a flavor of Resnet-50 or ResNet-101 by setting `WEIGHT` parameter in `yaml` file to any of the following:
+  - "catalog://ImageNetPretrained/MSRA/R-50-GN"
+  - "catalog://ImageNetPretrained/MSRA/R-101"
+  - "catalog://ImageNetPretrained/MSRA/R-101-GN"
+
+This script outputs results to the current working directory by default. However, this can be changed by adding `OUTPUT_DIR <DIR_NAME>` to the end of the default command. Logs produced during training are also stored in the `OUTPUT_DIR` specified. The training log will contain information about:
+  - Loss, time per iteration, learning rate and memory metrics
+  - performance values such as time per step
+  - test accuracy and test performance values after evaluation
+
+The training logs are located in the `<OUTPUT_DIR>/log` directory. The summary after each training epoch is printed in the following format:
+  ```
+  INFO:maskrcnn_benchmark.trainer:eta: 4:42:15  iter: 20  loss: 1.8236 (2.7274)  loss_box_reg: 0.0249 (0.0620)  loss_classifier: 0.6086 (1.2918)  loss_mask: 0.6996 (0.8026)  loss_objectness: 0.5373 (0.4787)  loss_rpn_box_reg: 0.0870 (0.0924)  time: 0.2002 (0.3765)  data: 0.0099 (0.1242)  lr: 0.014347  max mem: 3508
+  ```
+  The mean and median training losses are reported every 20 steps.
+
+Multi-gpu and multi-node training is enabled with the PyTorch distributed launch module. The following example runs training on 8 GPUs:
+  ```
+  python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file \"configs/e2e_mask_rcnn_R_50_FPN_1x.yaml\"
+  ```
+
+We have tested a batch sizes upto 4 on a 16GB V100 and upto 16 on a 32G V100 with mixed precision. The repository also implements gradient accumulation functionality to simulate bigger batches. The following command can be used to run a batch of 64:
+  ```
+  python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file \"configs/e2e_mask_rcnn_R_50_FPN_1x.yaml\" SOLVER.ACCUMULATE_GRAD True SOLVER.ACCUMULATE_STEPS 4
+  ```
+
+By default, training is performed using FP32, however training time can be reduced using tensor cores and mixed precision. This can be done by adding `DTYPE \"float16\"` to override the respective parameter in the config file.
+
+__Note__: When training a global batch size >= 32, it is recommended to additionally set the following parameters:
+  - `SOLVER.WARMUP_ITERS 625`
+  - `SOLVER.WARMUP_FACTOR 0.01`
+
+When experimenting with different global batch sizes for training and inference, make sure `SOLVER.IMS_PER_BATCH` and `TEST.IMS_PER_BATCH` are divisible by the number of GPUs.  
+
+#### Other training options
+A sample single GPU config is provided under `configs/e2e_mask_rcnn_R_50_FPN_1x_1GPU.yaml`
+
+For multi-gpu runs, `-m torch.distributed.launch --nproc_per_node num_gpus` is added prior to `tools/train_net.py`.  For example, for an 8 GPU run:
+```
+python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file “configs/e2e_mask_rcnn_R_50_FPN_1x.yaml”   
+```
+
+Training is terminated when either the required accuracies specified on the command line are reached or if the number of training iterations specified is reached.
+
+To terminate training on reaching target accuracy on 8 GPUs, run:
+```
+python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file “configs/e2e_mask_rcnn_R_50_FPN_1x.yaml” PER_EPOCH_EVAL True MIN_BBOX_MAP 0.377 MIN_MASK_MAP 0.342
+```
+
+__Note__: The score is always the Average Precision(AP) at
+  - IoU = 0.50:0.95
+  - Area = all - include small, medium and large
+  - maxDets = 100
+
+## Enabling mixed precision
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.  Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP),  library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by amp. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
+
+For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
+
+To enable mixed precision, you can:
+  - Import AMP from APEX, for example:
+    ```
+    from apex import amp
+    ```
+  - Initialize an AMP handle, for example:
+    ```
+    amp_handle = amp.init(enabled=True, verbose=True)
+    ```
+  - Wrap your optimizer with the AMP handle, for example:
+    ```
+    optimizer = amp_handle.wrap_optimizer(optimizer)
+    ```
+  - Scale loss before backpropagation (assuming loss is stored in a variable called losses)
+    - Default backpropagate for FP32:
+      ```
+      losses.backward()
+      ```
+    - Scale loss and backpropagate with AMP:
+      ```
+      with optimizer.scale_loss(losses) as scaled_losses:
+        scaled_losses.backward()
+      ```
+
+For information about:
+- how to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+- Techniques used for [mixed precision training, see the Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+- APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training).
+
+## Benchmarking
+Benchmarking can be performed for both training and inference. Both scripts run the Mask R-CNN model using the parameters defined in `configs/e2e_mask_rcnn_R_50_FPN_1x.yaml`. You can specify whether benchmarking is performed in FP16 or FP32 by specifying it as an argument to the benchmarking scripts.
+
+Training benchmarking can performed by running the script:
+```
+scripts/train_benchmark.sh <float16/float32>
+```
+
+Inference benchmarking can be performed by running the script:
+```
+scripts/inference_benchmark.sh <float16/float32>
+```
+
+## Results
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+### Training Accuracy Results
+Our results were obtained by running the `tools/train_net.py`  training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
+
+| **number of GPUs** | **batch size/GPU** | **Training time with FP16(hours)** | **Training time with FP32(hours)** |
+| --- | --- | ----- | ----- |
+|  8  | 4 | 4.81 | 6.35 |
+
+LOSS CURVE:
+
+![Loss Curve](./img/loss_curve.png)
+
+Here, multihead loss is simply the summation of losses on the mask head and the bounding box head.
+
+ACCURACY CURVE:
+
+![Accuracy Curve](./img/accuracy_curve.png)
+
+#### Training Stability Test
+The following tables compare mAP scores across 5 different training runs with different seeds, for both FP16 and FP32 respectively.  The runs showcase consistent convergence on all 5 seeds with very little deviation.
+
+| **Config** | **Seed #1** | **Seed #2** | **Seed #3** |  **Seed #4** | **Seed #5** | **mean** | **std** |
+| --- | --- | ----- | ----- | --- | --- | ----- | ----- |
+|  8 GPUs, fp16, final AP BBox  | 0.377 | 0.376 | 0.376 | 0.378  | 0.377 | 0.377 | 0.001 |
+| 8 GPUs, fp16, final AP Segm | 0.343 | 0.342 | 0.341 | 0.343  | 0.343 | 0.342 | 0.001 |
+
+| **Config** | **Seed #1** | **Seed #2** | **Seed #3** |  **Seed #4** | **Seed #5** | **mean** | **std** |
+| --- | --- | ----- | ----- | --- | --- | ----- | ----- |
+|  8 GPUs, fp32, final AP BBox  | 0.377 | 0.377 | 0.376 | 0.378  | 0.378 | 0.377 | 0.001 |
+| 8 GPUs, fp32, final AP Segm | 0.344 | 0.342 | 0.343 | 0.343  | 0.343 | 0.342 | 0.001 |
+
+### Training Performance Results
+#### NVIDIA DGX-1  (8x V100 16G)
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Performance numbers (in tokens per second) were averaged over an entire training epoch.
+
+| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| --- | --- | ----- | ----- | --- | --- | ----- |
+| 1 | 2 | 8.47 | 10.77 | 1.27 | 1 | 1 |
+| 4 | 2 | 30.23 | 36.88 | 1.22 | 3.67 | 3.53 |
+| 8 | 2 | 56.35 | 70.45 | 1.25 | 6.96 | 6.51 |
+
+| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| --- | --- | ----- | ----- | --- | --- | ----- |
+| 1 | 4 | 9.29 | 12.73 | 1.37 | 1 | 1 |
+| 4 | 4 | 34.07 | 44.95 | 1.32 | 3.67 | 3.53 |
+| 8 | 4 | 62.7 | 82.9 | 1.32 | 6.75 | 6.51 |
+
+To achieve these same results, follow the [Quick start guide](#quick-start-guide) outlined above.
+
+#### NVIDIA DGX-1 (8x V100 32G)
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| --- | --- | ----- | ----- | --- | --- | ----- |
+| 1 | 4 | 9.06 | 13.14 | 1.45 | 1 | 1 |
+| 4 | 4 | 32.87 | 50.70 | 1.54 | 3.86 | 3.63 |
+| 8 | 4 | 62.93 | 82.30 | 1.31 | 6.94 | 6.26 |
+
+| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| --- | --- | ----- | ----- | --- | --- | ----- |
+| 1 | 8 | 9.35 | 13.05 | 1.40 | 1 | 1 |
+| 4 | 8 | 33.38 | 46.69 | 1.40 | 3.57 | 3.57 |
+| 8 | 8 | 71.85 | 87.10 | 1.21 | 7.68 | 7.68 |
+
+| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+| --- | --- | ----- | ----- | --- | --- | ----- |
+| 1 | 16 | NA | 13.82 | NA | NA | 1 |
+| 4 | 16 | NA | 48.41 | NA | NA | 3.50 |
+| 8 | 16 | NA | 89.33 | NA | NA | 6.46 |
+
+It should be noted that respective values for FP32 runs using a batch size of 16 are not available due to out of memory errors that arise. Batch size of 16 is only available on using FP16.
+
+To achieve these same results, follow the [Quick start guide](#quick-start-guide) outlined above.
+
+### Inference performance results
+#### NVIDIA DGX-1 16G (1x V100 16G)
+Our results were obtained by running the `scripts/inference.sh` training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speedup** |
+| --- | --- | ----- | ----- | ----- |
+|  1  | 8 | 15.3 | 16.94 | 1.107 |
+
+To achieve these same results, follow the [Quick start guide](#quick-start-guide) outlined above.
+
+#### NVIDIA DGX-1 32G (1x V100 32G)
+Our results were obtained by running the `scripts/inference.sh <config/file/path>` training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 1x V100 32G GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+
+| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speedup** |
+| --- | --- | ----- | ----- | ----- |
+|  1  | 8 | 14.43  | 16.33 | 1.13 |
+
+To achieve these same results, follow the [Quick start guide](#quick-start-guide) outlined above.
+
+## Changelog
+March 2019
+  - Initial release
+
+## Known Issues
+There are no known issues with this model.
--- a/PyTorch/Segmentation/MaskRCNN/download_dataset.sh
+++ b/PyTorch/Segmentation/MaskRCNN/download_dataset.sh
@ -0,0 +1,28 @@
+DATA_DIR=$1
+
+wget https://dl.fbaipublicfiles.com/detectron/coco/coco_annotations_minival.tgz
+wget http://images.cocodataset.org/zips/train2014.zip
+wget http://images.cocodataset.org/zips/val2014.zip
+wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
+
+if md5sum -c hashes.md5
+then
+	echo "DOWNLOAD PASSED"
+	# mkdir $DATA_DIR
+	mv coco_annotations_minival.tgz $DATA_DIR
+	mv train2014.zip $DATA_DIR
+	mv val2014.zip $DATA_DIR
+	mv annotations_trainval2014.zip $DATA_DIR
+
+	cd $DATA_DIR
+	dtrx --one=here coco_annotations_minival.tgz
+	dtrx --one=here annotations_trainval2014.zip
+	mv annotations.1/* annotations/
+
+	dtrx train2014.zip
+	dtrx val2014.zip
+
+	echo "EXTRACTION COMPLETE"
+else
+	echo "DOWNLOAD FAILED HASHCHECK"
+fi
--- a/PyTorch/Segmentation/MaskRCNN/download_weights.sh
+++ b/PyTorch/Segmentation/MaskRCNN/download_weights.sh
@ -0,0 +1 @@
+wget https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl
--- a/PyTorch/Segmentation/MaskRCNN/hashes.md5
+++ b/PyTorch/Segmentation/MaskRCNN/hashes.md5
@ -0,0 +1,4 @@
+2d2b9d2283adb5e3b8d25eec88e65064  coco_annotations_minival.tgz
+0da8c0bd3d6becc4dcb32757491aca88  train2014.zip
+a3d79f5ed8d289b7a7554ce06a5782b3  val2014.zip
+0a379cfc70b0e71301e0f377548639bd  annotations_trainval2014.zip
--- a/PyTorch/Segmentation/MaskRCNN/img/accuracy_curve.png
+++ b/PyTorch/Segmentation/MaskRCNN/img/accuracy_curve.png
--- a/PyTorch/Segmentation/MaskRCNN/img/loss_curve.png
+++ b/PyTorch/Segmentation/MaskRCNN/img/loss_curve.png
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/ABSTRACTIONS.md
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/ABSTRACTIONS.md
@ -0,0 +1,65 @@
+## Abstractions
+The main abstractions introduced by `maskrcnn_benchmark` that are useful to
+have in mind are the following:
+
+### ImageList
+In PyTorch, the first dimension of the input to the network generally represents
+the batch dimension, and thus all elements of the same batch have the same
+height / width.
+In order to support images with different sizes and aspect ratios in the same
+batch, we created the `ImageList` class, which holds internally a batch of
+images (os possibly different sizes). The images are padded with zeros such that
+they have the same final size and batched over the first dimension. The original
+sizes of the images before padding are stored in the `image_sizes` attribute,
+and the batched tensor in `tensors`.
+We provide a convenience function `to_image_list` that accepts a few different
+input types, including a list of tensors, and returns an `ImageList` object.
+
+```python
+from maskrcnn_benchmark.structures.image_list import to_image_list
+
+images = [torch.rand(3, 100, 200), torch.rand(3, 150, 170)]
+batched_images = to_image_list(images)
+
+# it is also possible to make the final batched image be a multiple of a number
+batched_images_32 = to_image_list(images, size_divisible=32)
+```
+
+### BoxList
+The `BoxList` class holds a set of bounding boxes (represented as a `Nx4` tensor) for
+a specific image, as well as the size of the image as a `(width, height)` tuple.
+It also contains a set of methods that allow to perform geometric
+transformations to the bounding boxes (such as cropping, scaling and flipping).
+The class accepts bounding boxes from two different input formats:
+- `xyxy`, where each box is encoded as a `x1`, `y1`, `x2` and `y2` coordinates, and
+- `xywh`, where each box is encoded as `x1`, `y1`, `w` and `h`.
+
+Additionally, each `BoxList` instance can also hold arbitrary additional information
+for each bounding box, such as labels, visibility, probability scores etc.
+
+Here is an example on how to create a `BoxList` from a list of coordinates:
+```python
+from maskrcnn_benchmark.structures.bounding_box import BoxList, FLIP_LEFT_RIGHT
+
+width = 100
+height = 200
+boxes = [
+  [0, 10, 50, 50],
+  [50, 20, 90, 60],
+  [10, 10, 50, 50]
+]
+# create a BoxList with 3 boxes
+bbox = BoxList(boxes, image_size=(width, height), mode='xyxy')
+
+# perform some box transformations, has similar API as PIL.Image
+bbox_scaled = bbox.resize((width * 2, height * 3))
+bbox_flipped = bbox.transpose(FLIP_LEFT_RIGHT)
+
+# add labels for each bbox
+labels = torch.tensor([0, 10, 1])
+bbox.add_field('labels', labels)
+
+# bbox also support a few operations, like indexing
+# here, selects boxes 0 and 2
+bbox_subset = bbox[[0, 2]]
+```
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/CODE_OF_CONDUCT.md
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/CODE_OF_CONDUCT.md
@ -0,0 +1,5 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/CONTRIBUTING.md
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/CONTRIBUTING.md
@ -0,0 +1,39 @@
+# Contributing to Mask-RCNN Benchmark
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Our Development Process
+Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style  
+* 4 spaces for indentation rather than tabs
+* 80 character line length
+* PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/)
+
+## License
+By contributing to Mask-RCNN Benchmark, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/Dockerfile
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/Dockerfile
@ -0,0 +1,37 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.02-py3
+FROM ${FROM_IMAGE_NAME}
+
+# Install Python dependencies
+RUN pip install --upgrade --no-cache-dir pip \
+ && pip install --no-cache-dir \
+      mlperf-compliance==0.0.10 \
+      opencv-python==3.4.1.15 \
+      yacs
+
+WORKDIR /opt
+RUN git clone -b v0.1 https://github.com/NVIDIA/cocoapi.git \
+ && cd cocoapi/PythonAPI \
+ && pip install -e .
+
+# Copy detectron code and build
+WORKDIR /workspace/object_detection
+RUN mkdir -p /datasets/coco
+RUN mkdir /results
+COPY . .
+RUN pip install -e .
+
+ENV OMP_NUM_THREADS=1
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/INSTALL.md
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/INSTALL.md
@ -0,0 +1,78 @@
+## Installation
+
+### Requirements:
+- PyTorch 1.0 from a nightly release. Installation instructions can be found in https://pytorch.org/get-started/locally/
+- torchvision from master
+- cocoapi
+- yacs
+- matplotlib
+- GCC >= 4.9
+- (optional) OpenCV for the webcam demo
+
+
+### Option 1: Step-by-step installation
+
+```bash
+# first, make sure that your conda is setup properly with the right environment
+# for that, check that `which conda`, `which pip` and `which python` points to the
+# right path. From a clean conda env, this is what you need to do
+
+conda create --name maskrcnn_benchmark
+source activate maskrcnn_benchmark
+
+# this installs the right pip and dependencies for the fresh python
+conda install ipython
+
+# maskrcnn_benchmark and coco api dependencies
+pip install ninja yacs cython matplotlib
+
+# follow PyTorch installation in https://pytorch.org/get-started/locally/
+# we give the instructions for CUDA 9.0
+conda install pytorch-nightly -c pytorch
+
+# install torchvision
+cd ~/github
+git clone https://github.com/pytorch/vision.git
+cd vision
+python setup.py install
+
+# install pycocotools
+cd ~/github
+git clone https://github.com/cocodataset/cocoapi.git
+cd cocoapi/PythonAPI
+python setup.py build_ext install
+
+# install apex
+cd ~github
+git clone https://github.com/NVIDIA/apex.git
+cd apex
+python setup.py install --cuda_ext --cpp_ext
+
+# install PyTorch Detection
+cd ~/github
+git clone https://github.com/facebookresearch/maskrcnn-benchmark.git
+cd maskrcnn-benchmark
+# the following will install the lib with
+# symbolic links, so that you can modify
+# the files if you want and won't need to
+# re-build it
+python setup.py build develop
+
+# or if you are on macOS
+# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop
+```
+
+### Option 2: Docker Image (Requires CUDA, Linux only)
+
+Build image with defaults (`CUDA=9.0`, `CUDNN=7`):
+
+    nvidia-docker build -t maskrcnn-benchmark docker/
+    
+Build image with other CUDA and CUDNN versions:
+
+    nvidia-docker build -t maskrcnn-benchmark --build-arg CUDA=9.2 --build-arg CUDNN=7 docker/ 
+    
+Build and run image with built-in jupyter notebook(note that the password is used to log in jupyter notebook):
+
+    nvidia-docker build -t maskrcnn-benchmark-jupyter docker/docker-jupyter/
+    nvidia-docker run -td -p 8888:8888 -e PASSWORD=<password> -v <host-dir>:<container-dir> maskrcnn-benchmark-jupyter
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/LICENSE
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Facebook
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/MODEL_ZOO.md
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/MODEL_ZOO.md
@ -0,0 +1,82 @@
+## Model Zoo and Baselines
+
+### Hardware
+- 8 NVIDIA V100 GPUs
+
+### Software
+- PyTorch version: 1.0.0a0+dd2c487
+- CUDA 9.2
+- CUDNN 7.1
+- NCCL 2.2.13-1
+
+### End-to-end Faster and Mask R-CNN baselines
+
+All the baselines were trained using the exact same experimental setup as in Detectron.
+We initialize the detection models with ImageNet weights from Caffe2, the same as used by Detectron.
+
+The pre-trained models are available in the link in the model id.
+
+backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | mask AP | model id
+-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
+R-50-C4 | Fast | 1x | 1 | 5.8 | 0.4036 | 20.2 | 0.17130 | 34.8 | - | [6358800](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_C4_1x.pth)
+R-50-FPN | Fast | 1x | 2 | 4.4 | 0.3530 | 8.8 | 0.12580 | 36.8 | - | [6358793](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_FPN_1x.pth)
+R-101-FPN | Fast | 1x | 2 | 7.1 | 0.4591 | 11.5 | 0.143149 | 39.1 | - | [6358804](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_101_FPN_1x.pth)
+X-101-32x8d-FPN | Fast | 1x | 1 | 7.6 | 0.7007 | 35.0 | 0.209965 | 41.2 | - | [6358717](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_X_101_32x8d_FPN_1x.pth)
+R-50-C4 | Mask | 1x | 1 | 5.8 | 0.4520 | 22.6 | 0.17796 + 0.028 | 35.6 | 31.5 | [6358801](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_C4_1x.pth)
+R-50-FPN | Mask | 1x | 2 | 5.2 | 0.4536 | 11.3 | 0.12966 + 0.034 | 37.8 | 34.2 | [6358792](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_FPN_1x.pth)
+R-101-FPN | Mask | 1x | 2 | 7.9 | 0.5665 | 14.2 | 0.15384 + 0.034 | 40.1 | 36.1 | [6358805](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_101_FPN_1x.pth)
+X-101-32x8d-FPN | Mask | 1x | 1 | 7.8 | 0.7562 | 37.8 | 0.21739 + 0.034 | 42.2 | 37.8 | [6358718](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_X_101_32x8d_FPN_1x.pth)
+
+
+## Comparison with Detectron and mmdetection
+
+In the following section, we compare our implementation with [Detectron](https://github.com/facebookresearch/Detectron)
+and [mmdetection](https://github.com/open-mmlab/mmdetection).
+The same remarks from [mmdetection](https://github.com/open-mmlab/mmdetection/blob/master/MODEL_ZOO.md#training-speed)
+about different hardware applies here.
+
+### Training speed
+
+The numbers here are in seconds / iteration. The lower, the better.
+
+type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100)
+-- | -- | -- | --
+Faster R-CNN R-50 C4 | 0.566 | - | 0.4036
+Faster R-CNN R-50 FPN | 0.544 | 0.554 | 0.3530
+Faster R-CNN R-101 FPN | 0.647 | - | 0.4591
+Faster R-CNN X-101-32x8d FPN | 0.799 | - | 0.7007
+Mask R-CNN R-50 C4 | 0.620 | - | 0.4520
+Mask R-CNN R-50 FPN | 0.889 | 0.690 | 0.4536
+Mask R-CNN R-101 FPN | 1.008 | - | 0.5665
+Mask R-CNN X-101-32x8d FPN | 0.961 | - | 0.7562
+
+### Training memory
+
+The lower, the better
+
+type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100)
+-- | -- | -- | --
+Faster R-CNN R-50 C4 | 6.3 | - | 5.8
+Faster R-CNN R-50 FPN | 7.2 | 4.9 | 4.4
+Faster R-CNN R-101 FPN | 8.9 | - | 7.1
+Faster R-CNN X-101-32x8d FPN | 7.0 | - | 7.6
+Mask R-CNN R-50 C4 | 6.6 | - | 5.8
+Mask R-CNN R-50 FPN | 8.6 | 5.9 | 5.2
+Mask R-CNN R-101 FPN | 10.2 | - | 7.9
+Mask R-CNN X-101-32x8d FPN | 7.7 | - | 7.8
+
+### Accuracy
+
+The higher, the better
+
+type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100)
+-- | -- | -- | --
+Faster R-CNN R-50 C4 | 34.8 | - | 34.8
+Faster R-CNN R-50 FPN | 36.7 | 36.7 | 36.8
+Faster R-CNN R-101 FPN | 39.4 | - | 39.1
+Faster R-CNN X-101-32x8d FPN | 41.3 | - | 41.2
+Mask R-CNN R-50 C4 | 35.8 & 31.4 | - | 35.6 & 31.5
+Mask R-CNN R-50 FPN | 37.7 & 33.9 | 37.5 & 34.4 | 37.8 & 34.2
+Mask R-CNN R-101 FPN | 40.0 & 35.9 | - | 40.1 & 36.1
+Mask R-CNN X-101-32x8d FPN | 42.1 & 37.3 | - | 42.2 & 37.8
+
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/TROUBLESHOOTING.md
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/TROUBLESHOOTING.md
@ -0,0 +1,67 @@
+# Troubleshooting
+
+Here is a compilation if common issues that you might face
+while compiling / running this code:
+
+## Compilation errors when compiling the library
+If you encounter build errors like the following:
+```
+/usr/include/c++/6/type_traits:1558:8: note: provided for ‘template<class _From, class _To> struct std::is_convertible’
+     struct is_convertible
+        ^~~~~~~~~~~~~~
+/usr/include/c++/6/tuple:502:1: error: body of constexpr function ‘static constexpr bool std::_TC<<anonymous>, _Elements>::_NonNestedTuple() [with _SrcTuple = std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>&&; bool <anonymous> = true; _Elements = {at::Tensor, at::Tensor, at::Tensor, at::Tensor}]’ not a return-statement
+     }
+ ^
+error: command '/usr/local/cuda/bin/nvcc' failed with exit status 1
+```
+check your CUDA version and your `gcc` version.
+```
+nvcc --version
+gcc --version
+```
+If you are using CUDA 9.0 and gcc 6.4.0, then refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/25,
+which has a summary of the solution. Basically, CUDA 9.0 is not compatible with gcc 6.4.0.
+
+## ImportError: No module named maskrcnn_benchmark.config when running webcam.py
+
+This means that `maskrcnn-benchmark` has not been properly installed.
+Refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/22 for a few possible issues.
+Note that we now support Python 2 as well.
+
+
+## ImportError: Undefined symbol: __cudaPopCallConfiguration error when import _C
+
+This probably means that the inconsistent version of NVCC compile and your conda CUDAToolKit package. This is firstly mentioned in https://github.com/facebookresearch/maskrcnn-benchmark/issues/45 . All you need to do is:
+
+```
+# Check the NVCC compile version(e.g.)
+/usr/cuda-9.2/bin/nvcc --version
+# Check the CUDAToolKit version(e.g.)
+~/anaconda3/bin/conda list | grep cuda
+
+# If you need to update your CUDAToolKit
+~/anaconda3/bin/conda install -c anaconda cudatoolkit==9.2
+```
+
+Both of them should have the **same** version. For example, if NVCC==9.2 and CUDAToolKit==9.2, this will be fine while when NVCC==9.2 but CUDAToolKit==9, it fails.
+
+
+## Segmentation fault (core dumped) when running the library
+This probably means that you have compiled the library using GCC < 4.9, which is ABI incompatible with PyTorch.
+Indeed, during installation, you probably saw a message like
+```
+Your compiler (g++ 4.8) may be ABI-incompatible with PyTorch!
+Please use a compiler that is ABI-compatible with GCC 4.9 and above.
+See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html.
+
+See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6
+for instructions on how to install GCC 4.9 or higher.
+```
+Follow the instructions on https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6
+to install GCC 4.9 or higher, and try recompiling `maskrcnn-benchmark` again, after cleaning the
+`build` folder with
+```
+rm -rf build
+```
+
+
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [8.812795396454991, 5.914838795058071, 6, 5.092440919584583, 4.887887316499735, 4.744666463422983, 4.694560192557922, 4.567333741479565, 4.492525351620137, 6, 4.408311570055099, 4.334232046614567, 6, 4.263646488106407, 4.2514614595596445, 4.2171871953656055, 4.206751160226014, 4.1795772798196715, 4.156515416099515, 6, 4.108870625495911, 4.0985876759066855, 4.075221928967139, 4.080158276849438, 6, 4.033980131669857, 4.037739227952915, 6, 3.99941903534935, 6, 3.9875937877263565, 3.971811039999583, 3.980771179282509, 3.953947089124455, 3.9305202960968018, 3.9366443781873546, 3.9252991879350754, 3.8827156307395367, 3.9388060424005102, 3.88922161618695, 3.8874285418914396, 6, 3.8936942113018453, 3.537499847891029, 3.4058184228089177, 6, 6, 3.3219671837627627, 3.295458280363458, 3.262115957955606, 6, 6, 6, 3.2190717260910433, 3.213117691627236, 3.1739242191397987, 3.1791626058811704, 3.2088054501854177, 3.1719801842385507, 3.187761370792139, 3.1809213312432236, 3.1823803410259397, 3.1752594631311677, 3.1709555600928425, 3.1823559530957817], "val.acc": [0.025120322205631106, 0.06065902615325462, 0.08224594352985645, 0.09868630608427395, 0.11402055039858493, 0.11779455253460233, 0.1232203941357061, 0.13708232144631768, 0.13614397127135028, 0.13289094380937685, 0.14004009449749777, 0.1369843423424096, 0.13877603069457692, 0.15418866425831707, 0.1500001994042602, 0.1542573219664272, 0.14771151227315413, 0.15896497766306272, 0.1600724682809656, 0.15881491661088476, 0.16213217020726906, 0.16466781280171408, 0.15738430149539484, 0.16634155547369375, 0.1623110334880526, 0.16394517553182106, 0.1494171026560053, 0.16762167601953265, 0.16063595691096758, 0.16982898253523193, 0.17321918229909394, 0.17242960413896102, 0.1625123530546557, 0.18330429802960516, 0.16333127233412115, 0.17973452067250242, 0.16699022570278652, 0.17183956548028687, 0.17168756775917593, 0.17547718325478198, 0.1750019046551496, 0.18416070771679066, 0.1711460087987496, 0.231325087097653, 0.23716038401167305, 0.23886896590018106, 0.2403412383214709, 0.24380227870861898, 0.24383605475007317, 0.2449733300818802, 0.24508423152154857, 0.24252172333110344, 0.24566254540226004, 0.24661345705692578, 0.25123807624083877, 0.25184439401895475, 0.2519010236397111, 0.25191664071239706, 0.2522156441636805, 0.25215053241008767, 0.2525434296889651, 0.2524917808636186, 0.2527410425201369, 0.2534121449798447, 0.25279479287831214]}, "bs": [64], "model": "", "ngpus": [8]}
				`@ -0,0 +1 @@`
				`wget https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl`