Merge pull request #12 from NVIDIA/nvpstr/master

Adding 9 new models (6 in TensorFlow and 3 in PyTorch)
This commit is contained in:
nvpstr 2019-03-18 20:57:20 +01:00 committed by GitHub
commit b1ae8dd47c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
1201 changed files with 358857 additions and 48804 deletions

3
.gitmodules vendored
View file

@ -1,3 +0,0 @@
[submodule "TensorFlow/OpenSeq2Seq"]
path = TensorFlow/OpenSeq2Seq
url = https://github.com/NVIDIA/OpenSeq2Seq

View file

@ -1,19 +0,0 @@
#!/usr/bin/env sh
# This scripts downloads the CIFAR10 (binary version) data and unzips it.
set -e
cd "$( cd "$(dirname "$0")" ; pwd -P )"
echo "Downloading..."
wget --no-check-certificate http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
echo "Unzipping..."
tar -xf cifar-10-binary.tar.gz && rm -f cifar-10-binary.tar.gz
mv cifar-10-batches-bin/* . && rm -rf cifar-10-batches-bin
# Creation is split out because leveldb sometimes causes segfault
# and needs to be re-created.
echo "Done."

View file

@ -1,7 +0,0 @@
#!/bin/bash
set -e
cd "$( cd "$(dirname "$0")" ; pwd -P )"
# Create CIFAR10 train + test databases
make_cifar_db --db lmdb --input_folder "$(pwd)" --output_train_db_name cifar10_train_lmdb --output_test_db_name cifar10_test_lmdb

View file

@ -1,208 +0,0 @@
#!/usr/bin/env python
"""Example: train a model on CIFAR10."""
from __future__ import division, print_function
import argparse
import functools
import logging
import os.path
from caffe2.python import brew, core, data_parallel_model, optimizer, workspace
from caffe2.python.core import DataType
from caffe2.python.model_helper import ModelHelper
from caffe2.python.modeling.initializers import Initializer, pFP16Initializer
logging.basicConfig()
TRAIN_ENTRIES = 50000
TEST_ENTRIES = 10000
BATCH_SIZE = 100
EPOCHS = 10
DISPLAY = 100
ACCURACY_MIN = 0.7
ACCURACY_MAX = 0.8
def AddInputOps(model, reader, batch_size, dtype):
"""Add input ops."""
data, label = brew.image_input(
model, [reader], ['data', 'label'],
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
scale=32, crop=32, mirror=1, color=True, mean=128.0,
output_type='float16' if dtype == DataType.FLOAT16 else 'float',
is_test=False)
data = model.StopGradient(data, data)
def AddForwardPassOps(model, loss_scale, dtype):
"""Add forward pass ops and return a list of losses."""
initializer = (pFP16Initializer if dtype == DataType.FLOAT16
else Initializer)
with brew.arg_scope([brew.conv, brew.fc],
WeightInitializer=initializer,
BiasInitializer=initializer):
conv1 = brew.conv(model, 'data', 'conv1', 3, 32, 5, pad=2,
weight_init=('GaussianFill',
{'std': 0.0001, 'mean': 0.0}))
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=3, stride=2)
relu1 = brew.relu(model, pool1, 'relu1')
conv2 = brew.conv(model, relu1, 'conv2', 32, 32, 5, pad=2,
weight_init=('GaussianFill', {'std': 0.01}))
conv2 = brew.relu(model, conv2, conv2)
pool2 = brew.average_pool(model, conv2, 'pool2', kernel=3, stride=2)
conv3 = brew.conv(model, pool2, 'conv3', 32, 64, 5, pad=2,
weight_init=('GaussianFill', {'std': 0.01}))
conv3 = brew.relu(model, conv3, conv3)
pool3 = brew.average_pool(model, conv3, 'pool3', kernel=3, stride=2)
fc1 = brew.fc(model, pool3, 'fc1', 64 * 3 * 3, 64,
weight_init=('GaussianFill', {'std': 0.1}))
fc2 = brew.fc(model, fc1, 'fc2', 64, 10,
weight_init=('GaussianFill', {'std': 0.1}))
if dtype == DataType.FLOAT16:
fc2 = model.net.HalfToFloat(fc2, fc2 + '_fp32')
softmax, loss = model.SoftmaxWithLoss([fc2, 'label'], ['softmax', 'loss'])
loss = model.Scale(loss, loss, scale=loss_scale)
brew.accuracy(model, [softmax, 'label'], 'accuracy')
return [loss]
def AddOptimizerOps(model):
"""Add optimizer ops."""
optimizer.add_weight_decay(model, 0.004)
stepsize = TRAIN_ENTRIES * EPOCHS // BATCH_SIZE
optimizer.build_sgd(
model, 0.001,
policy='step', stepsize=stepsize, gamma=0.1,
momentum=0.9, nesterov=False)
def AddPostSyncOps(model):
"""Add ops which run after the initial parameter sync."""
for param_info in model.GetOptimizationParamInfo(model.GetParams()):
if param_info.blob_copy is not None:
# Ensure copies are in sync after initial broadcast
model.param_init_net.HalfToFloat(
param_info.blob,
param_info.blob_copy[core.DataType.FLOAT]
)
def createTrainModel(lmdb_path, devices, dtype):
"""Create and return a training model, complete with training ops."""
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=functools.partial(
AddInputOps, reader=reader,
batch_size=(BATCH_SIZE // len(devices)), dtype=dtype),
forward_pass_builder_fun=functools.partial(
AddForwardPassOps, dtype=dtype),
optimizer_builder_fun=AddOptimizerOps,
post_sync_builder_fun=AddPostSyncOps,
devices=devices, use_nccl=True)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def createTestModel(lmdb_path, devices, dtype):
"""Create and return a test model. Does not include training ops."""
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
init_params=False)
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=functools.partial(
AddInputOps, reader=reader,
batch_size=(BATCH_SIZE // len(devices)), dtype=dtype),
forward_pass_builder_fun=functools.partial(
AddForwardPassOps, dtype=dtype),
param_update_builder_fun=None,
devices=devices)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def getArgs():
"""Return command-line arguments."""
CURDIR = os.path.dirname(__file__)
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--train-lmdb', help='Path to training LMDB',
default=os.path.join(CURDIR, 'cifar10_train_lmdb'))
parser.add_argument('--test-lmdb', help='Path to test LMDB',
default=os.path.join(CURDIR, 'cifar10_test_lmdb'))
parser.add_argument('--dtype', choices=['float', 'float16'],
default='float', help='Data type used for training')
parser.add_argument('--gpus',
help='Comma separated list of GPU devices to use')
parser.add_argument('--num_gpus', type=int, default=1,
help='Number of GPU devices (instead of --gpus)')
parser.add_argument('--all-gpus', action='store_true',
help='Use all GPUs in the system')
args = parser.parse_args()
args.dtype = (DataType.FLOAT16 if args.dtype == 'float16'
else DataType.FLOAT)
if args.all_gpus:
args.num_gpus = workspace.NumCudaDevices()
args.gpus = range(args.num_gpus)
else:
if args.gpus is not None:
args.gpus = [int(x) for x in args.gpus.split(',')]
args.num_gpus = len(args.gpus)
else:
args.gpus = range(args.num_gpus)
args.num_gpus = args.num_gpus
return args
def main(args):
"""Train and test."""
train_model = createTrainModel(args.train_lmdb, args.gpus, args.dtype)
test_model = createTestModel(args.test_lmdb, args.gpus, args.dtype)
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
scope_prefix = 'gpu_%d/' % args.gpus[0]
for epoch in range(1, EPOCHS + 1):
# Train
for iteration in range(1, train_iter_per_epoch + 1):
workspace.RunNet(train_model.net.Proto().name)
if not iteration % DISPLAY:
loss = workspace.FetchBlob(scope_prefix + 'loss')
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
# Test
losses = []
accuracies = []
for _ in range(test_iter_per_epoch):
workspace.RunNet(test_model.net.Proto().name)
# Take average values across all GPUs
losses.append(sum(
workspace.FetchBlob('gpu_%d/loss' % g) for g in args.gpus
) / len(args.gpus))
accuracies.append(sum(
workspace.FetchBlob('gpu_%d/accuracy' % g) for g in args.gpus
) / len(args.gpus))
loss = sum(losses) / len(losses)
accuracy = sum(accuracies) / len(accuracies)
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
raise RuntimeError(
"Final accuracy %f is not in the expected range [%f, %f]" %
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
if __name__ == '__main__':
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
main(getArgs())

View file

@ -1,2 +0,0 @@
*.mdb
*-ubyte

View file

@ -1,14 +0,0 @@
#!/usr/bin/env sh
# This scripts downloads the mnist data and unzips it.
cd "$( cd "$(dirname "$0")" ; pwd -P )"
echo "Downloading..."
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
if [ ! -e $fname ]; then
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
gunzip ${fname}.gz
fi
done

View file

@ -1,7 +0,0 @@
#!/bin/bash
cd "$( cd "$(dirname "$0")" ; pwd -P )"
# Create MNIST databases from previously downloaded data
make_mnist_db --db lmdb --image_file train-images-idx3-ubyte --label_file train-labels-idx1-ubyte --output_file mnist_train_lmdb
make_mnist_db --db lmdb --image_file t10k-images-idx3-ubyte --label_file t10k-labels-idx1-ubyte --output_file mnist_test_lmdb

View file

@ -1,133 +0,0 @@
#!/usr/bin/env python
"""Example: train LeNet on MNIST."""
from __future__ import division, print_function
import argparse
import os.path
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import brew, core, optimizer, workspace
from caffe2.python.model_helper import ModelHelper
TRAIN_ENTRIES = 60000
TEST_ENTRIES = 10000
BATCH_SIZE = 100
EPOCHS = 4
DISPLAY = 100
ACCURACY_MIN = 0.98
ACCURACY_MAX = 0.999
def AddInputOps(model, reader, batch_size):
"""Add input ops."""
data, label = brew.image_input(
model, [reader], ['data', 'label'],
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
is_test=False)
data = model.StopGradient(data, data)
def AddForwardPassOps(model):
"""Add forward pass ops and return a list of losses."""
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
fc3 = brew.relu(model, fc3, fc3)
pred = brew.fc(model, fc3, 'pred', 500, 10)
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
brew.accuracy(model, [softmax, 'label'], 'accuracy')
return [loss]
def AddOptimizerOps(model):
"""Add optimizer ops."""
optimizer.build_sgd(model, 0.01,
policy='step', stepsize=1, gamma=0.999,
momentum=0.9, nesterov=False)
def createTrainModel(lmdb_path):
"""Create and return a training model, complete with training ops."""
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
AddInputOps(model, reader, BATCH_SIZE)
losses = AddForwardPassOps(model)
model.AddGradientOperators(losses)
AddOptimizerOps(model)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def createTestModel(lmdb_path):
"""Create and return a test model. Does not include training ops."""
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
init_params=False)
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
AddInputOps(model, reader, BATCH_SIZE)
AddForwardPassOps(model)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def getArgs():
"""Return command-line arguments."""
CURDIR = os.path.dirname(__file__)
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--train-lmdb', help='Path to training LMDB',
default=os.path.join(CURDIR, 'mnist_train_lmdb'))
parser.add_argument('--test-lmdb', help='Path to test LMDB',
default=os.path.join(CURDIR, 'mnist_test_lmdb'))
args = parser.parse_args()
return args
def main(args):
"""Train and test."""
device = 0
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, device)):
train_model = createTrainModel(args.train_lmdb)
test_model = createTestModel(args.test_lmdb)
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
for epoch in range(1, EPOCHS + 1):
# Train
for iteration in range(1, train_iter_per_epoch + 1):
workspace.RunNet(train_model.net.Proto().name)
if not iteration % DISPLAY:
loss = workspace.FetchBlob('loss')
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
# Test
losses = []
accuracies = []
for _ in range(test_iter_per_epoch):
workspace.RunNet(test_model.net.Proto().name)
losses.append(workspace.FetchBlob('loss'))
accuracies.append(workspace.FetchBlob('accuracy'))
loss = np.array(losses).mean()
accuracy = np.array(accuracies).mean()
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
raise RuntimeError(
"Final accuracy %f is not in the expected range [%f, %f]" %
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
if __name__ == '__main__':
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
main(getArgs())

View file

@ -1,139 +0,0 @@
#!/usr/bin/env python
"""Example: train LeNet on MNIST (with fp16)."""
from __future__ import division, print_function
import argparse
import os.path
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import brew, core, optimizer, workspace
from caffe2.python.model_helper import ModelHelper
from caffe2.python.modeling.initializers import pFP16Initializer
TRAIN_ENTRIES = 60000
TEST_ENTRIES = 10000
BATCH_SIZE = 100
EPOCHS = 4
DISPLAY = 100
ACCURACY_MIN = 0.98
ACCURACY_MAX = 0.999
def AddInputOps(model, reader, batch_size):
"""Add input ops."""
data, label = brew.image_input(
model, [reader], ['data', 'label'],
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
output_type='float16', is_test=True)
data = model.StopGradient(data, data)
def AddForwardPassOps(model):
"""Add forward pass ops and return a list of losses."""
with brew.arg_scope([brew.conv, brew.fc],
WeightInitializer=pFP16Initializer,
BiasInitializer=pFP16Initializer):
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
fc3 = brew.relu(model, fc3, fc3)
pred = brew.fc(model, fc3, 'pred', 500, 10)
# Cast back to fp32 for remaining ops
pred = model.net.HalfToFloat(pred, pred + '_fp32')
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
brew.accuracy(model, [softmax, 'label'], 'accuracy')
return [loss]
def AddOptimizerOps(model):
"""Add optimizer ops."""
optimizer.build_sgd(model, 0.01,
policy='step', stepsize=1, gamma=0.999,
momentum=0.9, nesterov=False)
def createTrainModel(lmdb_path):
"""Create and return a training model, complete with training ops."""
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
AddInputOps(model, reader, BATCH_SIZE)
losses = AddForwardPassOps(model)
model.AddGradientOperators(losses)
AddOptimizerOps(model)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def createTestModel(lmdb_path):
"""Create and return a test model. Does not include training ops."""
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
init_params=False)
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
AddInputOps(model, reader, BATCH_SIZE)
AddForwardPassOps(model)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def getArgs():
"""Return command-line arguments."""
CURDIR = os.path.dirname(__file__)
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--train-lmdb', help='Path to training LMDB',
default=os.path.join(CURDIR, 'mnist_train_lmdb'))
parser.add_argument('--test-lmdb', help='Path to test LMDB',
default=os.path.join(CURDIR, 'mnist_test_lmdb'))
args = parser.parse_args()
return args
def main(args):
"""Train and test."""
device = 0
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, device)):
train_model = createTrainModel(args.train_lmdb)
test_model = createTestModel(args.test_lmdb)
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
for epoch in range(1, EPOCHS + 1):
# Train
for iteration in range(1, train_iter_per_epoch + 1):
workspace.RunNet(train_model.net.Proto().name)
if not iteration % DISPLAY:
loss = workspace.FetchBlob('loss')
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
# Test
losses = []
accuracies = []
for _ in range(test_iter_per_epoch):
workspace.RunNet(test_model.net.Proto().name)
losses.append(workspace.FetchBlob('loss'))
accuracies.append(workspace.FetchBlob('accuracy'))
loss = np.array(losses).mean()
accuracy = np.array(accuracies).mean()
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
raise RuntimeError(
"Final accuracy %f is not in the expected range [%f, %f]" %
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
if __name__ == '__main__':
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
main(getArgs())

View file

@ -1,159 +0,0 @@
#!/usr/bin/env python
"""Example: train LeNet on MNIST (with multi-GPU)."""
from __future__ import division, print_function
import argparse
import functools
import logging
import os.path
from caffe2.python import brew, core, data_parallel_model, optimizer, workspace
from caffe2.python.model_helper import ModelHelper
logging.basicConfig()
TRAIN_ENTRIES = 60000
TEST_ENTRIES = 10000
BATCH_SIZE = 100
EPOCHS = 4
DISPLAY = 100
ACCURACY_MIN = 0.98
ACCURACY_MAX = 0.999
def AddInputOps(model, reader, batch_size):
"""Add input ops."""
data, label = brew.image_input(
model, [reader], ['data', 'label'],
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
is_test=True)
data = model.StopGradient(data, data)
def AddForwardPassOps(model, loss_scale):
"""Add forward pass ops and return a list of losses."""
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
fc3 = brew.relu(model, fc3, fc3)
pred = brew.fc(model, fc3, 'pred', 500, 10)
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
loss = model.Scale(loss, loss, scale=loss_scale)
brew.accuracy(model, [softmax, 'label'], 'accuracy')
return [loss]
def AddOptimizerOps(model):
"""Add optimizer ops."""
optimizer.build_sgd(model, 0.01,
policy='step', stepsize=1, gamma=0.999,
momentum=0.9, nesterov=False)
def createTrainModel(lmdb_path, devices):
"""Create and return a training model, complete with training ops."""
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=functools.partial(
AddInputOps, reader=reader,
batch_size=(BATCH_SIZE // len(devices))),
forward_pass_builder_fun=AddForwardPassOps,
optimizer_builder_fun=AddOptimizerOps,
devices=devices, use_nccl=True)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def createTestModel(lmdb_path, devices):
"""Create and return a test model. Does not include training ops."""
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
init_params=False)
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=functools.partial(
AddInputOps, reader=reader,
batch_size=(BATCH_SIZE // len(devices))),
forward_pass_builder_fun=AddForwardPassOps,
param_update_builder_fun=None,
devices=devices)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def getArgs():
"""Return command-line arguments."""
CURDIR = os.path.dirname(__file__)
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--train-lmdb', help='Path to training LMDB',
default=os.path.join(CURDIR, 'mnist_train_lmdb'))
parser.add_argument('--test-lmdb', help='Path to test LMDB',
default=os.path.join(CURDIR, 'mnist_test_lmdb'))
parser.add_argument('--gpus',
help='Comma separated list of GPU devices to use')
parser.add_argument('--num_gpus', type=int, default=1,
help='Number of GPU devices (instead of --gpus)')
args = parser.parse_args()
if args.gpus is not None:
args.gpus = [int(x) for x in args.gpus.split(',')]
args.num_gpus = len(args.gpus)
else:
args.gpus = range(args.num_gpus)
args.num_gpus = args.num_gpus
return args
def main(args):
"""Train and test."""
train_model = createTrainModel(args.train_lmdb, args.gpus)
test_model = createTestModel(args.test_lmdb, args.gpus)
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
scope_prefix = 'gpu_%d/' % args.gpus[0]
for epoch in range(1, EPOCHS + 1):
# Train
for iteration in range(1, train_iter_per_epoch + 1):
workspace.RunNet(train_model.net.Proto().name)
if not iteration % DISPLAY:
loss = workspace.FetchBlob(scope_prefix + 'loss')
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
# Test
losses = []
accuracies = []
for _ in range(test_iter_per_epoch):
workspace.RunNet(test_model.net.Proto().name)
# Take average values across all GPUs
losses.append(sum(
workspace.FetchBlob('gpu_%d/loss' % g) for g in args.gpus
) / len(args.gpus))
accuracies.append(sum(
workspace.FetchBlob('gpu_%d/accuracy' % g) for g in args.gpus
) / len(args.gpus))
loss = sum(losses) / len(losses)
accuracy = sum(accuracies) / len(accuracies)
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
raise RuntimeError(
"Final accuracy %f is not in the expected range [%f, %f]" %
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
if __name__ == '__main__':
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
main(getArgs())

View file

@ -0,0 +1,20 @@
FROM gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.03-py3-stage
# Set working directory
WORKDIR /mlperf
RUN apt-get update && apt-get install -y python3-tk python-pip git tmux htop tree
# Necessary pip packages
RUN pip install --upgrade pip
COPY requirements.txt .
RUN pip install -r requirements.txt
RUN python3 -m pip install pycocotools==2.0.0
# Copy SSD code
COPY ./setup.py .
COPY ./csrc ./csrc
RUN pip install .
COPY . .

View file

@ -0,0 +1,203 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2018 NVIDIA Corporation
Copyright 2018 The MLPerf Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -0,0 +1,344 @@
# SSD300 v1.1 For PyTorch
## Table Of Contents
* [The model](#the-model)
* [Default configuration](#default-configuration)
* [Setup](#setup)
* [Requirements](#requirements)
* [Quick start guide](#quick-start-guide)
* [Details](#details)
* [Command line arguments](#command-line-arguments)
* [Getting the data](#getting-the-data)
* [Training process](#training-process)
* [Data preprocessing](#data-preprocessing)
* [Data augmentation](#data-augmentation)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Benchmarking](#benchmarking)
* [Training performance benchmark](#training-performance-benchmark)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [Training performance results](#training-performance-results)
* [Inference performance results](#inference-performance-results)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## The model
The SSD300 v1.1 model is based on the
[SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325) paper, which
describes SSD as “a method for detecting objects in images using a single deep neural network".
The input size is fixed to 300x300.
The main difference between this model and the one described in the paper is in the backbone.
Specifically, the VGG model is obsolete and is replaced by the ResNet-50 model.
From the
[Speed/accuracy trade-offs for modern convolutional object detectors](https://arxiv.org/abs/1611.10012)
paper, the following enhancements were made to the backbone:
* The conv5_x, avgpool, fc and softmax layers were removed from the original classification model.
* All strides in conv4_x are set to 1x1.
The backbone is followed by 5 additional convolutional layers.
In addition to the convolutional layers, we attached 6 detection heads:
* The first detection head is attached to the last conv4_x layer.
* The other five detection heads are attached to the corresponding 5 additional layers.
Detector heads are similar to the ones referenced in the paper, however,
they are enhanced by additional BatchNorm layers after each convolution.
Additionally, we removed weight decay on every bias parameter and
all the BatchNorm layer parameters as described in the
[Highly Scalable Deep Learning Training System with Mixed-Precision:
Training ImageNet in Four Minutes](https://arxiv.org/abs/1807.11205) paper.
This model trains with mixed precision tensor cores on Volta, therefore you can get results much faster than training without tensor cores.
This model is tested against each NGC monthly container release to ensure
consistent accuracy and performance over time.
Because of these enhancements, the SSD300 v1.1 model achieves higher accuracy.
Training of SSD requires computational costly augmentations. To fully utilize GPUs during training we are using [NVIDIA DALI](https://github.com/NVIDIA/DALI) library to accelerate data preparation pipeline.
### Default configuration
We trained the model for 65 epochs with the following setup:
* SGD with momentum (0.9)
* Learning rate = 2.6e-3 * number of GPUs * (batch_size / 32)
* Learning rate decay multiply by 0.1 before 43 and 54 epochs
* We use linear warmup of the learning rate during the first epoch. For more information, see the
[Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677) paper.
To enable warmup provide argument the `--warmup 300`
* Weight decay:
* 0 for BatchNorms and biases
* 5e-4 for other layers
**Note**: The learning rate is automatically scaled (in other words, mutliplied by the number of GPUs and multiplied by the batch size divided by 32).
## Setup
The following section list the requirements in order to start training the SSD300 v1.1 model.
### Requirements
This repository contains `Dockerfile` which extends the PyTorch 19.03 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following software:
* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
* [PyTorch 19.03-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
* [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
For more information about how to get started with NGC containers, see the
following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
Documentation:
* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
## Quick Start Guide
To train your model using mixed precision with Tensor Cores, perform the
following steps using the default parameters of the SSD v1.1 model on the
[COCO 2017](http://cocodataset.org/#download) dataset.
### 1. Download and preprocess the dataset.
Extract the COCO 2017 dataset with `download_dataset.sh $COCO_DIR`.
Data will be downloaded to the `$COCO_DIR` directory (on the host).
### 2. Build the SSD300 v1.1 PyTorch NGC container.
` docker build . -t nvidia_ssd `
### 3. Launch the NGC container to run training/inference.
`nvidia-docker run --rm -it --ulimit memlock=-1 --ulimit stack=67108864 -v $COCO_DIR:/coco --ipc=host nvidia_ssd`
**Note**: the default mount point in the container is `/coco`.
### 4. Start training.
The `./examples` directory provides several sample scripts for various GPU settings
and act as wrappers around the main.py script.
The example scripts need two arguments:
- A path to root SSD directory.
- A path to COCO 2017 dataset.
Remaining arguments are passed to the `main.py` script.
The `--save` flag, saves the model after each epoch.
The checkpoints are stored as `./models/epoch_*.pt`.
Use `python main.py -h` to obtain the list of available options in the `main.py` script.
For example, if you want to run 8 GPU training with TensorCore acceleration and
save checkpoints after each epoch, run:
`bash ./examples/SSD300_FP16_8GPU.sh . /coco --save`
For information about how to train using mixed precision, see the [Mixed Precision Training paper](https://arxiv.org/abs/1710.03740) and [Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
For PyTorch, easily adding mixed-precision support is available from NVIDIAs [APEX](https://github.com/NVIDIA/apex), a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
### 5. Start validation/evaluation.
The `main.py` training script automatically runs validation during training.
The results from the validation are printed to stdout.
Pycocotools open-sourced scripts provides a consistent way to evaluate models on the COCO dataset. We are using these scripts during validation to measure models performance in AP metric. Metrics below are evaluated using pycocotools methodology, in the following format:
```
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.250
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.423
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.257
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.076
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.269
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.399
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.237
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.342
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.358
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.118
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.394
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.548
```
The metric reported in our results is present in the first row.
To evaluate a checkpointed model saved in previous point, run:
`python ./main.py --backbone resnet50 --mode evaluation --checkpoint ./models/epoch_*.pt --data /coco`
### 6. Optionally, resume training from a checkpointed model.
`python ./main.py --backbone resnet50 --checkpoint ./models/epoch_*.pt --data /coco`
## Details
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Command line arguments
All these parameters can be controlled by passing command line arguments to the `main.py` script. To get a complete list of all command line arguments with descriptions and default values you can run:
`python main.py --help`
### Getting the data
The SSD model was trained on the COCO 2017 dataset. The val2017 validation set was used as a validation dataset. PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
This repository contains the `download_dataset.sh` download script which will automatically
download and preprocess the training, validation and test datasets. By default,
data will be downloaded to the `/coco` directory.
### Training process
Training the SSD model is implemented in the `main.py` script.
By default, training is running for 65 epochs. Because evaluation is relatively time consuming,
it is not running every epoch. With default settings, evaluation is executed after epochs:
21, 31, 37, 42, 48, 53, 59, 64. The model is evaluated using pycocotools distributed with
the COCO dataset.
Which epochs should be evaluated can be reconfigured with argument `--evaluation`.
To run training with Tensor Cores, use the `--fp16` flag when running the `main.py` script.
The flag `--save` flag enables storing checkpoints after each epoch under `./models/epoch_*.pt`.
#### Data preprocessing
Before we feed data to the model, both during training and inference, we perform:
* Normalization
* Encoding bounding boxes
* Resize to 300x300
#### Data augmentation
During training we perform the following augmentation techniques:
* Random crop
* Random horizontal flip
* Color jitter
### Enabling mixed precision
[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) training previously required two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Manually adding loss scaling to preserve small gradient values.
Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP), library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval,
while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients.
In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by amp. The scaling value to be used can be dynamic or fixed.
For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
To enable mixed precision, you can:
- Import AMP from APEX, for example:
`from apex import amp`
- Initialize an AMP handle, for example:
`amp_handle = amp.init(enabled=True, verbose=True)`
- Wrap your optimizer with the AMP handle, for example:
`optimizer = amp_handle.wrap_optimizer(optimizer)`
- Scale loss before backpropagation (assuming loss is stored in a variable called losses)
- Default backpropagate for FP32:
`losses.backward()`
- Scale loss and backpropagate with AMP:
```
with optimizer.scale_loss(losses) as scaled_losses:
scaled_losses.backward()
```
For information about:
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
## Benchmarking
The following section shows how to run benchmarks measuring the model performance in training and inference modes.
### Training performance benchmark
Training benchmark was run in various scenarios on V100 16G GPU. For each scenario, batch size was set to 32. The benchmark does not require a checkpoint from a fully trained model.
To benchmark training, run:
```
python -m torch.distributed.launch --nproc_per_node={NGPU} \
main.py --batch-size {bs} \
--mode benchmark-training \
--benchmark-warmup 100 \
--benchmark-iterations 200 \
{fp16} \
--data {data}
```
Where the `{NGPU}` selects number of GPUs used in benchmark, the `{bs}` is the desired batch size, the `{fp16}` is set to `--fp16` if you want to benchmark training with tensor cores, and the `{data}` is the location of the COCO 2017 dataset.
Benchmark warmup is specified to omit first iterations of first epoch. Benchmark iterations is number of iterations used to measure performance.
### Inference performance benchmark
Inference benchmark was run on 1x V100 16G GPU. To benchmark inference, run:
```
python main.py --eval-batch-size {bs} \
--mode benchmark-inference \
--benchmark-warmup 100 \
--benchmark-iterations 200 \
{fp16} \
--data {data}
```
Where the `{bs}` is the desired batch size, the `{fp16}` is set to `--fp16` if you want to benchmark inference with Tensor Cores, and the `{data}` is the location of the COCO 2017 dataset.
Benchmark warmup is specified to omit first iterations of first epoch. Benchmark iterations is number of iterations used to measure performance.
## Results
The following sections provide details on how we achieved our performance and accuracy in training and inference.
### Training accuracy results
Our results were obtained by running the `./examples/SSD300_FP{16,32}_{1,4,8}GPU.sh`
script in the pytorch-19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Batch was set to size best utilizing GPU memory. For FP32 precision, batch size is 32, for mixed precision batch size is 64
| **Number of GPUs** | **Mixed precision mAP** | **Training time with mixed precision** | **FP32 mAP** | **Training time with FP32** |
|:------------------:|:------------------------:|:-------------------------------------:|:------------:|:---------------------------:|
| 1 | 0.2494 | 10h 39min | 0.2483 | 21h 40min |
| 4 | 0.2495 | 2h 53min | 0.2478 | 5h 52min |
| 8 | 0.2489 | 1h 31min | 0.2475 | 2h 54min |
Here are example graphs of FP32 and FP16 training on 8 GPU configuration:
![TrainingLoss](./img/training_loss.png)
![ValidationAccuracy](./img/validation_accuracy.png)
### Training performance results
Our results were obtained by running the `main.py` script with the
`--mode benchmark-training` flag in the pytorch-19.03-py3 NGC container on NVIDIA DGX-1 with V100 16G GPUs.
| **Number of GPUs** | **Batch size per GPU** | **Mixed precision img/s (median)** | **FP32 img/s (median)** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with mixed precision** | **Multi-gpu weak scaling with FP32** |
|:------------------:|:----------------------:|:----------------------------------:|:-----------------------:|:---------------------------------:|:-----------------------------------------------:|:------------------------------------:|
| 1 | 32 | 217.052 | 102.495 | 2.12 | 1.00 | 1.00 |
| 4 | 32 | 838.457 | 397.797 | 2.11 | 3.86 | 3.88 |
| 8 | 32 | 1639.843 | 789.695 | 2.08 | 7.56 | 7.70 |
To achieve same results, follow the [Quick start guide](#quick-start-guide) outlined above.
### Inference performance results
Our results were obtained by running the `main.py` script with `--mode benchmark-inference` flag in the pytorch-19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPUs.
| **Batch size** | **Mixed precision img/s (median)** | **FP32 img/s (median)** |
|:--------------:|:----------------------------------:|:-----------------------:|
| 2 | 163.12 | 147.91 |
| 4 | 296.60 | 201.62 |
| 8 | 412.52 | 228.16 |
| 16 | 470.10 | 280.57 |
| 32 | 520.54 | 302.43 |
To achieve same results, follow the [Quick start guide](#quick-start-guide) outlined above.
## Changelog
March 2019
* Initial release
## Known issues
There are no known issues with this model.

View file

View file

@ -0,0 +1,440 @@
/******************************************************************************
*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include <THC/THC.h>
#include <cuda.h>
//#define DEBUG
// calculate the IoU of a single box against another box
__device__
float calc_single_iou(const float4 b1, const float4 b2) {
// (lt), (rb)
float l = max(b1.x, b2.x);
float t = max(b1.y, b2.y);
float r = min(b1.z, b2.z);
float b = min(b1.w, b2.w);
float first = (r - l);
first = (first < 0) ? 0 : first;
float second = (b - t);
second = (second < 0) ? 0 : second;
float intersection = first * second;
float area1 = (b1.w - b1.y) * (b1.z - b1.x);
float area2 = (b2.w - b2.y) * (b2.z - b2.x);
return intersection / (area1 + area2 - intersection);
}
__global__
// boxes1 : [N x 4]
// boxes2 : [M x 4]
// ious : [N x M]
void calc_ious_kernel(const int N_img, const float4 *box1, const int *box1_offsets,
const int M, const float4 *boxes2, float *ious) {
// launch N_img blocks
const int img = blockIdx.x;
// each block, i will run over the box1_N[i] source and M target boxes
// generating box1_N[i] x M outputs
// alias to start of boxes for this image
const float4 *b1 = &box1[box1_offsets[img]];
if (threadIdx.x == 0) {
//printf("offset for img %d : %d\n", img, box1_offsets[img]);
}
// number of boxes for this image from offsets
int N = box1_offsets[img+1] - box1_offsets[img];
for (int i = 0; i < N; ++i) {
// if (threadIdx.x == 0) printf("i : %d\n", i);
const float4 source = b1[i];
// for each source, loop over targets
for (int j = threadIdx.x; j < M; j += blockDim.x) {
const float4 target = boxes2[j];
float iou = calc_single_iou(source, target);
// store the calculated IoU in the correct spot
int out_idx = box1_offsets[img] * M + i * M + j;
ious[out_idx] = iou;
}
}
}
__device__
void reduce_val_idx(int N, volatile float *vals, volatile int *idx) {
// naive: single thread for now
if (threadIdx.x == 0) {
float max_val = vals[0];
int max_idx = idx[0];
for (int i = 1; i < N; ++i) {
if (vals[i] > max_val) {
max_val = vals[i];
max_idx = idx[i];
}
}
vals[0] = max_val;
idx[0] = max_idx;
}
}
/**
* perform remaining parts, storing temporary values in global workspace
* workspace needs N_img * M values, each of 8 bytes (float, int)
**/
template <int BLOCK_SIZE, int MAX_BBOXES_PER_BLOCK>
__global__
void encode(const int N_img, const float4 *bbox_in, const long *labels_in, const int *offsets,
const int M, const float4 *dboxes, // const float *ious,
const float criteria, uint8_t *workspace, float4 *bbox_out, long *label_out) {
// Each block will take a single image's IoU set
const int img = blockIdx.x;
// shared memory for intermediate results
__shared__ volatile float best_bbox_iou_tmp[BLOCK_SIZE];
__shared__ volatile int best_bbox_idx_tmp[BLOCK_SIZE];
// shared memory for final best_bbox_{iou, idx} values
__shared__ volatile float best_bbox_iou[MAX_BBOXES_PER_BLOCK];
__shared__ volatile int best_bbox_idx[MAX_BBOXES_PER_BLOCK];
// index into the global workspace - each image needs (float + int) * M values
volatile float *best_dbox_iou = (float *)&workspace[img * M * 8];
volatile int *best_dbox_idx = (int *)&workspace[img * M * 8 + M * 4];
// number of input bboxes for this image
const int N_rows = offsets[img+1] - offsets[img];
// Check for potential crash
assert(N_rows <= MAX_BBOXES_PER_BLOCK);
#ifdef DEBUG
if (threadIdx.x == 0)
printf("N rows: %d %d to %d (%p - %p)\n", N_rows, offsets[img], offsets[img+1], best_dbox_iou, best_dbox_idx);
#endif
for (int i = threadIdx.x; i < MAX_BBOXES_PER_BLOCK; i += blockDim.x) {
best_bbox_iou[i] = -FLT_MAX;
best_bbox_idx[i] = -1;
}
__syncthreads();
// loop serially over the rows of the IoU set that correspond to this image
int row_num = 0;
for (int i = offsets[img]; i < offsets[img+1]; ++i) {
// reset shmem tallies
best_bbox_iou_tmp[threadIdx.x] = -FLT_MAX;
best_bbox_idx_tmp[threadIdx.x] = -1;
// index into the input buffer
// const float *row = &ious[i * M];
const float4 input_bbox = bbox_in[i];
#ifdef DEBUG
if (threadIdx.x == 0)
printf("%d - %p\n", img, &input_bbox);
#endif
// loop by threads over the columns
for (int j = threadIdx.x; j < M; j += blockDim.x) {
// check and store new max if necessary
const float4 input_dbox = dboxes[j];
// float new_val = row[j];
float new_val = calc_single_iou(input_bbox, input_dbox);
// handle per-row max in shared memory
if (new_val > best_bbox_iou_tmp[threadIdx.x]) {
best_bbox_iou_tmp[threadIdx.x] = new_val;
best_bbox_idx_tmp[threadIdx.x] = j;
}
// handle per-col max in global workspace
if (new_val > best_dbox_iou[j]) {
best_dbox_iou[j] = new_val;
best_dbox_idx[j] = row_num;
#ifdef DEBUG
assert(best_dbox_idx[j] >= 0);
assert(best_dbox_idx[j] < N_rows);
#endif
}
}
// Now we have all the values for this row -- reduce
__syncthreads();
// reduce - output is in max_{val, idx}_row[0]
reduce_val_idx(blockDim.x, best_bbox_iou_tmp, best_bbox_idx_tmp);
#ifdef DEBUG
__syncthreads();
#endif
// store output for row i
if (threadIdx.x == 0) {
best_bbox_iou[row_num] = best_bbox_iou_tmp[0];
best_bbox_idx[row_num] = best_bbox_idx_tmp[0];
#ifdef DEBUG
assert(best_bbox_idx[row_num] >= 0);
assert(best_bbox_idx[row_num] < M);
#endif
}
__syncthreads();
// keep track of _local_ row
row_num++;
}
#ifdef DEBUG
if (threadIdx.x == 0) {
for (int i = 0; i < N_rows; ++i) {
printf("%d - row : %d : best bbox_idx: %d\n", img, i, best_bbox_idx[i]);
}
}
#endif
#ifdef DEBUG
// make sure all best_bbox_{iou, val} are seen by everyone
__syncthreads();
#endif
// At this point we have the maximum values & indices for both bbox and dbox
/*
best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
best_dbox_idx[best_bbox_idx[idx]] = idx
*/
for (int i = threadIdx.x; i < N_rows; i += blockDim.x) {
int idx = best_bbox_idx[i];
#ifdef DEBUG
assert(idx < M);
assert(idx >= 0);
#endif
best_dbox_iou[idx] = 2.;
best_dbox_idx[idx] = i;
#ifdef DEBUG
printf("%d - set best dbox_idx[%d] to %d\n", img, best_bbox_idx[i], i);
#endif
}
/**
# filter IoU > 0.5
masks = best_dbox_ious > criteria
labels_out = torch.zeros(self.nboxes, dtype=torch.long)
#print(maxloc.shape, labels_in.shape, labels_out.shape)
labels_out[masks] = labels_in[best_dbox_idx[masks]]
bboxes_out = self.dboxes.clone()
bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
# Transform format to xywh format
x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
-bboxes_out[:, 0] + bboxes_out[:, 2], \
-bboxes_out[:, 1] + bboxes_out[:, 3]
bboxes_out[:, 0] = x
bboxes_out[:, 1] = y
bboxes_out[:, 2] = w
bboxes_out[:, 3] = h
return bboxes_out, labels_out
**/
__syncthreads();
for (int i = threadIdx.x; i < M; i += blockDim.x) {
// offset into output arrays: M values per image
// int output_idx = offsets[img] * M + i;
int output_idx = img * M + i;
// reset output labels to background
// NOTE: bbox_out is already cloned from dbox outside of this kernel
label_out[output_idx] = 0;
// Filter IoU > 0.5
bool mask = best_dbox_iou[i] > criteria;
float4 bbox = bbox_out[output_idx];
// copy some labels and bboxes
if (mask) {
// copy label
#ifdef DEBUG
printf("%d : label: local input idx: %d, value: %d\n", i, best_dbox_idx[i], labels_in[offsets[img] + best_dbox_idx[i]]);
// printf("%d : label: local input idx: %d, value: %d\n", i, best_dbox_idx[i], labels_in[offsets[img] + i]);
#endif
label_out[output_idx] = labels_in[offsets[img] + best_dbox_idx[i]];
// grab original box
bbox = bbox_in[offsets[img] + best_dbox_idx[i]];
#ifdef DEBUG
printf("mask %d : %d : %f %f %f %f\n", i, best_dbox_idx[i], bbox.x, bbox.y, bbox.z, bbox.w);
#endif
}
// transfer to xywh
float4 bbox_tmp;
bbox_tmp.x = 0.5 * (bbox.x + bbox.z);
bbox_tmp.y = 0.5 * (bbox.y + bbox.w);
bbox_tmp.z = bbox.z - bbox.x;
bbox_tmp.w = bbox.w - bbox.y;
// write out
bbox_out[output_idx] = bbox_tmp;
}
}
/**
def encode(self, bboxes_in, labels_in, criteria = 0.5):
ious = calc_iou_tensor(bboxes_in, self.dboxes)
best_dbox_ious, best_dbox_idx = ious.max(dim=0)
best_bbox_ious, best_bbox_idx = ious.max(dim=1)
# set best ious 2.0
best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
best_dbox_idx[best_bbox_idx[idx]] = idx
# filter IoU > 0.5
masks = best_dbox_ious > criteria
labels_out = torch.zeros(self.nboxes, dtype=torch.long)
#print(maxloc.shape, labels_in.shape, labels_out.shape)
labels_out[masks] = labels_in[best_dbox_idx[masks]]
bboxes_out = self.dboxes.clone()
bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
# Transform format to xywh format
x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
-bboxes_out[:, 0] + bboxes_out[:, 2], \
-bboxes_out[:, 1] + bboxes_out[:, 3]
bboxes_out[:, 0] = x
bboxes_out[:, 1] = y
bboxes_out[:, 2] = w
bboxes_out[:, 3] = h
return bboxes_out, labels_out
**/
std::vector<at::Tensor> box_encoder(const int N_img,
const at::Tensor& bbox_input,
const at::Tensor& bbox_offsets,
const at::Tensor& labels_input,
const at::Tensor& dbox,
float criteria) {
// Check everything is on the device
AT_ASSERTM(bbox_input.type().is_cuda(), "bboxes must be a CUDA tensor");
AT_ASSERTM(bbox_offsets.type().is_cuda(), "bbox offsets must be a CUDA tensor");
AT_ASSERTM(labels_input.type().is_cuda(), "labels must be a CUDA tensor");
AT_ASSERTM(dbox.type().is_cuda(), "dboxes must be a CUDA tensor");
// Check at least offsets, bboxes and labels are consistent
// Note: offsets is N+1 vs. N for labels
AT_ASSERTM(N_img + 1 == bbox_offsets.numel(), "must have N_img+1 offsets");
auto num_bbox_total = bbox_offsets[bbox_offsets.numel()-1].item<int>();
#ifdef DEBUG
printf("%d : bboxes: %d\n", (int)bbox_offsets.numel(), num_bbox_total);
#endif
AT_ASSERTM(num_bbox_total <= 2048, "total num bboxes must be <= 2048");
AT_ASSERTM(bbox_input.size(0) == labels_input.size(0), "bbox and labels must have same leading dimension");
const int N = bbox_input.size(0);
const int M = dbox.size(0);
auto stream = at::cuda::getCurrentCUDAStream();
// allocate final outputs (known size)
#ifdef DEBUG
printf("%d x %d\n", N_img * M, 4);
// at::Tensor bbox_out = dbox.type().tensor({N_img * M, 4});
printf("allocating %lu bytes for output labels\n", N_img*M*sizeof(long));
#endif
at::Tensor labels_out = at::empty({N_img * M}, labels_input.options());
THCudaCheck(cudaGetLastError());
// copy default boxes to outputs
#ifdef DEBUG
printf("allocating %lu bytes for output bboxes\n", N_img*M*4*sizeof(float));
#endif
at::Tensor bbox_out = dbox.repeat({N_img, 1});
THCudaCheck(cudaGetLastError());
// need to allocate some workspace
#ifdef DEBUG
printf("allocating %lu bytes for workspace\n", 8*M*N_img);
#endif
// at::Tensor workspace = at::CUDA(at::kByte).zeros({8 * M * N_img});
at::Tensor workspace = at::zeros({8 * M * N_img}, at::CUDA(at::kByte));
THCudaCheck(cudaGetLastError());
// Encode the inputs
const int THREADS_PER_BLOCK = 256;
encode<THREADS_PER_BLOCK, 256><<<N_img, THREADS_PER_BLOCK, 0, stream.stream()>>>(N_img,
(float4*)bbox_input.data<float>(),
labels_input.data<long>(),
bbox_offsets.data<int>(),
M,
(float4*)dbox.data<float>(),
criteria,
workspace.data<uint8_t>(),
(float4*)bbox_out.data<float>(),
labels_out.data<long>());
THCudaCheck(cudaGetLastError());
return {bbox_out, labels_out};
}
at::Tensor calc_ious(const int N_img,
const at::Tensor& boxes1,
const at::Tensor& boxes1_offsets,
const at::Tensor& boxes2) {
const int N = boxes1.size(0);
const int M = boxes2.size(0);
auto stream = at::cuda::getCurrentCUDAStream();
// at::Tensor ious = at::CUDA(at::kFloat).zeros({N, M});
// at::Tensor ious = at::ones(at::CUDA(at::kFloat), {N, M});
at::Tensor ious = at::empty({N, M}, boxes1.options());
// Get IoU of all source x default box pairs
calc_ious_kernel<<<N_img, 256, 0, stream.stream()>>>(
N_img,
(float4*)boxes1.data<float>(),
boxes1_offsets.data<int>(),
M,
(float4*)boxes2.data<float>(),
ious.data<float>());
THCudaCheck(cudaGetLastError());
return ious;
}

View file

@ -0,0 +1,81 @@
/******************************************************************************
*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
namespace py = pybind11;
// Box encoder
std::vector<at::Tensor> box_encoder(const int N_img,
const at::Tensor& bbox_input,
const at::Tensor& bbox_offsets,
const at::Tensor& labels_input,
const at::Tensor& dbox,
const float criteria = 0.5);
std::vector<at::Tensor> random_horiz_flip(
at::Tensor& img,
at::Tensor& bboxes,
const at::Tensor& bbox_offsets,
const float p,
const bool nhwc);
// Fused color jitter application
// ctm [4,4], img [H, W, C]
py::array_t<float> apply_transform(int H, int W, int C, py::array_t<float> img, py::array_t<float> ctm) {
auto img_buf = img.request();
auto ctm_buf = ctm.request();
// printf("H: %d, W: %d, C: %d\n", H, W, C);
py::array_t<float> result{img_buf.size};
auto res_buf = result.request();
float *img_ptr = (float *)img_buf.ptr;
float *ctm_ptr = (float *)ctm_buf.ptr;
float *res_ptr = (float *)res_buf.ptr;
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
float *ptr = &img_ptr[h * W * C + w * C];
float *out_ptr = &res_ptr[h * W * C + w * C];
// manually unroll over C
out_ptr[0] = ctm_ptr[0] * ptr[0] + ctm_ptr[1] * ptr[1] + ctm_ptr[2] * ptr[2] + ctm_ptr[3];
out_ptr[1] = ctm_ptr[4] * ptr[0] + ctm_ptr[5] * ptr[1] + ctm_ptr[6] * ptr[2] + ctm_ptr[7];
out_ptr[2] = ctm_ptr[8] * ptr[0] + ctm_ptr[9] * ptr[1] + ctm_ptr[10] * ptr[2] + ctm_ptr[11];
}
}
result.resize({H, W, C});
return result;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// batched box encoder
m.def("box_encoder", &box_encoder, "box_encoder");
m.def("random_horiz_flip", &random_horiz_flip, "random_horiz_flip");
// Apply fused color jitter
m.def("apply_transform", &apply_transform, "apply_transform");
}

View file

@ -0,0 +1,165 @@
/******************************************************************************
*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include <THC/THC.h>
#include <cuda.h>
/**
* Each block will handle one channel of each image
**/
template <typename T>
__global__
void HorizFlipImagesAndBoxes(
const int N,
const int C,
const int H,
const int W,
const T* img_in,
float* bboxes,
const int* offsets,
const float p,
const float* flip,
T* img_out,
const bool nhwc) {
// early return if not flipping
if (flip[blockIdx.x] < p) return;
// pointer offset into images
const int img_offset = blockIdx.x * C * H * W;
const T* img = &img_in[img_offset];
T* img_o = &img_out[img_offset];
// flip bboxes
auto bbox_offset_begin = offsets[blockIdx.x];
auto bbox_offset_end = offsets[blockIdx.x + 1];
auto num_bboxes = bbox_offset_end - bbox_offset_begin;
const int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
// bboxes in ltrb format, scaled to [0, 1]
for (int i = thread_idx; i < num_bboxes; i += blockDim.x * blockDim.y) {
float *bbox = &bboxes[(bbox_offset_begin + thread_idx) * 4];
// Could do this inplace, but not register constrained
auto bbox_0 = bbox[0];
auto bbox_2 = bbox[2];
bbox[0] = 1. - bbox_2;
bbox[2] = 1. - bbox_0;
}
if (nhwc) {
// loop over float3 pixels, handle 3 values / thread
for (int h = threadIdx.y; h < H; h += blockDim.y) {
for (int w = threadIdx.x; w < W; w += blockDim.x) {
const T* img_hw = &img[h * W * C + w * C];
T * img_out_hw = &img_o[h * W * C + (W - 1 - w) * C];
for (int c = 0; c < C; ++c) {
img_out_hw[c] = img_hw[c];
}
}
}
} else {
// loop over channels
for (int c = 0; c < C; ++c) {
const T* img_c = &img[c * H * W];
T *img_out_c = &img_o[c * H * W];
// handle tiles of (h, w) at a time
for (int h = threadIdx.y; h < H; h += blockDim.y) {
for (int w = threadIdx.x; w < W; w += blockDim.x) {
const int input_idx = h * W + w;
const int output_idx = h * W + (W - 1 - w);
img_out_c[output_idx] = img_c[input_idx];
}
}
}
}
}
/**
* Take images and their bboxes, randomly flip on horizontal axis
* In/Out: img: NCHW tensor of N, C-channel images of constant (H, W)
* In/Out: bboxes: [N_i, 4] tensor of original bboxes in ltrb format
* In: bbox_offsets: [N] offset values into bboxes
* In: p \in [0, 1): probability of flipping each (img, bbox) pair
* In: nhwc: Tensor in NHWC format
* ----
* Note: allocate temp memory, but effectively do this inplace
*/
std::vector<at::Tensor> random_horiz_flip(
at::Tensor& img,
at::Tensor& bboxes,
const at::Tensor& bbox_offsets,
const float p,
const bool nhwc) {
// dimensions
const int N = img.size(0);
int C, H, W;
if (nhwc) {
C = img.size(3);
H = img.size(1);
W = img.size(2);
} else {
C = img.size(1);
H = img.size(2);
W = img.size(3);
}
assert(img.type().is_cuda());
assert(bboxes.type().is_cuda());
assert(bbox_offsets.type().is_cuda());
// printf("%d %d %d %d\n", N, C, H, W);
// Need temp storage of size img
at::Tensor tmp_img = img.clone();
at::Tensor flip = at::zeros({N}, at::CUDA(at::kFloat)).uniform_(0., 1.);
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
img.type(),
"HorizFlipImagesAndBoxes",
[&] {
HorizFlipImagesAndBoxes<scalar_t><<<N, dim3(16, 16), 0, stream.stream()>>>(
N,
C,
H,
W,
img.data<scalar_t>(),
bboxes.data<float>(),
bbox_offsets.data<int>(),
p,
flip.data<float>(),
tmp_img.data<scalar_t>(),
nhwc);
THCudaCheck(cudaGetLastError());
});
// copy tmp_img -> img
// img = tmp_img;
return {tmp_img, bboxes};
}

View file

@ -0,0 +1,8 @@
# Get COCO 2017 data sets
COCO_DIR=${1:-"/coco"}
dir=$(pwd)
mkdir $COCO_DIR; cd $COCO_DIR
curl -O http://images.cocodataset.org/zips/train2017.zip; unzip train2017.zip
curl -O http://images.cocodataset.org/zips/val2017.zip; unzip val2017.zip
curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip; unzip annotations_trainval2017.zip
cd $dir

View file

@ -0,0 +1,4 @@
# This script launches SSD300 training in FP16 on 1 GPUs using 64 batch size
# Usage bash SSD300_FP16_1GPU.sh <path to this repository> <path to dataset> <additional flags>
python $1/main.py --backbone resnet50 --warmup 300 --bs 64 --fp16 --data $2 ${@:3}

View file

@ -0,0 +1,4 @@
# This script launches SSD300 training in FP16 on 4 GPUs using 256 batch size (64 per GPU)
# Usage ./SSD300_FP16_4GPU.sh <path to this repository> <path to dataset> <additional flags>
python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --fp16 --data $2 ${@:3}

View file

@ -0,0 +1,4 @@
# This script launches SSD300 training in FP16 on 8 GPUs using 512 batch size (64 per GPU)
# Usage ./SSD300_FP16_8GPU.sh <path to this repository> <path to dataset> <additional flags>
python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --fp16 --data $2 ${@:3}

View file

@ -0,0 +1,4 @@
# This script evaluates SSD300 model in FP16 using 32 batch size on 1 GPU
# Usage: ./SSD300_FP16_EVAL.sh <path to this repository> <path to dataset> <path to checkpoint> <additional flags>
python $1/main.py --backbone resnet50 --fp16 --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}

View file

@ -0,0 +1,4 @@
# This script launches SSD300 inference benchmark in FP16 on 1 GPU with 64 batch size
# Usage bash SSD300_FP16_INFERENCE_BENCHMARK.sh <path to this repository> <path to dataset> <additional flags>
python $1/main.py --backbone resnet50 --mode benchmark-inference --bs 64 --fp16 --data $2 ${@:3}

View file

@ -0,0 +1,4 @@
# This script launches SSD300 training in FP32 on 1 GPUs using 32 batch size
# Usage ./SSD300_FP32_1GPU.sh <path to this repository> <path to dataset> <additional flags>
python $1/main.py --backbone resnet50 --bs 32 --warmup 300 --data $2 ${@:3}

View file

@ -0,0 +1,4 @@
# This script launches SSD300 training in FP32 on 4 GPUs using 128 batch size (32 per GPU)
# Usage ./SSD300_FP32_4GPU.sh <path to this repository> <path to dataset> <additional flags>
python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --data $2 ${@:3}

View file

@ -0,0 +1,4 @@
# This script launches SSD300 training in FP32 on 8 GPUs using 256 batch size (32 per GPU)
# Usage ./SSD300_FP32_8GPU.sh <path to this repository> <path to dataset> <additional flags>
python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --data $2 ${@:3}

View file

@ -0,0 +1,4 @@
# This script evaluates SSD300 model in FP32 using 32 batch size on 1 GPU
# Usage: ./SSD300_FP32_EVAL.sh <path to this repository> <path to dataset> <path to checkpoint> <additional flags>
python $1/main.py --backbone resnet50 --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}

View file

@ -0,0 +1,4 @@
# This script launches SSD300 inference benchmark in FP32 on 1 GPU with 64 batch size
# Usage bash SSD300_FP32_INFERENCE_BENCHMARK.sh <path to this repository> <path to dataset> <additional flags>
python $1/main.py --backbone resnet50 --warmup 300 --mode benchmark-inference --bs 32 --data $2 ${@:3}

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

View file

@ -0,0 +1,240 @@
import os
import time
from argparse import ArgumentParser
import torch
import numpy as np
from torch.optim.lr_scheduler import MultiStepLR
import torch.utils.data.distributed
from src.model import SSD300, Loss
from src.utils import dboxes300_coco, Encoder
from src.logger import Logger, BenchLogger
from src.evaluate import evaluate
from src.train import train_loop, tencent_trick, load_checkpoint, benchmark_train_loop, benchmark_inference_loop
from src.data import get_train_loader, get_val_dataset, get_val_dataloader, get_coco_ground_truth
# Apex imports
try:
from apex.parallel.LARC import LARC
from apex import amp
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
except ImportError:
raise ImportError("Please install APEX from https://github.com/nvidia/apex")
def generate_mean_std(args):
mean_val = [0.485, 0.456, 0.406]
std_val = [0.229, 0.224, 0.225]
mean = torch.tensor(mean_val).cuda()
std = torch.tensor(std_val).cuda()
view = [1, len(mean_val), 1, 1]
mean = mean.view(*view)
std = std.view(*view)
if args.fp16:
mean = mean.half()
std = std.half()
return mean, std
def make_parser():
parser = ArgumentParser(description="Train Single Shot MultiBox Detector"
" on COCO")
parser.add_argument('--data', '-d', type=str, default='/coco', required=True,
help='path to test and training data files')
parser.add_argument('--epochs', '-e', type=int, default=65,
help='number of epochs for training')
parser.add_argument('--batch-size', '--bs', type=int, default=32,
help='number of examples for each iteration')
parser.add_argument('--eval-batch-size', '--ebs', type=int, default=32,
help='number of examples for each evaluation iteration')
parser.add_argument('--no-cuda', action='store_true',
help='use available GPUs')
parser.add_argument('--seed', '-s', type=int,
help='manually set random seed for torch')
parser.add_argument('--checkpoint', type=str, default=None,
help='path to model checkpoint file')
parser.add_argument('--save', action='store_true',
help='save model checkpoints')
parser.add_argument('--mode', type=str, default='training',
choices=['training', 'evaluation', 'benchmark-training', 'benchmark-inference'])
parser.add_argument('--evaluation', nargs='*', type=int, default=[21, 31, 37, 42, 48, 53, 59, 64],
help='epochs at which to evaluate')
parser.add_argument('--multistep', nargs='*', type=int, default=[43, 54],
help='epochs at which to decay learning rate')
# Hyperparameters
parser.add_argument('--learning-rate', '--lr', type=float, default=2.6e-3,
help='learning rate')
parser.add_argument('--momentum', '-m', type=float, default=0.9,
help='momentum argument for SGD optimizer')
parser.add_argument('--weight-decay', '--wd', type=float, default=0.0005,
help='momentum argument for SGD optimizer')
parser.add_argument('--profile', type=int, default=None)
parser.add_argument('--warmup', type=int, default=None)
parser.add_argument('--benchmark-iterations', type=int, default=20, metavar='N',
help='Run N iterations while benchmarking (ignored when training and validation)')
parser.add_argument('--benchmark-warmup', type=int, default=20, metavar='N',
help='Number of warmup iterations for benchmarking')
parser.add_argument('--backbone', type=str, default='resnet50',
choices=['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'])
parser.add_argument('--num-workers', type=int, default=4)
parser.add_argument('--fp16', action='store_true')
parser.add_argument('--amp', action='store_true')
# Distributed
parser.add_argument('--local_rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
return parser
def train(train_loop_func, logger, args):
if args.amp:
amp_handle = amp.init(enabled=args.fp16)
# Check that GPUs are actually available
use_cuda = not args.no_cuda
# Setup multi-GPU if necessary
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
if args.distributed:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
args.N_gpu = torch.distributed.get_world_size()
else:
args.N_gpu = 1
if args.seed is None:
args.seed = np.random.randint(1e4)
if args.distributed:
args.seed = (args.seed + torch.distributed.get_rank()) % 2**32
print("Using seed = {}".format(args.seed))
torch.manual_seed(args.seed)
np.random.seed(seed=args.seed)
# Setup data, defaults
dboxes = dboxes300_coco()
encoder = Encoder(dboxes)
cocoGt = get_coco_ground_truth(args)
train_loader = get_train_loader(args, args.seed - 2**31)
val_dataset = get_val_dataset(args)
val_dataloader = get_val_dataloader(val_dataset, args)
ssd300 = SSD300(backbone=args.backbone)
args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
start_epoch = 0
iteration = 0
loss_func = Loss(dboxes)
if use_cuda:
ssd300.cuda()
loss_func.cuda()
if args.fp16 and not args.amp:
ssd300 = network_to_half(ssd300)
if args.distributed:
ssd300 = DDP(ssd300)
optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate,
momentum=args.momentum, weight_decay=args.weight_decay)
scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1)
if args.fp16:
if args.amp:
optimizer = amp_handle.wrap_optimizer(optimizer)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.)
if args.checkpoint is not None:
if os.path.isfile(args.checkpoint):
load_checkpoint(ssd300, args.checkpoint)
checkpoint = torch.load(args.checkpoint,
map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device()))
start_epoch = checkpoint['epoch']
iteration = checkpoint['iteration']
scheduler.load_state_dict(checkpoint['scheduler'])
ssd300.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
else:
print('Provided checkpoint is not path to a file')
return
inv_map = {v: k for k, v in val_dataset.label_map.items()}
total_time = 0
if args.mode == 'evaluation':
acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
if args.local_rank == 0:
print('Model precision {} mAP'.format(acc))
return
mean, std = generate_mean_std(args)
for epoch in range(start_epoch, args.epochs):
start_epoch_time = time.time()
scheduler.step()
iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration,
logger, args, mean, std)
end_epoch_time = time.time() - start_epoch_time
total_time += end_epoch_time
if args.local_rank == 0:
logger.update_epoch_time(epoch, end_epoch_time)
if epoch in args.evaluation:
acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
if args.local_rank == 0:
logger.update_epoch(epoch, acc)
if args.save and args.local_rank == 0:
print("saving model...")
obj = {'epoch': epoch + 1,
'iteration': iteration,
'optimizer': optimizer.state_dict(),
'scheduler': scheduler.state_dict(),
'label_map': val_dataset.label_info}
if args.distributed:
obj['model'] = ssd300.module.state_dict()
else:
obj['model'] = ssd300.state_dict()
torch.save(obj, './models/epoch_{}.pt'.format(epoch))
train_loader.reset()
print('total training time: {}'.format(total_time))
if __name__ == "__main__":
parser = make_parser()
args = parser.parse_args()
if args.local_rank == 0:
os.makedirs('./models', exist_ok=True)
torch.backends.cudnn.benchmark = True
if args.mode == 'benchmark-training':
train_loop_func = benchmark_train_loop
logger = BenchLogger('Training benchmark')
args.epochs = 1
elif args.mode == 'benchmark-inference':
train_loop_func = benchmark_inference_loop
logger = BenchLogger('Inference benchmark')
args.epochs = 1
else:
train_loop_func = train_loop
logger = Logger('Training logger', print_freq=1)
train(train_loop_func, logger, args)

View file

@ -0,0 +1,31 @@
{
"model": "",
"ngpus": [1, 4, 8],
"bs": [2, 4, 8, 16, 32, 64, 128],
"metric_keys": ["images_per_second"],
"metrics": {
"1": {
"2": {
"images_per_second": 191.25867003414876
},
"4": {
"images_per_second": 340.9537905548054
},
"8": {
"images_per_second": 517.2612062140391
},
"16": {
"images_per_second": 711.5516679788083
},
"32": {
"images_per_second": 812.9203401838566
},
"64": {
"images_per_second": 951.7432815456556
},
"128": {
"images_per_second": 876.1868813828711
}
}
}
}

View file

@ -0,0 +1,31 @@
{
"model": "",
"ngpus": [1, 4, 8],
"bs": [2, 4, 8, 16, 32, 64, 128],
"metric_keys": ["images_per_second"],
"metrics": {
"1": {
"2": {
"images_per_second": 174.58768325581374
},
"4": {
"images_per_second": 254.24180710755593
},
"8": {
"images_per_second": 308.95847419165545
},
"16": {
"images_per_second": 419.60746029488445
},
"32": {
"images_per_second": 453.81433823995565
},
"64": {
"images_per_second": 592.6385687558369
},
"128": {
"images_per_second": 603.8453409148115
}
}
}
}

View file

@ -0,0 +1,59 @@
{
"model": "",
"ngpus": [1, 4, 8],
"bs": [2, 4, 8, 16, 32, 64],
"metric_keys": ["images_per_second"],
"metrics": {
"1": {
"2": {
"images_per_second": 40.71944999694824
},
"4": {
"images_per_second": 68.22257804870605
},
"8": {
"images_per_second": 121.42024612426758
},
"16": {
"images_per_second": 159.56442260742188
},
"32": {
"images_per_second": 185.69010543823242
}
},
"4": {
"2": {
"images_per_second": 40.75998783111572
},
"4": {
"images_per_second": 75.58991050720215
},
"8": {
"images_per_second": 142.64888381958008
},
"16": {
"images_per_second": 256.07005310058594
},
"32": {
"images_per_second": 300.8989944458008
}
},
"8": {
"2": {
"images_per_second": 61.28578186035156
},
"4": {
"images_per_second": 119.46021270751953
},
"8": {
"images_per_second": 231.7295379638672
},
"16": {
"images_per_second": 430.5494079589844
},
"32": {
"images_per_second": 454.2975769042969
}
}
}
}

View file

@ -0,0 +1,59 @@
{
"model": "",
"ngpus": [1, 4, 8],
"bs": [2, 4, 8, 16, 32],
"metric_keys": ["images_per_second"],
"metrics": {
"1": {
"2": {
"images_per_second": 48.635780334472656
},
"4": {
"images_per_second": 66.06407419840494
},
"8": {
"images_per_second": 83.91736857096353
},
"16": {
"images_per_second": 102.67040761311848
},
"32": {
"images_per_second": 110.02347819010416
}
},
"4": {
"2": {
"images_per_second": 41.199180603027344
},
"4": {
"images_per_second": 79.85076141357422
},
"8": {
"images_per_second": 145.39981587727863
},
"16": {
"images_per_second": 247.95855712890625
},
"32": {
"images_per_second": 341.29132080078125
}
},
"8": {
"2": {
"images_per_second": 63.07561111450195
},
"4": {
"images_per_second": 123.25757344563802
},
"8": {
"images_per_second": 237.3413340250651
},
"16": {
"images_per_second": 376.59598795572913
},
"32": {
"images_per_second": 507.9451497395833
}
}
}
}

View file

@ -0,0 +1,34 @@
{
"bs" : [
2,
4,
8,
16,
32
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 470.099200788709
},
"2" : {
"images_per_second" : 163.117099093173
},
"32" : {
"images_per_second" : 520.538879400471
},
"4" : {
"images_per_second" : 296.604178917743
},
"8" : {
"images_per_second" : 412.522394180558
}
}
},
"ngpus" : [
1
]
}

View file

@ -0,0 +1,34 @@
{
"bs" : [
2,
4,
8,
16,
32
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 280.570005994299
},
"2" : {
"images_per_second" : 147.914221468741
},
"32" : {
"images_per_second" : 302.430594818483
},
"4" : {
"images_per_second" : 201.622430560779
},
"8" : {
"images_per_second" : 228.159516872363
}
}
},
"ngpus" : [
1
]
}

View file

@ -0,0 +1,52 @@
{
"bs" : [
2,
4,
8,
16,
32
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 192.623916625977
},
"2" : {
"images_per_second" : 48.7488899230957
},
"32" : {
"images_per_second" : 204.250648498535
},
"4" : {
"images_per_second" : 95.4697418212891
},
"8" : {
"images_per_second" : 164.66495513916
}
},
"4" : {
"16" : {
"images_per_second" : 701.366027832031
},
"2" : {
"images_per_second" : 154.449935913086
},
"32" : {
"images_per_second" : 771.171325683594
},
"4" : {
"images_per_second" : 300.332641601562
},
"8" : {
"images_per_second" : 550.924163818359
}
}
},
"ngpus" : [
1,
4
]
}

View file

@ -0,0 +1,45 @@
{
"bs" : [
2,
4,
8,
16
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 121.772495269775
},
"2" : {
"images_per_second" : 60.2171878814697
},
"4" : {
"images_per_second" : 90.5315437316895
},
"8" : {
"images_per_second" : 103.113033294678
}
},
"4" : {
"16" : {
"images_per_second" : 472.226806640625
},
"2" : {
"images_per_second" : 184.061141967773
},
"4" : {
"images_per_second" : 324.639801025391
},
"8" : {
"images_per_second" : 391.055908203125
}
}
},
"ngpus" : [
1,
4
]
}

View file

@ -0,0 +1,81 @@
import argparse
import subprocess
from qa.qa_utils import compare_benchmarks, load_json, save_json, OKBLUE, ENDC, FAIL
# parsing
def parse_testscript_args():
parser = argparse.ArgumentParser(description='PyTorch Benchmark Tests')
parser.add_argument('--bs', default=[1], type=int, nargs='+')
parser.add_argument('--ngpus', default=[1], type=int, nargs='+')
parser.add_argument('--benchmark-mode', default='training', choices=['training', 'inference'],
help='benchmark training or inference', required=True)
parser.add_argument('--bench-iterations', type=int, default=20, metavar='N',
help='Run N iterations while benchmarking (ignored when training and validation)')
parser.add_argument('--bench-warmup', type=int, default=10, metavar='N',
help='Number of warmup iterations for benchmarking')
parser.add_argument('--fp16', action='store_true', help='Run model in mixed precision.')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers')
parser.add_argument('--data', type=str, metavar='<PATH>', required=True,
help='path to the dataset')
parser.add_argument('--results-file', default='experiment_raport.json', type=str,
help='file in which to store JSON experiment raport')
parser.add_argument('--benchmark-file', type=str, metavar='FILE', required=True,
help='path to the file with baselines')
return parser.parse_args()
# job command
command_template = 'python3 {launcher} qa/qa_perf_main.py --bs {bs} --ebs {bs} ' \
'--benchmark-mode {mode} --benchmark-warmup {bw} --benchmark-iterations {bi} {fp16} ' \
'--backbone resnet50 --seed 1 --data {data} --results-file {results_file} --benchmark-file {benchmark_file}'
if __name__ == '__main__':
args = parse_testscript_args()
fp16 = '--fp16' if args.fp16 else ''
# create results json file
# todo: maybe some template json file?
results = {'ngpus': args.ngpus,
'bs': args.bs,
'metric_keys': ['images_per_second'],
'metrics': {}}
for gpu in args.ngpus:
results['metrics'][str(gpu)] = {}
for bs in args.bs:
results['metrics'][str(gpu)][str(bs)] = {'images_per_second': None}
save_json(args.results_file, results)
# run qa_perf_main.py tests one by one
for gpu in args.ngpus:
launcher = '' if gpu == 1 else '-m torch.distributed.launch --nproc_per_node={}'.format(gpu)
for bs in args.bs:
print('#' * 80)
command = command_template.format(launcher=launcher, bs=bs, workers=args.workers, mode=args.benchmark_mode,
bw=args.bench_warmup, bi=args.bench_iterations, fp16=fp16,
data=args.data, results_file=args.results_file,
benchmark_file=args.benchmark_file)
print('Running "{}"'.format(command))
process = subprocess.Popen(command, shell=True)
output, error = process.communicate()
if error is not None:
print(FAIL + 'Program exited with status {}. Data has not been collected'.format(error) + ENDC)
# elif results['metrics'][str(gpu)][str(bs)]['images_per_second'] is None:
# print(WARNING + 'Program did not end sucessfully. Data has not been collected.' + ENDC)
else:
print(OKBLUE + 'Program ended sucessfully. Data has been collected.' + ENDC)
results_data = load_json(args.results_file)
benchmark_data = load_json(args.benchmark_file)
exit_code = compare_benchmarks(results_data, benchmark_data, args, 0.16 if args.benchmark_mode == 'inference' else 0.1)
print(exit_code)
exit(exit_code)

View file

@ -0,0 +1 @@
{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [8.812795396454991, 5.914838795058071, 6, 5.092440919584583, 4.887887316499735, 4.744666463422983, 4.694560192557922, 4.567333741479565, 4.492525351620137, 6, 4.408311570055099, 4.334232046614567, 6, 4.263646488106407, 4.2514614595596445, 4.2171871953656055, 4.206751160226014, 4.1795772798196715, 4.156515416099515, 6, 4.108870625495911, 4.0985876759066855, 4.075221928967139, 4.080158276849438, 6, 4.033980131669857, 4.037739227952915, 6, 3.99941903534935, 6, 3.9875937877263565, 3.971811039999583, 3.980771179282509, 3.953947089124455, 3.9305202960968018, 3.9366443781873546, 3.9252991879350754, 3.8827156307395367, 3.9388060424005102, 3.88922161618695, 3.8874285418914396, 6, 3.8936942113018453, 3.537499847891029, 3.4058184228089177, 6, 6, 3.3219671837627627, 3.295458280363458, 3.262115957955606, 6, 6, 6, 3.2190717260910433, 3.213117691627236, 3.1739242191397987, 3.1791626058811704, 3.2088054501854177, 3.1719801842385507, 3.187761370792139, 3.1809213312432236, 3.1823803410259397, 3.1752594631311677, 3.1709555600928425, 3.1823559530957817], "val.acc": [0.025120322205631106, 0.06065902615325462, 0.08224594352985645, 0.09868630608427395, 0.11402055039858493, 0.11779455253460233, 0.1232203941357061, 0.13708232144631768, 0.13614397127135028, 0.13289094380937685, 0.14004009449749777, 0.1369843423424096, 0.13877603069457692, 0.15418866425831707, 0.1500001994042602, 0.1542573219664272, 0.14771151227315413, 0.15896497766306272, 0.1600724682809656, 0.15881491661088476, 0.16213217020726906, 0.16466781280171408, 0.15738430149539484, 0.16634155547369375, 0.1623110334880526, 0.16394517553182106, 0.1494171026560053, 0.16762167601953265, 0.16063595691096758, 0.16982898253523193, 0.17321918229909394, 0.17242960413896102, 0.1625123530546557, 0.18330429802960516, 0.16333127233412115, 0.17973452067250242, 0.16699022570278652, 0.17183956548028687, 0.17168756775917593, 0.17547718325478198, 0.1750019046551496, 0.18416070771679066, 0.1711460087987496, 0.231325087097653, 0.23716038401167305, 0.23886896590018106, 0.2403412383214709, 0.24380227870861898, 0.24383605475007317, 0.2449733300818802, 0.24508423152154857, 0.24252172333110344, 0.24566254540226004, 0.24661345705692578, 0.25123807624083877, 0.25184439401895475, 0.2519010236397111, 0.25191664071239706, 0.2522156441636805, 0.25215053241008767, 0.2525434296889651, 0.2524917808636186, 0.2527410425201369, 0.2534121449798447, 0.25279479287831214]}, "bs": [64], "model": "", "ngpus": [8]}

View file

@ -0,0 +1 @@
{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [9.887425426832973, 6.30290542835752, 5.566619733535567, 5.192713968618468, 4.943981836976963, 4.777146058311629, 4.682364774062644, 4.566371860462505, 4.479279315107254, 5, 4.398730874582149, 4.31779890601812, 4.293896813580043, 4.250142149529603, 4.219812418175577, 4.21572122303159, 4.187492328960302, 4.147948342119242, 4.134799897931028, 4.131298205737984, 4.071315974647822, 4.074750597299968, 4.0595350983882055, 4.042616275720722, 4.029284068070124, 4.02082926113012, 3.9983501902834298, 4.00984974094874, 3.9730074155799167, 5, 3.9646901324326294, 3.952598022061144, 3.944574903713043, 3.9182081201711596, 3.9252539055836775, 3.907297405092997, 3.8867245969813986, 3.87151758639573, 3.8793927009449254, 3.8687505586699107, 3.8750464156204956, 5, 3.8645522469516402, 3.504709825765618, 3.3920036476251862, 3.318732707260998, 5, 3.295415750237011, 3.2602547589347872, 5, 5, 5, 5, 3.199645553613854, 3.1623374312205086, 5, 3.147109237820821, 3.158245995575684, 3.1465386938319977, 3.1480963979746055, 3.151234711101482, 3.146022343739672, 3.1410668343956294, 3.142435818259893, 3.123337645718104], "val.acc": [0.01106397969239677, 0.04958324872172423, 0.07470961174804201, 0.08412781056028416, 0.1052591997157941, 0.11592629309116805, 0.1275672396324061, 0.12472585915140484, 0.13138377072048255, 0.1262696666605193, 0.13354663690485083, 0.14424123617821044, 0.14059169419863984, 0.14768715602101368, 0.15450788443085858, 0.14792122925940135, 0.1508861356435794, 0.157419558440425, 0.15279118544884585, 0.16075469826863828, 0.14747077091644412, 0.16340857637480236, 0.14427366437395484, 0.15709914018423293, 0.16324391683493303, 0.16440443232887508, 0.16479726175439752, 0.17508843799046686, 0.16142292492169025, 0.1643848499786872, 0.16912610131976924, 0.16376330941842296, 0.16894551721633602, 0.17771765128166106, 0.1749561896689298, 0.1695538322677119, 0.16778561571905298, 0.16380194923909086, 0.16994188486879763, 0.1716953661397215, 0.17755697810460197, 0.17187995479426885, 0.1742018462295355, 0.23426649845846764, 0.23613136034024038, 0.24175797706337981, 0.2425279583355936, 0.24352550398110506, 0.24411115979837528, 0.24656561042490024, 0.24383524308920906, 0.24686666489675338, 0.24814559219197632, 0.24840393696219026, 0.251965847689631, 0.25254138256097747, 0.2523565615073023, 0.2529904738785998, 0.253555154014026, 0.2530651493203877, 0.25358174010109197, 0.2537683728256746, 0.2539384684886946, 0.2540280117408162, 0.2534652864501853]}, "bs": [32], "model": "", "ngpus": [8]}

View file

@ -0,0 +1,20 @@
{
"metrics" : {
"val.acc" : [
0.0100971670737651
],
"train.loss" : [
9.85026645043801
]
},
"ngpus" : [
8
],
"metric_keys" : [
"train.loss",
"val.acc"
],
"bs" : [
64
]
}

View file

@ -0,0 +1,20 @@
{
"bs" : [
32
],
"metrics" : {
"train.loss" : [
8.79916159380589
],
"val.acc" : [
0.0238952010105531
]
},
"metric_keys" : [
"train.loss",
"val.acc"
],
"ngpus" : [
8
]
}

View file

@ -0,0 +1,73 @@
# core imports
import os
import numpy as np
# pytorch imports
import torch
import torch.utils.data.distributed
# Apex imports
try:
from apex.parallel.LARC import LARC
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
except ImportError:
raise ImportError("Please install APEX from https://github.com/nvidia/apex")
# project imports
from src.train import train_loop
from main import train, make_parser
from src.logger import Logger
from qa.qa_utils import load_json, create_json_file, compare_acc, save_json
RESULT = None
def add_benchmark_args(parser):
parser.add_argument('--benchmark-mode', type=str, default='epoch-accuracy',
choices=['full-accuracy', 'epoch-accuracy'], required=True)
parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
help='path to the file with baselines', required=True)
return parser
def main(args):
if args.local_rank == 0:
os.makedirs('./models', exist_ok=True)
if args.seed is not None:
print("Using seed = {}".format(args.seed))
torch.manual_seed(args.seed)
np.random.seed(seed=args.seed)
torch.backends.cudnn.benchmark = True
if args.benchmark_mode == 'epoch-accuracy':
args.epochs = 1
train_loop_func = train_loop
logger = Logger('Accuracy test', print_freq=10)
args.evaluation = list(range(90))
train(train_loop_func, logger, args)
exit_code = 0
if args.local_rank == 0:
train_loss_results, val_acc_results, train_time_results = logger.print_results()
print(train_time_results)
print(train_loss_results)
print(val_acc_results)
measured_results = create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=args.batch_size)
save_json('/results/results.json', measured_results)
print(measured_results)
benchmark_results = load_json(args.benchmark_file)
exit_code = compare_acc(measured_results, benchmark_results, args)
exit(exit_code)
if __name__ == "__main__":
parser = make_parser()
parser = add_benchmark_args(parser)
args = parser.parse_args()
print(args)
main(args)

View file

@ -0,0 +1,199 @@
# core imports
import os
import numpy as np
import json
from pprint import pprint
import time
# pytorch imports
import torch
import torch.utils.data.distributed
from torch.autograd import Variable
# Apex imports
try:
from apex.parallel.LARC import LARC
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
except ImportError:
raise ImportError("Please install APEX from https://github.com/nvidia/apex")
# project imports
from main import train, make_parser
from src.logger import BenchLogger
# from src.train import benchmark_inference_loop, benchmark_train_loop
from SSD import _C as C
RESULT = None
def add_benchmark_args(parser):
parser.add_argument('--benchmark-mode', type=str, choices=['training', 'inference'],
default='inference', required=True)
parser.add_argument('--results-file', default='experiment_raport.json', type=str,
help='file in which to store JSON experiment raport')
parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
help='path to the file with baselines')
return parser
def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
start_time = None
# tensor for results
result = torch.zeros((1,)).cuda()
for i, data in enumerate(loop(train_dataloader)):
if i >= args.benchmark_warmup:
start_time = time.time()
img = data[0][0][0]
bbox = data[0][1][0]
label = data[0][2][0]
label = label.type(torch.cuda.LongTensor)
bbox_offsets = data[0][3][0]
# handle random flipping outside of DALI for now
bbox_offsets = bbox_offsets.cuda()
img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
if not args.no_cuda:
img = img.cuda()
bbox = bbox.cuda()
label = label.cuda()
bbox_offsets = bbox_offsets.cuda()
img.sub_(mean).div_(std)
N = img.shape[0]
if bbox_offsets[-1].item() == 0:
print("No labels in batch")
continue
bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
M = bbox.shape[0] // N
bbox = bbox.view(N, M, 4)
label = label.view(N, M)
ploc, plabel = model(img)
ploc, plabel = ploc.float(), plabel.float()
trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
if not args.no_cuda:
label = label.cuda()
gloc = Variable(trans_bbox, requires_grad=False)
glabel = Variable(label, requires_grad=False)
loss = loss_func(ploc, plabel, gloc, glabel)
# loss scaling
if args.fp16:
if args.amp:
with optim.scale_loss(loss) as scale_loss:
scale_loss.backward()
else:
optim.backward(loss)
else:
loss.backward()
optim.step()
optim.zero_grad()
iteration += 1
# reduce all results from every gpu
if i >= args.benchmark_warmup + args.benchmark_iterations:
result.data[0] = logger.print_result()
if args.N_gpu > 1:
torch.distributed.reduce(result, 0)
if args.local_rank == 0:
global RESULT
RESULT = float(result.data[0])
return
if i >= args.benchmark_warmup:
logger.update(args.batch_size, time.time() - start_time)
def loop(dataloader):
while True:
for data in dataloader:
yield data
def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
start_time = None
model.eval()
i=-1
dataloader = loop(val_dataloader)
while True:
i+=1
with torch.no_grad():
torch.cuda.synchronize()
if i >= args.benchmark_warmup:
start_time = time.time()
data = next(dataloader)
img = data[0]
if not args.no_cuda:
img = img.cuda()
if args.fp16:
img = img.half()
img.sub_(mean).div_(std)
img = Variable(img, requires_grad=False)
_ = model(img)
torch.cuda.synchronize()
if i >= args.benchmark_warmup + args.benchmark_iterations:
global RESULT
RESULT = logger.print_result()
return
if i >= args.benchmark_warmup:
logger.update(args.batch_size, time.time() - start_time)
def main(args):
if args.local_rank == 0:
os.makedirs('./models', exist_ok=True)
if args.seed is not None:
print("Using seed = {}".format(args.seed))
torch.manual_seed(args.seed)
np.random.seed(seed=args.seed)
torch.backends.cudnn.benchmark = True
if args.benchmark_mode == 'training':
train_loop_func = benchmark_train_loop
logger = BenchLogger('Training benchmark')
else:
train_loop_func = benchmark_inference_loop
logger = BenchLogger('Inference benchmark')
args.epochs = 1
train(train_loop_func, logger, args)
if args.local_rank == 0:
global RESULT
with open(args.results_file) as f:
results = json.load(f)
results['metrics'][str(args.N_gpu)][str(args.batch_size)] = {'images_per_second': RESULT}
pprint(results)
with open(args.results_file, 'w') as f:
json.dump(results, f)
if __name__ == "__main__":
parser = make_parser()
parser = add_benchmark_args(parser)
args = parser.parse_args()
print(args)
main(args)

View file

@ -0,0 +1,115 @@
import json
# terminal stdout colors
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
# load results and benchmark
def load_json(filepath):
with open(filepath) as f:
data = json.load(f)
return data
def save_json(filepath, data):
with open(filepath, 'w') as f:
json.dump(data, f)
# compare func
def compare(measured_value, true_value, pmargin=0.1):
assert 0 < pmargin < 1, 'Margin should be in range [0, 1]'
return (1 - pmargin) * true_value < measured_value
# compare 2 benchmark json files
def compare_benchmarks(results, benchmark, args, pmargin=0.1):
# sanity check
for metric in results['metric_keys']:
if metric not in benchmark['metric_keys']:
assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
assert len(args.bs) <= len(benchmark['bs']), 'len(args.bs) <= len(benchmark["bs"] ({} <= {})'.format(len(args.bs),
len(benchmark[
'bs']))
assert len(args.bs) == len(results['bs']), 'len(args.bs) <= len(results["bs"] ({} == {})'.format(len(args.bs),
len(results['bs']))
for bs in results['bs']:
if bs not in benchmark['bs']:
assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
assert len(args.ngpus) <= len(benchmark['ngpus']), 'len(args.ngpus) <= len(benchmark["ngpus"]) ({} <= {})'.format(
len(args.bs), len(benchmark['ngpus']))
assert len(args.ngpus) == len(results['ngpus']), 'len(args.ngpus) == len(results["ngpus"]) ({} == {})'.format(
len(args.bs), len(results['ngpus']))
for gpu in results['ngpus']:
if gpu not in benchmark['ngpus']:
assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
# compare measured numbers with benchmark
exit = 0
for metric in results['metric_keys']:
for gpu in results['ngpus']:
for bs in results['bs']:
measured_metric = results['metrics'][str(gpu)][str(bs)][metric]
ground_truth_metric = benchmark['metrics'][str(gpu)][str(bs)][metric]
ok = compare(measured_metric, ground_truth_metric, pmargin)
if ok:
print(OKGREEN + 'BENCHMARK PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
else:
print(FAIL + 'BENCHMARK NOT PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
exit = 1
return exit
# compare 2 benchmark json files
def compare_acc(results, benchmark, args):
# sanity check
for metric in results['metric_keys']:
if metric not in benchmark['metric_keys']:
assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
for bs in results['bs']:
if bs not in benchmark['bs']:
assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
for gpu in results['ngpus']:
if gpu not in benchmark['ngpus']:
assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
# compare measured numbers with benchmark
for i, (result, ground_truth) in enumerate(zip(results['metrics']['val.acc'], benchmark['metrics']['val.acc'])):
if i > 43: # before first decay accuracy tends to vary more than 15% at ~30th epoch
if ground_truth * 0.9 > result:
print(FAIL + 'ACCURACY TEST NOT PASSED' + ENDC)
return 1
# compare measured numbers with benchmark
for i, (result, ground_truth) in enumerate(zip(results['metrics']['train.loss'], benchmark['metrics']['train.loss'])):
if i > 43:
if ground_truth * 1.1 < result:
print(FAIL + 'LOSS TEST NOT PASSED' + ENDC)
return 1
print(OKGREEN + 'ACCURACY TEST PASSED' + ENDC)
return 0
def create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=32):
results = {"ngpus": [ngpus],
"bs": [bs],
"metric_keys": ["train.loss", "val.acc"],
"metrics": {
"train.loss": [],
"val.acc": []
}
}
for i, ((epoch1, acc), (epoch2, loss)) in enumerate(zip(val_acc_results, train_loss_results)):
assert i == epoch1 == epoch2
results['metrics']['train.loss'].append(loss)
results['metrics']['val.acc'].append(acc)
return results

View file

@ -0,0 +1,4 @@
#!/bin/bash
python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json --data $1

View file

@ -0,0 +1,4 @@
#!/bin/bash
python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json --data $1

View file

@ -0,0 +1,4 @@
#!/bin/bash
python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json --data $1

View file

@ -0,0 +1,4 @@
#!/bin/bash
python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json --data $1

View file

@ -0,0 +1,3 @@
#!/bin/bash
python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --fp16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp16.json --data $1

View file

@ -0,0 +1,3 @@
#!/bin/bash
python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp32.json --data $1

View file

@ -0,0 +1,3 @@
#!/bin/bash
python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 32 --fp16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json --data $1

View file

@ -0,0 +1,3 @@
#!/bin/bash
python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json --data $1

View file

@ -0,0 +1 @@
Cython==0.28.4

View file

@ -0,0 +1,89 @@
#!/usr/bin/env python
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
import torch
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
from setuptools import find_packages
from setuptools import setup
requirements = ["torch", "torchvision"]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "csrc")
source_cpu = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"))
print('c++: ', source_cpu)
print('cuda: ', source_cuda)
sources = source_cpu
extension = CppExtension
define_macros = []
if CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
extra_compile_flags= {'cxx' : []}
extra_compile_flags['nvcc'] = ['-DCUDA_HAS_FP16=1','-D__CUDA_NO_HALF_OPERATORS__','-D__CUDA_NO_HALF_CONVERSIONS__','-D__CUDA_NO_HALF2_OPERATORS__']
gencodes = [
#'-gencode', 'arch=compute_50,code=sm_50',
#'-gencode', 'arch=compute_52,code=sm_52',
#'-gencode', 'arch=compute_60,code=sm_60',
#'-gencode', 'arch=compute_61,code=sm_61',
'-gencode', 'arch=compute_70,code=sm_70',
'-gencode', 'arch=compute_70,code=compute_70',]
extra_compile_flags['nvcc'] += gencodes
ext_modules = [
extension(
"SSD._C",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_flags,
)
]
return ext_modules
setup(
name="SSD",
version="0.1",
author="slayton",
url="",
description="SSD in pytorch",
packages=find_packages(exclude=("configs", "examples", "test",)),
# install_requires=requirements,
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)

View file

433
PyTorch/Detection/SSD/src/coco.py Executable file
View file

@ -0,0 +1,433 @@
__author__ = 'tylin'
__version__ = '2.0'
# Interface for accessing the Microsoft COCO dataset.
# Microsoft COCO is a large image dataset designed for object detection,
# segmentation, and caption generation. pycocotools is a Python API that
# assists in loading, parsing and visualizing the annotations in COCO.
# Please visit http://mscoco.org/ for more information on COCO, including
# for the data, paper, and tutorials. The exact format of the annotations
# is also described on the COCO website. For example usage of the pycocotools
# please see pycocotools_demo.ipynb. In addition to this API, please download both
# the COCO images and annotations in order to run the demo.
# An alternative to using the API is to load the annotations directly
# into Python dictionary
# Using the API provides additional utility functions. Note that this API
# supports both *instance* and *caption* annotations. In the case of
# captions not all functions are defined (e.g. categories are undefined).
# The following API functions are defined:
# COCO - COCO api class that loads COCO annotation file and prepare data structures.
# decodeMask - Decode binary mask M encoded via run-length encoding.
# encodeMask - Encode binary mask M using run-length encoding.
# getAnnIds - Get ann ids that satisfy given filter conditions.
# getCatIds - Get cat ids that satisfy given filter conditions.
# getImgIds - Get img ids that satisfy given filter conditions.
# loadAnns - Load anns with the specified ids.
# loadCats - Load cats with the specified ids.
# loadImgs - Load imgs with the specified ids.
# annToMask - Convert segmentation in an annotation to binary mask.
# showAnns - Display the specified annotations.
# loadRes - Load algorithm results and create API for accessing them.
# download - Download COCO images from mscoco.org server.
# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
# Help on each functions can be accessed by: "help COCO>function".
# See also COCO>decodeMask,
# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
# COCO>loadImgs, COCO>annToMask, COCO>showAnns
# Microsoft COCO Toolbox. version 2.0
# Data, paper, and tutorials available at: http://mscoco.org/
# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
# Licensed under the Simplified BSD License [see bsd.txt]
import json
import time
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon
import numpy as np
import copy
import itertools
from pycocotools import mask as maskUtils
import os
from collections import defaultdict
import sys
PYTHON_VERSION = sys.version_info[0]
if PYTHON_VERSION == 2:
from urllib import urlretrieve
elif PYTHON_VERSION == 3:
from urllib.request import urlretrieve
def _isArrayLike(obj):
return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
class COCO:
def __init__(self, annotation_file=None):
"""
Constructor of Microsoft COCO helper class for reading and visualizing annotations.
:param annotation_file (str): location of annotation file
:param image_folder (str): location to the folder that hosts images.
:return:
"""
# load dataset
self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
if not annotation_file == None:
print('loading annotations into memory...')
tic = time.time()
dataset = json.load(open(annotation_file, 'r'))
assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
print('Done (t={:0.2f}s)'.format(time.time()- tic))
self.dataset = dataset
self.createIndex()
def createIndex(self):
# create index
print('creating index...')
anns, cats, imgs = {}, {}, {}
imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
if 'annotations' in self.dataset:
for ann in self.dataset['annotations']:
imgToAnns[ann['image_id']].append(ann)
anns[ann['id']] = ann
if 'images' in self.dataset:
for img in self.dataset['images']:
imgs[img['id']] = img
if 'categories' in self.dataset:
for cat in self.dataset['categories']:
cats[cat['id']] = cat
if 'annotations' in self.dataset and 'categories' in self.dataset:
for ann in self.dataset['annotations']:
catToImgs[ann['category_id']].append(ann['image_id'])
print('index created!')
# create class members
self.anns = anns
self.imgToAnns = imgToAnns
self.catToImgs = catToImgs
self.imgs = imgs
self.cats = cats
def info(self):
"""
Print information about the annotation file.
:return:
"""
for key, value in self.dataset['info'].items():
print('{}: {}'.format(key, value))
def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
"""
Get ann ids that satisfy given filter conditions. default skips that filter
:param imgIds (int array) : get anns for given imgs
catIds (int array) : get anns for given cats
areaRng (float array) : get anns for given area range (e.g. [0 inf])
iscrowd (boolean) : get anns for given crowd label (False or True)
:return: ids (int array) : integer array of ann ids
"""
imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(imgIds) == len(catIds) == len(areaRng) == 0:
anns = self.dataset['annotations']
else:
if not len(imgIds) == 0:
lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
anns = list(itertools.chain.from_iterable(lists))
else:
anns = self.dataset['annotations']
anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds]
anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
if not iscrowd == None:
ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
else:
ids = [ann['id'] for ann in anns]
return ids
def getCatIds(self, catNms=[], supNms=[], catIds=[]):
"""
filtering parameters. default skips that filter.
:param catNms (str array) : get cats for given cat names
:param supNms (str array) : get cats for given supercategory names
:param catIds (int array) : get cats for given cat ids
:return: ids (int array) : integer array of cat ids
"""
catNms = catNms if _isArrayLike(catNms) else [catNms]
supNms = supNms if _isArrayLike(supNms) else [supNms]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(catNms) == len(supNms) == len(catIds) == 0:
cats = self.dataset['categories']
else:
cats = self.dataset['categories']
cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms]
cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds]
ids = [cat['id'] for cat in cats]
return ids
def getImgIds(self, imgIds=[], catIds=[]):
'''
Get img ids that satisfy given filter conditions.
:param imgIds (int array) : get imgs for given ids
:param catIds (int array) : get imgs with all given cats
:return: ids (int array) : integer array of img ids
'''
imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(imgIds) == len(catIds) == 0:
ids = self.imgs.keys()
else:
ids = set(imgIds)
for i, catId in enumerate(catIds):
if i == 0 and len(ids) == 0:
ids = set(self.catToImgs[catId])
else:
ids &= set(self.catToImgs[catId])
return list(ids)
def loadAnns(self, ids=[]):
"""
Load anns with the specified ids.
:param ids (int array) : integer ids specifying anns
:return: anns (object array) : loaded ann objects
"""
if _isArrayLike(ids):
return [self.anns[id] for id in ids]
elif type(ids) == int:
return [self.anns[ids]]
def loadCats(self, ids=[]):
"""
Load cats with the specified ids.
:param ids (int array) : integer ids specifying cats
:return: cats (object array) : loaded cat objects
"""
if _isArrayLike(ids):
return [self.cats[id] for id in ids]
elif type(ids) == int:
return [self.cats[ids]]
def loadImgs(self, ids=[]):
"""
Load anns with the specified ids.
:param ids (int array) : integer ids specifying img
:return: imgs (object array) : loaded img objects
"""
if _isArrayLike(ids):
return [self.imgs[id] for id in ids]
elif type(ids) == int:
return [self.imgs[ids]]
def showAnns(self, anns):
"""
Display the specified annotations.
:param anns (array of object): annotations to display
:return: None
"""
if len(anns) == 0:
return 0
if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
datasetType = 'instances'
elif 'caption' in anns[0]:
datasetType = 'captions'
else:
raise Exception('datasetType not supported')
if datasetType == 'instances':
ax = plt.gca()
ax.set_autoscale_on(False)
polygons = []
color = []
for ann in anns:
c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
if 'segmentation' in ann:
if type(ann['segmentation']) == list:
# polygon
for seg in ann['segmentation']:
poly = np.array(seg).reshape((int(len(seg)/2), 2))
polygons.append(Polygon(poly))
color.append(c)
else:
# mask
t = self.imgs[ann['image_id']]
if type(ann['segmentation']['counts']) == list:
rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
else:
rle = [ann['segmentation']]
m = maskUtils.decode(rle)
img = np.ones( (m.shape[0], m.shape[1], 3) )
if ann['iscrowd'] == 1:
color_mask = np.array([2.0,166.0,101.0])/255
if ann['iscrowd'] == 0:
color_mask = np.random.random((1, 3)).tolist()[0]
for i in range(3):
img[:,:,i] = color_mask[i]
ax.imshow(np.dstack( (img, m*0.5) ))
if 'keypoints' in ann and type(ann['keypoints']) == list:
# turn skeleton into zero-based index
sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
kp = np.array(ann['keypoints'])
x = kp[0::3]
y = kp[1::3]
v = kp[2::3]
for sk in sks:
if np.all(v[sk]>0):
plt.plot(x[sk],y[sk], linewidth=3, color=c)
plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
ax.add_collection(p)
p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
ax.add_collection(p)
elif datasetType == 'captions':
for ann in anns:
print(ann['caption'])
def loadRes(self, resFile):
"""
Load result file and return a result api object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = COCO()
res.dataset['images'] = [img for img in self.dataset['images']]
print('Loading and preparing results...')
tic = time.time()
if type(resFile) == str: #or type(resFile) == unicode:
anns = json.load(open(resFile))
elif type(resFile) == np.ndarray:
anns = self.loadNumpyAnnotations(resFile)
else:
anns = resFile
assert type(anns) == list, 'results in not an array of objects'
annsImgIds = [ann['image_id'] for ann in anns]
assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
'Results do not correspond to current coco set'
if 'caption' in anns[0]:
imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
for id, ann in enumerate(anns):
ann['id'] = id+1
elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
bb = ann['bbox']
x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
if not 'segmentation' in ann:
ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
ann['area'] = bb[2]*bb[3]
ann['id'] = id+1
ann['iscrowd'] = 0
elif 'segmentation' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
# now only support compressed RLE format as segmentation results
ann['area'] = maskUtils.area(ann['segmentation'])
if not 'bbox' in ann:
ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
ann['id'] = id+1
ann['iscrowd'] = 0
elif 'keypoints' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
s = ann['keypoints']
x = s[0::3]
y = s[1::3]
x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
ann['area'] = (x1-x0)*(y1-y0)
ann['id'] = id + 1
ann['bbox'] = [x0,y0,x1-x0,y1-y0]
print('DONE (t={:0.2f}s)'.format(time.time()- tic))
res.dataset['annotations'] = anns
res.createIndex()
return res
def download(self, tarDir = None, imgIds = [] ):
'''
Download COCO images from mscoco.org server.
:param tarDir (str): COCO results directory name
imgIds (list): images to be downloaded
:return:
'''
if tarDir is None:
print('Please specify target directory')
return -1
if len(imgIds) == 0:
imgs = self.imgs.values()
else:
imgs = self.loadImgs(imgIds)
N = len(imgs)
if not os.path.exists(tarDir):
os.makedirs(tarDir)
for i, img in enumerate(imgs):
tic = time.time()
fname = os.path.join(tarDir, img['file_name'])
if not os.path.exists(fname):
urlretrieve(img['coco_url'], fname)
print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
def loadNumpyAnnotations(self, data):
"""
Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
:param data (numpy.ndarray)
:return: annotations (python nested list)
"""
print('Converting ndarray to lists...')
assert(type(data) == np.ndarray)
print(data.shape)
assert(data.shape[1] == 7)
N = data.shape[0]
ann = []
for i in range(N):
if i % 1000000 == 0:
print('{}/{}'.format(i,N))
ann += [{
'image_id' : int(data[i, 0]),
'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
'score' : data[i, 5],
'category_id': int(data[i, 6]),
}]
return ann
def annToRLE(self, ann):
"""
Convert annotation which can be polygons, uncompressed RLE to RLE.
:return: binary mask (numpy 2D array)
"""
t = self.imgs[ann['image_id']]
h, w = t['height'], t['width']
segm = ann['segmentation']
if type(segm) == list:
# polygon -- a single object might consist of multiple parts
# we merge all parts into one mask rle code
rles = maskUtils.frPyObjects(segm, h, w)
rle = maskUtils.merge(rles)
elif type(segm['counts']) == list:
# uncompressed RLE
rle = maskUtils.frPyObjects(segm, h, w)
else:
# rle
rle = ann['segmentation']
return rle
def annToMask(self, ann):
"""
Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
:return: binary mask (numpy 2D array)
"""
rle = self.annToRLE(ann)
m = maskUtils.decode(rle)
return m

View file

@ -0,0 +1,267 @@
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch
import ctypes
import logging
import numpy as np
# DALI imports
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
import time
class COCOPipeline(Pipeline):
def __init__(self, batch_size, device_id, file_root, annotations_file, num_gpus,
output_fp16=False, output_nhwc=False, pad_output=False, num_threads=1, seed=15):
super(COCOPipeline, self).__init__(batch_size=batch_size, device_id=device_id,
num_threads=num_threads, seed = seed)
try:
shard_id = torch.distributed.get_rank()
except RuntimeError:
shard_id = 0
self.input = ops.COCOReader(file_root = file_root, annotations_file = annotations_file,
shard_id = shard_id, num_shards = num_gpus, ratio=True, ltrb=True, random_shuffle=True,
skip_empty=True)
self.decode = ops.HostDecoder(device = "cpu", output_type = types.RGB)
# Augumentation techniques
self.crop = ops.SSDRandomCrop(device="cpu", num_attempts=1)
self.twist = ops.ColorTwist(device="gpu")
self.resize = ops.Resize(device = "gpu", resize_x = 300, resize_y = 300)
output_dtype = types.FLOAT16 if output_fp16 else types.FLOAT
output_layout = types.NHWC if output_nhwc else types.NCHW
self.normalize = ops.CropMirrorNormalize(device="gpu", crop=(300, 300),
mean=[0.0, 0.0, 0.0],
std=[255.0, 255.0, 255.0],
mirror=0,
output_dtype=output_dtype,
output_layout=output_layout,
pad_output=pad_output)
# Random variables
self.rng1 = ops.Uniform(range=[0.5, 1.5])
self.rng2 = ops.Uniform(range=[0.875, 1.125])
self.rng3 = ops.Uniform(range=[-0.5, 0.5])
def define_graph(self):
saturation = self.rng1()
contrast = self.rng1()
brightness = self.rng2()
hue = self.rng3()
inputs, bboxes, labels = self.input()
images = self.decode(inputs)
images, bboxes, labels = self.crop(images, bboxes, labels)
images = self.resize(images.gpu())
images = self.twist(images.gpu(), saturation=saturation, contrast=contrast, brightness=brightness, hue=hue)
images = self.normalize(images)
# bboxes and images and labels on GPU
return (images, bboxes.gpu(), labels.gpu())
to_torch_type = {
np.dtype(np.float32) : torch.float32,
np.dtype(np.float64) : torch.float64,
np.dtype(np.float16) : torch.float16,
np.dtype(np.uint8) : torch.uint8,
np.dtype(np.int8) : torch.int8,
np.dtype(np.int16) : torch.int16,
np.dtype(np.int32) : torch.int32,
np.dtype(np.int64) : torch.int64
}
def feed_ndarray(dali_tensor, arr):
"""
Copy contents of DALI tensor to pyTorch's Tensor.
Parameters
----------
`dali_tensor` : nvidia.dali.backend.TensorCPU or nvidia.dali.backend.TensorGPU
Tensor from which to copy
`arr` : torch.Tensor
Destination of the copy
"""
assert dali_tensor.shape() == list(arr.size()), \
("Shapes do not match: DALI tensor has size {0}"
", but PyTorch Tensor has size {1}".format(dali_tensor.shape(), list(arr.size())))
#turn raw int to a c void pointer
c_type_pointer = ctypes.c_void_p(arr.data_ptr())
dali_tensor.copy_to_external(c_type_pointer)
return arr
class DALICOCOIterator(object):
"""
COCO DALI iterator for pyTorch.
Parameters
----------
pipelines : list of nvidia.dali.pipeline.Pipeline
List of pipelines to use
size : int
Epoch size.
"""
def __init__(self, pipelines, size):
if not isinstance(pipelines, list):
pipelines = [pipelines]
self._num_gpus = len(pipelines)
assert pipelines is not None, "Number of provided pipelines has to be at least 1"
self.batch_size = pipelines[0].batch_size
self._size = size
self._pipes = pipelines
# Build all pipelines
for p in self._pipes:
p.build()
# Use double-buffering of data batches
self._data_batches = [[None, None, None, None] for i in range(self._num_gpus)]
self._counter = 0
self._current_data_batch = 0
self.output_map = ["image", "bboxes", "labels"]
# We need data about the batches (like shape information),
# so we need to run a single batch as part of setup to get that info
self._first_batch = None
self._first_batch = self.next()
def __next__(self):
if self._first_batch is not None:
batch = self._first_batch
self._first_batch = None
return batch
if self._counter > self._size:
raise StopIteration
# Gather outputs
outputs = []
for p in self._pipes:
p._prefetch()
for p in self._pipes:
outputs.append(p._share_outputs())
for i in range(self._num_gpus):
dev_id = self._pipes[i].device_id
out_images = []
bboxes = []
labels = []
# segregate outputs into image/labels/bboxes entries
for j, out in enumerate(outputs[i]):
if self.output_map[j] == "image":
out_images.append(out)
elif self.output_map[j] == "bboxes":
bboxes.append(out)
elif self.output_map[j] == "labels":
labels.append(out)
# Change DALI TensorLists into Tensors
images = [x.as_tensor() for x in out_images]
images_shape = [x.shape() for x in images]
# Prepare bboxes shapes
bboxes_shape = []
for j in range(len(bboxes)):
bboxes_shape.append([])
for k in range(len(bboxes[j])):
bboxes_shape[j].append(bboxes[j].at(k).shape())
# Prepare labels shapes and offsets
labels_shape = []
bbox_offsets = []
torch.cuda.synchronize()
for j in range(len(labels)):
labels_shape.append([])
bbox_offsets.append([0])
for k in range(len(labels[j])):
lshape = labels[j].at(k).shape()
bbox_offsets[j].append(bbox_offsets[j][k] + lshape[0])
labels_shape[j].append(lshape)
# We always need to alocate new memory as bboxes and labels varies in shape
images_torch_type = to_torch_type[np.dtype(images[0].dtype())]
bboxes_torch_type = to_torch_type[np.dtype(bboxes[0].at(0).dtype())]
labels_torch_type = to_torch_type[np.dtype(labels[0].at(0).dtype())]
torch_gpu_device = torch.device('cuda', dev_id)
torch_cpu_device = torch.device('cpu')
pyt_images = [torch.zeros(shape, dtype=images_torch_type, device=torch_gpu_device) for shape in images_shape]
pyt_bboxes = [[torch.zeros(shape, dtype=bboxes_torch_type, device=torch_gpu_device) for shape in shape_list] for shape_list in bboxes_shape]
pyt_labels = [[torch.zeros(shape, dtype=labels_torch_type, device=torch_gpu_device) for shape in shape_list] for shape_list in labels_shape]
pyt_offsets = [torch.zeros(len(offset), dtype=torch.int32, device=torch_cpu_device) for offset in bbox_offsets]
self._data_batches[i][self._current_data_batch] = (pyt_images, pyt_bboxes, pyt_labels, pyt_offsets)
# Copy data from DALI Tensors to torch tensors
for j, i_arr in enumerate(images):
feed_ndarray(i_arr, pyt_images[j])
for j, b_list in enumerate(bboxes):
for k in range(len(b_list)):
if (pyt_bboxes[j][k].shape[0] != 0):
feed_ndarray(b_list.at(k), pyt_bboxes[j][k])
pyt_bboxes[j] = torch.cat(pyt_bboxes[j])
for j, l_list in enumerate(labels):
for k in range(len(l_list)):
if (pyt_labels[j][k].shape[0] != 0):
feed_ndarray(l_list.at(k), pyt_labels[j][k])
pyt_labels[j] = torch.cat(pyt_labels[j]).squeeze(dim=1)
for j in range(len(pyt_offsets)):
pyt_offsets[j] = torch.IntTensor(bbox_offsets[j])
for p in self._pipes:
p._release_outputs()
p._start_run()
copy_db_index = self._current_data_batch
# Change index for double buffering
self._current_data_batch = (self._current_data_batch + 1) % 2
self._counter += self._num_gpus * self.batch_size
return [db[copy_db_index] for db in self._data_batches]
def next(self):
"""
Returns the next batch of data.
"""
return self.__next__();
def __iter__(self):
return self
def reset(self):
"""
Resets the iterator after the full epoch.
DALI iterators do not support resetting before the end of the epoch
and will ignore such request.
"""
if self._counter > self._size:
self._counter = self._counter % self._size
else:
logging.warning("DALI iterator does not support resetting while epoch is not finished. Ignoring...")

View file

@ -0,0 +1,54 @@
import os
import torch
from torch.utils.data import DataLoader
from src.utils import dboxes300_coco, COCODetection
from src.utils import SSDTransformer
from src.coco import COCO
#DALI import
from src.coco_pipeline import COCOPipeline, DALICOCOIterator
def get_train_loader(args, local_seed):
train_annotate = os.path.join(args.data, "annotations/instances_train2017.json")
train_coco_root = os.path.join(args.data, "train2017")
train_pipe = COCOPipeline(args.batch_size, args.local_rank, train_coco_root,
train_annotate, args.N_gpu, num_threads=args.num_workers,
output_fp16=args.fp16, output_nhwc=False,
pad_output=False, seed=local_seed)
train_pipe.build()
test_run = train_pipe.run()
train_loader = DALICOCOIterator(train_pipe, 118287 / args.N_gpu)
return train_loader
def get_val_dataset(args):
dboxes = dboxes300_coco()
val_trans = SSDTransformer(dboxes, (300, 300), val=True)
val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
val_coco_root = os.path.join(args.data, "val2017")
val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
return val_coco
def get_val_dataloader(dataset, args):
if args.distributed:
val_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
else:
val_sampler = None
val_dataloader = DataLoader(dataset,
batch_size=args.eval_batch_size,
shuffle=False, # Note: distributed sampler is shuffled :(
sampler=val_sampler,
num_workers=args.num_workers)
return val_dataloader
def get_coco_ground_truth(args):
val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
cocoGt = COCO(annotation_file=val_annotate)
return cocoGt

View file

@ -0,0 +1,82 @@
import torch
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
import torch.distributed as dist
from torch.nn.modules import Module
'''
This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
launcher included with this example. It assumes that your run is using multiprocess with 1
GPU/process, that the model is on the correct device, and that torch.set_device has been
used to set the device.
Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
and will be allreduced at the finish of the backward pass.
'''
class DistributedDataParallel(Module):
def __init__(self, module):
super(DistributedDataParallel, self).__init__()
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
self.module = module
for p in self.module.state_dict().values():
if not torch.is_tensor(p):
continue
if dist._backend == dist.dist_backend.NCCL:
assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU."
dist.broadcast(p, 0)
def allreduce_params():
if(self.needs_reduction):
self.needs_reduction = False
buckets = {}
for param in self.module.parameters():
if param.requires_grad and param.grad is not None:
tp = param.data.type()
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if self.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case.")
self.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(self.module.parameters()):
def allreduce_hook(*unused):
param._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
def forward(self, *inputs, **kwargs):
self.needs_reduction = True
return self.module(*inputs, **kwargs)
'''
def _sync_buffers(self):
buffers = list(self.module._all_buffers())
if len(buffers) > 0:
# cross-node buffer sync
flat_buffers = _flatten_dense_tensors(buffers)
dist.broadcast(flat_buffers, 0)
for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
buf.copy_(synced)
def train(self, mode=True):
# Clear NCCL communicator and CUDA event cache of the default group ID,
# These cache will be recreated at the later call. This is currently a
# work-around for a potential NCCL deadlock.
if dist._backend == dist.dist_backend.NCCL:
dist._clear_group_cache()
super(DistributedDataParallel, self).train(mode)
self.module.train(mode)
'''

View file

@ -0,0 +1,124 @@
import torch
import time
import numpy as np
from contextlib import redirect_stdout
import io
from pycocotools.cocoeval import COCOeval
def evaluate(model, coco, cocoGt, encoder, inv_map, args):
if args.distributed:
N_gpu = torch.distributed.get_world_size()
else:
N_gpu = 1
model.eval()
if not args.no_cuda:
model.cuda()
ret = []
start = time.time()
# for idx, image_id in enumerate(coco.img_keys):
for nbatch, (img, img_id, img_size, _, _) in enumerate(coco):
print("Parsing batch: {}/{}".format(nbatch, len(coco)), end='\r')
with torch.no_grad():
inp = img.cuda()
if args.fp16:
inp = inp.half()
# Get predictions
ploc, plabel = model(inp)
ploc, plabel = ploc.float(), plabel.float()
# Handle the batch of predictions produced
# This is slow, but consistent with old implementation.
for idx in range(ploc.shape[0]):
# ease-of-use for specific predictions
ploc_i = ploc[idx, :, :].unsqueeze(0)
plabel_i = plabel[idx, :, :].unsqueeze(0)
try:
result = encoder.decode_batch(ploc_i, plabel_i, 0.50, 200)[0]
except:
# raise
print("")
print("No object detected in idx: {}".format(idx))
continue
htot, wtot = img_size[0][idx].item(), img_size[1][idx].item()
loc, label, prob = [r.cpu().numpy() for r in result]
for loc_, label_, prob_ in zip(loc, label, prob):
ret.append([img_id[idx], loc_[0] * wtot, \
loc_[1] * htot,
(loc_[2] - loc_[0]) * wtot,
(loc_[3] - loc_[1]) * htot,
prob_,
inv_map[label_]])
# Now we have all predictions from this rank, gather them all together
# if necessary
ret = np.array(ret).astype(np.float32)
# Multi-GPU eval
if args.distributed:
# NCCL backend means we can only operate on GPU tensors
ret_copy = torch.tensor(ret).cuda()
# Everyone exchanges the size of their results
ret_sizes = [torch.tensor(0).cuda() for _ in range(N_gpu)]
torch.cuda.synchronize()
torch.distributed.all_gather(ret_sizes, torch.tensor(ret_copy.shape[0]).cuda())
torch.cuda.synchronize()
# Get the maximum results size, as all tensors must be the same shape for
# the all_gather call we need to make
max_size = 0
sizes = []
for s in ret_sizes:
max_size = max(max_size, s.item())
sizes.append(s.item())
# Need to pad my output to max_size in order to use in all_gather
ret_pad = torch.cat([ret_copy, torch.zeros(max_size - ret_copy.shape[0], 7, dtype=torch.float32).cuda()])
# allocate storage for results from all other processes
other_ret = [torch.zeros(max_size, 7, dtype=torch.float32).cuda() for i in range(N_gpu)]
# Everyone exchanges (padded) results
torch.cuda.synchronize()
torch.distributed.all_gather(other_ret, ret_pad)
torch.cuda.synchronize()
# Now need to reconstruct the _actual_ results from the padded set using slices.
cat_tensors = []
for i in range(N_gpu):
cat_tensors.append(other_ret[i][:sizes[i]][:])
final_results = torch.cat(cat_tensors).cpu().numpy()
else:
# Otherwise full results are just our results
final_results = ret
if args.local_rank == 0:
print("")
print("Predicting Ended, total time: {:.2f} s".format(time.time() - start))
cocoDt = cocoGt.loadRes(final_results)
E = COCOeval(cocoGt, cocoDt, iouType='bbox')
E.evaluate()
E.accumulate()
if args.local_rank == 0:
E.summarize()
print("Current AP: {:.5f}".format(E.stats[0]))
else:
# fix for cocoeval indiscriminate prints
with redirect_stdout(io.StringIO()):
E.summarize()
# put your model in training mode back on
model.train()
return E.stats[0] # Average Precision (AP) @[ IoU=050:0.95 | area= all | maxDets=100 ]

View file

@ -0,0 +1,103 @@
import math
import numpy as np
class EpochMeter:
def __init__(self, name):
self.name = name
self.data = []
def update(self, epoch, val):
self.data.append((epoch, val))
class IterationMeter:
def __init__(self, name):
self.name = name
self.data = []
def update(self, epoch, iteration, val):
self.data.append((epoch, iteration, val))
class IterationAverageMeter:
def __init__(self, name):
self.name = name
self.data = []
self.n = 0
self.sum = 0
def update_iter(self, val):
if math.isfinite(val): # sometimes loss === 'inf'
self.n += 1
self.sum += 0 if math.isinf(val) else val
def update_epoch(self, epoch):
self.data.append((epoch, self.sum / self.n))
self.n = 0
self.sum = 0
class Logger:
def __init__(self, name, print_freq=20):
self.name = name
self.train_loss_logger = IterationAverageMeter("Training loss")
self.train_epoch_time_logger = EpochMeter("Training 1 epoch time")
self.val_acc_logger = EpochMeter("Validation accuracy")
self.print_freq = print_freq
def update_iter(self, epoch, iteration, loss):
self.train_loss_logger.update_iter(loss)
if iteration % self.print_freq == 0:
print('epoch: {}\titeraion: {}\tloss: {}'.format(epoch, iteration, loss))
def update_epoch(self, epoch, acc):
self.train_loss_logger.update_epoch(epoch)
self.val_acc_logger.update(epoch, acc)
print('epoch: {}\tmAP accuracy: {}'.format(epoch, acc))
def update_epoch_time(self, epoch, time):
self.train_epoch_time_logger.update(epoch, time)
print('epoch: {}\ttime: {}'.format(epoch, time))
def print_results(self):
return self.train_loss_logger.data, self.val_acc_logger.data, self.train_epoch_time_logger
class BenchmarkMeter:
def __init__(self, name):
self.name = name
self.data = []
self.total_images = 0
self.total_time = 0
self.avr_images_per_second = 0
def update(self, bs, time):
self.total_images += bs
self.total_time += time
self.avr_images_per_second = self.total_images / self.total_time
self.data.append(bs / time)
class BenchLogger(Logger):
def __init__(self, name):
super().__init__(name)
self.name = name
self.images_per_ses = BenchmarkMeter(self.name)
def update(self, bs, time):
self.images_per_ses.update(bs, time)
def print_result(self):
total_bs = self.images_per_ses.total_images
total_time = self.images_per_ses.total_time
avr = self.images_per_ses.avr_images_per_second
med = np.median(self.images_per_ses.data)
print("Done benchmarking. Total images: {}\ttotal time: {:.3f}\tAverage images/sec: {:.3f}\tMedian images/sec: {:.3f}".format(
total_bs,
total_time,
avr,
med
))
return med

View file

@ -0,0 +1,181 @@
import torch
import torch.nn as nn
from torchvision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152
class ResNet(nn.Module):
def __init__(self, backbone='resnet50'):
super().__init__()
if backbone == 'resnet18':
backbone = resnet18(pretrained=True)
self.out_channels = [256, 512, 512, 256, 256, 128]
elif backbone == 'resnet34':
backbone = resnet34(pretrained=True)
self.out_channels = [256, 512, 512, 256, 256, 256]
elif backbone == 'resnet50':
backbone = resnet50(pretrained=True)
self.out_channels = [1024, 512, 512, 256, 256, 256]
elif backbone == 'resnet101':
backbone = resnet101(pretrained=True)
self.out_channels = [1024, 512, 512, 256, 256, 256]
else: # backbone == 'resnet152':
backbone = resnet152(pretrained=True)
self.out_channels = [1024, 512, 512, 256, 256, 256]
self.feature_extractor = nn.Sequential(*list(backbone.children())[:7])
conv4_block1 = self.feature_extractor[-1][0]
conv4_block1.conv1.stride = (1, 1)
conv4_block1.conv2.stride = (1, 1)
conv4_block1.downsample[0].stride = (1, 1)
def forward(self, x):
x = self.feature_extractor(x)
return x
class SSD300(nn.Module):
def __init__(self, backbone='resnet50'):
super().__init__()
self.feature_extractor = ResNet(backbone=backbone)
self.label_num = 81 # number of COCO classes
self._build_additional_features(self.feature_extractor.out_channels)
self.num_defaults = [4, 6, 6, 6, 4, 4]
self.loc = []
self.conf = []
for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels):
self.loc.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
self.conf.append(nn.Conv2d(oc, nd * self.label_num, kernel_size=3, padding=1))
self.loc = nn.ModuleList(self.loc)
self.conf = nn.ModuleList(self.conf)
self._init_weights()
def _build_additional_features(self, input_size):
self.additional_blocks = []
for i, (input_size, output_size, channels) in enumerate(zip(input_size[:-1], input_size[1:], [256, 256, 128, 128, 128])):
if i < 3:
layer = nn.Sequential(
nn.Conv2d(input_size, channels, kernel_size=1, bias=False),
nn.BatchNorm2d(channels),
nn.ReLU(inplace=True),
nn.Conv2d(channels, output_size, kernel_size=3, padding=1, stride=2, bias=False),
nn.BatchNorm2d(output_size),
nn.ReLU(inplace=True),
)
else:
layer = nn.Sequential(
nn.Conv2d(input_size, channels, kernel_size=1, bias=False),
nn.BatchNorm2d(channels),
nn.ReLU(inplace=True),
nn.Conv2d(channels, output_size, kernel_size=3, bias=False),
nn.BatchNorm2d(output_size),
nn.ReLU(inplace=True),
)
self.additional_blocks.append(layer)
self.additional_blocks = nn.ModuleList(self.additional_blocks)
def _init_weights(self):
layers = [*self.additional_blocks, *self.loc, *self.conf]
for layer in layers:
for param in layer.parameters():
if param.dim() > 1: nn.init.xavier_uniform_(param)
# Shape the classifier to the view of bboxes
def bbox_view(self, src, loc, conf):
ret = []
for s, l, c in zip(src, loc, conf):
ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1)))
locs, confs = list(zip(*ret))
locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
return locs, confs
def forward(self, x):
x = self.feature_extractor(x)
detection_feed = [x]
for l in self.additional_blocks:
x = l(x)
detection_feed.append(x)
# Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
locs, confs = self.bbox_view(detection_feed, self.loc, self.conf)
# For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
return locs, confs
class Loss(nn.Module):
"""
Implements the loss as the sum of the followings:
1. Confidence Loss: All labels, with hard negative mining
2. Localization Loss: Only on positive labels
Suppose input dboxes has the shape 8732x4
"""
def __init__(self, dboxes):
super(Loss, self).__init__()
self.scale_xy = 1.0/dboxes.scale_xy
self.scale_wh = 1.0/dboxes.scale_wh
self.sl1_loss = nn.SmoothL1Loss(reduce=False)
self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim = 0),
requires_grad=False)
# Two factor are from following links
# http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
self.con_loss = nn.CrossEntropyLoss(reduce=False)
def _loc_vec(self, loc):
"""
Generate Location Vectors
"""
gxy = self.scale_xy*(loc[:, :2, :] - self.dboxes[:, :2, :])/self.dboxes[:, 2:, ]
gwh = self.scale_wh*(loc[:, 2:, :]/self.dboxes[:, 2:, :]).log()
return torch.cat((gxy, gwh), dim=1).contiguous()
def forward(self, ploc, plabel, gloc, glabel):
"""
ploc, plabel: Nx4x8732, Nxlabel_numx8732
predicted location and labels
gloc, glabel: Nx4x8732, Nx8732
ground truth location and labels
"""
mask = glabel > 0
pos_num = mask.sum(dim=1)
vec_gd = self._loc_vec(gloc)
# sum on four coordinates, and mask
sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
sl1 = (mask.float()*sl1).sum(dim=1)
# hard negative mining
con = self.con_loss(plabel, glabel)
# postive mask will never selected
con_neg = con.clone()
con_neg[mask] = 0
_, con_idx = con_neg.sort(dim=1, descending=True)
_, con_rank = con_idx.sort(dim=1)
# number of negative three times positive
neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1)
neg_mask = con_rank < neg_num
#print(con.shape, mask.shape, neg_mask.shape)
closs = (con*(mask.float() + neg_mask.float())).sum(dim=1)
# avoid no object detected
total_loss = sl1 + closs
num_mask = (pos_num > 0).float()
pos_num = pos_num.float().clamp(min=1e-6)
ret = (total_loss*num_mask/pos_num).mean(dim=0)
return ret

View file

@ -0,0 +1,223 @@
from torch.autograd import Variable
import torch
import time
from SSD import _C as C
def train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
# for nbatch, (img, _, img_size, bbox, label) in enumerate(train_dataloader):
for nbatch, data in enumerate(train_dataloader):
img = data[0][0][0]
bbox = data[0][1][0]
label = data[0][2][0]
label = label.type(torch.cuda.LongTensor)
bbox_offsets = data[0][3][0]
# handle random flipping outside of DALI for now
bbox_offsets = bbox_offsets.cuda()
img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
img.sub_(mean).div_(std)
if not args.no_cuda:
img = img.cuda()
bbox = bbox.cuda()
label = label.cuda()
bbox_offsets = bbox_offsets.cuda()
N = img.shape[0]
if bbox_offsets[-1].item() == 0:
print("No labels in batch")
continue
bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
# output is ([N*8732, 4], [N*8732], need [N, 8732, 4], [N, 8732] respectively
M = bbox.shape[0] // N
bbox = bbox.view(N, M, 4)
label = label.view(N, M)
ploc, plabel = model(img)
ploc, plabel = ploc.float(), plabel.float()
trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
if not args.no_cuda:
label = label.cuda()
gloc = Variable(trans_bbox, requires_grad=False)
glabel = Variable(label, requires_grad=False)
loss = loss_func(ploc, plabel, gloc, glabel)
if args.local_rank == 0:
logger.update_iter(epoch, iteration, loss.item())
if args.fp16:
if args.amp:
with optim.scale_loss(loss) as scale_loss:
scale_loss.backward()
else:
optim.backward(loss)
else:
loss.backward()
if args.warmup is not None:
warmup(optim, args.warmup, iteration, args.learning_rate)
optim.step()
optim.zero_grad()
iteration += 1
return iteration
def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
start_time = None
# tensor for results
result = torch.zeros((1,)).cuda()
for i, data in enumerate(loop(train_dataloader)):
if i >= args.benchmark_warmup:
start_time = time.time()
img = data[0][0][0]
bbox = data[0][1][0]
label = data[0][2][0]
label = label.type(torch.cuda.LongTensor)
bbox_offsets = data[0][3][0]
# handle random flipping outside of DALI for now
bbox_offsets = bbox_offsets.cuda()
img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
if not args.no_cuda:
img = img.cuda()
bbox = bbox.cuda()
label = label.cuda()
bbox_offsets = bbox_offsets.cuda()
img.sub_(mean).div_(std)
N = img.shape[0]
if bbox_offsets[-1].item() == 0:
print("No labels in batch")
continue
bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
M = bbox.shape[0] // N
bbox = bbox.view(N, M, 4)
label = label.view(N, M)
ploc, plabel = model(img)
ploc, plabel = ploc.float(), plabel.float()
trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
if not args.no_cuda:
label = label.cuda()
gloc = Variable(trans_bbox, requires_grad=False)
glabel = Variable(label, requires_grad=False)
loss = loss_func(ploc, plabel, gloc, glabel)
# loss scaling
if args.fp16:
if args.amp:
with optim.scale_loss(loss) as scale_loss:
scale_loss.backward()
else:
optim.backward(loss)
else:
loss.backward()
optim.step()
optim.zero_grad()
if i >= args.benchmark_warmup + args.benchmark_iterations:
break
if i >= args.benchmark_warmup:
logger.update(args.batch_size, time.time() - start_time)
result.data[0] = logger.print_result()
if args.N_gpu > 1:
torch.distributed.reduce(result, 0)
if args.local_rank == 0:
print('Training performance = {} FPS'.format(float(result.data[0])))
def loop(dataloader):
while True:
for data in dataloader:
yield data
def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
start_time = None
model.eval()
i = -1
val_datas = loop(val_dataloader)
while True:
i += 1
torch.cuda.synchronize()
if i >= args.benchmark_warmup:
start_time = time.time()
data = next(val_datas)
with torch.no_grad():
img = data[0]
if not args.no_cuda:
img = img.cuda()
if args.fp16:
img = img.half()
img.sub_(mean).div_(std)
img = Variable(img, requires_grad=False)
_ = model(img)
torch.cuda.synchronize()
if i >= args.benchmark_warmup + args.benchmark_iterations:
break
if i >= args.benchmark_warmup:
logger.update(args.eval_batch_size, time.time() - start_time)
logger.print_result()
def warmup(optim, warmup_iters, iteration, base_lr):
if iteration < warmup_iters:
new_lr = 1. * base_lr / warmup_iters * iteration
for param_group in optim.param_groups:
param_group['lr'] = new_lr
def load_checkpoint(model, checkpoint):
"""
Load model from checkpoint.
"""
print("loading model checkpoint", checkpoint)
od = torch.load(checkpoint)
# remove proceeding 'N.' from checkpoint that comes from DDP wrapper
saved_model = od["model"]
model.load_state_dict(saved_model)
def tencent_trick(model):
"""
Divide parameters into 2 groups.
First group is BNs and all biases.
Second group is the remaining model's parameters.
Weight decay will be disabled in first group (aka tencent trick).
"""
decay, no_decay = [], []
for name, param in model.named_parameters():
if not param.requires_grad:
continue # frozen weights
if len(param.shape) == 1 or name.endswith(".bias"):
no_decay.append(param)
else:
decay.append(param)
return [{'params': no_decay, 'weight_decay': 0.0},
{'params': decay}]

View file

@ -0,0 +1,578 @@
import torch
import torchvision.transforms as transforms
import torch.utils.data as data
from PIL import Image
import os
import numpy as np
import random
import itertools
import torch.nn.functional as F
import json
import time
import bz2
import pickle
from math import sqrt
# This function is from https://github.com/kuangliu/pytorch-ssd.
def calc_iou_tensor(box1, box2):
""" Calculation of IoU based on two boxes tensor,
Reference to https://github.com/kuangliu/pytorch-src
input:
box1 (N, 4)
box2 (M, 4)
output:
IoU (N, M)
"""
N = box1.size(0)
M = box2.size(0)
be1 = box1.unsqueeze(1).expand(-1, M, -1)
be2 = box2.unsqueeze(0).expand(N, -1, -1)
# Left Top & Right Bottom
lt = torch.max(be1[:,:,:2], be2[:,:,:2])
#mask1 = (be1[:,:, 0] < be2[:,:, 0]) ^ (be1[:,:, 1] < be2[:,:, 1])
#mask1 = ~mask1
rb = torch.min(be1[:,:,2:], be2[:,:,2:])
#mask2 = (be1[:,:, 2] < be2[:,:, 2]) ^ (be1[:,:, 3] < be2[:,:, 3])
#mask2 = ~mask2
delta = rb - lt
delta[delta < 0] = 0
intersect = delta[:,:,0]*delta[:,:,1]
#*mask1.float()*mask2.float()
delta1 = be1[:,:,2:] - be1[:,:,:2]
area1 = delta1[:,:,0]*delta1[:,:,1]
delta2 = be2[:,:,2:] - be2[:,:,:2]
area2 = delta2[:,:,0]*delta2[:,:,1]
iou = intersect/(area1 + area2 - intersect)
return iou
# This function is from https://github.com/kuangliu/pytorch-ssd.
class Encoder(object):
"""
Inspired by https://github.com/kuangliu/pytorch-src
Transform between (bboxes, lables) <-> SSD output
dboxes: default boxes in size 8732 x 4,
encoder: input ltrb format, output xywh format
decoder: input xywh format, output ltrb format
encode:
input : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
criteria : IoU threshold of bboexes
decode:
input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
criteria : IoU threshold of bboexes
max_output : maximum number of output bboxes
"""
def __init__(self, dboxes):
self.dboxes = dboxes(order="ltrb")
self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0)
self.nboxes = self.dboxes.size(0)
self.scale_xy = dboxes.scale_xy
self.scale_wh = dboxes.scale_wh
def encode(self, bboxes_in, labels_in, criteria = 0.5):
ious = calc_iou_tensor(bboxes_in, self.dboxes)
best_dbox_ious, best_dbox_idx = ious.max(dim=0)
best_bbox_ious, best_bbox_idx = ious.max(dim=1)
# set best ious 2.0
best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
best_dbox_idx[best_bbox_idx[idx]] = idx
# filter IoU > 0.5
masks = best_dbox_ious > criteria
labels_out = torch.zeros(self.nboxes, dtype=torch.long)
labels_out[masks] = labels_in[best_dbox_idx[masks]]
bboxes_out = self.dboxes.clone()
bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
# Transform format to xywh format
x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
-bboxes_out[:, 0] + bboxes_out[:, 2], \
-bboxes_out[:, 1] + bboxes_out[:, 3]
bboxes_out[:, 0] = x
bboxes_out[:, 1] = y
bboxes_out[:, 2] = w
bboxes_out[:, 3] = h
return bboxes_out, labels_out
def scale_back_batch(self, bboxes_in, scores_in):
"""
Do scale and transform from xywh to ltrb
suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox
"""
if bboxes_in.device == torch.device("cpu"):
self.dboxes = self.dboxes.cpu()
self.dboxes_xywh = self.dboxes_xywh.cpu()
else:
self.dboxes = self.dboxes.cuda()
self.dboxes_xywh = self.dboxes_xywh.cuda()
bboxes_in = bboxes_in.permute(0, 2, 1)
scores_in = scores_in.permute(0, 2, 1)
bboxes_in[:, :, :2] = self.scale_xy*bboxes_in[:, :, :2]
bboxes_in[:, :, 2:] = self.scale_wh*bboxes_in[:, :, 2:]
bboxes_in[:, :, :2] = bboxes_in[:, :, :2]*self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp()*self.dboxes_xywh[:, :, 2:]
# Transform format to ltrb
l, t, r, b = bboxes_in[:, :, 0] - 0.5*bboxes_in[:, :, 2],\
bboxes_in[:, :, 1] - 0.5*bboxes_in[:, :, 3],\
bboxes_in[:, :, 0] + 0.5*bboxes_in[:, :, 2],\
bboxes_in[:, :, 1] + 0.5*bboxes_in[:, :, 3]
bboxes_in[:, :, 0] = l
bboxes_in[:, :, 1] = t
bboxes_in[:, :, 2] = r
bboxes_in[:, :, 3] = b
return bboxes_in, F.softmax(scores_in, dim=-1)
def decode_batch(self, bboxes_in, scores_in, criteria = 0.45, max_output=200):
bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
output = []
for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
bbox = bbox.squeeze(0)
prob = prob.squeeze(0)
output.append(self.decode_single(bbox, prob, criteria, max_output))
return output
# perform non-maximum suppression
def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
# Reference to https://github.com/amdegroot/ssd.pytorch
bboxes_out = []
scores_out = []
labels_out = []
for i, score in enumerate(scores_in.split(1, 1)):
# skip background
# print(score[score>0.90])
if i == 0: continue
# print(i)
score = score.squeeze(1)
mask = score > 0.05
bboxes, score = bboxes_in[mask, :], score[mask]
if score.size(0) == 0: continue
score_sorted, score_idx_sorted = score.sort(dim=0)
# select max_output indices
score_idx_sorted = score_idx_sorted[-max_num:]
candidates = []
#maxdata, maxloc = scores_in.sort()
while score_idx_sorted.numel() > 0:
idx = score_idx_sorted[-1].item()
bboxes_sorted = bboxes[score_idx_sorted, :]
bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
# we only need iou < criteria
score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
candidates.append(idx)
bboxes_out.append(bboxes[candidates, :])
scores_out.append(score[candidates])
labels_out.extend([i]*len(candidates))
bboxes_out, labels_out, scores_out = torch.cat(bboxes_out, dim=0), \
torch.tensor(labels_out, dtype=torch.long), \
torch.cat(scores_out, dim=0)
_, max_ids = scores_out.sort(dim=0)
max_ids = max_ids[-max_output:]
return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
class DefaultBoxes(object):
def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \
scale_xy=0.1, scale_wh=0.2):
self.feat_size = feat_size
self.fig_size = fig_size
self.scale_xy_ = scale_xy
self.scale_wh_ = scale_wh
# According to https://github.com/weiliu89/caffe
# Calculation method slightly different from paper
self.steps = steps
self.scales = scales
fk = fig_size/np.array(steps)
self.aspect_ratios = aspect_ratios
self.default_boxes = []
# size of feature and number of feature
for idx, sfeat in enumerate(self.feat_size):
sk1 = scales[idx]/fig_size
sk2 = scales[idx+1]/fig_size
sk3 = sqrt(sk1*sk2)
all_sizes = [(sk1, sk1), (sk3, sk3)]
for alpha in aspect_ratios[idx]:
w, h = sk1*sqrt(alpha), sk1/sqrt(alpha)
all_sizes.append((w, h))
all_sizes.append((h, w))
for w, h in all_sizes:
for i, j in itertools.product(range(sfeat), repeat=2):
cx, cy = (j+0.5)/fk[idx], (i+0.5)/fk[idx]
self.default_boxes.append((cx, cy, w, h))
self.dboxes = torch.tensor(self.default_boxes)
self.dboxes.clamp_(min=0, max=1)
# For IoU calculation
self.dboxes_ltrb = self.dboxes.clone()
self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]
self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]
self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]
self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]
@property
def scale_xy(self):
return self.scale_xy_
@property
def scale_wh(self):
return self.scale_wh_
def __call__(self, order="ltrb"):
if order == "ltrb": return self.dboxes_ltrb
if order == "xywh": return self.dboxes
def dboxes300_coco():
figsize = 300
feat_size = [38, 19, 10, 5, 3, 1]
steps = [8, 16, 32, 64, 100, 300]
# use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
scales = [21, 45, 99, 153, 207, 261, 315]
aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
return dboxes
# This function is from https://github.com/chauhan-utk/ssd.DomainAdaptation.
class SSDCropping(object):
""" Cropping for SSD, according to original paper
Choose between following 3 conditions:
1. Preserve the original image
2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9
3. Random crop
Reference to https://github.com/chauhan-utk/src.DomainAdaptation
"""
def __init__(self):
self.sample_options = (
# Do nothing
None,
# min IoU, max IoU
(0.1, None),
(0.3, None),
(0.5, None),
(0.7, None),
(0.9, None),
# no IoU requirements
(None, None),
)
def __call__(self, img, img_size, bboxes, labels):
# Ensure always return cropped image
while True:
mode = random.choice(self.sample_options)
if mode is None:
return img, img_size, bboxes, labels
htot, wtot = img_size
min_iou, max_iou = mode
min_iou = float("-inf") if min_iou is None else min_iou
max_iou = float("+inf") if max_iou is None else max_iou
# Implementation use 50 iteration to find possible candidate
for _ in range(1):
# suze of each sampled path in [0.1, 1] 0.3*0.3 approx. 0.1
w = random.uniform(0.3 , 1.0)
h = random.uniform(0.3 , 1.0)
if w/h < 0.5 or w/h > 2:
continue
# left 0 ~ wtot - w, top 0 ~ htot - h
left = random.uniform(0, 1.0 - w)
top = random.uniform(0, 1.0 - h)
right = left + w
bottom = top + h
ious = calc_iou_tensor(bboxes, torch.tensor([[left, top, right, bottom]]))
# tailor all the bboxes and return
if not ((ious > min_iou) & (ious < max_iou)).all():
continue
# discard any bboxes whose center not in the cropped image
xc = 0.5*(bboxes[:, 0] + bboxes[:, 2])
yc = 0.5*(bboxes[:, 1] + bboxes[:, 3])
masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom)
# if no such boxes, continue searching again
if not masks.any():
continue
bboxes[bboxes[:, 0] < left, 0] = left
bboxes[bboxes[:, 1] < top, 1] = top
bboxes[bboxes[:, 2] > right, 2] = right
bboxes[bboxes[:, 3] > bottom, 3] = bottom
bboxes = bboxes[masks, :]
labels = labels[masks]
left_idx = int(left*wtot)
top_idx = int(top*htot)
right_idx = int(right*wtot)
bottom_idx = int(bottom*htot)
img = img.crop((left_idx, top_idx, right_idx, bottom_idx))
bboxes[:, 0] = (bboxes[:, 0] - left)/w
bboxes[:, 1] = (bboxes[:, 1] - top)/h
bboxes[:, 2] = (bboxes[:, 2] - left)/w
bboxes[:, 3] = (bboxes[:, 3] - top)/h
htot = bottom_idx - top_idx
wtot = right_idx - left_idx
return img, (htot, wtot), bboxes, labels
class RandomHorizontalFlip(object):
def __init__(self, p=0.5):
self.p = p
def __call__(self, image, bboxes):
if random.random() < self.p:
bboxes[:, 0], bboxes[:, 2] = 1.0 - bboxes[:, 2], 1.0 - bboxes[:, 0]
return image.transpose(Image.FLIP_LEFT_RIGHT), bboxes
return image, bboxes
# Do data augumentation
class SSDTransformer(object):
""" SSD Data Augumentation, according to original paper
Composed by several steps:
Cropping
Resize
Flipping
Jittering
"""
def __init__(self, dboxes, size = (300, 300), val=False):
# define vgg16 mean
self.size = size
self.val = val
self.dboxes_ = dboxes #DefaultBoxes300()
self.encoder = Encoder(self.dboxes_)
self.crop = SSDCropping()
self.img_trans = transforms.Compose([
transforms.Resize(self.size),
transforms.ColorJitter(brightness=0.125, contrast=0.5,
saturation=0.5, hue=0.05
),
transforms.ToTensor()
])
self.hflip = RandomHorizontalFlip()
# All Pytorch Tensor will be normalized
# https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
self.trans_val = transforms.Compose([
transforms.Resize(self.size),
transforms.ToTensor(),
#ToTensor(),
self.normalize,])
@property
def dboxes(self):
return self.dboxes_
def __call__(self, img, img_size, bbox=None, label=None, max_num=200):
#img = torch.tensor(img)
if self.val:
bbox_out = torch.zeros(max_num, 4)
label_out = torch.zeros(max_num, dtype=torch.long)
bbox_out[:bbox.size(0), :] = bbox
label_out[:label.size(0)] = label
return self.trans_val(img), img_size, bbox_out, label_out
img, img_size, bbox, label = self.crop(img, img_size, bbox, label)
img, bbox = self.hflip(img, bbox)
img = self.img_trans(img).contiguous()
img = self.normalize(img)
bbox, label = self.encoder.encode(bbox, label)
return img, img_size, bbox, label
# Implement a datareader for COCO dataset
class COCODetection(data.Dataset):
def __init__(self, img_folder, annotate_file, transform=None):
self.img_folder = img_folder
self.annotate_file = annotate_file
# Start processing annotation
with open(annotate_file) as fin:
self.data = json.load(fin)
self.images = {}
self.label_map = {}
self.label_info = {}
start_time = time.time()
# 0 stand for the background
cnt = 0
self.label_info[cnt] = "background"
for cat in self.data["categories"]:
cnt += 1
self.label_map[cat["id"]] = cnt
self.label_info[cnt] = cat["name"]
# build inference for images
for img in self.data["images"]:
img_id = img["id"]
img_name = img["file_name"]
img_size = (img["height"],img["width"])
if img_id in self.images: raise Exception("dulpicated image record")
self.images[img_id] = (img_name, img_size, [])
# read bboxes
for bboxes in self.data["annotations"]:
img_id = bboxes["image_id"]
category_id = bboxes["category_id"]
bbox = bboxes["bbox"]
bbox_label = self.label_map[bboxes["category_id"]]
self.images[img_id][2].append((bbox, bbox_label))
for k, v in list(self.images.items()):
if len(v[2]) == 0:
self.images.pop(k)
self.img_keys = list(self.images.keys())
self.transform = transform
@property
def labelnum(self):
return len(self.label_info)
@staticmethod
def load(pklfile):
with bz2.open(pklfile, "rb") as fin:
ret = pickle.load(fin)
return ret
def save(self, pklfile):
with bz2.open(pklfile, "wb") as fout:
pickle.dump(self, fout)
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
img_id = self.img_keys[idx]
img_data = self.images[img_id]
fn = img_data[0]
img_path = os.path.join(self.img_folder, fn)
img = Image.open(img_path).convert("RGB")
htot, wtot = img_data[1]
bbox_sizes = []
bbox_labels = []
#for (xc, yc, w, h), bbox_label in img_data[2]:
for (l,t,w,h), bbox_label in img_data[2]:
r = l + w
b = t + h
#l, t, r, b = xc - 0.5*w, yc - 0.5*h, xc + 0.5*w, yc + 0.5*h
bbox_size = (l/wtot, t/htot, r/wtot, b/htot)
bbox_sizes.append(bbox_size)
bbox_labels.append(bbox_label)
bbox_sizes = torch.tensor(bbox_sizes)
bbox_labels = torch.tensor(bbox_labels)
if self.transform != None:
img, (htot, wtot), bbox_sizes, bbox_labels = \
self.transform(img, (htot, wtot), bbox_sizes, bbox_labels)
else:
pass
return img, img_id, (htot, wtot), bbox_sizes, bbox_labels
def draw_patches(img, bboxes, labels, order="xywh", label_map={}):
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Suppose bboxes in fractional coordinate:
# cx, cy, w, h
# img = img.numpy()
img = np.array(img)
labels = np.array(labels)
bboxes = bboxes.numpy()
if label_map:
labels = [label_map.get(l) for l in labels]
if order == "ltrb":
xmin, ymin, xmax, ymax = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
cx, cy, w, h = (xmin + xmax)/2, (ymin + ymax)/2, xmax - xmin, ymax - ymin
else:
cx, cy, w, h = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
htot, wtot,_ = img.shape
cx *= wtot
cy *= htot
w *= wtot
h *= htot
bboxes = zip(cx, cy, w, h)
plt.imshow(img)
ax = plt.gca()
for (cx, cy, w, h), label in zip(bboxes, labels):
if label == "background": continue
ax.add_patch(patches.Rectangle((cx-0.5*w, cy-0.5*h),
w, h, fill=False, color="r"))
bbox_props = dict(boxstyle="round", fc="y", ec="0.5", alpha=0.3)
ax.text(cx-0.5*w, cy-0.5*h, label, ha="center", va="center", size=15, bbox=bbox_props)
plt.show()

View file

@ -1,25 +0,0 @@
# Basic Multirpocess Example based on the MNIST example
This example is based on [PyTorch's MNIST Example](https://github.com/pytorch/examples/tree/master/mnist).
This example demonstrates how to modify a network to use a basic but effective distributed data parallel module. This parallel method is designed to easily run multi-gpu runs on a single node. It was created as current parallel methods integraded into pytorch can induce significant overhead due to python GIL lock. This method will reduce the influence of those overheads and potentially provide a benefit in performance, especially for networks with a significant number of fast running operations.
## Getting started
Prior to running please run
```pip install -r requirements.txt```
and start a single process run to allow the dataset to be downloaded (This will not work properly in multi-gpu. You can stop this job as soon as it starts iterating.).
```python main.py```
You can now the code multi-gpu with
```python -m multiproc main.py ...```
adding any normal option you'd like.
## Converting your own model
To understand how to convert your own model to use the distributed module included, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END: ADDED FOR DISTRIBUTED======``` flags.
Copy the distributed.py and multiproc.py files from here to your local workspace.
## Requirements
Pytorch master branch built from source. This requirement is to use NCCL as a distributed backend.

View file

@ -1,183 +0,0 @@
import torch
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
import torch.distributed as dist
from torch.nn.modules import Module
from torch.autograd import Variable
'''
This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
launcher included with this example. It assumes that your run is using multiprocess with 1
GPU/process, that the model is on the correct device, and that torch.set_device has been
used to set the device.
Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
and will be allreduced at the finish of the backward pass.
'''
def flat_dist_call(tensors, call, extra_args=None):
flat_dist_call.warn_on_half = True
buckets = {}
for tensor in tensors:
tp = tensor.type()
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(tensor)
if flat_dist_call.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case.")
flat_dist_call.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
coalesced = _flatten_dense_tensors(bucket)
if extra_args is not None:
call(coalesced, *extra_args)
else:
call(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)):
buf.copy_(synced)
class DistributedDataParallel(Module):
def __init__(self, module, message_size=10000000):
super(DistributedDataParallel, self).__init__()
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
self.message_size = message_size
#reference to last iterations parameters to see if anything has changed
self.param_refs = []
self.reduction_stream = torch.cuda.Stream()
self.module = module
self.param_list = list(self.module.parameters())
if dist._backend == dist.dist_backend.NCCL:
for param in self.param_list:
assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU."
self.record = []
self.create_hooks()
flat_dist_call([param.data for param in self.module.parameters()], dist.broadcast, (0,) )
def create_hooks(self):
#all reduce gradient hook
def allreduce_params():
if(self.needs_reduction):
self.needs_reduction = False
self.needs_refresh = False
else:
return
grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
flat_dist_call(grads, dist.all_reduce)
t_record = torch.cuda.IntTensor(self.record)
dist.broadcast(t_record, 0)
self.record = [int(entry) for entry in t_record]
def flush_buckets():
if not self.needs_reduction:
return
self.needs_reduction = False
ready = []
for i in range(len(self.param_state)):
if self.param_state[i] == 1:
param = self.param_list[self.record[i]]
if param.grad is not None:
ready.append(param.grad.data)
if(len(ready)>0):
orig_stream = torch.cuda.current_stream()
with torch.cuda.stream(self.reduction_stream):
self.reduction_stream.wait_stream(orig_stream)
flat_dist_call(ready, dist.all_reduce)
torch.cuda.current_stream().wait_stream(self.reduction_stream)
for param_i, param in enumerate(list(self.module.parameters())):
def wrapper(param_i):
def allreduce_hook(*unused):
if self.needs_refresh:
self.record.append(param_i)
Variable._execution_engine.queue_callback(allreduce_params)
else:
Variable._execution_engine.queue_callback(flush_buckets)
self.param_state[self.record.index(param_i)] = 1
self.comm_ready_buckets()
if param.requires_grad:
param.register_hook(allreduce_hook)
wrapper(param_i)
def comm_ready_buckets(self):
ready = []
counter = 0
while counter < len(self.param_state) and self.param_state[counter] == 2:
counter += 1
while counter < len(self.param_state) and self.param_state[counter] == 1:
ready.append(counter)
counter += 1
if not ready:
return
grads = []
for ind in ready:
param_ind = self.record[ind]
if self.param_list[param_ind].grad is not None:
grads.append(self.param_list[param_ind].grad.data)
bucket = []
bucket_inds = []
while grads:
bucket.append(grads.pop(0))
bucket_inds.append(ready.pop(0))
cumm_size = 0
for ten in bucket:
cumm_size += ten.numel()
if cumm_size < self.message_size:
continue
evt = torch.cuda.Event()
evt.record(torch.cuda.current_stream())
evt.wait(stream=self.reduction_stream)
with torch.cuda.stream(self.reduction_stream):
flat_dist_call(bucket, dist.all_reduce)
for ind in bucket_inds:
self.param_state[ind] = 2
def forward(self, *inputs, **kwargs):
param_list = [param for param in list(self.module.parameters()) if param.requires_grad]
self.needs_refresh = True if not self.param_refs else any(
[param1 is not param2 for param1, param2 in zip(param_list, self.param_refs)]
)
if self.needs_refresh:
self.record = []
self.param_state = [0 for i in range(len(param_list))]
self.param_refs = param_list
self.needs_reduction = True
return self.module(*inputs, **kwargs)

View file

@ -1,196 +0,0 @@
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
#=====START: ADDED FOR DISTRIBUTED======
'''Add custom module for distributed'''
from distributed import DistributedDataParallel as DDP
'''Import distributed data loader'''
import torch.utils.data
import torch.utils.data.distributed
'''Import torch.distributed'''
import torch.distributed as dist
#=====END: ADDED FOR DISTRIBUTED======
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
#======START: ADDED FOR DISTRIBUTED======
'''
Add some distributed options. For explanation of dist-url and dist-backend please see
http://pytorch.org/tutorials/intermediate/dist_tuto.html
--world-size and --rank are required parameters as they will be used by the multiproc.py launcher
but do not have to be set explicitly.
'''
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
#=====END: ADDED FOR DISTRIBUTED======
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
#======START: ADDED FOR DISTRIBUTED======
'''Add a convenience flag to see if we are running distributed'''
args.distributed = args.world_size > 1
'''Check that we are running with cuda, as distributed is only supported for cuda.'''
if args.distributed:
assert args.cuda, "Distributed mode requires running with CUDA."
if args.distributed:
'''
Set cuda device so everything is done on the right GPU.
THIS MUST BE DONE AS SOON AS POSSIBLE.
'''
torch.cuda.set_device(args.rank % torch.cuda.device_count())
'''Initialize distributed communication'''
dist.init_process_group(args.dist_backend, init_method=args.dist_url,
world_size=args.world_size)
#=====END: ADDED FOR DISTRIBUTED======
torch.manual_seed(args.seed)
if args.cuda:
torch.cuda.manual_seed(args.seed)
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
#=====START: ADDED FOR DISTRIBUTED======
'''
Change sampler to distributed if running distributed.
Shuffle data loader only if distributed.
'''
train_dataset = datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, sampler=train_sampler,
batch_size=args.batch_size, shuffle=(train_sampler is None), **kwargs
)
#=====END: ADDED FOR DISTRIBUTED======
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
model = Net()
if args.cuda:
model.cuda()
#=====START: ADDED FOR DISTRIBUTED======
'''
Wrap model in our version of DistributedDataParallel.
This must be done AFTER the model is converted to cuda.
'''
if args.distributed:
model = DDP(model)
#=====END: ADDED FOR DISTRIBUTED======
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
def train(epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
if args.cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.data[0]))
def test():
model.eval()
test_loss = 0
correct = 0
for data, target in test_loader:
if args.cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data, volatile=True), Variable(target)
output = model(data)
test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
correct += pred.eq(target.data.view_as(pred)).cpu().sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
for epoch in range(1, args.epochs + 1):
train(epoch)
test()

View file

@ -1,28 +0,0 @@
import torch
import sys
import subprocess
argslist = list(sys.argv)[1:]
world_size = torch.cuda.device_count()
if '--world-size' in argslist:
argslist[argslist.index('--world-size')+1] = str(world_size)
else:
argslist.append('--world-size')
argslist.append(str(world_size))
workers = []
for i in range(world_size):
if '--rank' in argslist:
argslist[argslist.index('--rank')+1] = str(i)
else:
argslist.append('--rank')
argslist.append(str(i))
stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w")
print(argslist)
p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
workers.append(p)
for p in workers:
p.wait()

View file

@ -1,2 +0,0 @@
torch
torchvision

View file

@ -1 +0,0 @@
python -m multiproc main.py

View file

@ -1,58 +0,0 @@
# Word-level language modeling RNN
This example is based on [PyTorch's Word-level language modeling RNN Example](https://github.com/pytorch/examples/tree/master/word_language_model).
This example trains a multi-layer RNN (Elman, GRU, or LSTM) on a language modeling task.
By default, the training script uses the Wikitext-2 dataset, provided.
The trained model can then be used by the generate script to generate new text.
```bash
python main.py --cuda --epochs 6 # Train a LSTM on Wikitext-2 with CUDA, reaching perplexity of 117.61
python main.py --cuda --epochs 6 --tied # Train a tied LSTM on Wikitext-2 with CUDA, reaching perplexity of 110.44
python main.py --cuda --tied # Train a tied LSTM on Wikitext-2 with CUDA for 40 epochs, reaching perplexity of 87.17
python generate.py # Generate samples from the trained LSTM model.
```
The model uses the `nn.RNN` module (and its sister modules `nn.GRU` and `nn.LSTM`)
which will automatically use the cuDNN backend if run on CUDA with cuDNN installed.
During training, if a keyboard interrupt (Ctrl-C) is received,
training is stopped and the current model is evaluated against the test dataset.
The `main.py` script accepts the following arguments:
```bash
optional arguments:
-h, --help show this help message and exit
--data DATA location of the data corpus
--model MODEL type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)
--emsize EMSIZE size of word embeddings
--nhid NHID number of hidden units per layer
--nlayers NLAYERS number of layers
--lr LR initial learning rate
--clip CLIP gradient clipping
--epochs EPOCHS upper epoch limit
--batch-size N batch size
--bptt BPTT sequence length
--dropout DROPOUT dropout applied to layers (0 = no dropout)
--decay DECAY learning rate decay per epoch
--tied tie the word embedding and softmax weights
--seed SEED random seed
--cuda use CUDA
--log-interval N report interval
--save SAVE path to save the final model
```
With these arguments, a variety of models can be tested.
As an example, the following arguments produce slower but better models:
```bash
python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 # Test perplexity of 80.97
python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 --tied # Test perplexity of 75.96
python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 # Test perplexity of 77.42
python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied # Test perplexity of 72.30
```
Perplexities on PTB are equal or better than
[Recurrent Neural Network Regularization (Zaremba et al. 2014)](https://arxiv.org/pdf/1409.2329.pdf)
and are similar to [Using the Output Embedding to Improve Language Models (Press & Wolf 2016](https://arxiv.org/abs/1608.05859) and [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling (Inan et al. 2016)](https://arxiv.org/pdf/1611.01462.pdf), though both of these papers have improved perplexities by using a form of recurrent dropout [(variational dropout)](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks).

View file

@ -1,49 +0,0 @@
import os
import torch
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
def add_word(self, word):
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word) - 1
return self.word2idx[word]
def __len__(self):
return len(self.idx2word)
class Corpus(object):
def __init__(self, path):
self.dictionary = Dictionary()
self.train = self.tokenize(os.path.join(path, 'train.txt'))
self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
self.test = self.tokenize(os.path.join(path, 'test.txt'))
def tokenize(self, path):
"""Tokenizes a text file."""
assert os.path.exists(path)
# Add words to the dictionary
with open(path, 'r') as f:
tokens = 0
for line in f:
words = line.split() + ['<eos>']
tokens += len(words)
for word in words:
self.dictionary.add_word(word)
# Tokenize file content
with open(path, 'r') as f:
ids = torch.LongTensor(tokens)
token = 0
for line in f:
words = line.split() + ['<eos>']
for word in words:
ids[token] = self.dictionary.word2idx[word]
token += 1
return ids

View file

@ -1,3 +0,0 @@
This is raw data from the wikitext-2 dataset.
See https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,36 +0,0 @@
import torch
def params_to_type(params, totype):
new_params = []
for param in params:
new_params.append(param.type(totype))
return new_params
def params_to_16(params):
return params_to_type(params, torch.cuda.HalfTensor)
def params_to_32(params):
return params_to_type(params, torch.cuda.FloatTensor)
def clone_params(net):
new_params = []
for param in list(net.parameters()):
new_params.append(param.data.clone())
return new_params
def clone_grads(net):
new_params = []
for param in list(net.parameters()):
new_params.append(param.grad.data.clone())
return new_params
def copy_in_params(net, params):
net_params = list(net.parameters())
for i in range(len(params)):
net_params[i].data.copy_(params[i])

View file

@ -1,74 +0,0 @@
###############################################################################
# Language Modeling on Penn Tree Bank
#
# This file generates new sentences sampled from the language model
#
###############################################################################
import argparse
import torch
from torch.autograd import Variable
import data
parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Language Model')
# Model parameters.
parser.add_argument('--data', type=str, default='./data/wikitext-2',
help='location of the data corpus')
parser.add_argument('--checkpoint', type=str, default='./model.pt',
help='model checkpoint to use')
parser.add_argument('--outf', type=str, default='generated.txt',
help='output file for generated text')
parser.add_argument('--words', type=int, default='1000',
help='number of words to generate')
parser.add_argument('--seed', type=int, default=1111,
help='random seed')
parser.add_argument('--cuda', action='store_true',
help='use CUDA')
parser.add_argument('--temperature', type=float, default=1.0,
help='temperature - higher will increase diversity')
parser.add_argument('--log-interval', type=int, default=100,
help='reporting interval')
args = parser.parse_args()
# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
if not args.cuda:
print("WARNING: You have a CUDA device, so you should probably run with --cuda")
else:
torch.cuda.manual_seed(args.seed)
if args.temperature < 1e-3:
parser.error("--temperature has to be greater or equal 1e-3")
with open(args.checkpoint, 'rb') as f:
model = torch.load(f)
model.eval()
if args.cuda:
model.cuda()
else:
model.cpu()
corpus = data.Corpus(args.data)
ntokens = len(corpus.dictionary)
hidden = model.init_hidden(1)
input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True)
if args.cuda:
input.data = input.data.cuda()
with open(args.outf, 'w') as outf:
for i in range(args.words):
output, hidden = model(input, hidden)
word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
word_idx = torch.multinomial(word_weights, 1)[0]
input.data.fill_(word_idx)
word = corpus.dictionary.idx2word[word_idx]
outf.write(word + ('\n' if i % 20 == 19 else ' '))
if i % args.log_interval == 0:
print('| Generated {}/{} words'.format(i, args.words))

View file

@ -1,236 +0,0 @@
# coding: utf-8
import argparse
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from fp16util import *
import data
import model
parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM Language Model')
parser.add_argument('--data', type=str, default='./data/wikitext-2',
help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM',
help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
parser.add_argument('--emsize', type=int, default=200,
help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200,
help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2,
help='number of layers')
parser.add_argument('--lr', type=float, default=20,
help='initial learning rate')
parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40,
help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N',
help='batch size')
parser.add_argument('--bptt', type=int, default=35,
help='sequence length')
parser.add_argument('--dropout', type=float, default=0.2,
help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--tied', action='store_true',
help='tie the word embedding and softmax weights')
parser.add_argument('--seed', type=int, default=1111,
help='random seed')
parser.add_argument('--cuda', action='store_true',
help='use CUDA')
parser.add_argument('--log-interval', type=int, default=200, metavar='N',
help='report interval')
parser.add_argument('--save', type=str, default='model.pt',
help='path to save the final model')
parser.add_argument('--fp16', action='store_true',
help='Run model in pseudo-fp16 mode (fp16 storage fp32 math).')
parser.add_argument('--loss_scale', type=float, default=1,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
args = parser.parse_args()
# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
if not args.cuda:
print("WARNING: You have a CUDA device, so you should probably run with --cuda")
else:
torch.cuda.manual_seed(args.seed)
if args.fp16 and not args.cuda:
print("WARNING: --fp16 requires --cuda, ignoring --fp16 option")
###############################################################################
# Load data
###############################################################################
corpus = data.Corpus(args.data)
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.
def batchify(data, bsz):
# Work out how cleanly we can divide the dataset into bsz parts.
nbatch = data.size(0) // bsz
# Trim off any extra elements that wouldn't cleanly fit (remainders).
data = data.narrow(0, 0, nbatch * bsz)
# Evenly divide the data across the bsz batches.
data = data.view(bsz, -1).t().contiguous()
if args.cuda:
data = data.cuda()
return data
eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
###############################################################################
# Build the model
###############################################################################
ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
if args.cuda and args.fp16:
model.type(torch.cuda.HalfTensor)
param_copy = params_to_32(clone_params(model))
elif args.cuda:
model.cuda()
criterion = nn.CrossEntropyLoss()
###############################################################################
# Training code
###############################################################################
def repackage_hidden(h):
"""Wraps hidden states in new Variables, to detach them from their history."""
if type(h) == Variable:
return Variable(h.data)
else:
return tuple(repackage_hidden(v) for v in h)
# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.
def get_batch(source, i, evaluation=False):
seq_len = min(args.bptt, len(source) - 1 - i)
data = Variable(source[i:i+seq_len], volatile=evaluation)
target = Variable(source[i+1:i+1+seq_len].view(-1))
return data, target
def evaluate(data_source):
# Turn on evaluation mode which disables dropout.
model.eval()
total_loss = 0
ntokens = len(corpus.dictionary)
hidden = model.init_hidden(eval_batch_size)
for i in range(0, data_source.size(0) - 1, args.bptt):
data, targets = get_batch(data_source, i, evaluation=True)
output, hidden = model(data, hidden)
output_flat = output.view(-1, ntokens)
#total loss can overflow if accumulated in fp16.
total_loss += len(data) * criterion(output_flat, targets).data.float()
hidden = repackage_hidden(hidden)
return total_loss[0] / len(data_source)
def train():
# Turn on training mode which enables dropout.
model.train()
total_loss = 0
start_time = time.time()
ntokens = len(corpus.dictionary)
hidden = model.init_hidden(args.batch_size)
for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
data, targets = get_batch(train_data, i)
# Starting each batch, we detach the hidden state from how it was previously produced.
# If we didn't, the model would try backpropagating all the way to start of the dataset.
hidden = repackage_hidden(hidden)
model.zero_grad()
output, hidden = model(data, hidden)
loss = criterion(output.view(-1, ntokens), targets)
loss = loss * args.loss_scale
loss.backward()
loss = loss / args.loss_scale
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
if args.fp16 and args.cuda:
grad = params_to_32(clone_grads(model))
for i, _ in enumerate(param_copy):
param_copy[i] = param_copy[i] - grad[i] * (lr/args.loss_scale)
copy_in_params(model, params_to_16(param_copy))
else:
for p in model.parameters():
p.data.add_(-lr/args.loss_scale, p.grad.data)
total_loss += loss.data
if batch % args.log_interval == 0 and batch > 0:
cur_loss = total_loss[0] / args.log_interval
elapsed = time.time() - start_time
print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
'loss {:5.2f} | ppl {:8.2f}'.format(
epoch, batch, len(train_data) // args.bptt, lr,
elapsed * 1000 / args.log_interval, cur_loss, math.exp(min(cur_loss, 20))))
total_loss = 0
start_time = time.time()
# Loop over epochs.
lr = args.lr
best_val_loss = None
# At any point you can hit Ctrl + C to break out of training early.
try:
for epoch in range(1, args.epochs+1):
epoch_start_time = time.time()
train()
val_loss = evaluate(val_data)
print('-' * 89)
print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
val_loss, math.exp(min(val_loss, 20))))
print('-' * 89)
# Save the model if the validation loss is the best we've seen so far.
if not best_val_loss or val_loss < best_val_loss:
with open(args.save, 'wb') as f:
torch.save(model, f)
best_val_loss = val_loss
else:
# Anneal the learning rate if no improvement has been seen in the validation dataset.
lr /= 4.0
except KeyboardInterrupt:
print('-' * 89)
print('Exiting from training early')
# Load the best saved model.
with open(args.save, 'rb') as f:
model = torch.load(f)
# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
test_loss, math.exp(test_loss)))
print('=' * 89)

View file

@ -1,59 +0,0 @@
import torch.nn as nn
from torch.autograd import Variable
class RNNModel(nn.Module):
"""Container module with an encoder, a recurrent module, and a decoder."""
def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
super(RNNModel, self).__init__()
self.drop = nn.Dropout(dropout)
self.encoder = nn.Embedding(ntoken, ninp)
if rnn_type in ['LSTM', 'GRU']:
self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
else:
try:
nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
except KeyError:
raise ValueError("""An invalid option for `--model` was supplied,
options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
self.decoder = nn.Linear(nhid, ntoken)
# Optionally tie weights as in:
# "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
# https://arxiv.org/abs/1608.05859
# and
# "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
# https://arxiv.org/abs/1611.01462
if tie_weights:
if nhid != ninp:
raise ValueError('When using the tied flag, nhid must be equal to emsize')
self.decoder.weight = self.encoder.weight
self.init_weights()
self.rnn_type = rnn_type
self.nhid = nhid
self.nlayers = nlayers
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.fill_(0)
self.decoder.weight.data.uniform_(-initrange, initrange)
def forward(self, input, hidden):
emb = self.drop(self.encoder(input))
output, hidden = self.rnn(emb, hidden)
output = self.drop(output)
decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
def init_hidden(self, bsz):
weight = next(self.parameters()).data
if self.rnn_type == 'LSTM':
return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
else:
return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())

View file

@ -1 +0,0 @@
torch

View file

@ -0,0 +1,4 @@
MaskRCNN PyTorch
This repository includes software from https://github.com/facebookresearch/maskrcnn-benchmark
licensed under the MIT License.

View file

@ -0,0 +1,480 @@
# Mask R-CNN For PyTorch
This repository provides a script and recipe to train and infer on MaskRCNN to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
## Table Of Contents
* [The model](#the-model)
* [Default configuration](#default-configuration)
* [Setup](#setup)
* [Requirements](#requirements)
* [Quick start guide](#quick-start-guide)
* [Details](#details)
* [Command line arguments](#command-line-arguments)
* [Getting the data](#getting-the-data)
* [Training process](#training-process)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Benchmarking](#benchmarking)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [Training stability test](#training-stability-test)
* [Training performance results](#training-performance-results)
* [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
* [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
* [Inference performance results](#inference-performance-results)
* [NVIDIA DGX-1 16G (1x V100 16G)](#nvidia-dgx-1-16g-1x-v100-16g)
* [NVIDIA DGX-1 32G (1x V100 32G)](#nvidia-dgx-1-32g-1x-v100-32g)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## The model
Mask R-CNN is a convolution based neural network for the task of object instance segmentation. The paper describing the model can be found [here](https://arxiv.org/abs/1703.06870). NVIDIAs Mask R-CNN 19.2 is an optimized version of [Facebooks implementation](https://github.com/facebookresearch/maskrcnn-benchmark), leveraging mixed precision arithmetic and tensor cores on V100 GPUs for 1.3x faster training times while maintaining target accuracy. Because this model trains with mixed precision tensor cores on Volta, researchers can get results much faster than training without tensor cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
The repository also contains scripts to interactively launch training, benchmarking and inference routines in a Docker container.
The major differences between the official implementation of the paper and our version of Mask R-CNN are as follows:
- Mixed precision support with [PyTorch AMP](https://www.google.com/url?q=https://github.com/NVIDIA/apex&sa=D&ust=1552333999336000&usg=AFQjCNGO8dE0mQMO0lUfhMyCs7g0NmJXkw).
- Gradient accumulation to simulate larger batches.
- Custom fused CUDA kernels for faster computations.
These techniques/optimizations improve model performance and reduce training time by a factor of 1.3x, allowing you to perform more efficient instance segmentation with no additional effort.
Other publicly available implementations of Mask R-CNN include:
- [Matterport](https://github.com/matterport/Mask_RCNN)
- [Tensorpack](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN)
- [Googles tensorflow model](https://github.com/tensorflow/models/tree/master/research/object_detection)
### Default Configuration
The default configuration of this model can be found at `pytorch/maskrcnn_benchmark/config/defaults.py`. The default hyper-parameters are as follows:
- General:
- Base Learning Rate set to 0.001
- Global batch size set to 16 images
- Steps set to 30000
- Images resized with aspect ratio maintained and smaller side length between [800,1333]
- Global train batch size - 16
- Global test batch size - 8
- Feature extractor:
- Backend network set to Resnet50_conv4
- Backbone network weights are frozen after second epoch
- Region Proposal Network (RPN):
- Anchor stride set to 16
- Anchor sizes set to (32, 64, 128, 256, 512)
- Foreground IOU Threshold set to 0.7, Background IOU Threshold set to 0.5
- RPN target fraction of positive proposals set to 0.5
- Train Pre-NMS Top proposals set to 12000
- Train Post-NMS Top proposals set to 2000
- Test Pre-NMS Top proposals set to 6000
- Test Post-NMS Top proposals set to 1000
- RPN NMS Threshold set to 0.7
- RoI heads:
- Foreground threshold set to 0.5
- Batch size per image set to 512
- Positive fraction of batch set to 0.25
This repository implements multi-gpu and gradient accumulation to support larger batches and mixed precision support. This implementation also includes the following optimizations.
- Target generation - Optimized GPU implementation for generating binary mask ground truths from the list of polygon coordinates that exist in the dataset.
- Custom CUDA kernels for:
- Box Intersection over Union (IoU) computation
- Proposal matcher
- Generate anchor boxes
- Pre NMS box selection - Selection of RoIs based on objectness score before NMS is applied.
The source files can be found under `maskrcnn_benchmark/csrc/cuda`.
## Setup
The following sections list the requirements in order to start training the Mask R-CNN model.
### Requirements
This repository contains `Dockerfile` which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
- [PyTorch 19.02-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
- [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
For more information about how to get started with NGC containers, see the
following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
Documentation:
- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
- [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
## Quick Start Guide
To train your model using mixed precision with tensor cores or using FP32, perform the following steps using the default parameters of the Mask R-CNN model on the COCO 2014 dataset.
### 1. Clone the repository.
```
git clone https://github.com/NVIDIA/DeepLearningExamples.git
cd DeepLearningExamples/PyTorch/Segmentation/MaskRCNN
```
### 2. Download and preprocess the dataset.
This repository provides scripts to download and extract the COCO 2014 dataset. Data will be downloaded to the `current working` directory on the host and extracted to a user-defined directory
To download, verify, and extract the COCO dataset, use the following scripts:
```
cd Detectron_PyT
./download_dataset.sh <data/dir>
```
By default, the data is organized into the following structure:
```
<data/dir>
annotations/
instances_train2014.json
instances_val2014.json
train2014/
COCO_train2014_*.jpg
val2014/
COCO_val2014_*.jpg
```
### 3. Build the Mask R-CNN PyTorch NGC container.
```
bash scripts/docker/build.sh
```
### 4. Start an interactive session in the NGC container to run training/inference.
After you build the container image, you can start an interactive CLI session with
```
bash scripts/docker/interactive.sh <path/to/dataset/>
```
The `interactive.sh` script requires that the location on the dataset is specified. For example, `/home/<USER>/Detectron_PyT/detectron/lib/datasets/data/coco`
### 5. Start training.
```
bash scripts/train.sh
```
The `train.sh` script trains a model and performs evaluation on the COCO 2014 dataset. By default, the training script:
- Uses 8 GPUs.
- Saves a checkpoint every 2500 iterations and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
- Mixed precision training with Tensor Cores is invoked by adding `DTYPE \"float16\"` to the end of the above command as shown in the train script. This will override the default `DTYPE` configuration which is float32.
The `scripts/train.sh` script runs the following Python command:
```
python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file “configs/e2e_mask_rcnn_R_50_FPN_1x.yaml”
```
### 6. Start validation/evaluation.
```
bash scripts/eval.sh
```
Model evaluation on a checkpoint can be launched by running the `pytorch/scripts/eval.sh` script. The script requires:
- the location of the checkpoint folder to be specified and present within/mounted to the container.
- a text file named last_checkpoint which contains the path to the latest checkpoint. This mechanism is required in order to resume training from the latest checkpoint.
- The file last_checkpoint is automatically created at the end of the training process.
By default, evaluation is performed on the test dataset once training is complete. To skip evaluation at the end of training, issue the `--skip-test` flag.
Additionally, to perform evaluation after every epoch and terminate training on reaching a minimum required mAP score, set
- `PER_EPOCH_EVAL = True`
- `MIN_BBOX_MAP = <required value>`
- `MIN_MASK_MAP = <required value>`
### 7. Start inference/predictions.
Model predictions can be obtained on a test dataset and a model checkpoint by running the `scripts/inference.sh <config/file/path>` script. The script requires:
- the location of the checkpoint folder and dataset to be specified and present within/mounted to the container.
- a text file named last_checkpoint which contains the path to the checkpoint.
For example:
```
bash scripts/inference.sh configs/e2e_mask_rcnn_R_50_FPN_1x.yaml
```
Model predictions get saved in the `<OUTPUT_DIR>/inference` directory.
To perform inference and skip computation of mAP scores, issue the `--skip-eval` flag. Performance is reported in seconds per iteration per GPU. The benchmarking scripts can be used to extract frames per second on training and inference.
## Details
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Command line arguments
You can modify the training behaviour through the various flags in both the `train_net.py` script and through overriding specific parameters in the YAML config files. Flags in the `train_net.py` script are as follows:
`--config_file` - path to config file containing model params
`--skip-test` - skips model testing after training
`--skip-eval` - skips computation of mAP scores
`--opts` - allows for you to override specific params in config file
For example:
```
python -m torch.distributed.launch --nproc_per_node=2 tools/train_net.py \
--config-file configs/e2e_faster_rcnn_R_50_FPN_1x.yaml \
--skip-eval \
DTYPE "float16" \
OUTPUT_DIR RESULTS \
SOLVER.BASE_LR 0.002 \
SOLVER.STEPS “(360000, 480000)”
```
### Getting the data
The Mask R-CNN model was trained on the [COCO 2014](http://cocodataset.org/#download) dataset. This dataset comes with a training and validation set.
This repository contains the `./download_dataset.sh`,`./verify_dataset.sh`, and `./extract_dataset.sh` scripts which automatically download and preprocess the training and validation sets.
In order to run on your own dataset, ensure your dataset is present/mounted to the Docker container with the following hierarchy:
```
my_dataset/
images_train/
images_val/
instances_train.json
instances_val.json
```
and add it to `DATASETS` dictionary in `maskrcnn_benchmark/config/paths_catalog.py`
```
DATASETS = {
"my_dataset_train": {
"img_dir": "data/images_train",
"ann_file": "data/instances_train.json"
},
"my_dataset_val": {
"img_dir": "data/images_val",
"ann_file": "data/instances_val.json"
},
}
```
```
python -m torch.distributed.launch --nproc_per_node=<NUM_GPUS> tools/train_net.py \
--config-file <CONFIG? \
DATASETS.TRAIN "(\"my_dataset_train\")"\
DATASETS.TEST "(\"my_dataset_val\")"\
DTYPE "float16" \
OUTPUT_DIR <RESULTS> \
| tee <LOGFILE>
```
### Training Process
Training is performed using the `tools/train_net.py` script along with parameters defined in the config file. The default config files can be found in the `pytorch/configs/` directory.
The `e2e_mask_rcnn_R_50_FPN_1x.yaml` file was used to gather accuracy and performance metrics. This configuration sets the following parameters:
- Backbone weights to ResNet-50
- Feature extractor set to ResNet-50 with Feature Pyramid Networks (FPN)
- RPN uses FPN
- RoI Heads use FPN
- Dataset - COCO 2014
- Base Learning Rate - 0.02
- Global train batch size - 16
- Global test batch size - 8
- RPN batch size - 256
- ROI batch size - 512
- Solver steps - (60000, 80000)
- Max iterations - 90000
- Warmup iterations - 500
- Warmup factor = 0.33
- Initial learning rate = Base Learning Rate x Warmup factor
The default feature extractor can be changed by setting `CONV_BODY` parameter in `yaml` file to any of the following:
- R-50-C4
- R-50-C5
- R-101-C4
- R-101-C5
- R-101-FPN
The default backbone can be changed to a flavor of Resnet-50 or ResNet-101 by setting `WEIGHT` parameter in `yaml` file to any of the following:
- "catalog://ImageNetPretrained/MSRA/R-50-GN"
- "catalog://ImageNetPretrained/MSRA/R-101"
- "catalog://ImageNetPretrained/MSRA/R-101-GN"
This script outputs results to the current working directory by default. However, this can be changed by adding `OUTPUT_DIR <DIR_NAME>` to the end of the default command. Logs produced during training are also stored in the `OUTPUT_DIR` specified. The training log will contain information about:
- Loss, time per iteration, learning rate and memory metrics
- performance values such as time per step
- test accuracy and test performance values after evaluation
The training logs are located in the `<OUTPUT_DIR>/log` directory. The summary after each training epoch is printed in the following format:
```
INFO:maskrcnn_benchmark.trainer:eta: 4:42:15 iter: 20 loss: 1.8236 (2.7274) loss_box_reg: 0.0249 (0.0620) loss_classifier: 0.6086 (1.2918) loss_mask: 0.6996 (0.8026) loss_objectness: 0.5373 (0.4787) loss_rpn_box_reg: 0.0870 (0.0924) time: 0.2002 (0.3765) data: 0.0099 (0.1242) lr: 0.014347 max mem: 3508
```
The mean and median training losses are reported every 20 steps.
Multi-gpu and multi-node training is enabled with the PyTorch distributed launch module. The following example runs training on 8 GPUs:
```
python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file \"configs/e2e_mask_rcnn_R_50_FPN_1x.yaml\"
```
We have tested a batch sizes upto 4 on a 16GB V100 and upto 16 on a 32G V100 with mixed precision. The repository also implements gradient accumulation functionality to simulate bigger batches. The following command can be used to run a batch of 64:
```
python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file \"configs/e2e_mask_rcnn_R_50_FPN_1x.yaml\" SOLVER.ACCUMULATE_GRAD True SOLVER.ACCUMULATE_STEPS 4
```
By default, training is performed using FP32, however training time can be reduced using tensor cores and mixed precision. This can be done by adding `DTYPE \"float16\"` to override the respective parameter in the config file.
__Note__: When training a global batch size >= 32, it is recommended to additionally set the following parameters:
- `SOLVER.WARMUP_ITERS 625`
- `SOLVER.WARMUP_FACTOR 0.01`
When experimenting with different global batch sizes for training and inference, make sure `SOLVER.IMS_PER_BATCH` and `TEST.IMS_PER_BATCH` are divisible by the number of GPUs.
#### Other training options
A sample single GPU config is provided under `configs/e2e_mask_rcnn_R_50_FPN_1x_1GPU.yaml`
For multi-gpu runs, `-m torch.distributed.launch --nproc_per_node num_gpus` is added prior to `tools/train_net.py`. For example, for an 8 GPU run:
```
python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file “configs/e2e_mask_rcnn_R_50_FPN_1x.yaml”
```
Training is terminated when either the required accuracies specified on the command line are reached or if the number of training iterations specified is reached.
To terminate training on reaching target accuracy on 8 GPUs, run:
```
python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file “configs/e2e_mask_rcnn_R_50_FPN_1x.yaml” PER_EPOCH_EVAL True MIN_BBOX_MAP 0.377 MIN_MASK_MAP 0.342
```
__Note__: The score is always the Average Precision(AP) at
- IoU = 0.50:0.95
- Area = all - include small, medium and large
- maxDets = 100
## Enabling mixed precision
[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Manually adding loss scaling to preserve small gradient values.
Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP), library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by amp. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
To enable mixed precision, you can:
- Import AMP from APEX, for example:
```
from apex import amp
```
- Initialize an AMP handle, for example:
```
amp_handle = amp.init(enabled=True, verbose=True)
```
- Wrap your optimizer with the AMP handle, for example:
```
optimizer = amp_handle.wrap_optimizer(optimizer)
```
- Scale loss before backpropagation (assuming loss is stored in a variable called losses)
- Default backpropagate for FP32:
```
losses.backward()
```
- Scale loss and backpropagate with AMP:
```
with optimizer.scale_loss(losses) as scaled_losses:
scaled_losses.backward()
```
For information about:
- how to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
- Techniques used for [mixed precision training, see the Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
- APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training).
## Benchmarking
Benchmarking can be performed for both training and inference. Both scripts run the Mask R-CNN model using the parameters defined in `configs/e2e_mask_rcnn_R_50_FPN_1x.yaml`. You can specify whether benchmarking is performed in FP16 or FP32 by specifying it as an argument to the benchmarking scripts.
Training benchmarking can performed by running the script:
```
scripts/train_benchmark.sh <float16/float32>
```
Inference benchmarking can be performed by running the script:
```
scripts/inference_benchmark.sh <float16/float32>
```
## Results
The following sections provide details on how we achieved our performance and accuracy in training and inference.
### Training Accuracy Results
Our results were obtained by running the `tools/train_net.py` training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
| **number of GPUs** | **batch size/GPU** | **Training time with FP16(hours)** | **Training time with FP32(hours)** |
| --- | --- | ----- | ----- |
| 8 | 4 | 4.81 | 6.35 |
LOSS CURVE:
![Loss Curve](./img/loss_curve.png)
Here, multihead loss is simply the summation of losses on the mask head and the bounding box head.
ACCURACY CURVE:
![Accuracy Curve](./img/accuracy_curve.png)
#### Training Stability Test
The following tables compare mAP scores across 5 different training runs with different seeds, for both FP16 and FP32 respectively. The runs showcase consistent convergence on all 5 seeds with very little deviation.
| **Config** | **Seed #1** | **Seed #2** | **Seed #3** | **Seed #4** | **Seed #5** | **mean** | **std** |
| --- | --- | ----- | ----- | --- | --- | ----- | ----- |
| 8 GPUs, fp16, final AP BBox | 0.377 | 0.376 | 0.376 | 0.378 | 0.377 | 0.377 | 0.001 |
| 8 GPUs, fp16, final AP Segm | 0.343 | 0.342 | 0.341 | 0.343 | 0.343 | 0.342 | 0.001 |
| **Config** | **Seed #1** | **Seed #2** | **Seed #3** | **Seed #4** | **Seed #5** | **mean** | **std** |
| --- | --- | ----- | ----- | --- | --- | ----- | ----- |
| 8 GPUs, fp32, final AP BBox | 0.377 | 0.377 | 0.376 | 0.378 | 0.378 | 0.377 | 0.001 |
| 8 GPUs, fp32, final AP Segm | 0.344 | 0.342 | 0.343 | 0.343 | 0.343 | 0.342 | 0.001 |
### Training Performance Results
#### NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Performance numbers (in tokens per second) were averaged over an entire training epoch.
| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
| --- | --- | ----- | ----- | --- | --- | ----- |
| 1 | 2 | 8.47 | 10.77 | 1.27 | 1 | 1 |
| 4 | 2 | 30.23 | 36.88 | 1.22 | 3.67 | 3.53 |
| 8 | 2 | 56.35 | 70.45 | 1.25 | 6.96 | 6.51 |
| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
| --- | --- | ----- | ----- | --- | --- | ----- |
| 1 | 4 | 9.29 | 12.73 | 1.37 | 1 | 1 |
| 4 | 4 | 34.07 | 44.95 | 1.32 | 3.67 | 3.53 |
| 8 | 4 | 62.7 | 82.9 | 1.32 | 6.75 | 6.51 |
To achieve these same results, follow the [Quick start guide](#quick-start-guide) outlined above.
#### NVIDIA DGX-1 (8x V100 32G)
Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
| --- | --- | ----- | ----- | --- | --- | ----- |
| 1 | 4 | 9.06 | 13.14 | 1.45 | 1 | 1 |
| 4 | 4 | 32.87 | 50.70 | 1.54 | 3.86 | 3.63 |
| 8 | 4 | 62.93 | 82.30 | 1.31 | 6.94 | 6.26 |
| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
| --- | --- | ----- | ----- | --- | --- | ----- |
| 1 | 8 | 9.35 | 13.05 | 1.40 | 1 | 1 |
| 4 | 8 | 33.38 | 46.69 | 1.40 | 3.57 | 3.57 |
| 8 | 8 | 71.85 | 87.10 | 1.21 | 7.68 | 7.68 |
| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
| --- | --- | ----- | ----- | --- | --- | ----- |
| 1 | 16 | NA | 13.82 | NA | NA | 1 |
| 4 | 16 | NA | 48.41 | NA | NA | 3.50 |
| 8 | 16 | NA | 89.33 | NA | NA | 6.46 |
It should be noted that respective values for FP32 runs using a batch size of 16 are not available due to out of memory errors that arise. Batch size of 16 is only available on using FP16.
To achieve these same results, follow the [Quick start guide](#quick-start-guide) outlined above.
### Inference performance results
#### NVIDIA DGX-1 16G (1x V100 16G)
Our results were obtained by running the `scripts/inference.sh` training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speedup** |
| --- | --- | ----- | ----- | ----- |
| 1 | 8 | 15.3 | 16.94 | 1.107 |
To achieve these same results, follow the [Quick start guide](#quick-start-guide) outlined above.
#### NVIDIA DGX-1 32G (1x V100 32G)
Our results were obtained by running the `scripts/inference.sh <config/file/path>` training script in the PyTorch 19.02-py3 NGC container on NVIDIA DGX-1 with 1x V100 32G GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
| **number of GPUs** | **batch size/GPU** | **FP 32 items/sec** | **FP16 items/sec** | **Speedup** |
| --- | --- | ----- | ----- | ----- |
| 1 | 8 | 14.43 | 16.33 | 1.13 |
To achieve these same results, follow the [Quick start guide](#quick-start-guide) outlined above.
## Changelog
March 2019
- Initial release
## Known Issues
There are no known issues with this model.

View file

@ -0,0 +1,28 @@
DATA_DIR=$1
wget https://dl.fbaipublicfiles.com/detectron/coco/coco_annotations_minival.tgz
wget http://images.cocodataset.org/zips/train2014.zip
wget http://images.cocodataset.org/zips/val2014.zip
wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
if md5sum -c hashes.md5
then
echo "DOWNLOAD PASSED"
# mkdir $DATA_DIR
mv coco_annotations_minival.tgz $DATA_DIR
mv train2014.zip $DATA_DIR
mv val2014.zip $DATA_DIR
mv annotations_trainval2014.zip $DATA_DIR
cd $DATA_DIR
dtrx --one=here coco_annotations_minival.tgz
dtrx --one=here annotations_trainval2014.zip
mv annotations.1/* annotations/
dtrx train2014.zip
dtrx val2014.zip
echo "EXTRACTION COMPLETE"
else
echo "DOWNLOAD FAILED HASHCHECK"
fi

View file

@ -0,0 +1 @@
wget https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl

View file

@ -0,0 +1,4 @@
2d2b9d2283adb5e3b8d25eec88e65064 coco_annotations_minival.tgz
0da8c0bd3d6becc4dcb32757491aca88 train2014.zip
a3d79f5ed8d289b7a7554ce06a5782b3 val2014.zip
0a379cfc70b0e71301e0f377548639bd annotations_trainval2014.zip

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

View file

@ -0,0 +1,65 @@
## Abstractions
The main abstractions introduced by `maskrcnn_benchmark` that are useful to
have in mind are the following:
### ImageList
In PyTorch, the first dimension of the input to the network generally represents
the batch dimension, and thus all elements of the same batch have the same
height / width.
In order to support images with different sizes and aspect ratios in the same
batch, we created the `ImageList` class, which holds internally a batch of
images (os possibly different sizes). The images are padded with zeros such that
they have the same final size and batched over the first dimension. The original
sizes of the images before padding are stored in the `image_sizes` attribute,
and the batched tensor in `tensors`.
We provide a convenience function `to_image_list` that accepts a few different
input types, including a list of tensors, and returns an `ImageList` object.
```python
from maskrcnn_benchmark.structures.image_list import to_image_list
images = [torch.rand(3, 100, 200), torch.rand(3, 150, 170)]
batched_images = to_image_list(images)
# it is also possible to make the final batched image be a multiple of a number
batched_images_32 = to_image_list(images, size_divisible=32)
```
### BoxList
The `BoxList` class holds a set of bounding boxes (represented as a `Nx4` tensor) for
a specific image, as well as the size of the image as a `(width, height)` tuple.
It also contains a set of methods that allow to perform geometric
transformations to the bounding boxes (such as cropping, scaling and flipping).
The class accepts bounding boxes from two different input formats:
- `xyxy`, where each box is encoded as a `x1`, `y1`, `x2` and `y2` coordinates, and
- `xywh`, where each box is encoded as `x1`, `y1`, `w` and `h`.
Additionally, each `BoxList` instance can also hold arbitrary additional information
for each bounding box, such as labels, visibility, probability scores etc.
Here is an example on how to create a `BoxList` from a list of coordinates:
```python
from maskrcnn_benchmark.structures.bounding_box import BoxList, FLIP_LEFT_RIGHT
width = 100
height = 200
boxes = [
[0, 10, 50, 50],
[50, 20, 90, 60],
[10, 10, 50, 50]
]
# create a BoxList with 3 boxes
bbox = BoxList(boxes, image_size=(width, height), mode='xyxy')
# perform some box transformations, has similar API as PIL.Image
bbox_scaled = bbox.resize((width * 2, height * 3))
bbox_flipped = bbox.transpose(FLIP_LEFT_RIGHT)
# add labels for each bbox
labels = torch.tensor([0, 10, 1])
bbox.add_field('labels', labels)
# bbox also support a few operations, like indexing
# here, selects boxes 0 and 2
bbox_subset = bbox[[0, 2]]
```

View file

@ -0,0 +1,5 @@
# Code of Conduct
Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
Please read the [full text](https://code.fb.com/codeofconduct/)
so that you can understand what actions will and will not be tolerated.

View file

@ -0,0 +1,39 @@
# Contributing to Mask-RCNN Benchmark
We want to make contributing to this project as easy and transparent as
possible.
## Our Development Process
Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis.
## Pull Requests
We actively welcome your pull requests.
1. Fork the repo and create your branch from `master`.
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
5. Make sure your code lints.
6. If you haven't already, complete the Contributor License Agreement ("CLA").
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## Issues
We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.
Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Coding Style
* 4 spaces for indentation rather than tabs
* 80 character line length
* PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/)
## License
By contributing to Mask-RCNN Benchmark, you agree that your contributions will be licensed
under the LICENSE file in the root directory of this source tree.

View file

@ -0,0 +1,37 @@
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.02-py3
FROM ${FROM_IMAGE_NAME}
# Install Python dependencies
RUN pip install --upgrade --no-cache-dir pip \
&& pip install --no-cache-dir \
mlperf-compliance==0.0.10 \
opencv-python==3.4.1.15 \
yacs
WORKDIR /opt
RUN git clone -b v0.1 https://github.com/NVIDIA/cocoapi.git \
&& cd cocoapi/PythonAPI \
&& pip install -e .
# Copy detectron code and build
WORKDIR /workspace/object_detection
RUN mkdir -p /datasets/coco
RUN mkdir /results
COPY . .
RUN pip install -e .
ENV OMP_NUM_THREADS=1

View file

@ -0,0 +1,78 @@
## Installation
### Requirements:
- PyTorch 1.0 from a nightly release. Installation instructions can be found in https://pytorch.org/get-started/locally/
- torchvision from master
- cocoapi
- yacs
- matplotlib
- GCC >= 4.9
- (optional) OpenCV for the webcam demo
### Option 1: Step-by-step installation
```bash
# first, make sure that your conda is setup properly with the right environment
# for that, check that `which conda`, `which pip` and `which python` points to the
# right path. From a clean conda env, this is what you need to do
conda create --name maskrcnn_benchmark
source activate maskrcnn_benchmark
# this installs the right pip and dependencies for the fresh python
conda install ipython
# maskrcnn_benchmark and coco api dependencies
pip install ninja yacs cython matplotlib
# follow PyTorch installation in https://pytorch.org/get-started/locally/
# we give the instructions for CUDA 9.0
conda install pytorch-nightly -c pytorch
# install torchvision
cd ~/github
git clone https://github.com/pytorch/vision.git
cd vision
python setup.py install
# install pycocotools
cd ~/github
git clone https://github.com/cocodataset/cocoapi.git
cd cocoapi/PythonAPI
python setup.py build_ext install
# install apex
cd ~github
git clone https://github.com/NVIDIA/apex.git
cd apex
python setup.py install --cuda_ext --cpp_ext
# install PyTorch Detection
cd ~/github
git clone https://github.com/facebookresearch/maskrcnn-benchmark.git
cd maskrcnn-benchmark
# the following will install the lib with
# symbolic links, so that you can modify
# the files if you want and won't need to
# re-build it
python setup.py build develop
# or if you are on macOS
# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop
```
### Option 2: Docker Image (Requires CUDA, Linux only)
Build image with defaults (`CUDA=9.0`, `CUDNN=7`):
nvidia-docker build -t maskrcnn-benchmark docker/
Build image with other CUDA and CUDNN versions:
nvidia-docker build -t maskrcnn-benchmark --build-arg CUDA=9.2 --build-arg CUDNN=7 docker/
Build and run image with built-in jupyter notebook(note that the password is used to log in jupyter notebook):
nvidia-docker build -t maskrcnn-benchmark-jupyter docker/docker-jupyter/
nvidia-docker run -td -p 8888:8888 -e PASSWORD=<password> -v <host-dir>:<container-dir> maskrcnn-benchmark-jupyter

View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2018 Facebook
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,82 @@
## Model Zoo and Baselines
### Hardware
- 8 NVIDIA V100 GPUs
### Software
- PyTorch version: 1.0.0a0+dd2c487
- CUDA 9.2
- CUDNN 7.1
- NCCL 2.2.13-1
### End-to-end Faster and Mask R-CNN baselines
All the baselines were trained using the exact same experimental setup as in Detectron.
We initialize the detection models with ImageNet weights from Caffe2, the same as used by Detectron.
The pre-trained models are available in the link in the model id.
backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | mask AP | model id
-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | --
R-50-C4 | Fast | 1x | 1 | 5.8 | 0.4036 | 20.2 | 0.17130 | 34.8 | - | [6358800](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_C4_1x.pth)
R-50-FPN | Fast | 1x | 2 | 4.4 | 0.3530 | 8.8 | 0.12580 | 36.8 | - | [6358793](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_FPN_1x.pth)
R-101-FPN | Fast | 1x | 2 | 7.1 | 0.4591 | 11.5 | 0.143149 | 39.1 | - | [6358804](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_101_FPN_1x.pth)
X-101-32x8d-FPN | Fast | 1x | 1 | 7.6 | 0.7007 | 35.0 | 0.209965 | 41.2 | - | [6358717](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_X_101_32x8d_FPN_1x.pth)
R-50-C4 | Mask | 1x | 1 | 5.8 | 0.4520 | 22.6 | 0.17796 + 0.028 | 35.6 | 31.5 | [6358801](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_C4_1x.pth)
R-50-FPN | Mask | 1x | 2 | 5.2 | 0.4536 | 11.3 | 0.12966 + 0.034 | 37.8 | 34.2 | [6358792](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_FPN_1x.pth)
R-101-FPN | Mask | 1x | 2 | 7.9 | 0.5665 | 14.2 | 0.15384 + 0.034 | 40.1 | 36.1 | [6358805](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_101_FPN_1x.pth)
X-101-32x8d-FPN | Mask | 1x | 1 | 7.8 | 0.7562 | 37.8 | 0.21739 + 0.034 | 42.2 | 37.8 | [6358718](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_X_101_32x8d_FPN_1x.pth)
## Comparison with Detectron and mmdetection
In the following section, we compare our implementation with [Detectron](https://github.com/facebookresearch/Detectron)
and [mmdetection](https://github.com/open-mmlab/mmdetection).
The same remarks from [mmdetection](https://github.com/open-mmlab/mmdetection/blob/master/MODEL_ZOO.md#training-speed)
about different hardware applies here.
### Training speed
The numbers here are in seconds / iteration. The lower, the better.
type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100)
-- | -- | -- | --
Faster R-CNN R-50 C4 | 0.566 | - | 0.4036
Faster R-CNN R-50 FPN | 0.544 | 0.554 | 0.3530
Faster R-CNN R-101 FPN | 0.647 | - | 0.4591
Faster R-CNN X-101-32x8d FPN | 0.799 | - | 0.7007
Mask R-CNN R-50 C4 | 0.620 | - | 0.4520
Mask R-CNN R-50 FPN | 0.889 | 0.690 | 0.4536
Mask R-CNN R-101 FPN | 1.008 | - | 0.5665
Mask R-CNN X-101-32x8d FPN | 0.961 | - | 0.7562
### Training memory
The lower, the better
type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100)
-- | -- | -- | --
Faster R-CNN R-50 C4 | 6.3 | - | 5.8
Faster R-CNN R-50 FPN | 7.2 | 4.9 | 4.4
Faster R-CNN R-101 FPN | 8.9 | - | 7.1
Faster R-CNN X-101-32x8d FPN | 7.0 | - | 7.6
Mask R-CNN R-50 C4 | 6.6 | - | 5.8
Mask R-CNN R-50 FPN | 8.6 | 5.9 | 5.2
Mask R-CNN R-101 FPN | 10.2 | - | 7.9
Mask R-CNN X-101-32x8d FPN | 7.7 | - | 7.8
### Accuracy
The higher, the better
type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100)
-- | -- | -- | --
Faster R-CNN R-50 C4 | 34.8 | - | 34.8
Faster R-CNN R-50 FPN | 36.7 | 36.7 | 36.8
Faster R-CNN R-101 FPN | 39.4 | - | 39.1
Faster R-CNN X-101-32x8d FPN | 41.3 | - | 41.2
Mask R-CNN R-50 C4 | 35.8 & 31.4 | - | 35.6 & 31.5
Mask R-CNN R-50 FPN | 37.7 & 33.9 | 37.5 & 34.4 | 37.8 & 34.2
Mask R-CNN R-101 FPN | 40.0 & 35.9 | - | 40.1 & 36.1
Mask R-CNN X-101-32x8d FPN | 42.1 & 37.3 | - | 42.2 & 37.8

View file

@ -0,0 +1,67 @@
# Troubleshooting
Here is a compilation if common issues that you might face
while compiling / running this code:
## Compilation errors when compiling the library
If you encounter build errors like the following:
```
/usr/include/c++/6/type_traits:1558:8: note: provided for template<class _From, class _To> struct std::is_convertible
struct is_convertible
^~~~~~~~~~~~~~
/usr/include/c++/6/tuple:502:1: error: body of constexpr function static constexpr bool std::_TC<<anonymous>, _Elements>::_NonNestedTuple() [with _SrcTuple = std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>&&; bool <anonymous> = true; _Elements = {at::Tensor, at::Tensor, at::Tensor, at::Tensor}] not a return-statement
}
^
error: command '/usr/local/cuda/bin/nvcc' failed with exit status 1
```
check your CUDA version and your `gcc` version.
```
nvcc --version
gcc --version
```
If you are using CUDA 9.0 and gcc 6.4.0, then refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/25,
which has a summary of the solution. Basically, CUDA 9.0 is not compatible with gcc 6.4.0.
## ImportError: No module named maskrcnn_benchmark.config when running webcam.py
This means that `maskrcnn-benchmark` has not been properly installed.
Refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/22 for a few possible issues.
Note that we now support Python 2 as well.
## ImportError: Undefined symbol: __cudaPopCallConfiguration error when import _C
This probably means that the inconsistent version of NVCC compile and your conda CUDAToolKit package. This is firstly mentioned in https://github.com/facebookresearch/maskrcnn-benchmark/issues/45 . All you need to do is:
```
# Check the NVCC compile version(e.g.)
/usr/cuda-9.2/bin/nvcc --version
# Check the CUDAToolKit version(e.g.)
~/anaconda3/bin/conda list | grep cuda
# If you need to update your CUDAToolKit
~/anaconda3/bin/conda install -c anaconda cudatoolkit==9.2
```
Both of them should have the **same** version. For example, if NVCC==9.2 and CUDAToolKit==9.2, this will be fine while when NVCC==9.2 but CUDAToolKit==9, it fails.
## Segmentation fault (core dumped) when running the library
This probably means that you have compiled the library using GCC < 4.9, which is ABI incompatible with PyTorch.
Indeed, during installation, you probably saw a message like
```
Your compiler (g++ 4.8) may be ABI-incompatible with PyTorch!
Please use a compiler that is ABI-compatible with GCC 4.9 and above.
See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html.
See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6
for instructions on how to install GCC 4.9 or higher.
```
Follow the instructions on https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6
to install GCC 4.9 or higher, and try recompiling `maskrcnn-benchmark` again, after cleaning the
`build` folder with
```
rm -rf build
```

Some files were not shown because too many files have changed in this diff Show more