Adding MNIST, CIFAR10

This commit is contained in:
Simon Layton 2018-05-02 16:40:52 -04:00
parent 4ac4aa1504
commit 057d0c29cd
9 changed files with 688 additions and 0 deletions

View file

@ -0,0 +1,19 @@
#!/usr/bin/env sh
# This scripts downloads the CIFAR10 (binary version) data and unzips it.
set -e
cd "$( cd "$(dirname "$0")" ; pwd -P )"
echo "Downloading..."
wget --no-check-certificate http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
echo "Unzipping..."
tar -xf cifar-10-binary.tar.gz && rm -f cifar-10-binary.tar.gz
mv cifar-10-batches-bin/* . && rm -rf cifar-10-batches-bin
# Creation is split out because leveldb sometimes causes segfault
# and needs to be re-created.
echo "Done."

View file

@ -0,0 +1,7 @@
#!/bin/bash
set -e
cd "$( cd "$(dirname "$0")" ; pwd -P )"
# Create CIFAR10 train + test databases
make_cifar_db --db lmdb --input_folder "$(pwd)" --output_train_db_name cifar10_train_lmdb --output_test_db_name cifar10_test_lmdb

View file

@ -0,0 +1,208 @@
#!/usr/bin/env python
"""Example: train a model on CIFAR10."""
from __future__ import division, print_function
import argparse
import functools
import logging
import os.path
from caffe2.python import brew, core, data_parallel_model, optimizer, workspace
from caffe2.python.core import DataType
from caffe2.python.model_helper import ModelHelper
from caffe2.python.modeling.initializers import Initializer, pFP16Initializer
logging.basicConfig()
TRAIN_ENTRIES = 50000
TEST_ENTRIES = 10000
BATCH_SIZE = 100
EPOCHS = 10
DISPLAY = 100
ACCURACY_MIN = 0.7
ACCURACY_MAX = 0.8
def AddInputOps(model, reader, batch_size, dtype):
"""Add input ops."""
data, label = brew.image_input(
model, [reader], ['data', 'label'],
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
scale=32, crop=32, mirror=1, color=True, mean=128.0,
output_type='float16' if dtype == DataType.FLOAT16 else 'float',
is_test=False)
data = model.StopGradient(data, data)
def AddForwardPassOps(model, loss_scale, dtype):
"""Add forward pass ops and return a list of losses."""
initializer = (pFP16Initializer if dtype == DataType.FLOAT16
else Initializer)
with brew.arg_scope([brew.conv, brew.fc],
WeightInitializer=initializer,
BiasInitializer=initializer):
conv1 = brew.conv(model, 'data', 'conv1', 3, 32, 5, pad=2,
weight_init=('GaussianFill',
{'std': 0.0001, 'mean': 0.0}))
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=3, stride=2)
relu1 = brew.relu(model, pool1, 'relu1')
conv2 = brew.conv(model, relu1, 'conv2', 32, 32, 5, pad=2,
weight_init=('GaussianFill', {'std': 0.01}))
conv2 = brew.relu(model, conv2, conv2)
pool2 = brew.average_pool(model, conv2, 'pool2', kernel=3, stride=2)
conv3 = brew.conv(model, pool2, 'conv3', 32, 64, 5, pad=2,
weight_init=('GaussianFill', {'std': 0.01}))
conv3 = brew.relu(model, conv3, conv3)
pool3 = brew.average_pool(model, conv3, 'pool3', kernel=3, stride=2)
fc1 = brew.fc(model, pool3, 'fc1', 64 * 3 * 3, 64,
weight_init=('GaussianFill', {'std': 0.1}))
fc2 = brew.fc(model, fc1, 'fc2', 64, 10,
weight_init=('GaussianFill', {'std': 0.1}))
if dtype == DataType.FLOAT16:
fc2 = model.net.HalfToFloat(fc2, fc2 + '_fp32')
softmax, loss = model.SoftmaxWithLoss([fc2, 'label'], ['softmax', 'loss'])
loss = model.Scale(loss, loss, scale=loss_scale)
brew.accuracy(model, [softmax, 'label'], 'accuracy')
return [loss]
def AddOptimizerOps(model):
"""Add optimizer ops."""
optimizer.add_weight_decay(model, 0.004)
stepsize = TRAIN_ENTRIES * EPOCHS // BATCH_SIZE
optimizer.build_sgd(
model, 0.001,
policy='step', stepsize=stepsize, gamma=0.1,
momentum=0.9, nesterov=False)
def AddPostSyncOps(model):
"""Add ops which run after the initial parameter sync."""
for param_info in model.GetOptimizationParamInfo(model.GetParams()):
if param_info.blob_copy is not None:
# Ensure copies are in sync after initial broadcast
model.param_init_net.HalfToFloat(
param_info.blob,
param_info.blob_copy[core.DataType.FLOAT]
)
def createTrainModel(lmdb_path, devices, dtype):
"""Create and return a training model, complete with training ops."""
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=functools.partial(
AddInputOps, reader=reader,
batch_size=(BATCH_SIZE // len(devices)), dtype=dtype),
forward_pass_builder_fun=functools.partial(
AddForwardPassOps, dtype=dtype),
optimizer_builder_fun=AddOptimizerOps,
post_sync_builder_fun=AddPostSyncOps,
devices=devices, use_nccl=True)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def createTestModel(lmdb_path, devices, dtype):
"""Create and return a test model. Does not include training ops."""
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
init_params=False)
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=functools.partial(
AddInputOps, reader=reader,
batch_size=(BATCH_SIZE // len(devices)), dtype=dtype),
forward_pass_builder_fun=functools.partial(
AddForwardPassOps, dtype=dtype),
param_update_builder_fun=None,
devices=devices)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def getArgs():
"""Return command-line arguments."""
CURDIR = os.path.dirname(__file__)
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--train-lmdb', help='Path to training LMDB',
default=os.path.join(CURDIR, 'cifar10_train_lmdb'))
parser.add_argument('--test-lmdb', help='Path to test LMDB',
default=os.path.join(CURDIR, 'cifar10_test_lmdb'))
parser.add_argument('--dtype', choices=['float', 'float16'],
default='float', help='Data type used for training')
parser.add_argument('--gpus',
help='Comma separated list of GPU devices to use')
parser.add_argument('--num_gpus', type=int, default=1,
help='Number of GPU devices (instead of --gpus)')
parser.add_argument('--all-gpus', action='store_true',
help='Use all GPUs in the system')
args = parser.parse_args()
args.dtype = (DataType.FLOAT16 if args.dtype == 'float16'
else DataType.FLOAT)
if args.all_gpus:
args.num_gpus = workspace.NumCudaDevices()
args.gpus = range(args.num_gpus)
else:
if args.gpus is not None:
args.gpus = [int(x) for x in args.gpus.split(',')]
args.num_gpus = len(args.gpus)
else:
args.gpus = range(args.num_gpus)
args.num_gpus = args.num_gpus
return args
def main(args):
"""Train and test."""
train_model = createTrainModel(args.train_lmdb, args.gpus, args.dtype)
test_model = createTestModel(args.test_lmdb, args.gpus, args.dtype)
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
scope_prefix = 'gpu_%d/' % args.gpus[0]
for epoch in range(1, EPOCHS + 1):
# Train
for iteration in range(1, train_iter_per_epoch + 1):
workspace.RunNet(train_model.net.Proto().name)
if not iteration % DISPLAY:
loss = workspace.FetchBlob(scope_prefix + 'loss')
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
# Test
losses = []
accuracies = []
for _ in range(test_iter_per_epoch):
workspace.RunNet(test_model.net.Proto().name)
# Take average values across all GPUs
losses.append(sum(
workspace.FetchBlob('gpu_%d/loss' % g) for g in args.gpus
) / len(args.gpus))
accuracies.append(sum(
workspace.FetchBlob('gpu_%d/accuracy' % g) for g in args.gpus
) / len(args.gpus))
loss = sum(losses) / len(losses)
accuracy = sum(accuracies) / len(accuracies)
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
raise RuntimeError(
"Final accuracy %f is not in the expected range [%f, %f]" %
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
if __name__ == '__main__':
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
main(getArgs())

View file

@ -0,0 +1,2 @@
*.mdb
*-ubyte

View file

@ -0,0 +1,14 @@
#!/usr/bin/env sh
# This scripts downloads the mnist data and unzips it.
cd "$( cd "$(dirname "$0")" ; pwd -P )"
echo "Downloading..."
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
if [ ! -e $fname ]; then
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
gunzip ${fname}.gz
fi
done

View file

@ -0,0 +1,7 @@
#!/bin/bash
cd "$( cd "$(dirname "$0")" ; pwd -P )"
# Create MNIST databases from previously downloaded data
make_mnist_db --db lmdb --image_file train-images-idx3-ubyte --label_file train-labels-idx1-ubyte --output_file mnist_train_lmdb
make_mnist_db --db lmdb --image_file t10k-images-idx3-ubyte --label_file t10k-labels-idx1-ubyte --output_file mnist_test_lmdb

View file

@ -0,0 +1,133 @@
#!/usr/bin/env python
"""Example: train LeNet on MNIST."""
from __future__ import division, print_function
import argparse
import os.path
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import brew, core, optimizer, workspace
from caffe2.python.model_helper import ModelHelper
TRAIN_ENTRIES = 60000
TEST_ENTRIES = 10000
BATCH_SIZE = 100
EPOCHS = 4
DISPLAY = 100
ACCURACY_MIN = 0.98
ACCURACY_MAX = 0.999
def AddInputOps(model, reader, batch_size):
"""Add input ops."""
data, label = brew.image_input(
model, [reader], ['data', 'label'],
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
is_test=False)
data = model.StopGradient(data, data)
def AddForwardPassOps(model):
"""Add forward pass ops and return a list of losses."""
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
fc3 = brew.relu(model, fc3, fc3)
pred = brew.fc(model, fc3, 'pred', 500, 10)
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
brew.accuracy(model, [softmax, 'label'], 'accuracy')
return [loss]
def AddOptimizerOps(model):
"""Add optimizer ops."""
optimizer.build_sgd(model, 0.01,
policy='step', stepsize=1, gamma=0.999,
momentum=0.9, nesterov=False)
def createTrainModel(lmdb_path):
"""Create and return a training model, complete with training ops."""
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
AddInputOps(model, reader, BATCH_SIZE)
losses = AddForwardPassOps(model)
model.AddGradientOperators(losses)
AddOptimizerOps(model)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def createTestModel(lmdb_path):
"""Create and return a test model. Does not include training ops."""
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
init_params=False)
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
AddInputOps(model, reader, BATCH_SIZE)
AddForwardPassOps(model)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def getArgs():
"""Return command-line arguments."""
CURDIR = os.path.dirname(__file__)
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--train-lmdb', help='Path to training LMDB',
default=os.path.join(CURDIR, 'mnist_train_lmdb'))
parser.add_argument('--test-lmdb', help='Path to test LMDB',
default=os.path.join(CURDIR, 'mnist_test_lmdb'))
args = parser.parse_args()
return args
def main(args):
"""Train and test."""
device = 0
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, device)):
train_model = createTrainModel(args.train_lmdb)
test_model = createTestModel(args.test_lmdb)
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
for epoch in range(1, EPOCHS + 1):
# Train
for iteration in range(1, train_iter_per_epoch + 1):
workspace.RunNet(train_model.net.Proto().name)
if not iteration % DISPLAY:
loss = workspace.FetchBlob('loss')
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
# Test
losses = []
accuracies = []
for _ in range(test_iter_per_epoch):
workspace.RunNet(test_model.net.Proto().name)
losses.append(workspace.FetchBlob('loss'))
accuracies.append(workspace.FetchBlob('accuracy'))
loss = np.array(losses).mean()
accuracy = np.array(accuracies).mean()
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
raise RuntimeError(
"Final accuracy %f is not in the expected range [%f, %f]" %
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
if __name__ == '__main__':
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
main(getArgs())

View file

@ -0,0 +1,139 @@
#!/usr/bin/env python
"""Example: train LeNet on MNIST (with fp16)."""
from __future__ import division, print_function
import argparse
import os.path
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import brew, core, optimizer, workspace
from caffe2.python.model_helper import ModelHelper
from caffe2.python.modeling.initializers import pFP16Initializer
TRAIN_ENTRIES = 60000
TEST_ENTRIES = 10000
BATCH_SIZE = 100
EPOCHS = 4
DISPLAY = 100
ACCURACY_MIN = 0.98
ACCURACY_MAX = 0.999
def AddInputOps(model, reader, batch_size):
"""Add input ops."""
data, label = brew.image_input(
model, [reader], ['data', 'label'],
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
output_type='float16', is_test=True)
data = model.StopGradient(data, data)
def AddForwardPassOps(model):
"""Add forward pass ops and return a list of losses."""
with brew.arg_scope([brew.conv, brew.fc],
WeightInitializer=pFP16Initializer,
BiasInitializer=pFP16Initializer):
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
fc3 = brew.relu(model, fc3, fc3)
pred = brew.fc(model, fc3, 'pred', 500, 10)
# Cast back to fp32 for remaining ops
pred = model.net.HalfToFloat(pred, pred + '_fp32')
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
brew.accuracy(model, [softmax, 'label'], 'accuracy')
return [loss]
def AddOptimizerOps(model):
"""Add optimizer ops."""
optimizer.build_sgd(model, 0.01,
policy='step', stepsize=1, gamma=0.999,
momentum=0.9, nesterov=False)
def createTrainModel(lmdb_path):
"""Create and return a training model, complete with training ops."""
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
AddInputOps(model, reader, BATCH_SIZE)
losses = AddForwardPassOps(model)
model.AddGradientOperators(losses)
AddOptimizerOps(model)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def createTestModel(lmdb_path):
"""Create and return a test model. Does not include training ops."""
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
init_params=False)
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
AddInputOps(model, reader, BATCH_SIZE)
AddForwardPassOps(model)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def getArgs():
"""Return command-line arguments."""
CURDIR = os.path.dirname(__file__)
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--train-lmdb', help='Path to training LMDB',
default=os.path.join(CURDIR, 'mnist_train_lmdb'))
parser.add_argument('--test-lmdb', help='Path to test LMDB',
default=os.path.join(CURDIR, 'mnist_test_lmdb'))
args = parser.parse_args()
return args
def main(args):
"""Train and test."""
device = 0
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, device)):
train_model = createTrainModel(args.train_lmdb)
test_model = createTestModel(args.test_lmdb)
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
for epoch in range(1, EPOCHS + 1):
# Train
for iteration in range(1, train_iter_per_epoch + 1):
workspace.RunNet(train_model.net.Proto().name)
if not iteration % DISPLAY:
loss = workspace.FetchBlob('loss')
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
# Test
losses = []
accuracies = []
for _ in range(test_iter_per_epoch):
workspace.RunNet(test_model.net.Proto().name)
losses.append(workspace.FetchBlob('loss'))
accuracies.append(workspace.FetchBlob('accuracy'))
loss = np.array(losses).mean()
accuracy = np.array(accuracies).mean()
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
raise RuntimeError(
"Final accuracy %f is not in the expected range [%f, %f]" %
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
if __name__ == '__main__':
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
main(getArgs())

View file

@ -0,0 +1,159 @@
#!/usr/bin/env python
"""Example: train LeNet on MNIST (with multi-GPU)."""
from __future__ import division, print_function
import argparse
import functools
import logging
import os.path
from caffe2.python import brew, core, data_parallel_model, optimizer, workspace
from caffe2.python.model_helper import ModelHelper
logging.basicConfig()
TRAIN_ENTRIES = 60000
TEST_ENTRIES = 10000
BATCH_SIZE = 100
EPOCHS = 4
DISPLAY = 100
ACCURACY_MIN = 0.98
ACCURACY_MAX = 0.999
def AddInputOps(model, reader, batch_size):
"""Add input ops."""
data, label = brew.image_input(
model, [reader], ['data', 'label'],
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
is_test=True)
data = model.StopGradient(data, data)
def AddForwardPassOps(model, loss_scale):
"""Add forward pass ops and return a list of losses."""
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
fc3 = brew.relu(model, fc3, fc3)
pred = brew.fc(model, fc3, 'pred', 500, 10)
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
loss = model.Scale(loss, loss, scale=loss_scale)
brew.accuracy(model, [softmax, 'label'], 'accuracy')
return [loss]
def AddOptimizerOps(model):
"""Add optimizer ops."""
optimizer.build_sgd(model, 0.01,
policy='step', stepsize=1, gamma=0.999,
momentum=0.9, nesterov=False)
def createTrainModel(lmdb_path, devices):
"""Create and return a training model, complete with training ops."""
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=functools.partial(
AddInputOps, reader=reader,
batch_size=(BATCH_SIZE // len(devices))),
forward_pass_builder_fun=AddForwardPassOps,
optimizer_builder_fun=AddOptimizerOps,
devices=devices, use_nccl=True)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def createTestModel(lmdb_path, devices):
"""Create and return a test model. Does not include training ops."""
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
init_params=False)
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
data_parallel_model.Parallelize_GPU(
model,
input_builder_fun=functools.partial(
AddInputOps, reader=reader,
batch_size=(BATCH_SIZE // len(devices))),
forward_pass_builder_fun=AddForwardPassOps,
param_update_builder_fun=None,
devices=devices)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
return model
def getArgs():
"""Return command-line arguments."""
CURDIR = os.path.dirname(__file__)
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--train-lmdb', help='Path to training LMDB',
default=os.path.join(CURDIR, 'mnist_train_lmdb'))
parser.add_argument('--test-lmdb', help='Path to test LMDB',
default=os.path.join(CURDIR, 'mnist_test_lmdb'))
parser.add_argument('--gpus',
help='Comma separated list of GPU devices to use')
parser.add_argument('--num_gpus', type=int, default=1,
help='Number of GPU devices (instead of --gpus)')
args = parser.parse_args()
if args.gpus is not None:
args.gpus = [int(x) for x in args.gpus.split(',')]
args.num_gpus = len(args.gpus)
else:
args.gpus = range(args.num_gpus)
args.num_gpus = args.num_gpus
return args
def main(args):
"""Train and test."""
train_model = createTrainModel(args.train_lmdb, args.gpus)
test_model = createTestModel(args.test_lmdb, args.gpus)
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
scope_prefix = 'gpu_%d/' % args.gpus[0]
for epoch in range(1, EPOCHS + 1):
# Train
for iteration in range(1, train_iter_per_epoch + 1):
workspace.RunNet(train_model.net.Proto().name)
if not iteration % DISPLAY:
loss = workspace.FetchBlob(scope_prefix + 'loss')
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
# Test
losses = []
accuracies = []
for _ in range(test_iter_per_epoch):
workspace.RunNet(test_model.net.Proto().name)
# Take average values across all GPUs
losses.append(sum(
workspace.FetchBlob('gpu_%d/loss' % g) for g in args.gpus
) / len(args.gpus))
accuracies.append(sum(
workspace.FetchBlob('gpu_%d/accuracy' % g) for g in args.gpus
) / len(args.gpus))
loss = sum(losses) / len(losses)
accuracy = sum(accuracies) / len(accuracies)
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
raise RuntimeError(
"Final accuracy %f is not in the expected range [%f, %f]" %
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
if __name__ == '__main__':
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
main(getArgs())