Adding MNIST, CIFAR10
This commit is contained in:
parent
4ac4aa1504
commit
057d0c29cd
19
Caffe2/Classification/cifar10/get_cifar10.sh
Executable file
19
Caffe2/Classification/cifar10/get_cifar10.sh
Executable file
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env sh
|
||||
# This scripts downloads the CIFAR10 (binary version) data and unzips it.
|
||||
set -e
|
||||
|
||||
cd "$( cd "$(dirname "$0")" ; pwd -P )"
|
||||
|
||||
echo "Downloading..."
|
||||
|
||||
wget --no-check-certificate http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
|
||||
|
||||
echo "Unzipping..."
|
||||
|
||||
tar -xf cifar-10-binary.tar.gz && rm -f cifar-10-binary.tar.gz
|
||||
mv cifar-10-batches-bin/* . && rm -rf cifar-10-batches-bin
|
||||
|
||||
# Creation is split out because leveldb sometimes causes segfault
|
||||
# and needs to be re-created.
|
||||
|
||||
echo "Done."
|
7
Caffe2/Classification/cifar10/make_cifar10.sh
Executable file
7
Caffe2/Classification/cifar10/make_cifar10.sh
Executable file
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
cd "$( cd "$(dirname "$0")" ; pwd -P )"
|
||||
|
||||
# Create CIFAR10 train + test databases
|
||||
make_cifar_db --db lmdb --input_folder "$(pwd)" --output_train_db_name cifar10_train_lmdb --output_test_db_name cifar10_test_lmdb
|
208
Caffe2/Classification/cifar10/train_cifar10.py
Executable file
208
Caffe2/Classification/cifar10/train_cifar10.py
Executable file
|
@ -0,0 +1,208 @@
|
|||
#!/usr/bin/env python
|
||||
"""Example: train a model on CIFAR10."""
|
||||
from __future__ import division, print_function
|
||||
|
||||
import argparse
|
||||
import functools
|
||||
import logging
|
||||
import os.path
|
||||
|
||||
from caffe2.python import brew, core, data_parallel_model, optimizer, workspace
|
||||
from caffe2.python.core import DataType
|
||||
from caffe2.python.model_helper import ModelHelper
|
||||
from caffe2.python.modeling.initializers import Initializer, pFP16Initializer
|
||||
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
TRAIN_ENTRIES = 50000
|
||||
TEST_ENTRIES = 10000
|
||||
BATCH_SIZE = 100
|
||||
EPOCHS = 10
|
||||
DISPLAY = 100
|
||||
ACCURACY_MIN = 0.7
|
||||
ACCURACY_MAX = 0.8
|
||||
|
||||
|
||||
def AddInputOps(model, reader, batch_size, dtype):
|
||||
"""Add input ops."""
|
||||
data, label = brew.image_input(
|
||||
model, [reader], ['data', 'label'],
|
||||
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
|
||||
scale=32, crop=32, mirror=1, color=True, mean=128.0,
|
||||
output_type='float16' if dtype == DataType.FLOAT16 else 'float',
|
||||
is_test=False)
|
||||
data = model.StopGradient(data, data)
|
||||
|
||||
|
||||
def AddForwardPassOps(model, loss_scale, dtype):
|
||||
"""Add forward pass ops and return a list of losses."""
|
||||
initializer = (pFP16Initializer if dtype == DataType.FLOAT16
|
||||
else Initializer)
|
||||
with brew.arg_scope([brew.conv, brew.fc],
|
||||
WeightInitializer=initializer,
|
||||
BiasInitializer=initializer):
|
||||
conv1 = brew.conv(model, 'data', 'conv1', 3, 32, 5, pad=2,
|
||||
weight_init=('GaussianFill',
|
||||
{'std': 0.0001, 'mean': 0.0}))
|
||||
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=3, stride=2)
|
||||
relu1 = brew.relu(model, pool1, 'relu1')
|
||||
conv2 = brew.conv(model, relu1, 'conv2', 32, 32, 5, pad=2,
|
||||
weight_init=('GaussianFill', {'std': 0.01}))
|
||||
conv2 = brew.relu(model, conv2, conv2)
|
||||
pool2 = brew.average_pool(model, conv2, 'pool2', kernel=3, stride=2)
|
||||
conv3 = brew.conv(model, pool2, 'conv3', 32, 64, 5, pad=2,
|
||||
weight_init=('GaussianFill', {'std': 0.01}))
|
||||
conv3 = brew.relu(model, conv3, conv3)
|
||||
pool3 = brew.average_pool(model, conv3, 'pool3', kernel=3, stride=2)
|
||||
fc1 = brew.fc(model, pool3, 'fc1', 64 * 3 * 3, 64,
|
||||
weight_init=('GaussianFill', {'std': 0.1}))
|
||||
fc2 = brew.fc(model, fc1, 'fc2', 64, 10,
|
||||
weight_init=('GaussianFill', {'std': 0.1}))
|
||||
|
||||
if dtype == DataType.FLOAT16:
|
||||
fc2 = model.net.HalfToFloat(fc2, fc2 + '_fp32')
|
||||
softmax, loss = model.SoftmaxWithLoss([fc2, 'label'], ['softmax', 'loss'])
|
||||
loss = model.Scale(loss, loss, scale=loss_scale)
|
||||
brew.accuracy(model, [softmax, 'label'], 'accuracy')
|
||||
return [loss]
|
||||
|
||||
|
||||
def AddOptimizerOps(model):
|
||||
"""Add optimizer ops."""
|
||||
optimizer.add_weight_decay(model, 0.004)
|
||||
stepsize = TRAIN_ENTRIES * EPOCHS // BATCH_SIZE
|
||||
optimizer.build_sgd(
|
||||
model, 0.001,
|
||||
policy='step', stepsize=stepsize, gamma=0.1,
|
||||
momentum=0.9, nesterov=False)
|
||||
|
||||
|
||||
def AddPostSyncOps(model):
|
||||
"""Add ops which run after the initial parameter sync."""
|
||||
for param_info in model.GetOptimizationParamInfo(model.GetParams()):
|
||||
if param_info.blob_copy is not None:
|
||||
# Ensure copies are in sync after initial broadcast
|
||||
model.param_init_net.HalfToFloat(
|
||||
param_info.blob,
|
||||
param_info.blob_copy[core.DataType.FLOAT]
|
||||
)
|
||||
|
||||
|
||||
def createTrainModel(lmdb_path, devices, dtype):
|
||||
"""Create and return a training model, complete with training ops."""
|
||||
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
|
||||
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
|
||||
data_parallel_model.Parallelize_GPU(
|
||||
model,
|
||||
input_builder_fun=functools.partial(
|
||||
AddInputOps, reader=reader,
|
||||
batch_size=(BATCH_SIZE // len(devices)), dtype=dtype),
|
||||
forward_pass_builder_fun=functools.partial(
|
||||
AddForwardPassOps, dtype=dtype),
|
||||
optimizer_builder_fun=AddOptimizerOps,
|
||||
post_sync_builder_fun=AddPostSyncOps,
|
||||
devices=devices, use_nccl=True)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
workspace.CreateNet(model.net)
|
||||
return model
|
||||
|
||||
|
||||
def createTestModel(lmdb_path, devices, dtype):
|
||||
"""Create and return a test model. Does not include training ops."""
|
||||
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
|
||||
init_params=False)
|
||||
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
|
||||
data_parallel_model.Parallelize_GPU(
|
||||
model,
|
||||
input_builder_fun=functools.partial(
|
||||
AddInputOps, reader=reader,
|
||||
batch_size=(BATCH_SIZE // len(devices)), dtype=dtype),
|
||||
forward_pass_builder_fun=functools.partial(
|
||||
AddForwardPassOps, dtype=dtype),
|
||||
param_update_builder_fun=None,
|
||||
devices=devices)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
workspace.CreateNet(model.net)
|
||||
return model
|
||||
|
||||
|
||||
def getArgs():
|
||||
"""Return command-line arguments."""
|
||||
CURDIR = os.path.dirname(__file__)
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--train-lmdb', help='Path to training LMDB',
|
||||
default=os.path.join(CURDIR, 'cifar10_train_lmdb'))
|
||||
parser.add_argument('--test-lmdb', help='Path to test LMDB',
|
||||
default=os.path.join(CURDIR, 'cifar10_test_lmdb'))
|
||||
parser.add_argument('--dtype', choices=['float', 'float16'],
|
||||
default='float', help='Data type used for training')
|
||||
parser.add_argument('--gpus',
|
||||
help='Comma separated list of GPU devices to use')
|
||||
parser.add_argument('--num_gpus', type=int, default=1,
|
||||
help='Number of GPU devices (instead of --gpus)')
|
||||
parser.add_argument('--all-gpus', action='store_true',
|
||||
help='Use all GPUs in the system')
|
||||
args = parser.parse_args()
|
||||
|
||||
args.dtype = (DataType.FLOAT16 if args.dtype == 'float16'
|
||||
else DataType.FLOAT)
|
||||
|
||||
if args.all_gpus:
|
||||
args.num_gpus = workspace.NumCudaDevices()
|
||||
args.gpus = range(args.num_gpus)
|
||||
else:
|
||||
if args.gpus is not None:
|
||||
args.gpus = [int(x) for x in args.gpus.split(',')]
|
||||
args.num_gpus = len(args.gpus)
|
||||
else:
|
||||
args.gpus = range(args.num_gpus)
|
||||
args.num_gpus = args.num_gpus
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Train and test."""
|
||||
train_model = createTrainModel(args.train_lmdb, args.gpus, args.dtype)
|
||||
test_model = createTestModel(args.test_lmdb, args.gpus, args.dtype)
|
||||
|
||||
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
|
||||
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
|
||||
scope_prefix = 'gpu_%d/' % args.gpus[0]
|
||||
|
||||
for epoch in range(1, EPOCHS + 1):
|
||||
# Train
|
||||
for iteration in range(1, train_iter_per_epoch + 1):
|
||||
workspace.RunNet(train_model.net.Proto().name)
|
||||
if not iteration % DISPLAY:
|
||||
loss = workspace.FetchBlob(scope_prefix + 'loss')
|
||||
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
|
||||
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
|
||||
|
||||
# Test
|
||||
losses = []
|
||||
accuracies = []
|
||||
for _ in range(test_iter_per_epoch):
|
||||
workspace.RunNet(test_model.net.Proto().name)
|
||||
# Take average values across all GPUs
|
||||
losses.append(sum(
|
||||
workspace.FetchBlob('gpu_%d/loss' % g) for g in args.gpus
|
||||
) / len(args.gpus))
|
||||
accuracies.append(sum(
|
||||
workspace.FetchBlob('gpu_%d/accuracy' % g) for g in args.gpus
|
||||
) / len(args.gpus))
|
||||
|
||||
loss = sum(losses) / len(losses)
|
||||
accuracy = sum(accuracies) / len(accuracies)
|
||||
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
|
||||
|
||||
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
|
||||
raise RuntimeError(
|
||||
"Final accuracy %f is not in the expected range [%f, %f]" %
|
||||
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
|
||||
main(getArgs())
|
2
Caffe2/Classification/mnist/.gitignore
vendored
Normal file
2
Caffe2/Classification/mnist/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
*.mdb
|
||||
*-ubyte
|
14
Caffe2/Classification/mnist/get_mnist.sh
Executable file
14
Caffe2/Classification/mnist/get_mnist.sh
Executable file
|
@ -0,0 +1,14 @@
|
|||
#!/usr/bin/env sh
|
||||
# This scripts downloads the mnist data and unzips it.
|
||||
|
||||
cd "$( cd "$(dirname "$0")" ; pwd -P )"
|
||||
|
||||
echo "Downloading..."
|
||||
|
||||
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
|
||||
do
|
||||
if [ ! -e $fname ]; then
|
||||
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
|
||||
gunzip ${fname}.gz
|
||||
fi
|
||||
done
|
7
Caffe2/Classification/mnist/make_mnist.sh
Executable file
7
Caffe2/Classification/mnist/make_mnist.sh
Executable file
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd "$( cd "$(dirname "$0")" ; pwd -P )"
|
||||
|
||||
# Create MNIST databases from previously downloaded data
|
||||
make_mnist_db --db lmdb --image_file train-images-idx3-ubyte --label_file train-labels-idx1-ubyte --output_file mnist_train_lmdb
|
||||
make_mnist_db --db lmdb --image_file t10k-images-idx3-ubyte --label_file t10k-labels-idx1-ubyte --output_file mnist_test_lmdb
|
133
Caffe2/Classification/mnist/train_lenet.py
Executable file
133
Caffe2/Classification/mnist/train_lenet.py
Executable file
|
@ -0,0 +1,133 @@
|
|||
#!/usr/bin/env python
|
||||
"""Example: train LeNet on MNIST."""
|
||||
from __future__ import division, print_function
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from caffe2.proto import caffe2_pb2
|
||||
from caffe2.python import brew, core, optimizer, workspace
|
||||
from caffe2.python.model_helper import ModelHelper
|
||||
|
||||
|
||||
TRAIN_ENTRIES = 60000
|
||||
TEST_ENTRIES = 10000
|
||||
BATCH_SIZE = 100
|
||||
EPOCHS = 4
|
||||
DISPLAY = 100
|
||||
ACCURACY_MIN = 0.98
|
||||
ACCURACY_MAX = 0.999
|
||||
|
||||
|
||||
def AddInputOps(model, reader, batch_size):
|
||||
"""Add input ops."""
|
||||
data, label = brew.image_input(
|
||||
model, [reader], ['data', 'label'],
|
||||
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
|
||||
scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
|
||||
is_test=False)
|
||||
data = model.StopGradient(data, data)
|
||||
|
||||
|
||||
def AddForwardPassOps(model):
|
||||
"""Add forward pass ops and return a list of losses."""
|
||||
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
|
||||
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
|
||||
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
|
||||
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
|
||||
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
|
||||
fc3 = brew.relu(model, fc3, fc3)
|
||||
pred = brew.fc(model, fc3, 'pred', 500, 10)
|
||||
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
|
||||
brew.accuracy(model, [softmax, 'label'], 'accuracy')
|
||||
return [loss]
|
||||
|
||||
|
||||
def AddOptimizerOps(model):
|
||||
"""Add optimizer ops."""
|
||||
optimizer.build_sgd(model, 0.01,
|
||||
policy='step', stepsize=1, gamma=0.999,
|
||||
momentum=0.9, nesterov=False)
|
||||
|
||||
|
||||
def createTrainModel(lmdb_path):
|
||||
"""Create and return a training model, complete with training ops."""
|
||||
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
|
||||
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
|
||||
AddInputOps(model, reader, BATCH_SIZE)
|
||||
losses = AddForwardPassOps(model)
|
||||
model.AddGradientOperators(losses)
|
||||
AddOptimizerOps(model)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
workspace.CreateNet(model.net)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def createTestModel(lmdb_path):
|
||||
"""Create and return a test model. Does not include training ops."""
|
||||
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
|
||||
init_params=False)
|
||||
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
|
||||
AddInputOps(model, reader, BATCH_SIZE)
|
||||
AddForwardPassOps(model)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
workspace.CreateNet(model.net)
|
||||
return model
|
||||
|
||||
|
||||
def getArgs():
|
||||
"""Return command-line arguments."""
|
||||
CURDIR = os.path.dirname(__file__)
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--train-lmdb', help='Path to training LMDB',
|
||||
default=os.path.join(CURDIR, 'mnist_train_lmdb'))
|
||||
parser.add_argument('--test-lmdb', help='Path to test LMDB',
|
||||
default=os.path.join(CURDIR, 'mnist_test_lmdb'))
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Train and test."""
|
||||
device = 0
|
||||
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, device)):
|
||||
train_model = createTrainModel(args.train_lmdb)
|
||||
test_model = createTestModel(args.test_lmdb)
|
||||
|
||||
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
|
||||
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
|
||||
|
||||
for epoch in range(1, EPOCHS + 1):
|
||||
# Train
|
||||
for iteration in range(1, train_iter_per_epoch + 1):
|
||||
workspace.RunNet(train_model.net.Proto().name)
|
||||
if not iteration % DISPLAY:
|
||||
loss = workspace.FetchBlob('loss')
|
||||
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
|
||||
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
|
||||
|
||||
# Test
|
||||
losses = []
|
||||
accuracies = []
|
||||
for _ in range(test_iter_per_epoch):
|
||||
workspace.RunNet(test_model.net.Proto().name)
|
||||
losses.append(workspace.FetchBlob('loss'))
|
||||
accuracies.append(workspace.FetchBlob('accuracy'))
|
||||
|
||||
loss = np.array(losses).mean()
|
||||
accuracy = np.array(accuracies).mean()
|
||||
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
|
||||
|
||||
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
|
||||
raise RuntimeError(
|
||||
"Final accuracy %f is not in the expected range [%f, %f]" %
|
||||
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
|
||||
main(getArgs())
|
139
Caffe2/Classification/mnist/train_lenet_fp16.py
Executable file
139
Caffe2/Classification/mnist/train_lenet_fp16.py
Executable file
|
@ -0,0 +1,139 @@
|
|||
#!/usr/bin/env python
|
||||
"""Example: train LeNet on MNIST (with fp16)."""
|
||||
from __future__ import division, print_function
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from caffe2.proto import caffe2_pb2
|
||||
from caffe2.python import brew, core, optimizer, workspace
|
||||
from caffe2.python.model_helper import ModelHelper
|
||||
from caffe2.python.modeling.initializers import pFP16Initializer
|
||||
|
||||
|
||||
TRAIN_ENTRIES = 60000
|
||||
TEST_ENTRIES = 10000
|
||||
BATCH_SIZE = 100
|
||||
EPOCHS = 4
|
||||
DISPLAY = 100
|
||||
ACCURACY_MIN = 0.98
|
||||
ACCURACY_MAX = 0.999
|
||||
|
||||
|
||||
def AddInputOps(model, reader, batch_size):
|
||||
"""Add input ops."""
|
||||
data, label = brew.image_input(
|
||||
model, [reader], ['data', 'label'],
|
||||
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
|
||||
scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
|
||||
output_type='float16', is_test=True)
|
||||
data = model.StopGradient(data, data)
|
||||
|
||||
|
||||
def AddForwardPassOps(model):
|
||||
"""Add forward pass ops and return a list of losses."""
|
||||
with brew.arg_scope([brew.conv, brew.fc],
|
||||
WeightInitializer=pFP16Initializer,
|
||||
BiasInitializer=pFP16Initializer):
|
||||
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
|
||||
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
|
||||
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
|
||||
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
|
||||
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
|
||||
fc3 = brew.relu(model, fc3, fc3)
|
||||
pred = brew.fc(model, fc3, 'pred', 500, 10)
|
||||
|
||||
# Cast back to fp32 for remaining ops
|
||||
pred = model.net.HalfToFloat(pred, pred + '_fp32')
|
||||
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
|
||||
brew.accuracy(model, [softmax, 'label'], 'accuracy')
|
||||
return [loss]
|
||||
|
||||
|
||||
def AddOptimizerOps(model):
|
||||
"""Add optimizer ops."""
|
||||
optimizer.build_sgd(model, 0.01,
|
||||
policy='step', stepsize=1, gamma=0.999,
|
||||
momentum=0.9, nesterov=False)
|
||||
|
||||
|
||||
def createTrainModel(lmdb_path):
|
||||
"""Create and return a training model, complete with training ops."""
|
||||
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
|
||||
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
|
||||
AddInputOps(model, reader, BATCH_SIZE)
|
||||
losses = AddForwardPassOps(model)
|
||||
model.AddGradientOperators(losses)
|
||||
AddOptimizerOps(model)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
workspace.CreateNet(model.net)
|
||||
return model
|
||||
|
||||
|
||||
def createTestModel(lmdb_path):
|
||||
"""Create and return a test model. Does not include training ops."""
|
||||
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
|
||||
init_params=False)
|
||||
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
|
||||
AddInputOps(model, reader, BATCH_SIZE)
|
||||
AddForwardPassOps(model)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
workspace.CreateNet(model.net)
|
||||
return model
|
||||
|
||||
|
||||
def getArgs():
|
||||
"""Return command-line arguments."""
|
||||
CURDIR = os.path.dirname(__file__)
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--train-lmdb', help='Path to training LMDB',
|
||||
default=os.path.join(CURDIR, 'mnist_train_lmdb'))
|
||||
parser.add_argument('--test-lmdb', help='Path to test LMDB',
|
||||
default=os.path.join(CURDIR, 'mnist_test_lmdb'))
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Train and test."""
|
||||
device = 0
|
||||
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, device)):
|
||||
train_model = createTrainModel(args.train_lmdb)
|
||||
test_model = createTestModel(args.test_lmdb)
|
||||
|
||||
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
|
||||
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
|
||||
|
||||
for epoch in range(1, EPOCHS + 1):
|
||||
# Train
|
||||
for iteration in range(1, train_iter_per_epoch + 1):
|
||||
workspace.RunNet(train_model.net.Proto().name)
|
||||
if not iteration % DISPLAY:
|
||||
loss = workspace.FetchBlob('loss')
|
||||
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
|
||||
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
|
||||
|
||||
# Test
|
||||
losses = []
|
||||
accuracies = []
|
||||
for _ in range(test_iter_per_epoch):
|
||||
workspace.RunNet(test_model.net.Proto().name)
|
||||
losses.append(workspace.FetchBlob('loss'))
|
||||
accuracies.append(workspace.FetchBlob('accuracy'))
|
||||
|
||||
loss = np.array(losses).mean()
|
||||
accuracy = np.array(accuracies).mean()
|
||||
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
|
||||
|
||||
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
|
||||
raise RuntimeError(
|
||||
"Final accuracy %f is not in the expected range [%f, %f]" %
|
||||
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
|
||||
main(getArgs())
|
159
Caffe2/Classification/mnist/train_lenet_mgpu.py
Executable file
159
Caffe2/Classification/mnist/train_lenet_mgpu.py
Executable file
|
@ -0,0 +1,159 @@
|
|||
#!/usr/bin/env python
|
||||
"""Example: train LeNet on MNIST (with multi-GPU)."""
|
||||
from __future__ import division, print_function
|
||||
|
||||
import argparse
|
||||
import functools
|
||||
import logging
|
||||
import os.path
|
||||
|
||||
from caffe2.python import brew, core, data_parallel_model, optimizer, workspace
|
||||
from caffe2.python.model_helper import ModelHelper
|
||||
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
TRAIN_ENTRIES = 60000
|
||||
TEST_ENTRIES = 10000
|
||||
BATCH_SIZE = 100
|
||||
EPOCHS = 4
|
||||
DISPLAY = 100
|
||||
ACCURACY_MIN = 0.98
|
||||
ACCURACY_MAX = 0.999
|
||||
|
||||
|
||||
def AddInputOps(model, reader, batch_size):
|
||||
"""Add input ops."""
|
||||
data, label = brew.image_input(
|
||||
model, [reader], ['data', 'label'],
|
||||
batch_size=batch_size, use_caffe_datum=False, use_gpu_transform=True,
|
||||
scale=28, crop=28, mirror=False, color=False, mean=128.0, std=256.0,
|
||||
is_test=True)
|
||||
data = model.StopGradient(data, data)
|
||||
|
||||
|
||||
def AddForwardPassOps(model, loss_scale):
|
||||
"""Add forward pass ops and return a list of losses."""
|
||||
conv1 = brew.conv(model, 'data', 'conv1', 1, 20, 5)
|
||||
pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
|
||||
conv2 = brew.conv(model, pool1, 'conv2', 20, 50, 5)
|
||||
pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
|
||||
fc3 = brew.fc(model, pool2, 'fc3', 50 * 4 * 4, 500)
|
||||
fc3 = brew.relu(model, fc3, fc3)
|
||||
pred = brew.fc(model, fc3, 'pred', 500, 10)
|
||||
softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss'])
|
||||
loss = model.Scale(loss, loss, scale=loss_scale)
|
||||
brew.accuracy(model, [softmax, 'label'], 'accuracy')
|
||||
return [loss]
|
||||
|
||||
|
||||
def AddOptimizerOps(model):
|
||||
"""Add optimizer ops."""
|
||||
optimizer.build_sgd(model, 0.01,
|
||||
policy='step', stepsize=1, gamma=0.999,
|
||||
momentum=0.9, nesterov=False)
|
||||
|
||||
|
||||
def createTrainModel(lmdb_path, devices):
|
||||
"""Create and return a training model, complete with training ops."""
|
||||
model = ModelHelper(name='train', arg_scope={'order': 'NCHW'})
|
||||
reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb')
|
||||
data_parallel_model.Parallelize_GPU(
|
||||
model,
|
||||
input_builder_fun=functools.partial(
|
||||
AddInputOps, reader=reader,
|
||||
batch_size=(BATCH_SIZE // len(devices))),
|
||||
forward_pass_builder_fun=AddForwardPassOps,
|
||||
optimizer_builder_fun=AddOptimizerOps,
|
||||
devices=devices, use_nccl=True)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
workspace.CreateNet(model.net)
|
||||
return model
|
||||
|
||||
|
||||
def createTestModel(lmdb_path, devices):
|
||||
"""Create and return a test model. Does not include training ops."""
|
||||
model = ModelHelper(name='test', arg_scope={'order': 'NCHW'},
|
||||
init_params=False)
|
||||
reader = model.CreateDB('test_reader', db=lmdb_path, db_type='lmdb')
|
||||
data_parallel_model.Parallelize_GPU(
|
||||
model,
|
||||
input_builder_fun=functools.partial(
|
||||
AddInputOps, reader=reader,
|
||||
batch_size=(BATCH_SIZE // len(devices))),
|
||||
forward_pass_builder_fun=AddForwardPassOps,
|
||||
param_update_builder_fun=None,
|
||||
devices=devices)
|
||||
workspace.RunNetOnce(model.param_init_net)
|
||||
workspace.CreateNet(model.net)
|
||||
return model
|
||||
|
||||
|
||||
def getArgs():
|
||||
"""Return command-line arguments."""
|
||||
CURDIR = os.path.dirname(__file__)
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--train-lmdb', help='Path to training LMDB',
|
||||
default=os.path.join(CURDIR, 'mnist_train_lmdb'))
|
||||
parser.add_argument('--test-lmdb', help='Path to test LMDB',
|
||||
default=os.path.join(CURDIR, 'mnist_test_lmdb'))
|
||||
parser.add_argument('--gpus',
|
||||
help='Comma separated list of GPU devices to use')
|
||||
parser.add_argument('--num_gpus', type=int, default=1,
|
||||
help='Number of GPU devices (instead of --gpus)')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.gpus is not None:
|
||||
args.gpus = [int(x) for x in args.gpus.split(',')]
|
||||
args.num_gpus = len(args.gpus)
|
||||
else:
|
||||
args.gpus = range(args.num_gpus)
|
||||
args.num_gpus = args.num_gpus
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Train and test."""
|
||||
train_model = createTrainModel(args.train_lmdb, args.gpus)
|
||||
test_model = createTestModel(args.test_lmdb, args.gpus)
|
||||
|
||||
train_iter_per_epoch = TRAIN_ENTRIES // BATCH_SIZE
|
||||
test_iter_per_epoch = TEST_ENTRIES // BATCH_SIZE
|
||||
scope_prefix = 'gpu_%d/' % args.gpus[0]
|
||||
|
||||
for epoch in range(1, EPOCHS + 1):
|
||||
# Train
|
||||
for iteration in range(1, train_iter_per_epoch + 1):
|
||||
workspace.RunNet(train_model.net.Proto().name)
|
||||
if not iteration % DISPLAY:
|
||||
loss = workspace.FetchBlob(scope_prefix + 'loss')
|
||||
print("Epoch %d/%d, iteration %4d/%d, loss=%f" % (
|
||||
epoch, EPOCHS, iteration, train_iter_per_epoch, loss))
|
||||
|
||||
# Test
|
||||
losses = []
|
||||
accuracies = []
|
||||
for _ in range(test_iter_per_epoch):
|
||||
workspace.RunNet(test_model.net.Proto().name)
|
||||
# Take average values across all GPUs
|
||||
losses.append(sum(
|
||||
workspace.FetchBlob('gpu_%d/loss' % g) for g in args.gpus
|
||||
) / len(args.gpus))
|
||||
accuracies.append(sum(
|
||||
workspace.FetchBlob('gpu_%d/accuracy' % g) for g in args.gpus
|
||||
) / len(args.gpus))
|
||||
|
||||
loss = sum(losses) / len(losses)
|
||||
accuracy = sum(accuracies) / len(accuracies)
|
||||
print("Test loss: %f, accuracy: %f" % (loss, accuracy))
|
||||
|
||||
if accuracy < ACCURACY_MIN or accuracy > ACCURACY_MAX:
|
||||
raise RuntimeError(
|
||||
"Final accuracy %f is not in the expected range [%f, %f]" %
|
||||
(accuracy, ACCURACY_MIN, ACCURACY_MAX))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
|
||||
main(getArgs())
|
Loading…
Reference in a new issue