Merge pull request #4 from NVIDIA/master

pull upstream
This commit is contained in:
Yang Zhang 2019-10-22 09:48:09 -07:00 committed by GitHub
commit 52c60d2ed2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
331 changed files with 19749 additions and 4977 deletions

View file

@ -19,5 +19,6 @@ set(cuda_kernel_files
)
add_library(fastertransformer STATIC ${cuda_kernel_files})
set_target_properties(fastertransformer PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(fastertransformer PUBLIC -lcublas -lcudart ${CMAKE_THREAD_LIBS_INIT})

View file

@ -197,11 +197,9 @@ void add_bias_input_layernorm(__half* out, const __half* input, const __half* bi
template <typename T>
void add_bias_act_kernelLauncher(T* out, const T* bias, int m, int n, cudaStream_t stream)
{
// dim3 grid(m / 64);
dim3 grid(m / 4);
dim3 block(n / 4);
assert(block.x > 1024);
// dim3 block(n);
assert(block.x <= 1024);
add_bias_act<T><<<grid, block, 0, stream>>>(out, bias, m, n);
}
@ -209,9 +207,9 @@ template<typename T>
void add_bias_input_layernorm_kernelLauncher(T* out, const T* input, const T* bias,
const T* gamma, const T* beta, int m, int n, cudaStream_t stream)
{
assert(n > 1024);
dim3 grid(m);
dim3 block(n);
assert(block.x <= 1024);
add_bias_input_layernorm<T><<<grid, block, 0, stream>>>(out, input, bias, gamma, beta, m, n);
}
@ -220,9 +218,9 @@ template <>
void add_bias_input_layernorm_kernelLauncher(__half* out, const __half* input, const __half* bias,
const __half* gamma, const __half* beta, int m, int n, cudaStream_t stream)
{
assert(n / 2 > 1024);
dim3 grid(m);
dim3 block(n / 2);
assert(block.x <= 1024);
add_bias_input_layernorm<__half><<<grid, block, 0, stream>>>(out, input, bias, gamma, beta, m, n);
}

View file

@ -88,7 +88,7 @@ T blockReduceMax(T val)
__syncthreads();
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : 0;
val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : -1e20f;
val = warpReduceMax(val);
return val;
@ -204,7 +204,7 @@ void softmax_kernel(T* qk_buf_, const T* attr_mask, const int batch_size, const
mask_val = (1.0f - mask_val) * -10000.0f;
float tmp = threadIdx.x < seq_len ? (float)(qk * (float)scaler + mask_val): -1e-20f;
float tmp = threadIdx.x < seq_len ? (float)(qk * (float)scaler + mask_val): -1e20f;
float max_val = blockReduceMax<float>(tmp);
@ -248,7 +248,7 @@ void softmax_kernel_v2(T* qk_buf_, const T* attr_mask, const int batch_size, con
mask_val = (1.0f - mask_val) * -10000.0f;
float tmp = threadIdx.x < seq_len ? (float)(qk * (float)scaler + mask_val) : -1e-20f;
float tmp = threadIdx.x < seq_len ? (float)(qk * (float)scaler + mask_val) : -1e20f;
float max_val = blockReduceMax<float>(tmp);
if(threadIdx.x == 0)
s_max = max_val;
@ -324,10 +324,9 @@ void OpenMultiHeadAttention<OpType_>::multiHeadAttr_nofuse_kernelLauncher(
if(OpType_ == OperationType::FP32)
{
// const int word_per_block = 32;
const int word_per_block = 1;
assert(k > 1024);
assert(m / word_per_block * 3 > 65536);
assert(k <= 1024);
assert(m / word_per_block * 3 <= 65536);
dim3 grid(m / word_per_block * 3);
dim3 block(k);
@ -340,8 +339,6 @@ void OpenMultiHeadAttention<OpType_>::multiHeadAttr_nofuse_kernelLauncher(
grid.x = batch_size * seq_len / word_per_block;
block.x = head_num * size_per_head * word_per_block / 2;
assert(block.x);
add_QKV_bias<DataType_><<<grid, block, 0, stream>>>(Q, bias_Q, K, bias_K, V, bias_V, q_buf_, k_buf_,
v_buf_, batch_size, seq_len, head_num, size_per_head / 2, word_per_block);
}
@ -400,11 +397,10 @@ void OpenMultiHeadAttention<OpType_>::multiHeadAttr_nofuse_kernelLauncher(
if(OpType_ == OperationType::HALF)
{
const int seq_per_block = 4;
// const int seq_per_block = 1;
grid.x = batch_size * head_num * seq_len / seq_per_block;
block.x = seq_per_block * size_per_head / 2;
assert(grid.x * seq_per_block != batch_size * head_num * seq_len);
assert(grid.x * seq_per_block == batch_size * head_num * seq_len);
transpose<DataType_><<<grid, block, 0, stream>>>(transpose_dst_, dst,
batch_size, seq_len, head_num, size_per_head / 2);

View file

@ -25,4 +25,5 @@ add_definitions(-DGOOGLE_CUDA=1)
add_definitions(-DNDEBUG)
add_library(tf_fastertransformer SHARED ${tf_bert_transformer_files})
set_target_properties(tf_fastertransformer PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tf_fastertransformer PRIVATE -lcublas -lcudart -ltensorflow_framework ${CMAKE_THREAD_LIBS_INIT})

View file

@ -363,7 +363,7 @@ with tf.Session(config=config) as sess:
print("#################################")
np_val1 = sess.run(output)
np_val2 = sess.run(output_own)
print("cross_check " + str(np.allclose(np_val1, np_val2, atol = 1e-5)))
print("cross_check " + str(np.allclose(np_val1, np_val2, atol = 1e-1)))
print("max diff " + str(np.fabs(np_val1 - np_val2).max()))
print("min diff " + str(np.fabs(np_val1 - np_val2).min()))
print np_val1

View file

@ -361,7 +361,7 @@ with tf.Session(config=config) as sess:
print("#################################")
np_val1 = sess.run(output)
np_val2 = sess.run(output_own)
print("cross_check " + str(np.allclose(np_val1, np_val2, atol = 1e-5)))
print("cross_check " + str(np.allclose(np_val1, np_val2, atol = 1e-4)))
print("max diff " + str(np.fabs(np_val1 - np_val2).max()))
print("min diff " + str(np.fabs(np_val1 - np_val2).min()))

View file

@ -22,7 +22,9 @@ set(gemm_fp32_files
)
add_executable(gemm_fp32 ${gemm_fp32_files})
set_target_properties(gemm_fp32 PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(gemm_fp32 PUBLIC -lcublas -lcudart ${CMAKE_THREAD_LIBS_INIT})
add_executable(gemm_fp16 ${gemm_fp16_files})
set_target_properties(gemm_fp16 PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(gemm_fp16 PUBLIC -lcublas -lcudart ${CMAKE_THREAD_LIBS_INIT})

View file

@ -0,0 +1,3 @@
FROM nvcr.io/nvidia/mxnet:19.07-py3
COPY . /workspace/rn50
WORKDIR /workspace/rn50

View file

@ -1,3 +1,4 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

View file

@ -1,6 +1,46 @@
# ResNet50 v1.5 For MXNet
# ResNet50 v1.5 for MXNet
## The model
This repository provides a script and recipe to train the ResNet50 v1.5 model to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
## Table Of Contents
- [Model overview](#model-overview)
* [Default configuration](#default-configuration)
* [Feature support matrix](#feature-support-matrix)
* [Features](#features)
* [Mixed precision training](#mixed-precision-training)
* [Enabling mixed precision](#enabling-mixed-precision)
- [Setup](#setup)
* [Requirements](#requirements)
- [Quick Start Guide](#quick-start-guide)
- [Advanced](#advanced)
* [Scripts and sample code](#scripts-and-sample-code)
* [Parameters](#parameters)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Dataset guidelines](#dataset-guidelines)
* [Multi-dataset](#multi-dataset)
* [Training process](#training-process)
* [Inference process](#inference-process)
- [Performance](#performance)
* [Benchmarking](#benchmarking)
* [Training performance benchmark](#training-performance-benchmark)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
* [Training stability test](#training-stability-test)
* [Training performance results](#training-performance-results)
* [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
* [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-(16x-v100-32G))
* [Inference performance results](#inference-performance-results)
* [Inference performance: NVIDIA DGX-1 (8x V100 16G)](#inference-performance-nvidia-dgx-1-(8x-v100-16G))
* [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
- [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## Model overview
The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
The difference between v1 and v1.5 is in the bottleneck blocks which require
@ -9,96 +49,448 @@ v1.5 has stride = 2 in the 3x3 convolution
This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1, but comes with a small performance drawback (~5% imgs/sec).
## Training procedure
This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 3.5x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
### Optimizer
### Default configuration
This model trains for 90 epochs, with the standard ResNet v1.5 setup:
**Optimizer:**
* SGD with momentum (0.9)
* SGD with momentum (0.875)
* Learning rate = 0.256 for 256 batch size, for other batch sizes we lineary scale the learning rate.
* Learning rate schedule -- we use cosine LR schedule
* Linear warmup of the learning rate during first 5 epochs according to [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
* Weight decay: 3.0517578125e-05 (1/32768).
* We do not apply WD on Batch Norm trainable parameters (gamma/bias)
* Label Smoothing: 0.1
* We train for:
* 50 Epochs -> configuration that reaches 75.9% top1 accuracy
* 90 Epochs -> 90 epochs is a standard for ResNet50
* 250 Epochs -> best possible accuracy. For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
* Learning rate = 0.1 for 256 batch size, for other batch sizes we linearly
scale the learning rate.
**Data augmentation:**
* Learning rate decay - multiply by 0.1 after 30, 60, and 80 epochs
This model uses the following data augmentation:
* Linear warmup of the learning rate during first 5 epochs
according to [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
* Weight decay: 1e-4
### Data Augmentation
During training, we perform the following augmentation techniques:
For training:
* Normalization
* Random resized crop to 224x224
* Scale from 5% to 100%
* Scale from 8% to 100%
* Aspect ratio from 3/4 to 4/3
* Random horizontal flip
During inference, we perform the following augmentation techniques:
For inference:
* Normalization
* Scale to 256x256
* Center crop to 224x224
See `data.py` for more info.
### Feature support matrix
# Setup
## Requirements
Ensure your environment meets the following requirements:
* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
* [MXNet 18.12-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia%2Fmxnet) or newer
* [NVIDIA-DALI 0.5.0](https://github.com/NVIDIA/DALI) -- included in the MXNet container
* [Python 3.5](https://www.python.org) -- included in the MXNet container
* [CUDA 10](https://developer.nvidia.com/cuda-toolkit) -- included in the MXNet container
* [cuDNN 7.4.1](https://developer.nvidia.com/cudnn) -- included in the the MXNet container
* (optional) NVIDIA Volta or Turing GPU (see section below) -- for best training performance using FP16
For more information about how to get started with NGC containers, see the
following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
* [Running MXNet](https://docs.nvidia.com/deeplearning/dgx/mxnet-release-notes/running.html#running)
## Training using mixed precision with Tensor Cores
### Hardware requirements
Training with mixed precision on NVIDIA Tensor Cores, requires an NVIDIA Volta-based or Turing-based GPU.
| **Feature** | **ResNet50 MXNet** |
|:---:|:--------:|
|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html)|yes|
|Horovod Multi-GPU|yes|
### Software changes
#### Features
The following features are supported by this model.
For information about how to train using mixed precision, see the
[Mixed Precision Training paper](https://arxiv.org/abs/1710.03740)
and
[Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
NVIDIA DALI - NVIDIA Data Loading Library (DALI) is a collection of highly optimized building blocks, and an execution engine, to accelerate the pre-processing of the input data for deep learning applications. DALI provides both the performance and the flexibility for accelerating different data pipelines as a single library. This single library can then be easily integrated into different deep learning training and inference applications.
Horovod Multi-GPU - Horovod is a distributed training framework for TensorFlow, Keras, PyTorch and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod).
# Quick start guide
## Docker
### Mixed precision training
To run docker MXNet container, run:
Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Adding loss scaling to preserve small gradient values.
`nvidia-docker run --rm -it --ipc=host -v <path to source of this repo>:/workspace/resnet50 -v <path to prepared dataset>:/data/imagenet/train-val-recordio-passthrough nvcr.io/nvidia/mxnet:18.12-py3`
The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
It will also automatically start downloading the MXNet container if you haven't downloaded it yet. You can also download it manually by running:
For information about:
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
`nvidia-docker pull nvcr.io/nvidia/mxnet:18.12-py3`
If you haven't prepared dataset yet (see section below), download raw ImageNet dataset (see section below), and run:
`nvidia-docker run --rm -it --ipc=host -v <path to source of this repo>:/workspace/resnet50 -v <path where prepared dataset should be created>:/data/imagenet/train-val-recordio-passthrough -v <path to raw dataset>:/data/imagenet/raw nvcr.io/nvidia/mxnet:18.12-py3`
#### Enabling mixed precision
Using the Gluon API, ensure you perform the following steps to convert a model that supports computation with float16.
and follow step from Prepare Dataset section.
1. Cast Gluon Blocks parameters and expected input type to float16 by calling the cast method of the Block representing the network.
```python
net = net.cast('float16')
```
## Prepare Dataset
2. Ensure the data input to the network is of float16 type. If your DataLoader or Iterator produces output in another datatype, then you have to cast your data. There are different ways you can do this. The easiest way is to use the `astype` method of NDArrays.
```python
data = data.astype('float16', copy=False)
```
3. If you are using images and DataLoader, you can also use a Cast transform. It is preferable to use multi_precision mode of optimizer when training in float16. This mode of optimizer maintains a master copy of the weights in float32 even when the training (forward and backward pass) is in float16. This helps increase precision of the weight updates and can lead to faster convergence in some scenarios.
```python
optimizer = mx.optimizer.create('sgd', multi_precision=True, lr=0.01)
```
## Setup
The following section lists the requirements in order to start training the ResNet50 v1.5 model.
### Requirements
This repository contains Dockerfile which extends the MXNet NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
- [MXNet 19.07-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia%2Fmxnet)
- [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
- [Running MXNet](https://docs.nvidia.com/deeplearning/dgx/mxnet-release-notes/running.html#running)
For those unable to use the MXNet NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
## Quick Start Guide
**1. Clone the repository.**
```bash
git clone https://github.com/NVIDIA/DeepLearningExamples
cd DeepLearningExamples/MxNet/Classification/RN50v1.5
```
**2. Build the ResNet50 MXNet NGC container.**
After Docker is setup, you can build the ResNet50 image with:
```bash
docker build . -t nvidia_rn50_mx
```
**3. Start an interactive session in the NGC container to run preprocessing/training/inference.**
```bash
nvidia-docker run --rm -it --ipc=host <path to dataset>:/data/imagenet/train-val-recordio-passthrough nvidia_rn50_mx
```
**4. Download and preprocess the data.**
* Download the images from http://image-net.org/download-images.
* Extract the training and validation data:
```bash
mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
cd ..
```
**5. Extract the validation data and move the images to subfolders.**
```bash
mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
```
**6. Preprocess the dataset.**
```bash
./scripts/prepare_imagenet.sh <path to raw imagenet> <path where processed dataset will be created>
```
**7. Start training.**
```bash
./runner -n <number of gpus> -b <batch size per GPU (default 192)>
```
**8. Start validation/evaluation.**
```bash
./runner -n <number of gpus> -b <batch size per GPU (default 192)> --load <path to trained model> --mode val
```
**9. Start inference/predictions.**
```bash
./runner --load <path to trained model> --mode pred --data-pred <path to the image>
```
## Advanced
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Scripts and sample code
In the root directory, the most important files are:
* `runner`: A wrapper on the `train.py` script which is the main executable script for training/validation/predicting
* `benchmark.py`: A script for benchmarking
* `Dockerfile`: Container to build the container
* `fit.py`: A file containing most of the training and validation logic
* `data.py`: Data loading and preprocessing code
* `dali.py`: Data loading and preprocessing code using DALI
* `models.py`: The model architecture
* `report.py`: A file containing JSON report structure and description of fields
In the `scripts` directory, the most important files are:
* `prepare_imagenet.sh`: A script that converts raw dataset format to RecordIO format
### Parameters
The complete list of available parameters contains:
```
Model:
--arch {resnetv1,resnetv15,resnextv1,resnextv15,xception}
model architecture (default: resnetv15)
--num-layers NUM_LAYERS
number of layers in the neural network, required by
some networks such as resnet (default: 50)
--num-groups NUM_GROUPS
number of groups for grouped convolutions, required by
some networks such as resnext (default: 32)
--num-classes NUM_CLASSES
the number of classes (default: 1000)
--batchnorm-eps BATCHNORM_EPS
the amount added to the batchnorm variance to prevent
output explosion. (default: 1e-05)
--batchnorm-mom BATCHNORM_MOM
the leaky-integrator factor controling the batchnorm
mean and variance. (default: 0.9)
--fuse-bn-relu FUSE_BN_RELU
have batchnorm kernel perform activation relu
(default: 0)
--fuse-bn-add-relu FUSE_BN_ADD_RELU
have batchnorm kernel perform add followed by
activation relu (default: 0)
Training:
--mode {train_val,train,val,pred}
mode (default: train_val)
--seed SEED random seed (default: None)
-n NGPUS, --ngpus NGPUS
number of GPUs to use (default: 1)
--kv-store {device,horovod}
key-value store type (default: horovod)
--dtype {float32,float16}
Precision (default: float16)
--amp If enabled, turn on AMP (Automatic Mixed Precision)
(default: False)
-b BATCH_SIZE, --batch-size BATCH_SIZE
batch size per GPU (default: 192)
-e NUM_EPOCHS, --num-epochs NUM_EPOCHS
number of epochs (default: 90)
-l LR, --lr LR learning rate; IMPORTANT: true learning rate will be
calculated as `lr * batch_size / 256` (default: 0.256)
--lr-schedule {multistep,cosine}
learning rate schedule (default: cosine)
--lr-factor LR_FACTOR
the ratio to reduce lr on each step (default: 0.256)
--lr-steps LR_STEPS the epochs to reduce the lr, e.g. 30,60 (default: [])
--warmup-epochs WARMUP_EPOCHS
the epochs to ramp-up lr to scaled large-batch value
(default: 5)
--optimizer OPTIMIZER
the optimizer type (default: sgd)
--mom MOM momentum for sgd (default: 0.875)
--wd WD weight decay for sgd (default: 3.0517578125e-05)
--label-smoothing LABEL_SMOOTHING
label smoothing factor (default: 0.1)
--mixup MIXUP alpha parameter for mixup (if 0 then mixup is not
applied) (default: 0)
--disp-batches DISP_BATCHES
show progress for every n batches (default: 20)
--model-prefix MODEL_PREFIX
model checkpoint prefix (default: model)
--save-frequency SAVE_FREQUENCY
frequency of saving model in epochs (--model-prefix
must be specified). If -1 then save only best model.
If 0 then do not save anything. (default: -1)
--begin-epoch BEGIN_EPOCH
start the model from an epoch (default: 0)
--load LOAD checkpoint to load (default: None)
--test-io test reading speed without training (default: False)
--test-io-mode {train,val}
data to test (default: train)
--log LOG file where to save the log from the experiment
(default: log.log)
--report REPORT file where to save report (default: report.json)
--no-metrics do not calculate evaluation metrics (for benchmarking)
(default: False)
--benchmark-iters BENCHMARK_ITERS
run only benchmark-iters iterations from each epoch
(default: None)
Data:
--data-root DATA_ROOT
Directory with RecordIO data files (default:
/data/imagenet/train-val-recordio-passthrough)
--data-backend {dali,mxnet,synthetic}
data backend (default: dali)
--image-shape IMAGE_SHAPE
the image shape feed into the network (default: [3,
224, 224])
--rgb-mean RGB_MEAN a tuple of size 3 for the mean rgb (default: [123.68,
116.779, 103.939])
--rgb-std RGB_STD a tuple of size 3 for the std rgb (default: [58.393,
57.12, 57.375])
--input-layout {NCHW,NHWC}
the layout of the input data (default: NCHW)
--conv-layout {NCHW,NHWC}
the layout of the data assumed by the conv operation
(default: NCHW)
--batchnorm-layout {NCHW,NHWC}
the layout of the data assumed by the batchnorm
operation (default: NCHW)
--pooling-layout {NCHW,NHWC}
the layout of the data assumed by the pooling
operation (default: NCHW)
--num-examples NUM_EXAMPLES
the number of training examples (doesn't work with
mxnet data backend) (default: 1281167)
--data-val-resize DATA_VAL_RESIZE
base length of shorter edge for validation dataset
(default: 256)
DALI data backend:
entire group applies only to dali data backend
--dali-separ-val each process will perform independent validation on
whole val-set (default: False)
--dali-threads DALI_THREADS
number of threadsper GPU for DALI (default: 3)
--dali-validation-threads DALI_VALIDATION_THREADS
number of threadsper GPU for DALI for validation
(default: 10)
--dali-prefetch-queue DALI_PREFETCH_QUEUE
DALI prefetch queue depth (default: 2)
--dali-nvjpeg-memory-padding DALI_NVJPEG_MEMORY_PADDING
Memory padding value for nvJPEG (in MB) (default: 64)
MXNet data backend:
entire group applies only to mxnet data backend
--data-mxnet-threads DATA_MXNET_THREADS
number of threads for data decoding for mxnet data
backend (default: 40)
--random-crop RANDOM_CROP
if or not randomly crop the image (default: 0)
--random-mirror RANDOM_MIRROR
if or not randomly flip horizontally (default: 1)
--max-random-h MAX_RANDOM_H
max change of hue, whose range is [0, 180] (default:
0)
--max-random-s MAX_RANDOM_S
max change of saturation, whose range is [0, 255]
(default: 0)
--max-random-l MAX_RANDOM_L
max change of intensity, whose range is [0, 255]
(default: 0)
--min-random-aspect-ratio MIN_RANDOM_ASPECT_RATIO
min value of aspect ratio, whose value is either None
or a positive value. (default: 0.75)
--max-random-aspect-ratio MAX_RANDOM_ASPECT_RATIO
max value of aspect ratio. If min_random_aspect_ratio
is None, the aspect ratio range is
[1-max_random_aspect_ratio,
1+max_random_aspect_ratio], otherwise it is
[min_random_aspect_ratio, max_random_aspect_ratio].
(default: 1.33)
--max-random-rotate-angle MAX_RANDOM_ROTATE_ANGLE
max angle to rotate, whose range is [0, 360] (default:
0)
--max-random-shear-ratio MAX_RANDOM_SHEAR_RATIO
max ratio to shear, whose range is [0, 1] (default: 0)
--max-random-scale MAX_RANDOM_SCALE
max ratio to scale (default: 1)
--min-random-scale MIN_RANDOM_SCALE
min ratio to scale, should >= img_size/input_shape.
otherwise use --pad-size (default: 1)
--max-random-area MAX_RANDOM_AREA
max area to crop in random resized crop, whose range
is [0, 1] (default: 1)
--min-random-area MIN_RANDOM_AREA
min area to crop in random resized crop, whose range
is [0, 1] (default: 0.05)
--min-crop-size MIN_CROP_SIZE
Crop both width and height into a random size in
[min_crop_size, max_crop_size] (default: -1)
--max-crop-size MAX_CROP_SIZE
Crop both width and height into a random size in
[min_crop_size, max_crop_size] (default: -1)
--brightness BRIGHTNESS
brightness jittering, whose range is [0, 1] (default:
0)
--contrast CONTRAST contrast jittering, whose range is [0, 1] (default: 0)
--saturation SATURATION
saturation jittering, whose range is [0, 1] (default:
0)
--pca-noise PCA_NOISE
pca noise, whose range is [0, 1] (default: 0)
--random-resized-crop RANDOM_RESIZED_CROP
whether to use random resized crop (default: 1)
```
### Command-line options
To see the full list of available options and their descriptions, use the `-h` or `--help` command line option: `./runner --help` and `python train.py --help`. `./runner` acts as a wrapper on `train.py` and all additional flags will be passed to `train.py`.
`./runner` command-line options:
```
usage: runner [-h] [-n NGPUS] [-b BATCH_SIZE] [-e NUM_EPOCHS] [-l LR]
[--data-root DATA_ROOT] [--dtype {float32,float16}]
[--kv-store {device,horovod}]
[--data-backend {dali,mxnet,synthetic}]
```
`train.py` command-line options:
```
usage: train.py [-h]
[--arch {resnetv1,resnetv15,resnextv1,resnextv15,xception}]
[--num-layers NUM_LAYERS] [--num-groups NUM_GROUPS]
[--num-classes NUM_CLASSES] [--batchnorm-eps BATCHNORM_EPS]
[--batchnorm-mom BATCHNORM_MOM] [--fuse-bn-relu FUSE_BN_RELU]
[--fuse-bn-add-relu FUSE_BN_ADD_RELU]
[--mode {train_val,train,val,pred}] [--seed SEED]
[--gpus GPUS] [--kv-store {device,horovod}]
[--dtype {float32,float16}] [--amp] [--batch-size BATCH_SIZE]
[--num-epochs NUM_EPOCHS] [--lr LR]
[--lr-schedule {multistep,cosine}] [--lr-factor LR_FACTOR]
[--lr-steps LR_STEPS] [--warmup-epochs WARMUP_EPOCHS]
[--optimizer OPTIMIZER] [--mom MOM] [--wd WD]
[--label-smoothing LABEL_SMOOTHING] [--mixup MIXUP]
[--disp-batches DISP_BATCHES] [--model-prefix MODEL_PREFIX]
[--save-frequency SAVE_FREQUENCY] [--begin-epoch BEGIN_EPOCH]
[--load LOAD] [--test-io] [--test-io-mode {train,val}]
[--log LOG] [--report REPORT] [--no-metrics]
[--benchmark-iters BENCHMARK_ITERS] [--data-train DATA_TRAIN]
[--data-train-idx DATA_TRAIN_IDX] [--data-val DATA_VAL]
[--data-val-idx DATA_VAL_IDX] [--data-pred DATA_PRED]
[--data-backend {dali,mxnet,synthetic}]
[--image-shape IMAGE_SHAPE] [--rgb-mean RGB_MEAN]
[--rgb-std RGB_STD] [--input-layout {NCHW,NHWC}]
[--conv-layout {NCHW,NHWC}] [--batchnorm-layout {NCHW,NHWC}]
[--pooling-layout {NCHW,NHWC}] [--num-examples NUM_EXAMPLES]
[--data-val-resize DATA_VAL_RESIZE] [--dali-separ-val]
[--dali-threads DALI_THREADS]
[--dali-validation-threads DALI_VALIDATION_THREADS]
[--dali-prefetch-queue DALI_PREFETCH_QUEUE]
[--dali-nvjpeg-memory-padding DALI_NVJPEG_MEMORY_PADDING]
[--data-mxnet-threads DATA_MXNET_THREADS]
[--random-crop RANDOM_CROP] [--random-mirror RANDOM_MIRROR]
[--max-random-h MAX_RANDOM_H] [--max-random-s MAX_RANDOM_S]
[--max-random-l MAX_RANDOM_L]
[--min-random-aspect-ratio MIN_RANDOM_ASPECT_RATIO]
[--max-random-aspect-ratio MAX_RANDOM_ASPECT_RATIO]
[--max-random-rotate-angle MAX_RANDOM_ROTATE_ANGLE]
[--max-random-shear-ratio MAX_RANDOM_SHEAR_RATIO]
[--max-random-scale MAX_RANDOM_SCALE]
[--min-random-scale MIN_RANDOM_SCALE]
[--max-random-area MAX_RANDOM_AREA]
[--min-random-area MIN_RANDOM_AREA]
[--min-crop-size MIN_CROP_SIZE]
[--max-crop-size MAX_CROP_SIZE] [--brightness BRIGHTNESS]
[--contrast CONTRAST] [--saturation SATURATION]
[--pca-noise PCA_NOISE]
[--random-resized-crop RANDOM_RESIZED_CROP]
```
### Getting the data
The MXNet ResNet50 v1.5 script operates on ImageNet 1k, a widely popular image classification dataset from ILSVRC challenge.
You can download the images from http://image-net.org/download-images
You can download the images from http://image-net.org/download-images.
The recommended data format is
[RecordIO](http://mxnet.io/architecture/note_data_loading.html), which
@ -106,7 +498,7 @@ concatenates multiple examples into seekable binary files for better read
efficiency. MXNet provides a tool called `im2rec.py` located in the `/opt/mxnet/tools/` directory.
The tool converts individual images into `.rec` files.
To prepare RecordIO file containing ImageNet data, we first need to create .lst files
To prepare a RecordIO file containing ImageNet data, we first need to create `.lst` files
which consist of the labels and image paths. We assume that the original images were
downloaded to `/data/imagenet/raw/train-jpeg` and `/data/imagenet/raw/val-jpeg`.
@ -115,121 +507,216 @@ python /opt/mxnet/tools/im2rec.py --list --recursive train /data/imagenet/raw/tr
python /opt/mxnet/tools/im2rec.py --list --recursive val /data/imagenet/raw/val-jpeg
```
Then we generate the `.rec` (RecordIO files with data) and `.idx` (indexes required by DALI
Next, we generate the `.rec` (RecordIO files with data) and `.idx` (indexes required by DALI
to speed up data loading) files. To obtain the best training accuracy
we do not preprocess the images when creating RecordIO file.
we do not preprocess the images when creating the RecordIO file.
```bash
python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 train /data/imagenet/raw/train-jpeg
python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 val /data/imagenet/raw/val-jpeg
```
## Running training
#### Dataset guidelines
The process of loading, normalizing and augmenting the data contained in the dataset can be found in the `data.py` and `dali.py` files.
To run training for a standard configuration (1/4/8 GPUs, FP16/FP32),
run one of the scripts in the `./examples` directory
called `./examples/RN50_{FP16, FP32}_{1, 4, 8}GPU.sh`.
By default the training scripts run the validation and save checkpoint after each epoch.
Checkpoints will be stored in `model-symbol.json` and `model-<number of epoch>.params` files.
The data is read from RecordIO format, which concatenates multiple examples into seekable binary files for better read efficiency.
If imagenet is mounted in the `/data/imagenet/train-val-recordio-passthrough` directory, you don't have to specify `--data-root` flag.
Data augmentation techniques are described in the [Default configuration](#default-configuration) section.
To run a non standard configuration use:
#### Multi-dataset
`./runner -n <number of gpus> -b <batch size per gpu> --data-root <path to imagenet> --dtype <float32 or float16> --model-prefix <model prefix>`
In most cases, to train a model on a different dataset, no changes in the code are required, but the dataset has to be converted into RecordIO format.
Checkpoints will be stored in `<model prefix>-symbol.json` and `<model prefix>-<number of epoch>.params` files.
To generate JSON report with performance and accuracy stats, use `--report <path to report>` flag (see `report.py` for info about JSON report file structure).
Use `./runner -h` and `python ./train.py -h` to obtain the list of available options.
## Running inference
To run inference on a checkpointed model run:
* For FP16
`./examples/SCORE_FP16.sh <model prefix> <epoch>`
* For FP32
`./examples/SCORE_FP32.sh <model prefix> <epoch>`
To convert a custom dataset, follow the steps from [Getting the data](#getting-the-data) section, and refer to the `scripts/prepare_dataset.py` script.
## Benchmark scripts
### Training process
To start training, run:
`./runner -n <number of gpus> -b <batch size per GPU> --data-root <path to imagenet> --dtype <float32 or float16>`
By default the training script runs the validation after each epoch:
* the best checkpoint will be stored in the `model_best.params` file in the working directory
* the log from training will be saved in the `log.log` file in the working directory
* the JSON report with statistics will be saved in the `report.json` file in the working directory
If ImageNet is mounted in the `/data/imagenet/train-val-recordio-passthrough` directory, you don't have to specify the `--data-root` flag.
### Inference process
To start validation, run:
`./runner -n <number of gpus> -b <batch size per GPU> --data-root <path to imagenet> --dtype <float32 or float16> --mode val`
By default:
* the log from validation will be saved in the `log.log` file in the working directory
* the JSON report with statistics will be saved in the `report.json` file in the working directory
## Performance
### Benchmarking
To benchmark training and inference, run:
`python benchmark.py -n <numbers of gpus separated by comma> -b <batch sizes per GPU separated by comma> --data-root <path to imagenet> --dtype <float32 or float16> -o <path to benchmark report>`
`python benchmark.py -n <numbers of gpus separated by comma> -b <batch sizes per gpu separated by comma> --data-root <path to imagenet> --dtype <float32 or float16> -o <path to benchmark report>`
To control benchmark length per epoch, use `-i` flag (defaults to 100 iterations).
To control number of epochs, use `-e` flag.
To control number of warmup epochs (epochs which are not taken into account), use `-w` flag.
To limit length of dataset, use `--num-examples` flag.
To benchmark only inference, use `--only-inference` flag.
To control the benchmark length per epoch, use the `-i` flag (defaults to 100 iterations).
To control the number of epochs, use the `-e` flag.
To control the number of warmup epochs (epochs which are not taken into account), use the `-w` flag.
To limit the length of the dataset, use the `--num-examples` flag.
By default, the same parameters as in `./runner` will be used. Additional flags will be passed to `./runner`.
#### Training performance benchmark
To benchmark only training, use the `--mode train` flag.
## Training accuracy results
The following results were obtained by running the `./examples/RN50_{FP16, FP32}_{1, 4, 8}GPU.sh` scripts in the
mxnet-18.12-py3 Docker container on NVIDIA DGX-1 with 8 V100 16G GPUs.
| **number of GPUs** | **FP16 top1** | **FP16 training time** | **FP32 top1** | **FP32 training time** |
|:------------------:|:-------------:|:----------------------:|:-------------:|:----------------------:|
| 1 | 76.424 | 22.9h | 76.462 | 82.0h |
| 4 | 76.328 | 6.2h | 76.448 | 21.1h |
| 8 | 76.490 | 3.3h | 76.668 | 11.1h |
Here are example graphs of FP32 and FP16 training on 8 GPU configuration:
![TrainingLoss](./img/training_loss.png)
![TrainingAccuracy](./img/training_accuracy.png)
![ValidationAccuracy](./img/validation_accuracy.png)
#### Inference performance benchmark
To benchmark only inference, use the `--mode val` flag.
## Training performance results
### Results
The following sections provide details on how we achieved our performance and accuracy in training and inference.
#### Training accuracy results
##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
90 epochs configuration
Our results were obtained by running the `./runner -n <number of gpus> -b 96 --dtype float32` script for FP32 and the `./runner -n <number of gpus> -b 192` script for mixed precision in the in the mxnet-19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
on NVIDIA DGX-1 with (8x V100 16G) GPUs.
| **GPUs** | **Accuracy - mixed precision** | **Accuracy - FP32** | **Time to train - mixed precision** | **Time to train - FP32** | **Time to train - speedup** |
|:---:|:---:|:---:|:---:|:---:|:---:|
|1|77.208|77.160|24.2|84.5|3.49|
|4|77.296|77.280|6.0|21.4|3.59|
|8|77.308|77.292|3.0|10.7|3.54|
##### Training stability test
Our results were obtained by running the following commands 8 times with different seeds.
* For 50 epochs
* `./runner -n 8 -b 96 --dtype float32 --num-epochs 50` for FP32
* `./runner -n 8 -b 192 --num-epochs 50` for mixed precision
* For 90 epochs
* `./runner -n 8 -b 96 --dtype float32` for FP32
* `./runner -n 8 -b 192` for mixed precision
* For 250 epochs
* `./runner -n 8 -b 96 --dtype float32 --num-epochs 250 --mixup 0.2` for FP32
* `./runner -n 8 -b 192 --num-epochs 250 --mixup 0.2` for mixed precision
| **# of epochs** | **mixed precision avg top1** | **FP32 avg top1** | **mixed precision standard deviation** | **FP32 standard deviation** | **mixed precision minimum top1** | **FP32 minimum top1** | **mixed precision maximum top1** | **FP32 maximum top1** |
|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
|50|76.156|76.185|0.118|0.082|76.010|76.062|76.370|76.304|
|90|77.105|77.224|0.097|0.060|76.982|77.134|77.308|77.292|
|250|78.317|78.400|0.073|0.102|78.202|78.316|78.432|78.570|
Plots for 250 epoch configuration
Here are example graphs of FP32 and mixed precision training on 8 GPU 250 epochs configuration:
![TrainingLoss](./img/dgx1-16g_250e_training_loss.png)
![TrainingAccuracy](./img/dgx1-16g_250e_validation_top1.png)
![ValidationAccuracy](./img/dgx1-16g_250e_validation_top5.png)
#### Training performance results
##### Training performance: NVIDIA DGX-1 (8x V100 16G)
The following results were obtained by running the
`python benchmark.py -n 1,2,4,8 -b 192 --dtype float16 -o benchmark_report_fp16.json -i 500 -e 3 -w 1 --num-examples 32000 --mode train` script for mixed precision and the
`python benchmark.py -n 1,2,4,8 -b 96 --dtype float32 -o benchmark_report_fp32.json -i 500 -e 3 -w 1 --num-examples 32000 --mode train` script for FP32 in the mxnet-19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
The following results were obtained by running
`python benchmark.py -n 1,4,8 -b 208 --dtype float16 -o benchmark_report_fp16.json --data-root <path to imagenet> -i 100 -e 12 -w 4 --num-examples 25600` for FP16, and
`python benchmark.py -n 1,4,8 -b 96 --dtype float32 -o benchmark_report_fp32.json --data-root <path to imagenet> -i 100 -e 12 -w 4 --num-examples 12800` for FP32
in the mxnet-18.12-py3 Docker container on NVIDIA DGX-1 with V100 16G GPUs.
Training performance reported as Total IPS (data + compute time taken into account).
Weak scaling is calculated as a ratio of speed for given number of GPUs to speed for 1 GPU.
| **number of GPUs** | **FP16 img/s** | **FP32 img/s** | **FP16 speedup** | **FP16 weak scaling** | **FP32 weak scaling** |
|:------------------:|:--------------:|:--------------:|:----------------:|:---------------------:|:---------------------:|
| 1 | 1442.6 | 400.2 | 3.60 | 1.00 | 1.00 |
| 4 | 5391.8 | 1558.6 | 3.46 | 3.74 | 3.89 |
| 8 | 10263.2 | 2957.4 | 3.47 | 7.11 | 7.39 |
| **GPUs** | **Throughput - mixed precision** | **Throughput - FP32** | **Throughput speedup (FP32 - mixed precision)** | **Weak scaling - mixed precision** | **Weak scaling - FP32** |
|:---:|:---:|:---:|:---:|:---:|:---:|
|1|1427|385|3.71|1.00|1.00|
|2|2820|768|3.67|1.98|2.00|
|4|5560|1513|3.68|3.90|3.93|
|8|10931|3023|3.62|7.66|7.86|
##### Training performance: NVIDIA DGX-2 (16x V100 32G)
## Inference performance results
The following results were obtained by running the
`python benchmark.py -n 1,4,8,16 -b 256 --dtype float16 -o benchmark_report_fp16.json -i 500 -e 3 -w 1 --num-examples 32000 --mode train` script for mixed precision and the
`python benchmark.py -n 1,4,8,16 -b 128 --dtype float32 -o benchmark_report_fp32.json -i 500 -e 3 -w 1 --num-examples 32000 --mode train` script for FP32 in the mxnet-19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
Training performance reported as Total IPS (data + compute time taken into account).
Weak scaling is calculated as a ratio of speed for given number of GPUs to speed for 1 GPU.
| **GPUs** | **Throughput - mixed precision** | **Throughput - FP32** | **Throughput speedup (FP32 - mixed precision)** | **Weak scaling - mixed precision** | **Weak scaling - FP32** |
|:---:|:---:|:---:|:---:|:---:|:---:|
|1|1438|409|3.52|1.00|1.00|
|2|2868|817|3.51|1.99|2.00|
|4|5624|1617|3.48|3.91|3.96|
|8|11174|3214|3.48|7.77|7.86|
|16|20530|6356|3.23|14.28|15.54|
#### Inference performance results
##### Inference performance: NVIDIA DGX-1 (8x V100 16G)
The following results were obtained by running the
`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,128,192,256 --dtype float16 -o inferbenchmark_report_fp16.json -i 500 -e 3 -w 1 --mode val` script for mixed precision and the
`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,128,192,256 --dtype float32 -o inferbenchmark_report_fp32.json -i 500 -e 3 -w 1 --mode val` script for FP32 in the mxnet-19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
The following results were obtained by running
`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,96,128,192,208 --dtype float16 -o inferbenchmark_report_fp16.json --data-root <path to imagenet> -i 200 -e 12 -w 4 --only-inference` for FP16, and
`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,96 --dtype float32 -o inferbenchmark_report_fp32.json --data-root <path to imagenet> -i 200 -e 12 -w 4 --only-inference` for FP32
in the mxnet-18.12-py3 Docker container on NVIDIA DGX-1 using one V100 16G GPU.
Inference performance reported as Total IPS (data + compute time taken into account).
| **batch size** | **FP16 img/s** | **FP32 img/s** |
|:--------------:|:--------------:|:--------------:|
| 1 | 314 | 252 |
| 2 | 555 | 393 |
| 4 | 1024 | 601 |
| 8 | 1642 | 824 |
| 16 | 2144 | 1028 |
| 32 | 2954 | 1138 |
| 64 | 3428 | 1236 |
| 96 | 3546 | 1282 |
| 128 | 3690 | |
| 192 | 3828 | |
| 208 | 3832 | |
Reported mixed precision speedups are relative to FP32 numbers for corresponding configuration.
| **Batch size** | **Throughput (img/sec) - mixed precision** | **Throughput - speedup** | **Avg latency (ms) - mixed precision** | **Avg latency - speedup** | **50% latency (ms) - mixed precision** | **50% latency - speedup** | **90% latency (ms) - mixed precision** | **90% latency - speedup** | **95% latency (ms) - mixed precision** | **95% latency - speedup** | **99% latency (ms) - mixed precision** | **99% latency - speedup** | **100% latency (ms) - mixed precision** | **100% latency - speedup** |
|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
| 1 | 397 | 1.65 | 2.5 | 1.65 | 2.5 | 1.67 | 2.7 | 1.59 | 2.8 | 1.56 | 3.2 | 1.51 | 15.8 | 0.84 |
| 2 | 732 | 1.81 | 2.7 | 1.81 | 2.6 | 1.88 | 3.0 | 1.67 | 3.3 | 1.52 | 4.9 | 1.10 | 18.8 | 0.83 |
| 4 | 1269 | 2.08 | 3.2 | 2.08 | 3.0 | 2.21 | 3.5 | 1.92 | 4.0 | 1.72 | 7.5 | 0.97 | 14.5 | 0.54 |
| 8 | 2012 | 2.53 | 4.0 | 2.53 | 3.9 | 2.59 | 4.2 | 2.45 | 4.4 | 2.37 | 8.3 | 1.29 | 15.3 | 0.72 |
| 16 | 2667 | 2.64 | 6.0 | 2.64 | 5.9 | 2.66 | 6.3 | 2.54 | 6.4 | 2.52 | 8.3 | 2.02 | 16.9 | 1.05 |
| 32 | 3240 | 2.86 | 9.9 | 2.86 | 9.8 | 2.87 | 10.3 | 2.79 | 10.4 | 2.76 | 11.5 | 2.53 | 28.4 | 1.12 |
| 64 | 3776 | 3.10 | 17.0 | 3.10 | 17.0 | 3.09 | 17.5 | 3.03 | 17.7 | 3.01 | 18.1 | 3.01 | 18.7 | 2.99 |
| 128 | 3734 | 3.02 | 34.3 | 3.02 | 33.8 | 3.05 | 35.5 | 2.93 | 36.3 | 2.88 | 42.4 | 2.79 | 51.7 | 2.38 |
| 192 | 3641 | 2.90 | 52.7 | 2.90 | 52.4 | 2.90 | 55.2 | 2.77 | 56.2 | 2.74 | 65.4 | 2.76 | 77.1 | 2.41 |
| 256 | 3463 | 2.73 | 73.9 | 2.73 | 72.8 | 2.75 | 77.3 | 2.61 | 79.9 | 2.54 | 100.8 | 2.39 | 104.1 | 2.35 |
# Changelog
##### Inference performance: NVIDIA T4
1. Dec 19, 2018
The following results were obtained by running the
`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,128,192,256 --dtype float16 -o inferbenchmark_report_fp16.json -i 500 -e 3 -w 1 --mode val` script for mixed precision and the
`python benchmark.py -n 1 -b 1,2,4,8,16,32,64,128,192,256 --dtype float32 -o inferbenchmark_report_fp32.json -i 500 -e 3 -w 1 --mode val` script for FP32 in the mxnet-19.07-py3 NGC container on an NVIDIA T4 GPU.
Inference performance reported as Total IPS (data + compute time taken into account).
Reported mixed precision speedups are relative to FP32 numbers for corresponding configuration.
| **Batch size** | **Throughput (img/sec) - mixed precision** | **Throughput - speedup** | **Avg latency (ms) - mixed precision** | **Avg latency - speedup** | **50% latency (ms) - mixed precision** | **50% latency - speedup** | **90% latency (ms) - mixed precision** | **90% latency - speedup** | **95% latency (ms) - mixed precision** | **95% latency - speedup** | **99% latency (ms) - mixed precision** | **99% latency - speedup** | **100% latency (ms) - mixed precision** | **100% latency - speedup** |
|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
| 1 | 348 | 1.88 | 2.9 | 1.88 | 2.8 | 1.91 | 2.9 | 1.88 | 3.0 | 1.90 | 3.9 | 1.82 | 17.6 | 0.74 |
| 2 | 594 | 2.30 | 3.4 | 2.30 | 3.3 | 2.35 | 3.4 | 2.34 | 3.5 | 2.38 | 5.7 | 1.55 | 20.2 | 0.74 |
| 4 | 858 | 2.93 | 4.7 | 2.93 | 4.6 | 2.97 | 4.9 | 2.86 | 5.0 | 2.81 | 6.0 | 2.46 | 13.7 | 1.12 |
| 8 | 1047 | 3.17 | 7.6 | 3.17 | 7.6 | 3.19 | 7.9 | 3.10 | 8.2 | 3.02 | 9.1 | 2.77 | 15.0 | 1.72 |
| 16 | 1163 | 3.16 | 13.8 | 3.16 | 13.7 | 3.17 | 14.1 | 3.13 | 14.4 | 3.07 | 15.4 | 2.90 | 17.5 | 2.62 |
| 32 | 1225 | 3.22 | 26.1 | 3.22 | 26.1 | 3.22 | 27.0 | 3.15 | 27.3 | 3.12 | 28.3 | 3.05 | 30.5 | 2.89 |
| 64 | 1230 | 3.15 | 52.0 | 3.15 | 51.8 | 3.16 | 52.9 | 3.12 | 53.3 | 3.10 | 54.4 | 3.08 | 58.8 | 2.90 |
| 128 | 1260 | 3.21 | 101.6 | 3.21 | 101.3 | 3.22 | 102.7 | 3.21 | 103.2 | 3.20 | 115.0 | 2.89 | 121.8 | 2.86 |
| 192 | 1252 | 3.20 | 153.3 | 3.20 | 153.1 | 3.20 | 154.7 | 3.19 | 155.5 | 3.21 | 156.9 | 3.20 | 182.3 | 2.81 |
| 256 | 1251 | 3.22 | 204.6 | 3.22 | 204.3 | 3.23 | 206.4 | 3.21 | 207.1 | 3.21 | 209.3 | 3.18 | 241.9 | 2.76 |
## Release notes
### Changelog
1. Dec, 2018
* Initial release (based on https://github.com/apache/incubator-mxnet/tree/master/example/image-classification)
2. June, 2019
* Code refactor
* Label smoothing
* Cosine LR schedule
* MixUp regularization
* Better configurations
# Known Issues
### Known Issues
There are no known issues with this model.

97
MxNet/Classification/RN50v1.5/benchmark.py Normal file → Executable file
View file

@ -1,3 +1,5 @@
#!/usr/bin/env python3
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -18,14 +20,21 @@ import sys
import tempfile
import json
import os
import traceback
import numpy as np
from collections import OrderedDict
from subprocess import Popen
parser = argparse.ArgumentParser(description='Benchmark')
def int_list(x):
return list(map(int, x.split(',')))
parser = argparse.ArgumentParser(description='Benchmark',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--executable', default='./runner', help='path to runner')
parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]',
parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report")
parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]', type=int_list,
required=True, help='numbers of gpus separated by comma')
parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]',
parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]', type=int_list,
required=True, help='batch sizes separated by comma')
parser.add_argument('-i', '--benchmark-iters', metavar='I',
type=int, default=100, help='iterations')
@ -33,57 +42,83 @@ parser.add_argument('-e', '--epochs', metavar='E',
type=int, default=1, help='number of epochs')
parser.add_argument('-w', '--warmup', metavar='N',
type=int, default=0, help='warmup epochs')
parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report")
parser.add_argument('--only-inference', action='store_true', help="benchmark inference only")
parser.add_argument('--timeout', metavar='T',
type=str, default='inf', help='timeout for each run')
parser.add_argument('--mode', metavar='MODE', choices=('train_val', 'train', 'val'), default='train_val',
help="benchmark mode")
args, other_args = parser.parse_known_args()
ngpus = list(map(int, args.ngpus.split(',')))
batch_sizes = list(map(int, args.batch_sizes.split(',')))
latency_percentiles = ['avg', 50, 90, 95, 99, 100]
harmonic_mean_metrics = ['train.total_ips', 'val.total_ips']
res = OrderedDict()
res['model'] = ''
res['ngpus'] = ngpus
res['bs'] = batch_sizes
if args.only_inference:
res['metric_keys'] = ['val.total_ips']
else:
res['metric_keys'] = ['train.total_ips', 'val.total_ips']
res['ngpus'] = args.ngpus
res['bs'] = args.batch_sizes
res['metric_keys'] = []
if args.mode == 'train' or args.mode == 'train_val':
res['metric_keys'].append('train.total_ips')
for percentile in latency_percentiles:
res['metric_keys'].append('train.latency_{}'.format(percentile))
if args.mode == 'val' or args.mode == 'train_val':
res['metric_keys'].append('val.total_ips')
for percentile in latency_percentiles:
res['metric_keys'].append('val.latency_{}'.format(percentile))
res['metrics'] = OrderedDict()
for n in ngpus:
for n in args.ngpus:
res['metrics'][str(n)] = OrderedDict()
for bs in batch_sizes:
for bs in args.batch_sizes:
res['metrics'][str(n)][str(bs)] = OrderedDict()
report_file = args.output + '-{},{}'.format(n, bs)
Popen([args.executable, '-n', str(n), '-b', str(bs),
Popen(['timeout', args.timeout, args.executable, '-n', str(n), '-b', str(bs),
'--benchmark-iters', str(args.benchmark_iters),
'-e', str(args.epochs), '--report', report_file,
*([] if not args.only_inference else ['--only-inference']),
'--no-metrics'] + other_args, stdout=sys.stderr).wait()
'--mode', args.mode, '--no-metrics'] + other_args,
stdout=sys.stderr).wait()
with open(report_file, 'r') as f:
report = json.load(f)
try:
for suffix in ['', *['-{}'.format(i) for i in range(1, n)]]:
try:
with open(report_file + suffix, 'r') as f:
report = json.load(f)
break
except FileNotFoundError:
pass
else:
with open(report_file, 'r') as f:
report = json.load(f)
for metric in res['metric_keys']:
data = report['metrics'][metric][args.warmup:]
avg = len(data) / sum(map(lambda x: 1 / x, data))
res['metrics'][str(n)][str(bs)][metric] = avg
for metric in res['metric_keys']:
if len(report['metrics'][metric]) != args.epochs:
raise ValueError('Wrong number epochs in report')
data = report['metrics'][metric][args.warmup:]
if metric in harmonic_mean_metrics:
avg = len(data) / sum(map(lambda x: 1 / x, data))
else:
avg = np.mean(data)
res['metrics'][str(n)][str(bs)][metric] = avg
except Exception as e:
traceback.print_exc()
for metric in res['metric_keys']:
res['metrics'][str(n)][str(bs)][metric] = float('nan')
column_len = 7
column_len = 11
for m in res['metric_keys']:
print(m, file=sys.stderr)
print(' ' * column_len, end='|', file=sys.stderr)
for bs in batch_sizes:
for bs in args.batch_sizes:
print(str(bs).center(column_len), end='|', file=sys.stderr)
print(file=sys.stderr)
print('-' * (len(batch_sizes) + 1) * (column_len + 1), file=sys.stderr)
for n in ngpus:
print('-' * (len(args.batch_sizes) + 1) * (column_len + 1), file=sys.stderr)
for n in args.ngpus:
print(str(n).center(column_len), end='|', file=sys.stderr)
for bs in batch_sizes:
print(str(round(res['metrics'][str(n)][str(bs)][m])).center(column_len), end='|', file=sys.stderr)
for bs in args.batch_sizes:
print('{:.5g}'.format(res['metrics'][str(n)][str(bs)][m]).center(column_len), end='|', file=sys.stderr)
print(file=sys.stderr)
print(file=sys.stderr)

View file

@ -52,11 +52,14 @@ class BenchmarkingDataIter:
def __getattr__(self, attr):
return getattr(self.data_iter, attr)
def get_avg_time_and_clear(self):
def get_avg_time(self):
if self.num <= 1:
avg = float('nan')
else:
avg = self.overall_time / (self.num - 1)
return avg
def reset(self):
self.overall_time = 0
self.num = 0
return avg
self.data_iter.reset()

View file

@ -18,146 +18,166 @@ from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
from nvidia.dali.plugin.mxnet import DALIClassificationIterator
import horovod.mxnet as hvd
def add_dali_args(parser):
group = parser.add_argument_group('DALI', 'pipeline and augumentation')
group.add_argument('--use-dali', action='store_true',
help='use dalli pipeline and augunetation')
group.add_argument('--separ-val', action='store_true',
group = parser.add_argument_group('DALI data backend', 'entire group applies only to dali data backend')
group.add_argument('--dali-separ-val', action='store_true',
help='each process will perform independent validation on whole val-set')
group.add_argument('--dali-threads', type=int, default=3, help="number of threads" +\
"per GPU for DALI")
group.add_argument('--validation-dali-threads', type=int, default=10, help="number of threads" +\
group.add_argument('--dali-validation-threads', type=int, default=10, help="number of threads" +\
"per GPU for DALI for validation")
group.add_argument('--dali-prefetch-queue', type=int, default=3, help="DALI prefetch queue depth")
group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=16, help="Memory padding value for nvJPEG (in MB)")
group.add_argument('--dali-prefetch-queue', type=int, default=2, help="DALI prefetch queue depth")
group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=64, help="Memory padding value for nvJPEG (in MB)")
group.add_argument('--dali-fuse-decoder', type=int, default=1, help="0 or 1 whether to fuse decoder or not")
return parser
_mean_pixel = [255 * x for x in (0.485, 0.456, 0.406)]
_std_pixel = [255 * x for x in (0.229, 0.224, 0.225)]
class HybridTrainPipe(Pipeline):
def __init__(self, batch_size, num_threads, device_id, rec_path, idx_path,
shard_id, num_shards, crop_shape,
nvjpeg_padding, prefetch_queue=3,
output_layout=types.NCHW, pad_output=True, dtype='float16'):
super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id, prefetch_queue_depth = prefetch_queue)
self.input = ops.MXNetReader(path = [rec_path], index_path=[idx_path],
def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path,
shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3,
output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False):
super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, prefetch_queue_depth = prefetch_queue)
self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path],
random_shuffle=True, shard_id=shard_id, num_shards=num_shards)
self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB,
device_memory_padding = nvjpeg_padding,
host_memory_padding = nvjpeg_padding)
self.rrc = ops.RandomResizedCrop(device = "gpu", size = crop_shape)
self.cmnp = ops.CropMirrorNormalize(device = "gpu",
output_dtype = types.FLOAT16 if dtype == 'float16' else types.FLOAT,
output_layout = output_layout,
crop = crop_shape,
pad_output = pad_output,
image_type = types.RGB,
mean = _mean_pixel,
std = _std_pixel)
self.coin = ops.CoinFlip(probability = 0.5)
if dali_cpu:
dali_device = "cpu"
if args.dali_fuse_decoder:
self.decode = ops.HostDecoderRandomCrop(device=dali_device, output_type=types.RGB)
else:
self.decode = ops.HostDecoder(device=dali_device, output_type=types.RGB)
else:
dali_device = "gpu"
if args.dali_fuse_decoder:
self.decode = ops.nvJPEGDecoderRandomCrop(device="mixed", output_type=types.RGB,
device_memory_padding=nvjpeg_padding, host_memory_padding=nvjpeg_padding)
else:
self.decode = ops.nvJPEGDecoder(device="mixed", output_type=types.RGB,
device_memory_padding=nvjpeg_padding, host_memory_padding=nvjpeg_padding)
if args.dali_fuse_decoder:
self.resize = ops.Resize(device=dali_device, resize_x=crop_shape[1], resize_y=crop_shape[0])
else:
self.resize = ops.RandomResizedCrop(device=dali_device, size=crop_shape)
self.cmnp = ops.CropMirrorNormalize(device="gpu",
output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std)
self.coin = ops.CoinFlip(probability=0.5)
def define_graph(self):
rng = self.coin()
self.jpegs, self.labels = self.input(name = "Reader")
self.jpegs, self.labels = self.input(name="Reader")
images = self.decode(self.jpegs)
images = self.rrc(images)
output = self.cmnp(images, mirror = rng)
images = self.resize(images)
output = self.cmnp(images.gpu(), mirror=rng)
return [output, self.labels]
class HybridValPipe(Pipeline):
def __init__(self, batch_size, num_threads, device_id, rec_path, idx_path,
shard_id, num_shards, crop_shape,
nvjpeg_padding, prefetch_queue=3,
resize_shp=None,
output_layout=types.NCHW, pad_output=True, dtype='float16'):
super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id, prefetch_queue_depth = prefetch_queue)
self.input = ops.MXNetReader(path = [rec_path], index_path=[idx_path],
def __init__(self, args, batch_size, num_threads, device_id, rec_path, idx_path,
shard_id, num_shards, crop_shape, nvjpeg_padding, prefetch_queue=3, resize_shp=None,
output_layout=types.NCHW, pad_output=True, dtype='float16', dali_cpu=False):
super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id, prefetch_queue_depth=prefetch_queue)
self.input = ops.MXNetReader(path=[rec_path], index_path=[idx_path],
random_shuffle=False, shard_id=shard_id, num_shards=num_shards)
self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB,
device_memory_padding = nvjpeg_padding,
host_memory_padding = nvjpeg_padding)
self.resize = ops.Resize(device = "gpu", resize_shorter=resize_shp) if resize_shp else None
self.cmnp = ops.CropMirrorNormalize(device = "gpu",
output_dtype = types.FLOAT16 if dtype == 'float16' else types.FLOAT,
output_layout = output_layout,
crop = crop_shape,
pad_output = pad_output,
image_type = types.RGB,
mean = _mean_pixel,
std = _std_pixel)
if dali_cpu:
dali_device = "cpu"
self.decode = ops.HostDecoder(device=dali_device, output_type=types.RGB)
else:
dali_device = "gpu"
self.decode = ops.nvJPEGDecoder(device="mixed", output_type=types.RGB,
device_memory_padding=nvjpeg_padding,
host_memory_padding=nvjpeg_padding)
self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None
self.cmnp = ops.CropMirrorNormalize(device="gpu",
output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std)
def define_graph(self):
self.jpegs, self.labels = self.input(name = "Reader")
self.jpegs, self.labels = self.input(name="Reader")
images = self.decode(self.jpegs)
if self.resize:
images = self.resize(images)
output = self.cmnp(images)
output = self.cmnp(images.gpu())
return [output, self.labels]
def get_rec_iter(args, kv=None):
# resize is default base length of shorter edge for dataset;
# all images will be reshaped to this size
resize = int(args.resize)
# target shape is final shape of images pipelined to network;
# all images will be cropped to this size
target_shape = tuple([int(l) for l in args.image_shape.split(',')])
pad_output = target_shape[0] == 4
gpus = list(map(int, filter(None, args.gpus.split(',')))) # filter to not encount eventually empty strings
batch_size = args.batch_size//len(gpus)
def get_rec_iter(args, kv=None, dali_cpu=False):
gpus = args.gpus
num_threads = args.dali_threads
num_validation_threads = args.validation_dali_threads
#db_folder = "/data/imagenet/train-480-val-256-recordio/"
num_validation_threads = args.dali_validation_threads
pad_output = (args.image_shape[0] == 4)
# the input_layout w.r.t. the model is the output_layout of the image pipeline
output_layout = types.NHWC if args.input_layout == 'NHWC' else types.NCHW
rank = kv.rank if kv else 0
nWrk = kv.num_workers if kv else 1
if 'horovod' in args.kv_store:
rank = hvd.rank()
nWrk = hvd.size()
else:
rank = kv.rank if kv else 0
nWrk = kv.num_workers if kv else 1
trainpipes = [HybridTrainPipe(batch_size = batch_size,
batch_size = args.batch_size // nWrk // len(gpus)
trainpipes = [HybridTrainPipe(args = args,
batch_size = batch_size,
num_threads = num_threads,
device_id = gpu_id,
rec_path = args.data_train,
idx_path = args.data_train_idx,
shard_id = gpus.index(gpu_id) + len(gpus)*rank,
num_shards = len(gpus)*nWrk,
crop_shape = target_shape[1:],
crop_shape = args.image_shape[1:],
output_layout = output_layout,
pad_output = pad_output,
dtype = args.dtype,
pad_output = pad_output,
dali_cpu = dali_cpu,
nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus]
valpipes = [HybridValPipe(batch_size = batch_size,
num_threads = num_validation_threads,
device_id = gpu_id,
rec_path = args.data_val,
idx_path = args.data_val_idx,
shard_id = 0 if args.separ_val
else gpus.index(gpu_id) + len(gpus)*rank,
num_shards = 1 if args.separ_val else len(gpus)*nWrk,
crop_shape = target_shape[1:],
resize_shp = resize,
output_layout = output_layout,
pad_output = pad_output,
dtype = args.dtype,
nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] if args.data_val else None
if args.data_val:
valpipes = [HybridValPipe(args = args,
batch_size = batch_size,
num_threads = num_validation_threads,
device_id = gpu_id,
rec_path = args.data_val,
idx_path = args.data_val_idx,
shard_id = 0 if args.dali_separ_val
else gpus.index(gpu_id) + len(gpus)*rank,
num_shards = 1 if args.dali_separ_val else len(gpus)*nWrk,
crop_shape = args.image_shape[1:],
resize_shp = args.data_val_resize,
output_layout = output_layout,
dtype = args.dtype,
pad_output = pad_output,
dali_cpu = dali_cpu,
nvjpeg_padding = args.dali_nvjpeg_memory_padding * 1024 * 1024,
prefetch_queue = args.dali_prefetch_queue) for gpu_id in gpus] if args.data_val else None
trainpipes[0].build()
if args.data_val:
valpipes[0].build()
worker_val_examples = valpipes[0].epoch_size("Reader")
if not args.dali_separ_val:
worker_val_examples = worker_val_examples // nWrk
if rank < valpipes[0].epoch_size("Reader") % nWrk:
worker_val_examples += 1
if args.num_examples < trainpipes[0].epoch_size("Reader"):
warnings.warn("{} training examples will be used, although full training set contains {} examples".format(args.num_examples, trainpipes[0].epoch_size("Reader")))
dali_train_iter = DALIClassificationIterator(trainpipes, args.num_examples // nWrk)
dali_val_iter = DALIClassificationIterator(valpipes, valpipes[0].epoch_size("Reader") // (1 if args.separ_val else nWrk), fill_last_batch = False) if args.data_val else None
return dali_train_iter, dali_val_iter
if args.data_val:
dali_val_iter = DALIClassificationIterator(valpipes, worker_val_examples, fill_last_batch = False) if args.data_val else None
else:
dali_val_iter = None
return dali_train_iter, dali_val_iter

View file

@ -1,7 +1,5 @@
# -----------------------------------------------------------------------
# Copyright 2017-2018 The Apache Software Foundation
#
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@ -36,128 +34,61 @@
# limitations under the License.
import mxnet as mx
import mxnet.ndarray as nd
import random
import argparse
from mxnet.io import DataBatch, DataIter
import numpy as np
import horovod.mxnet as hvd
import dali
def add_data_args(parser):
data = parser.add_argument_group('Data', 'the input images')
def float_list(x):
return list(map(float, x.split(',')))
def int_list(x):
return list(map(int, x.split(',')))
data = parser.add_argument_group('Data')
data.add_argument('--data-train', type=str, help='the training data')
data.add_argument('--data-train-idx', type=str, default='', help='the index of training data')
data.add_argument('--data-val', type=str, help='the validation data')
data.add_argument('--data-val-idx', type=str, default='', help='the index of validation data')
data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
data.add_argument('--data-pred', type=str, help='the image on which run inference (only for pred mode)')
data.add_argument('--data-backend', choices=('dali-gpu', 'dali-cpu', 'mxnet', 'synthetic'), default='dali-gpu',
help='set data loading & augmentation backend')
data.add_argument('--image-shape', type=int_list, default=[3, 224, 224],
help='the image shape feed into the network')
data.add_argument('--rgb-mean', type=float_list, default=[123.68, 116.779, 103.939],
help='a tuple of size 3 for the mean rgb')
data.add_argument('--rgb-std', type=str, default='1,1,1',
data.add_argument('--rgb-std', type=float_list, default=[58.393, 57.12, 57.375],
help='a tuple of size 3 for the std rgb')
data.add_argument('--pad-size', type=int, default=0,
help='padding the input image')
data.add_argument('--fill-value', type=int, default=127,
help='Set the padding pixels value to fill_value')
data.add_argument('--image-shape', type=str,
help='the image shape feed into the network, e.g. (3,224,224)')
data.add_argument('--num-classes', type=int, help='the number of classes')
data.add_argument('--num-examples', type=int, help='the number of training examples')
data.add_argument('--data-nthreads', type=int, default=4,
help='number of threads for data decoding')
data.add_argument('--benchmark-iters', type=int, default=None,
help='run only benchmark-iters iterations from each epoch')
data.add_argument('--input-layout', type=str, default='NCHW',
help='the layout of the input data (e.g. NCHW)')
data.add_argument('--conv-layout', type=str, default='NCHW',
help='the layout of the data assumed by the conv operation (e.g. NCHW)')
data.add_argument('--conv-algo', type=int, default=-1,
help='set the convolution algos (fwd, dgrad, wgrad)')
data.add_argument('--batchnorm-layout', type=str, default='NCHW',
help='the layout of the data assumed by the batchnorm operation (e.g. NCHW)')
data.add_argument('--batchnorm-eps', type=float, default=2e-5,
help='the amount added to the batchnorm variance to prevent output explosion.')
data.add_argument('--batchnorm-mom', type=float, default=0.9,
help='the leaky-integrator factor controling the batchnorm mean and variance.')
data.add_argument('--pooling-layout', type=str, default='NCHW',
help='the layout of the data assumed by the pooling operation (e.g. NCHW)')
data.add_argument('--verbose', type=int, default=0,
help='turn on reporting of chosen algos for convolution, etc.')
data.add_argument('--seed', type=int, default=None,
help='set the seed for python, nd and mxnet rngs')
data.add_argument('--custom-bn-off', type=int, default=0,
help='disable use of custom batchnorm kernel')
data.add_argument('--fuse-bn-relu', type=int, default=0,
help='have batchnorm kernel perform activation relu')
data.add_argument('--fuse-bn-add-relu', type=int, default=0,
help='have batchnorm kernel perform add followed by activation relu')
data.add_argument('--force-tensor-core', type=int, default=0,
help='require conv algos to be tensor core')
data.add_argument('--input-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
help='the layout of the input data')
data.add_argument('--conv-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
help='the layout of the data assumed by the conv operation')
data.add_argument('--batchnorm-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
help='the layout of the data assumed by the batchnorm operation')
data.add_argument('--pooling-layout', type=str, default='NCHW', choices=('NCHW', 'NHWC'),
help='the layout of the data assumed by the pooling operation')
data.add_argument('--num-examples', type=int, default=1281167,
help="the number of training examples (doesn't work with mxnet data backend)")
data.add_argument('--data-val-resize', type=int, default=256,
help='base length of shorter edge for validation dataset')
return data
# Action to translate --set-resnet-aug flag to its component settings.
class SetResnetAugAction(argparse.Action):
def __init__(self, nargs=0, **kwargs):
if nargs != 0:
raise ValueError('nargs for SetResnetAug must be 0.')
super(SetResnetAugAction, self).__init__(nargs=nargs, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
# standard data augmentation setting for resnet training
setattr(namespace, 'random_crop', 1)
setattr(namespace, 'random_resized_crop', 1)
setattr(namespace, 'random_mirror', 1)
setattr(namespace, 'min_random_area', 0.08)
setattr(namespace, 'max_random_aspect_ratio', 4./3.)
setattr(namespace, 'min_random_aspect_ratio', 3./4.)
setattr(namespace, 'brightness', 0.4)
setattr(namespace, 'contrast', 0.4)
setattr(namespace, 'saturation', 0.4)
setattr(namespace, 'pca_noise', 0.1)
# record that this --set-resnet-aug 'macro arg' has been invoked
setattr(namespace, self.dest, 1)
# Similar to the above, but suitable for calling within a training script to set the defaults.
def set_resnet_aug(aug):
# standard data augmentation setting for resnet training
aug.set_defaults(random_crop=0, random_resized_crop=1)
aug.set_defaults(random_mirror=1)
aug.set_defaults(min_random_area=0.08)
aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
# Action to translate --set-data-aug-level <N> arg to its component settings.
class SetDataAugLevelAction(argparse.Action):
def __init__(self, option_strings, dest, nargs=None, **kwargs):
if nargs is not None:
raise ValueError("nargs not allowed")
super(SetDataAugLevelAction, self).__init__(option_strings, dest, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
level = values
# record that this --set-data-aug-level <N> 'macro arg' has been invoked
setattr(namespace, self.dest, level)
if level >= 1:
setattr(namespace, 'random_crop', 1)
setattr(namespace, 'random_mirror', 1)
if level >= 2:
setattr(namespace, 'max_random_h', 36)
setattr(namespace, 'max_random_s', 50)
setattr(namespace, 'max_random_l', 50)
if level >= 3:
setattr(namespace, 'max_random_rotate_angle', 10)
setattr(namespace, 'max_random_shear_ratio', 0.1)
setattr(namespace, 'max_random_aspect_ratio', 0.25)
# Similar to the above, but suitable for calling within a training script to set the defaults.
def set_data_aug_level(aug, level):
if level >= 1:
aug.set_defaults(random_crop=1, random_mirror=1)
if level >= 2:
aug.set_defaults(max_random_h=36, max_random_s=50, max_random_l=50)
if level >= 3:
aug.set_defaults(max_random_rotate_angle=10, max_random_shear_ratio=0.1, max_random_aspect_ratio=0.25)
def add_data_aug_args(parser):
aug = parser.add_argument_group(
'Image augmentations', 'implemented in src/io/image_aug_default.cc')
'MXNet data backend', 'entire group applies only to mxnet data backend')
aug.add_argument('--data-mxnet-threads', type=int, default=40,
help='number of threads for data decoding for mxnet data backend')
aug.add_argument('--random-crop', type=int, default=0,
help='if or not randomly crop the image')
aug.add_argument('--random-mirror', type=int, default=0,
aug.add_argument('--random-mirror', type=int, default=1,
help='if or not randomly flip horizontally')
aug.add_argument('--max-random-h', type=int, default=0,
help='max change of hue, whose range is [0, 180]')
@ -165,9 +96,9 @@ def add_data_aug_args(parser):
help='max change of saturation, whose range is [0, 255]')
aug.add_argument('--max-random-l', type=int, default=0,
help='max change of intensity, whose range is [0, 255]')
aug.add_argument('--min-random-aspect-ratio', type=float, default=None,
aug.add_argument('--min-random-aspect-ratio', type=float, default=0.75,
help='min value of aspect ratio, whose value is either None or a positive value.')
aug.add_argument('--max-random-aspect-ratio', type=float, default=0,
aug.add_argument('--max-random-aspect-ratio', type=float, default=1.33,
help='max value of aspect ratio. If min_random_aspect_ratio is None, '
'the aspect ratio range is [1-max_random_aspect_ratio, '
'1+max_random_aspect_ratio], otherwise it is '
@ -183,7 +114,7 @@ def add_data_aug_args(parser):
'otherwise use --pad-size')
aug.add_argument('--max-random-area', type=float, default=1,
help='max area to crop in random resized crop, whose range is [0, 1]')
aug.add_argument('--min-random-area', type=float, default=1,
aug.add_argument('--min-random-area', type=float, default=0.05,
help='min area to crop in random resized crop, whose range is [0, 1]')
aug.add_argument('--min-crop-size', type=int, default=-1,
help='Crop both width and height into a random size in '
@ -199,87 +130,200 @@ def add_data_aug_args(parser):
help='saturation jittering, whose range is [0, 1]')
aug.add_argument('--pca-noise', type=float, default=0,
help='pca noise, whose range is [0, 1]')
aug.add_argument('--random-resized-crop', type=int, default=0,
aug.add_argument('--random-resized-crop', type=int, default=1,
help='whether to use random resized crop')
aug.add_argument('--set-resnet-aug', action=SetResnetAugAction,
help='whether to employ standard resnet augmentations (see data.py)')
aug.add_argument('--set-data-aug-level', type=int, default=None, action=SetDataAugLevelAction,
help='set multiple data augmentations based on a `level` (see data.py)')
return aug
def get_data_loader(args):
if args.data_backend == 'dali-gpu':
return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, dali_cpu=False))
if args.data_backend == 'dali-cpu':
return (lambda *args, **kwargs: dali.get_rec_iter(*args, **kwargs, dali_cpu=True))
if args.data_backend == 'synthetic':
return get_synthetic_rec_iter
if args.data_backend == 'mxnet':
return get_rec_iter
raise ValueError('Wrong data backend')
class DataGPUSplit:
def __init__(self, dataloader, ctx, dtype):
self.dataloader = dataloader
self.ctx = ctx
self.dtype = dtype
self.batch_size = dataloader.batch_size // len(ctx)
self._num_gpus = len(ctx)
def __iter__(self):
return DataGPUSplit(iter(self.dataloader), self.ctx, self.dtype)
def __next__(self):
data = next(self.dataloader)
ret = []
for i in range(len(self.ctx)):
start = i * len(data.data[0]) // len(self.ctx)
end = (i + 1) * len(data.data[0]) // len(self.ctx)
pad = max(0, min(data.pad - (len(self.ctx) - i - 1) * self.batch_size, self.batch_size))
ret.append(mx.io.DataBatch(
[data.data[0][start:end].as_in_context(self.ctx[i]).astype(self.dtype)],
[data.label[0][start:end].as_in_context(self.ctx[i])],
pad=pad))
return ret
def next(self):
return next(self)
def reset(self):
self.dataloader.reset()
def get_rec_iter(args, kv=None):
image_shape = tuple([int(l) for l in args.image_shape.split(',')])
if args.input_layout == 'NHWC':
image_shape = image_shape[1:] + (image_shape[0],)
if kv:
(rank, nworker) = (kv.rank, kv.num_workers)
gpus = args.gpus
if 'horovod' in args.kv_store:
rank = hvd.rank()
nworker = hvd.size()
gpus = [gpus[0]]
batch_size = args.batch_size // hvd.size()
else:
(rank, nworker) = (0, 1)
rgb_mean = [float(i) for i in args.rgb_mean.split(',')]
rgb_std = [float(i) for i in args.rgb_std.split(',')]
rank = kv.rank if kv else 0
nworker = kv.num_workers if kv else 1
batch_size = args.batch_size
if args.input_layout == 'NHWC':
raise ValueError('ImageRecordIter cannot handle layout {}'.format(args.input_layout))
train = mx.io.ImageRecordIter(
path_imgrec = args.data_train,
path_imgidx = args.data_train_idx,
label_width = 1,
mean_r = rgb_mean[0],
mean_g = rgb_mean[1],
mean_b = rgb_mean[2],
std_r = rgb_std[0],
std_g = rgb_std[1],
std_b = rgb_std[2],
data_name = 'data',
label_name = 'softmax_label',
data_shape = image_shape,
batch_size = args.batch_size,
rand_crop = args.random_crop,
max_random_scale = args.max_random_scale,
pad = args.pad_size,
fill_value = args.fill_value,
random_resized_crop = args.random_resized_crop,
min_random_scale = args.min_random_scale,
max_aspect_ratio = args.max_random_aspect_ratio,
min_aspect_ratio = args.min_random_aspect_ratio,
max_random_area = args.max_random_area,
min_random_area = args.min_random_area,
min_crop_size = args.min_crop_size,
max_crop_size = args.max_crop_size,
brightness = args.brightness,
contrast = args.contrast,
saturation = args.saturation,
pca_noise = args.pca_noise,
random_h = args.max_random_h,
random_s = args.max_random_s,
random_l = args.max_random_l,
max_rotate_angle = args.max_random_rotate_angle,
max_shear_ratio = args.max_random_shear_ratio,
rand_mirror = args.random_mirror,
preprocess_threads = args.data_nthreads,
shuffle = True,
num_parts = nworker,
part_index = rank)
train = DataGPUSplit(mx.io.ImageRecordIter(
path_imgrec = args.data_train,
path_imgidx = args.data_train_idx,
label_width = 1,
mean_r = args.rgb_mean[0],
mean_g = args.rgb_mean[1],
mean_b = args.rgb_mean[2],
std_r = args.rgb_std[0],
std_g = args.rgb_std[1],
std_b = args.rgb_std[2],
data_name = 'data',
label_name = 'softmax_label',
data_shape = args.image_shape,
batch_size = batch_size,
rand_crop = args.random_crop,
max_random_scale = args.max_random_scale,
random_resized_crop = args.random_resized_crop,
min_random_scale = args.min_random_scale,
max_aspect_ratio = args.max_random_aspect_ratio,
min_aspect_ratio = args.min_random_aspect_ratio,
max_random_area = args.max_random_area,
min_random_area = args.min_random_area,
min_crop_size = args.min_crop_size,
max_crop_size = args.max_crop_size,
brightness = args.brightness,
contrast = args.contrast,
saturation = args.saturation,
pca_noise = args.pca_noise,
random_h = args.max_random_h,
random_s = args.max_random_s,
random_l = args.max_random_l,
max_rotate_angle = args.max_random_rotate_angle,
max_shear_ratio = args.max_random_shear_ratio,
rand_mirror = args.random_mirror,
preprocess_threads = args.data_mxnet_threads,
shuffle = True,
num_parts = nworker,
part_index = rank,
seed = args.seed or '0',
), [mx.gpu(gpu) for gpu in gpus], args.dtype)
if args.data_val is None:
return (train, None)
val = mx.io.ImageRecordIter(
path_imgrec = args.data_val,
path_imgidx = args.data_val_idx,
label_width = 1,
mean_r = rgb_mean[0],
mean_g = rgb_mean[1],
mean_b = rgb_mean[2],
std_r = rgb_std[0],
std_g = rgb_std[1],
std_b = rgb_std[2],
data_name = 'data',
label_name = 'softmax_label',
batch_size = args.batch_size,
round_batch = False,
data_shape = image_shape,
preprocess_threads = args.data_nthreads,
rand_crop = False,
rand_mirror = False,
num_parts = nworker,
part_index = rank)
val = DataGPUSplit(mx.io.ImageRecordIter(
path_imgrec = args.data_val,
path_imgidx = args.data_val_idx,
label_width = 1,
mean_r = args.rgb_mean[0],
mean_g = args.rgb_mean[1],
mean_b = args.rgb_mean[2],
std_r = args.rgb_std[0],
std_g = args.rgb_std[1],
std_b = args.rgb_std[2],
data_name = 'data',
label_name = 'softmax_label',
batch_size = batch_size,
round_batch = False,
data_shape = args.image_shape,
preprocess_threads = args.data_mxnet_threads,
rand_crop = False,
rand_mirror = False,
num_parts = nworker,
part_index = rank,
resize = args.data_val_resize,
), [mx.gpu(gpu) for gpu in gpus], args.dtype)
return (train, val)
class SyntheticDataIter(DataIter):
def __init__(self, num_classes, data_shape, max_iter, ctx, dtype):
self.batch_size = data_shape[0]
self.cur_iter = 0
self.max_iter = max_iter
self.dtype = dtype
label = np.random.randint(0, num_classes, [self.batch_size,])
data = np.random.uniform(-1, 1, data_shape)
self.data = []
self.label = []
self._num_gpus = len(ctx)
for dev in ctx:
self.data.append(mx.nd.array(data, dtype=self.dtype, ctx=dev))
self.label.append(mx.nd.array(label, dtype=self.dtype, ctx=dev))
def __iter__(self):
return self
def next(self):
self.cur_iter += 1
if self.cur_iter <= self.max_iter:
return [DataBatch(data=(data,), label=(label,), pad=0) for data, label in zip(self.data, self.label)]
else:
raise StopIteration
def __next__(self):
return self.next()
def reset(self):
self.cur_iter = 0
def get_synthetic_rec_iter(args, kv=None):
gpus = args.gpus
if 'horovod' in args.kv_store:
gpus = [gpus[0]]
batch_size = args.batch_size // hvd.size()
else:
batch_size = args.batch_size
if args.input_layout == 'NCHW':
data_shape = (batch_size, *args.image_shape)
elif args.input_layout == 'NHWC':
data_shape = (batch_size, *args.image_shape[1:], args.image_shape[0])
else:
raise ValueError('Wrong input layout')
train = SyntheticDataIter(args.num_classes, data_shape,
args.num_examples // args.batch_size,
[mx.gpu(gpu) for gpu in gpus], args.dtype)
if args.data_val is None:
return (train, None)
val = SyntheticDataIter(args.num_classes, data_shape,
args.num_examples // args.batch_size,
[mx.gpu(gpu) for gpu in gpus], args.dtype)
return (train, val)
def load_image(args, path, ctx=mx.cpu()):
image = mx.image.imread(path).astype('float32')
image = mx.image.imresize(image, *args.image_shape[1:])
image = (image - nd.array(args.rgb_mean)) / nd.array(args.rgb_std)
image = image.as_in_context(ctx)
if args.input_layout == 'NCHW':
image = image.transpose((2, 0, 1))
image = image.astype(args.dtype)
if args.image_shape[0] == 4:
dim = 0 if args.input_layout == 'NCHW' else 2
image = nd.concat(image, nd.zeros((1, *image.shape[1:]), dtype=image.dtype, ctx=image.context), dim=dim)
return image

View file

@ -1,19 +0,0 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script launches ResNet50 inference benchmark in FP16 on 1 GPU with 1,2,4,64,128,192,208 batch size
# Usage ./INFER_BENCHMARK_FP16.sh <additionals flags>
python benchmark.py -n 1 -b 1,2,4,64,128,192,208 --only-inference -e 3 -w 1 -i 100 -o report.json $@

View file

@ -1,19 +0,0 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script launches ResNet50 training in FP16 on 1 GPUs using 208 batch size (208 per GPU)
# Usage ./RN50_FP16_1GPU.sh <path to this repository> <additionals flags>
"$1/runner" -n 1 -b 208 --model-prefix model ${@:2}

View file

@ -1,19 +0,0 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script launches ResNet50 training in FP16 on 8 GPUs using 1664 batch size (208 per GPU)
# Usage ./RN50_FP16_8GPU.sh <path to this repository> <additionals flags>
"$1/runner" -n 8 -b 208 --model-prefix model ${@:2}

View file

@ -1,19 +0,0 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script launches ResNet50 training in FP32 on 1 GPUs using 96 batch size (96 per GPU)
# Usage ./RN50_FP32_1GPU.sh <path to this repository> <additionals flags>
"$1/runner" -n 1 -b 96 --dtype float32 --model-prefix model ${@:2}

View file

@ -1,19 +0,0 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script launches ResNet50 training in FP32 on 4 GPUs using 384 batch size (96 per GPU)
# Usage ./RN50_FP32_4GPU.sh <path to this repository> <additionals flags>
"$1/runner" -n 4 -b 96 --dtype float32 --model-prefix model ${@:2}

View file

@ -1,19 +0,0 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script launches ResNet50 training in FP32 on 8 GPUs using 768 batch size (96 per GPU)
# Usage ./RN50_FP32_8GPU.sh <path to this repository> <additionals flags>
"$1/runner" -n 8 -b 96 --dtype float32 --model-prefix model ${@:2}

View file

@ -1,19 +0,0 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script score ResNet50 checkpoint in FP16 on 1 GPUs using 128 batch size
# Usage ./SCORE_FP16.sh <model prefix> <epoch> <additionals flags>
./runner -n 1 -b 128 --only-inference --model-prefix $1 --load-epoch $2 -e 1 ${@:3}

View file

@ -1,19 +0,0 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script score ResNet50 checkpoint in FP32 on 1 GPUs using 64 batch size
# Usage ./SCORE_FP32.sh <model prefix> <epoch> <additionals flags>
./runner -n 1 -b 64 --dtype float32 --only-inference --model-prefix $1 --load-epoch $2 -e 1 ${@:3}

View file

@ -33,197 +33,408 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" example train fit utility """
""" train fit utility """
import logging
import os
import time
import re
import math
import sys
import random
from itertools import starmap
import numpy as np
import mxnet as mx
import mxnet.ndarray as nd
import horovod.mxnet as hvd
import mxnet.contrib.amp as amp
from mxnet import autograd as ag
from mxnet import gluon
from report import Report
from benchmarking import BenchmarkingDataIter
def get_epoch_size(args, kv):
return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size)
def _get_lr_scheduler(args, kv):
if 'lr_factor' not in args or args.lr_factor >= 1:
return (args.lr, None)
epoch_size = get_epoch_size(args, kv)
begin_epoch = args.load_epoch if args.load_epoch else 0
if 'pow' in args.lr_step_epochs:
lr = args.lr
max_up = args.num_epochs * epoch_size
pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs))
poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr)
return (lr, poly_sched)
step_epochs = [int(l) for l in args.lr_step_epochs.split(',')]
lr = args.lr
for s in step_epochs:
if begin_epoch >= s:
lr *= args.lr_factor
if lr != args.lr:
logging.info('Adjust learning rate to %e for epoch %d',
lr, begin_epoch)
steps = [epoch_size * (x - begin_epoch)
for x in step_epochs if x - begin_epoch > 0]
if steps:
if kv:
num_workers = kv.num_workers
else:
num_workers = 1
epoch_size = math.ceil(int(args.num_examples/num_workers)/args.batch_size)
return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor,
base_lr=args.lr, warmup_steps=epoch_size * args.warmup_epochs,
warmup_mode=args.warmup_strategy))
else:
return (lr, None)
def _load_model(args, rank=0):
if 'load_epoch' not in args or args.load_epoch is None:
return (None, None, None)
assert args.model_prefix is not None
model_prefix = args.model_prefix
if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)):
model_prefix += "-%d" % (rank)
sym, arg_params, aux_params = mx.model.load_checkpoint(
model_prefix, args.load_epoch)
logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch)
return (sym, arg_params, aux_params)
def _save_model(args, rank=0):
if args.model_prefix is None:
return None
return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % (
args.model_prefix, rank), period=args.save_period)
import data
def add_fit_args(parser):
"""
parser : argparse.ArgumentParser
return a parser added with args required by fit
"""
train = parser.add_argument_group('Training', 'model training')
train.add_argument('--num-layers', type=int,
help='number of layers in the neural network, \
required by some networks such as resnet')
train.add_argument('--gpus', type=str,
help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu')
train.add_argument('--kv-store', type=str, default='device',
def int_list(x):
return list(map(int, x.split(',')))
def float_list(x):
return list(map(float, x.split(',')))
train = parser.add_argument_group('Training')
train.add_argument('--mode', default='train_val', choices=('train_val', 'train', 'val', 'pred'),
help='mode')
train.add_argument('--seed', type=int, default=None,
help='random seed')
train.add_argument('--gpus', type=int_list, default=[0],
help='list of gpus to run, e.g. 0 or 0,2,5')
train.add_argument('--kv-store', type=str, default='device', choices=('device', 'horovod'),
help='key-value store type')
train.add_argument('--num-epochs', type=int, default=100,
help='max num of epochs')
train.add_argument('--dtype', type=str, default='float16', choices=('float32', 'float16'),
help='precision')
train.add_argument('--amp', action='store_true',
help='If enabled, turn on AMP (Automatic Mixed Precision)')
train.add_argument('--batch-size', type=int, default=192,
help='the batch size')
train.add_argument('--num-epochs', type=int, default=90,
help='number of epochs')
train.add_argument('--lr', type=float, default=0.1,
help='initial learning rate')
train.add_argument('--lr-factor', type=float, default=0.1,
train.add_argument('--lr-schedule', choices=('multistep', 'cosine'), default='cosine',
help='learning rate schedule')
train.add_argument('--lr-factor', type=float, default=0.256,
help='the ratio to reduce lr on each step')
train.add_argument('--lr-step-epochs', type=str,
train.add_argument('--lr-steps', type=float_list, default=[],
help='the epochs to reduce the lr, e.g. 30,60')
train.add_argument('--initializer', type=str, default='default',
help='the initializer type')
train.add_argument('--optimizer', type=str, default='sgd',
help='the optimizer type')
train.add_argument('--mom', type=float, default=0.9,
help='momentum for sgd')
train.add_argument('--wd', type=float, default=0.0001,
help='weight decay for sgd')
train.add_argument('--batch-size', type=int, default=208,
help='the batch size')
train.add_argument('--disp-batches', type=int, default=20,
help='show progress for every n batches')
train.add_argument('--model-prefix', type=str,
help='model prefix')
train.add_argument('--save-period', type=int, default=1, help='params saving period')
parser.add_argument('--monitor', dest='monitor', type=int, default=0,
help='log network parameters every N iters if larger than 0')
train.add_argument('--load-epoch', type=int,
help='load the model on an epoch using the model-load-prefix')
train.add_argument('--loss', type=str, default='',
help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss')
train.add_argument('--test-io', type=int, default=0,
help='1 means test reading speed without training')
train.add_argument('--dtype', type=str, default='float16',
help='precision: float32 or float16')
train.add_argument('--gc-type', type=str, default='none',
help='type of gradient compression to use, \
takes `2bit` or `none` for now')
train.add_argument('--gc-threshold', type=float, default=0.5,
help='threshold for 2bit gradient compression')
# additional parameters for large batch sgd
train.add_argument('--macrobatch-size', type=int, default=0,
help='distributed effective batch size')
train.add_argument('--warmup-epochs', type=int, default=5,
help='the epochs to ramp-up lr to scaled large-batch value')
train.add_argument('--warmup-strategy', type=str, default='linear',
help='the ramping-up strategy for large batch sgd')
train.add_argument('--logging-dir', type=str, default='logs')
train.add_argument('--log', type=str, default='')
train.add_argument('--bn-gamma-init0', action='store_true')
train.add_argument('--epoch-size',type=int, default=0,
help='set number of batches in an epoch. useful for debugging')
#train.add_argument('--tensorboard', type=str, default='',
# help='log parameters to visualize in tensorboard every epoch. takes name to specify as tensorboard run. Empty means tensorboard logging is disabled')
train.add_argument('--profile-worker-suffix', type=str, default='',
help='profile workers actions into this file. During distributed training\
filename saved will be rank1_ followed by this suffix')
train.add_argument('--profile-server-suffix', type=str, default='',
help='profile server actions into a file with name like rank1_ followed by this suffix \
during distributed training')
train.add_argument('--report', type=str, help='file where to save report')
train.add_argument('--only-inference', action='store_true', help='do not train, only inference (for benchmarking)')
train.add_argument('--optimizer', type=str, default='sgd',
help='the optimizer type')
train.add_argument('--mom', type=float, default=0.875,
help='momentum for sgd')
train.add_argument('--wd', type=float, default=1 / 32768,
help='weight decay for sgd')
train.add_argument('--label-smoothing', type=float, default=0.1,
help='label smoothing factor')
train.add_argument('--mixup', type=float, default=0,
help='alpha parameter for mixup (if 0 then mixup is not applied)')
train.add_argument('--disp-batches', type=int, default=20,
help='show progress for every n batches')
train.add_argument('--model-prefix', type=str, default='model',
help='model checkpoint prefix')
train.add_argument('--save-frequency', type=int, default=-1,
help='frequency of saving model in epochs (--model-prefix must be specified). '
'If -1 then save only best model. If 0 then do not save anything.')
train.add_argument('--begin-epoch', type=int, default=0,
help='start the model from an epoch')
train.add_argument('--load', help='checkpoint to load')
train.add_argument('--test-io', action='store_true',
help='test reading speed without training')
train.add_argument('--test-io-mode', default='train', choices=('train', 'val'),
help='data to test')
train.add_argument('--log', type=str, default='log.log',
help='file where to save the log from the experiment')
train.add_argument('--report', default='report.json', help='file where to save report')
train.add_argument('--no-metrics', action='store_true', help='do not calculate evaluation metrics (for benchmarking)')
train.add_argument('--benchmark-iters', type=int, default=None,
help='run only benchmark-iters iterations from each epoch')
return train
def get_epoch_size(args, kv):
return math.ceil(args.num_examples / args.batch_size)
def fit(args, network, data_loader, **kwargs):
def get_lr_scheduler(args):
def multistep_schedule(x):
lr = args.lr * (args.lr_factor ** (len(list(filter(lambda step: step <= x, args.lr_steps)))))
warmup_coeff = min(1, x / args.warmup_epochs)
return warmup_coeff * lr
def cosine_schedule(x):
steps = args.lr_steps
if not steps or steps[0] > args.warmup_epochs:
steps = [args.warmup_epochs] + steps
elif not steps or steps[0] != 0:
steps = [0] + steps
if steps[-1] != args.num_epochs:
steps.append(args.num_epochs)
if x < args.warmup_epochs:
return args.lr * x / args.warmup_epochs
for i, (step, next_step) in enumerate(zip(steps, steps[1:])):
if next_step > x:
return args.lr * 0.5 * (1 + math.cos(math.pi * (x - step) / (next_step - step))) * (args.lr_factor ** i)
return 0
schedules = {
'multistep': multistep_schedule,
'cosine': cosine_schedule,
}
return schedules[args.lr_schedule]
def load_model(args, model):
if args.load is None:
return False
model.load_parameters(args.load)
logging.info('Loaded model {}'.format(args.load))
return True
def save_checkpoint(net, epoch, top1, best_acc, model_prefix, save_frequency, kvstore):
if model_prefix is None or save_frequency == 0 or ('horovod' in kvstore and hvd.rank() != 0):
return
if save_frequency > 0 and (epoch + 1) % save_frequency == 0:
fname = '{}_{:04}.params'.format(model_prefix, epoch)
net.save_parameters(fname)
logging.info('[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(epoch, fname, top1))
if top1 > best_acc:
fname = '{}_best.params'.format(model_prefix)
net.save_parameters(fname)
logging.info('[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(epoch, fname, top1))
def add_metrics_to_report(report, mode, metric, durations, total_batch_size, loss=None, warmup=20):
if report is None:
return
top1 = metric.get('accuracy', None)
if top1 is not None:
report.add_value('{}.top1'.format(mode), top1)
top5 = metric.get('top_k_accuracy_5', None)
if top5 is not None:
report.add_value('{}.top5'.format(mode), top5)
if loss is not None:
report.add_value('{}.loss'.format(mode), loss.get_global()[1])
if len(durations) > warmup:
durations = durations[warmup:]
duration = np.mean(durations)
total_ips = total_batch_size / duration
report.add_value('{}.latency_avg'.format(mode), duration)
for percentile in [50, 90, 95, 99, 100]:
report.add_value('{}.latency_{}'.format(mode, percentile), np.percentile(durations, percentile))
report.add_value('{}.total_ips'.format(mode), total_ips)
def model_pred(args, model, image):
from imagenet_classes import classes
output = model(image.reshape(-1, *image.shape))[0].softmax().as_in_context(mx.cpu())
top = output.argsort(is_ascend=False)[:10]
for i, ind in enumerate(top):
ind = int(ind.asscalar())
logging.info('{:2d}. {:5.2f}% -> {}'.format(i + 1, output[ind].asscalar() * 100, classes[ind]))
def reduce_metrics(args, metrics, kvstore):
if 'horovod' not in kvstore or not metrics[0] or hvd.size() == 1:
return metrics
m = mx.ndarray.array(metrics[1], ctx=mx.gpu(args.gpus[0]))
reduced = hvd.allreduce(m)
values = reduced.as_in_context(mx.cpu()).asnumpy().tolist()
return (metrics[0], values)
def model_score(args, net, val_data, metric, kvstore, report=None):
if val_data is None:
logging.info('Omitting validation: no data')
return [], []
if not isinstance(metric, mx.metric.EvalMetric):
metric = mx.metric.create(metric)
metric.reset()
val_data.reset()
total_batch_size = val_data.batch_size * val_data._num_gpus * (hvd.size() if 'horovod' in kvstore else 1)
durations = []
tic = time.time()
outputs = []
for batches in val_data:
# synchronize to previous iteration
for o in outputs:
o.wait_to_read()
data = [b.data[0] for b in batches]
label = [b.label[0][:len(b.data[0]) - b.pad] for b in batches if len(b.data[0]) != b.pad]
outputs = [net(X) for X, b in zip(data, batches)]
outputs = [o[:len(b.data[0]) - b.pad] for o, b in zip(outputs, batches) if len(b.data[0]) != b.pad]
metric.update(label, outputs)
durations.append(time.time() - tic)
tic = time.time()
metric = reduce_metrics(args, metric.get_global(), kvstore)
add_metrics_to_report(report, 'val', dict(zip(*metric)), durations, total_batch_size)
return metric
class ScalarMetric(mx.metric.Loss):
def update(self, _, scalar):
self.sum_metric += scalar
self.global_sum_metric += scalar
self.num_inst += 1
self.global_num_inst += 1
def label_smoothing(labels, classes, eta):
return labels.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes)
def model_fit(args, net, train_data, eval_metric, optimizer,
optimizer_params, lr_scheduler, eval_data, kvstore, kv,
begin_epoch, num_epoch, model_prefix, report, print_loss):
if not isinstance(eval_metric, mx.metric.EvalMetric):
eval_metric = mx.metric.create(eval_metric)
loss_metric = ScalarMetric()
if 'horovod' in kvstore:
trainer = hvd.DistributedTrainer(net.collect_params(), optimizer, optimizer_params)
else:
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params,
kvstore=kv, update_on_kvstore=False)
if args.amp:
amp.init_trainer(trainer)
sparse_label_loss = (args.label_smoothing == 0 and args.mixup == 0)
loss = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)
loss.hybridize(static_shape=True, static_alloc=True)
local_batch_size = train_data.batch_size
total_batch_size = local_batch_size * train_data._num_gpus * (hvd.size() if 'horovod' in kvstore else 1)
durations = []
epoch_size = get_epoch_size(args, kv)
def transform_data(images, labels):
if args.mixup != 0:
coeffs = mx.nd.array(np.random.beta(args.mixup, args.mixup, size=images.shape[0])).as_in_context(images.context)
image_coeffs = coeffs.astype(images.dtype, copy=False).reshape(*coeffs.shape, 1, 1, 1)
ret_images = image_coeffs * images + (1 - image_coeffs) * images[::-1]
ret_labels = label_smoothing(labels, args.num_classes, args.label_smoothing)
label_coeffs = coeffs.reshape(*coeffs.shape, 1)
ret_labels = label_coeffs * ret_labels + (1 - label_coeffs) * ret_labels[::-1]
else:
ret_images = images
if not sparse_label_loss:
ret_labels = label_smoothing(labels, args.num_classes, args.label_smoothing)
else:
ret_labels = labels
return ret_images, ret_labels
best_accuracy = -1
for epoch in range(begin_epoch, num_epoch):
tic = time.time()
train_data.reset()
eval_metric.reset()
loss_metric.reset()
btic = time.time()
logging.info('Starting epoch {}'.format(epoch))
outputs = []
for i, batches in enumerate(train_data):
# synchronize to previous iteration
for o in outputs:
o.wait_to_read()
trainer.set_learning_rate(lr_scheduler(epoch + i / epoch_size))
data = [b.data[0] for b in batches]
label = [b.label[0].as_in_context(b.data[0].context) for b in batches]
orig_label = label
data, label = zip(*starmap(transform_data, zip(data, label)))
outputs = []
Ls = []
with ag.record():
for x, y in zip(data, label):
z = net(x)
L = loss(z, y)
# store the loss and do backward after we have done forward
# on all GPUs for better speed on multiple GPUs.
Ls.append(L)
outputs.append(z)
if args.amp:
with amp.scale_loss(Ls, trainer) as scaled_loss:
ag.backward(scaled_loss)
else:
ag.backward(Ls)
if 'horovod' in kvstore:
trainer.step(local_batch_size)
else:
trainer.step(total_batch_size)
if print_loss:
loss_metric.update(..., np.mean([l.asnumpy() for l in Ls]).item())
eval_metric.update(orig_label, outputs)
if args.disp_batches and not (i + 1) % args.disp_batches:
name, acc = eval_metric.get()
if print_loss:
name = [loss_metric.get()[0]] + name
acc = [loss_metric.get()[1]] + acc
logging.info('Epoch[{}] Batch [{}-{}]\tSpeed: {} samples/sec\tLR: {}\t{}'.format(
epoch, (i // args.disp_batches) * args.disp_batches, i,
args.disp_batches * total_batch_size / (time.time() - btic), trainer.learning_rate,
'\t'.join(list(map(lambda x: '{}: {:.6f}'.format(*x), zip(name, acc))))))
eval_metric.reset_local()
loss_metric.reset_local()
btic = time.time()
durations.append(time.time() - tic)
tic = time.time()
add_metrics_to_report(report, 'train', dict(eval_metric.get_global_name_value()), durations, total_batch_size, loss_metric if print_loss else None)
if args.mode == 'train_val':
logging.info('Validating epoch {}'.format(epoch))
score = model_score(args, net, eval_data, eval_metric, kvstore, report)
for name, value in zip(*score):
logging.info('Epoch[{}] Validation {:20}: {}'.format(epoch, name, value))
score = dict(zip(*score))
accuracy = score.get('accuracy', -1)
save_checkpoint(net, epoch, accuracy, best_accuracy, model_prefix, args.save_frequency, kvstore)
best_accuracy = max(best_accuracy, accuracy)
def fit(args, model, data_loader):
"""
train a model
args : argparse returns
network : the symbol definition of the nerual network
model : the the neural network model
data_loader : function that returns the train and val data iterators
"""
start_time = time.time()
report = Report(args.arch, len(args.gpus), sys.argv)
# select gpu for horovod process
if 'horovod' in args.kv_store:
hvd.init()
args.gpus = [args.gpus[hvd.local_rank()]]
if args.amp:
amp.init()
if args.seed is not None:
logging.info('Setting seeds to {}'.format(args.seed))
random.seed(args.seed)
np.random.seed(args.seed)
mx.random.seed(args.seed)
# kvstore
kv = mx.kvstore.create(args.kv_store)
if args.gc_type != 'none':
kv.set_gradient_compression({'type': args.gc_type,
'threshold': args.gc_threshold})
if args.profile_server_suffix:
mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server')
mx.profiler.set_state(state='run', profile_process='server')
if args.profile_worker_suffix:
if kv.num_workers > 1:
filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix
else:
filename = args.profile_worker_suffix
mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker')
mx.profiler.set_state(state='run', profile_process='worker')
# logging
head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)
logging.info('start with arguments %s', args)
epoch_size = get_epoch_size(args, kv)
# data iterators
(train, val) = data_loader(args, kv)
if 'dist' in args.kv_store and not 'async' in args.kv_store:
logging.info('Resizing training data to %d batches per machine', epoch_size)
# resize train iter to ensure each machine has same number of batches per epoch
# if not, dist_sync can hang at the end with one machine waiting for other machines
if not args.use_dali:
train = mx.io.ResizeIter(train, epoch_size)
if 'horovod' in args.kv_store:
kv = None
rank = hvd.rank()
num_workers = hvd.size()
else:
kv = mx.kvstore.create(args.kv_store)
rank = kv.rank
num_workers = kv.num_workers
if args.test_io:
train, val = data_loader(args, kv)
if args.test_io_mode == 'train':
data_iter = train
else:
data_iter = val
tic = time.time()
for i, batch in enumerate(train):
for i, batch in enumerate(data_iter):
if isinstance(batch, list):
for b in batch:
for j in b.data:
@ -232,232 +443,90 @@ def fit(args, network, data_loader, **kwargs):
for j in batch.data:
j.wait_to_read()
if (i + 1) % args.disp_batches == 0:
logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i,
args.disp_batches * args.batch_size / (time.time() - tic))
logging.info('Batch [{}]\tSpeed: {:.2f} samples/sec'.format(
i, args.disp_batches * args.batch_size / (time.time() - tic)))
tic = time.time()
return
# load model
if 'arg_params' in kwargs and 'aux_params' in kwargs:
arg_params = kwargs['arg_params']
aux_params = kwargs['aux_params']
else:
sym, arg_params, aux_params = _load_model(args, kv.rank)
# save model
checkpoint = _save_model(args, kv.rank)
epoch_end_callbacks = []
if checkpoint:
epoch_end_callbacks.append(checkpoint)
if not load_model(args, model):
# all initializers should be specified in the model definition.
# if not, this will raise an error
model.initialize(mx.init.Initializer())
# devices for training
devs = mx.cpu() if args.gpus is None or args.gpus == "" else [
mx.gpu(int(i)) for i in args.gpus.split(',')]
devs = list(map(mx.gpu, args.gpus))
model.collect_params().reset_ctx(devs)
if args.mode == 'pred':
logging.info('Infering image {}'.format(args.data_pred))
model_pred(args, model, data.load_image(args, args.data_pred, devs[0]))
return
# learning rate
lr, lr_scheduler = _get_lr_scheduler(args, kv)
# create model
model = mx.mod.Module(
context=devs,
symbol=network
)
lr_scheduler = get_lr_scheduler(args)
optimizer_params = {
'learning_rate': lr,
'learning_rate': 0,
'wd': args.wd,
'lr_scheduler': lr_scheduler,
'multi_precision': True}
'multi_precision': True,
}
# Only a limited number of optimizers have 'momentum' property
has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'}
if args.optimizer in has_momentum:
optimizer_params['momentum'] = args.mom
monitor = mx.mon.Monitor(
args.monitor, pattern=".*") if args.monitor > 0 else None
# A limited number of optimizers have a warmup period
has_warmup = {'lbsgd', 'lbnag'}
if args.optimizer in has_warmup:
if 'dist' in args.kv_store:
nworkers = kv.num_workers
else:
nworkers = 1
epoch_size = args.num_examples / args.batch_size / nworkers
if epoch_size < 1:
epoch_size = 1
macrobatch_size = args.macrobatch_size
if macrobatch_size < args.batch_size * nworkers:
macrobatch_size = args.batch_size * nworkers
#batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999)
batch_scale = math.ceil(
float(macrobatch_size) / args.batch_size / nworkers)
optimizer_params['updates_per_epoch'] = epoch_size
optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0
optimizer_params['batch_scale'] = batch_scale
optimizer_params['warmup_strategy'] = args.warmup_strategy
optimizer_params['warmup_epochs'] = args.warmup_epochs
optimizer_params['num_epochs'] = args.num_epochs
if args.initializer == 'default':
initializer = mx.init.Xavier(
rnd_type='gaussian', factor_type="in", magnitude=2)
# initializer = mx.init.Xavier(factor_type="in", magnitude=2.34),
elif args.initializer == 'xavier':
initializer = mx.init.Xavier()
elif args.initializer == 'msra':
initializer = mx.init.MSRAPrelu()
elif args.initializer == 'orthogonal':
initializer = mx.init.Orthogonal()
elif args.initializer == 'normal':
initializer = mx.init.Normal()
elif args.initializer == 'uniform':
initializer = mx.init.Uniform()
elif args.initializer == 'one':
initializer = mx.init.One()
elif args.initializer == 'zero':
initializer = mx.init.Zero()
# evaluation metrices
if not args.no_metrics:
eval_metrics = ['crossentropy', 'accuracy']
eval_metrics = ['accuracy']
eval_metrics.append(mx.metric.create(
'top_k_accuracy', top_k=5))
else:
eval_metrics = []
supported_loss = ['ce', 'nll_loss']
if len(args.loss) > 0:
# ce or nll loss is only applicable to softmax output
loss_type_list = args.loss.split(',')
if 'softmax_output' in network.list_outputs():
for loss_type in loss_type_list:
loss_type = loss_type.strip()
if loss_type == 'nll':
loss_type = 'nll_loss'
if loss_type not in supported_loss:
logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \
'negative likelihood loss is supported!')
else:
eval_metrics.append(mx.metric.create(loss_type))
else:
logging.warning("The output is not softmax_output, loss argument will be skipped!")
# callbacks that run after each batch
batch_end_callbacks = []
batch_end_callbacks.append(mx.callback.Speedometer(
args.batch_size, args.disp_batches))
if 'batch_end_callback' in kwargs:
cbs = kwargs['batch_end_callback']
batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs]
report = Report('resnet{}'.format(args.num_layers), len(args.gpus.split(',')), sys.argv)
train, val = data_loader(args, kv)
train = BenchmarkingDataIter(train, args.benchmark_iters)
val = BenchmarkingDataIter(val, args.benchmark_iters)
if val is not None:
val = BenchmarkingDataIter(val, args.benchmark_iters)
class Gatherer:
def __init__(self, report, mode, data_iter, total_bs=None):
self.report = report
self.mode = mode
self.total_bs = total_bs
self.data_iter = data_iter
self.clear()
def clear(self):
self.num = 0
self.top1 = 0
self.top5 = 0
self.loss = 0
self.time = 0
self.tic = 0
def gather_metrics(self, data):
params = dict(data.eval_metric.get_global_name_value())
if self.num != 0:
self.time += time.time() - self.tic
self.num += 1
if not args.no_metrics:
self.top1 = params['accuracy']
self.top5 = params['top_k_accuracy_5']
self.loss = params['cross-entropy']
self.tic = time.time()
def add_metrics(self, *a, **k):
top1 = self.top1 * 100
top5 = self.top5 * 100
loss = self.loss
if self.num <= 1:
time = float('nan')
else:
time = self.time / (self.num - 1)
data = self.data_iter.get_avg_time_and_clear()
if self.total_bs is not None:
compute_ips = self.total_bs / (time - data)
total_ips = self.total_bs / time
if not args.no_metrics:
self.report.add_value('{}.top1'.format(self.mode), top1)
self.report.add_value('{}.top5'.format(self.mode), top5)
self.report.add_value('{}.loss'.format(self.mode), loss)
self.report.add_value('{}.time'.format(self.mode), time)
# self.report.add_value('{}.data'.format(self.mode), data)
if self.total_bs is not None:
# self.report.add_value('{}.compute_ips'.format(self.mode), compute_ips)
self.report.add_value('{}.total_ips'.format(self.mode), total_ips)
self.clear()
def save_report(*a, **k):
report.set_total_duration(time.time() - start_time)
if args.report:
report.save(args.report)
train_gatherer = Gatherer(report, 'train', train, args.batch_size)
eval_gatherer = Gatherer(report, 'val', val, args.batch_size)
batch_end_callbacks = [train_gatherer.gather_metrics] + batch_end_callbacks
epoch_end_callbacks = [train_gatherer.add_metrics, save_report] + epoch_end_callbacks
eval_batch_end_callbacks = [eval_gatherer.gather_metrics]
eval_end_callbacks = [eval_gatherer.add_metrics, save_report]
if 'horovod' in args.kv_store:
# Fetch and broadcast parameters
params = model.collect_params()
if params is not None:
hvd.broadcast_parameters(params, root_rank=0)
# run
model.fit(train,
begin_epoch=args.load_epoch if args.load_epoch else 0,
num_epoch=args.num_epochs if not args.only_inference else 0,
eval_data=val,
eval_metric=eval_metrics,
kvstore=kv,
optimizer=args.optimizer,
optimizer_params=optimizer_params,
initializer=initializer,
arg_params=arg_params,
aux_params=aux_params,
batch_end_callback=batch_end_callbacks,
epoch_end_callback=epoch_end_callbacks, #checkpoint if args.use_dali else ,,
eval_batch_end_callback=eval_batch_end_callbacks,
eval_end_callback=eval_end_callbacks,
allow_missing=True,
monitor=monitor)
if args.mode in ['train_val', 'train']:
model_fit(
args,
model,
train,
begin_epoch=args.begin_epoch,
num_epoch=args.num_epochs,
eval_data=val,
eval_metric=eval_metrics,
kvstore=args.kv_store,
kv=kv,
optimizer=args.optimizer,
optimizer_params=optimizer_params,
lr_scheduler=lr_scheduler,
report=report,
model_prefix=args.model_prefix,
print_loss=not args.no_metrics,
)
elif args.mode == 'val':
for epoch in range(args.num_epochs): # loop for benchmarking
score = model_score(args, model, val, eval_metrics, args.kv_store, report=report)
for name, value in zip(*score):
logging.info('Validation {:20}: {}'.format(name, value))
else:
raise ValueError('Wrong mode')
if args.only_inference:
for epoch in range(args.num_epochs):
score = model.score(val, eval_metrics, batch_end_callback=eval_batch_end_callbacks, score_end_callback=eval_end_callbacks, epoch=epoch)
print('-------------')
for name, value in score:
print('{}: {}'.format(name, value))
mx.nd.waitall()
if args.profile_server_suffix:
mx.profiler.set_state(state='run', profile_process='server')
if args.profile_worker_suffix:
mx.profiler.set_state(state='run', profile_process='worker')
report.set_total_duration(time.time() - start_time)
if args.report:
suffix = '-{}'.format(hvd.rank()) if 'horovod' in args.kv_store and hvd.rank() != 0 else ''
report.save(args.report + suffix)
save_report()
print('Experiment took: {} sec'.format(report.total_duration))
logging.info('Experiment took: {} sec'.format(report.total_duration))

File diff suppressed because it is too large Load diff

View file

Before

Width:  |  Height:  |  Size: 11 KiB

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

View file

@ -0,0 +1,522 @@
# Copyright 2017-2018 The Apache Software Foundation
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# -----------------------------------------------------------------------
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import mxnet as mx
from mxnet.gluon.block import HybridBlock
from mxnet.gluon import nn
def add_model_args(parser):
model = parser.add_argument_group('Model')
model.add_argument('--arch', default='resnetv15',
choices=['resnetv1', 'resnetv15',
'resnextv1', 'resnextv15',
'xception'],
help='model architecture')
model.add_argument('--num-layers', type=int, default=50,
help='number of layers in the neural network, \
required by some networks such as resnet')
model.add_argument('--num-groups', type=int, default=32,
help='number of groups for grouped convolutions, \
required by some networks such as resnext')
model.add_argument('--num-classes', type=int, default=1000,
help='the number of classes')
model.add_argument('--batchnorm-eps', type=float, default=1e-5,
help='the amount added to the batchnorm variance to prevent output explosion.')
model.add_argument('--batchnorm-mom', type=float, default=0.9,
help='the leaky-integrator factor controling the batchnorm mean and variance.')
model.add_argument('--fuse-bn-relu', type=int, default=0,
help='have batchnorm kernel perform activation relu')
model.add_argument('--fuse-bn-add-relu', type=int, default=0,
help='have batchnorm kernel perform add followed by activation relu')
return model
class Builder:
def __init__(self, dtype, input_layout, conv_layout, bn_layout,
pooling_layout, bn_eps, bn_mom, fuse_bn_relu, fuse_bn_add_relu):
self.dtype = dtype
self.input_layout = input_layout
self.conv_layout = conv_layout
self.bn_layout = bn_layout
self.pooling_layout = pooling_layout
self.bn_eps = bn_eps
self.bn_mom = bn_mom
self.fuse_bn_relu = fuse_bn_relu
self.fuse_bn_add_relu = fuse_bn_add_relu
self.act_type = 'relu'
self.bn_gamma_initializer = lambda last: 'zeros' if last else 'ones'
self.linear_initializer = lambda groups=1: mx.init.Xavier(rnd_type='gaussian', factor_type="in",
magnitude=2 * (groups ** 0.5))
self.last_layout = self.input_layout
def copy(self):
return copy.copy(self)
def batchnorm(self, last=False):
gamma_initializer = self.bn_gamma_initializer(last)
bn_axis = 3 if self.bn_layout == 'NHWC' else 1
return self.sequence(
self.transpose(self.bn_layout),
nn.BatchNorm(axis=bn_axis, momentum=self.bn_mom, epsilon=self.bn_eps,
gamma_initializer=gamma_initializer,
running_variance_initializer=gamma_initializer)
)
def batchnorm_add_relu(self, last=False):
gamma_initializer = self.bn_gamma_initializer(last)
if self.fuse_bn_add_relu:
bn_axis = 3 if self.bn_layout == 'NHWC' else 1
return self.sequence(
self.transpose(self.bn_layout),
BatchNormAddRelu(axis=bn_axis, momentum=self.bn_mom,
epsilon=self.bn_eps, act_type=self.act_type,
gamma_initializer=gamma_initializer,
running_variance_initializer=gamma_initializer)
)
return NonFusedBatchNormAddRelu(self, last=last)
def batchnorm_relu(self, last=False):
gamma_initializer = self.bn_gamma_initializer(last)
if self.fuse_bn_relu:
bn_axis = 3 if self.bn_layout == 'NHWC' else 1
return self.sequence(
self.transpose(self.bn_layout),
nn.BatchNorm(axis=bn_axis, momentum=self.bn_mom,
epsilon=self.bn_eps, act_type=self.act_type,
gamma_initializer=gamma_initializer,
running_variance_initializer=gamma_initializer)
)
return self.sequence(self.batchnorm(last=last), self.activation())
def activation(self):
return nn.Activation(self.act_type)
def global_avg_pool(self):
return self.sequence(
self.transpose(self.pooling_layout),
nn.GlobalAvgPool2D(layout=self.pooling_layout)
)
def max_pool(self, pool_size, strides=1, padding=True):
padding = pool_size // 2 if padding is True else int(padding)
return self.sequence(
self.transpose(self.pooling_layout),
nn.MaxPool2D(pool_size, strides=strides, padding=padding,
layout=self.pooling_layout)
)
def conv(self, channels, kernel_size, padding=True, strides=1, groups=1, in_channels=0):
padding = kernel_size // 2 if padding is True else int(padding)
initializer = self.linear_initializer(groups=groups)
return self.sequence(
self.transpose(self.conv_layout),
nn.Conv2D(channels, kernel_size=kernel_size, strides=strides,
padding=padding, use_bias=False, groups=groups,
in_channels=in_channels, layout=self.conv_layout,
weight_initializer=initializer)
)
def separable_conv(self, channels, kernel_size, in_channels, padding=True, strides=1):
return self.sequence(
self.conv(in_channels, kernel_size, padding=padding,
strides=strides, groups=in_channels, in_channels=in_channels),
self.conv(channels, 1, in_channels=in_channels)
)
def dense(self, units, in_units=0):
return nn.Dense(units, in_units=in_units,
weight_initializer=self.linear_initializer())
def transpose(self, to_layout):
if self.last_layout == to_layout:
return None
ret = Transpose(self.last_layout, to_layout)
self.last_layout = to_layout
return ret
def sequence(self, *seq):
seq = list(filter(lambda x: x is not None, seq))
if len(seq) == 1:
return seq[0]
ret = nn.HybridSequential()
ret.add(*seq)
return ret
class Transpose(HybridBlock):
def __init__(self, from_layout, to_layout):
super().__init__()
supported_layouts = ['NCHW', 'NHWC']
if from_layout not in supported_layouts:
raise ValueError('Not prepared to handle layout: {}'.format(from_layout))
if to_layout not in supported_layouts:
raise ValueError('Not prepared to handle layout: {}'.format(to_layout))
self.from_layout = from_layout
self.to_layout = to_layout
def hybrid_forward(self, F, x):
# Insert transpose if from_layout and to_layout don't match
if self.from_layout == 'NCHW' and self.to_layout == 'NHWC':
return F.transpose(x, axes=(0, 2, 3, 1))
elif self.from_layout == 'NHWC' and self.to_layout == 'NCHW':
return F.transpose(x, axes=(0, 3, 1, 2))
else:
return x
def __repr__(self):
s = '{name}({content})'
if self.from_layout == self.to_layout:
content = 'passthrough ' + self.from_layout
else:
content = self.from_layout + ' -> ' + self.to_layout
return s.format(name=self.__class__.__name__,
content=content)
class LayoutWrapper(HybridBlock):
def __init__(self, op, io_layout, op_layout, **kwargs):
super(LayoutWrapper, self).__init__(**kwargs)
with self.name_scope():
self.layout1 = Transpose(io_layout, op_layout)
self.op = op
self.layout2 = Transpose(op_layout, io_layout)
def hybrid_forward(self, F, *x):
return self.layout2(self.op(*(self.layout1(y) for y in x)))
class BatchNormAddRelu(nn.BatchNorm):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self._kwargs.pop('act_type') != 'relu':
raise ValueError('BatchNormAddRelu can be used only with ReLU as activation')
def hybrid_forward(self, F, x, y, gamma, beta, running_mean, running_var):
return F.BatchNormAddRelu(data=x, addend=y, gamma=gamma, beta=beta,
moving_mean=running_mean, moving_var=running_var, name='fwd', **self._kwargs)
class NonFusedBatchNormAddRelu(HybridBlock):
def __init__(self, builder, **kwargs):
super().__init__()
self.bn = builder.batchnorm(**kwargs)
self.act = builder.activation()
def hybrid_forward(self, F, x, y):
return self.act(self.bn(x) + y)
# Blocks
class ResNetBasicBlock(HybridBlock):
def __init__(self, builder, channels, stride, downsample=False, in_channels=0,
version='1', resnext_groups=None, **kwargs):
super().__init__()
assert not resnext_groups
self.transpose = builder.transpose(builder.conv_layout)
builder_copy = builder.copy()
body = [
builder.conv(channels, 3, strides=stride, in_channels=in_channels),
builder.batchnorm_relu(),
builder.conv(channels, 3),
]
self.body = builder.sequence(*body)
self.bn_add_relu = builder.batchnorm_add_relu(last=True)
builder = builder_copy
if downsample:
self.downsample = builder.sequence(
builder.conv(channels, 1, strides=stride, in_channels=in_channels),
builder.batchnorm()
)
else:
self.downsample = None
def hybrid_forward(self, F, x):
if self.transpose is not None:
x = self.transpose(x)
residual = x
x = self.body(x)
if self.downsample:
residual = self.downsample(residual)
x = self.bn_add_relu(x, residual)
return x
class ResNetBottleNeck(HybridBlock):
def __init__(self, builder, channels, stride, downsample=False, in_channels=0,
version='1', resnext_groups=None):
super().__init__()
stride1 = stride if version == '1' else 1
stride2 = 1 if version == '1' else stride
mult = 2 if resnext_groups else 1
groups = resnext_groups or 1
self.transpose = builder.transpose(builder.conv_layout)
builder_copy = builder.copy()
body = [
builder.conv(channels * mult // 4, 1, strides=stride1, in_channels=in_channels),
builder.batchnorm_relu(),
builder.conv(channels * mult // 4, 3, strides=stride2),
builder.batchnorm_relu(),
builder.conv(channels, 1)
]
self.body = builder.sequence(*body)
self.bn_add_relu = builder.batchnorm_add_relu(last=True)
builder = builder_copy
if downsample:
self.downsample = builder.sequence(
builder.conv(channels, 1, strides=stride, in_channels=in_channels),
builder.batchnorm()
)
else:
self.downsample = None
def hybrid_forward(self, F, x):
if self.transpose is not None:
x = self.transpose(x)
residual = x
x = self.body(x)
if self.downsample:
residual = self.downsample(residual)
x = self.bn_add_relu(x, residual)
return x
class XceptionBlock(HybridBlock):
def __init__(self, builder, definition, in_channels, relu_at_beginning=True):
super().__init__()
self.transpose = builder.transpose(builder.conv_layout)
builder_copy = builder.copy()
body = []
if relu_at_beginning:
body.append(builder.activation())
last_channels = in_channels
for channels1, channels2 in zip(definition, definition[1:] + [0]):
if channels1 > 0:
body.append(builder.separable_conv(channels1, 3, in_channels=last_channels))
if channels2 > 0:
body.append(builder.batchnorm_relu())
else:
body.append(builder.batchnorm(last=True))
last_channels = channels1
else:
body.append(builder.max_pool(3, 2))
self.body = builder.sequence(*body)
builder = builder_copy
if any(map(lambda x: x <= 0, definition)):
self.shortcut = builder.sequence(
builder.conv(last_channels, 1, strides=2, in_channels=in_channels),
builder.batchnorm(),
)
else:
self.shortcut = builder.sequence()
def hybrid_forward(self, F, x):
return self.shortcut(x) + self.body(x)
# Nets
class ResNet(HybridBlock):
def __init__(self, builder, block, layers, channels, classes=1000,
version='1', resnext_groups=None):
super().__init__()
assert len(layers) == len(channels) - 1
self.version = version
with self.name_scope():
features = [
builder.conv(channels[0], 7, strides=2),
builder.batchnorm_relu(),
builder.max_pool(3, 2),
]
for i, num_layer in enumerate(layers):
stride = 1 if i == 0 else 2
features.append(self.make_layer(builder, block, num_layer, channels[i+1],
stride, in_channels=channels[i],
resnext_groups=resnext_groups))
features.append(builder.global_avg_pool())
self.features = builder.sequence(*features)
self.output = builder.dense(classes, in_units=channels[-1])
def make_layer(self, builder, block, layers, channels, stride,
in_channels=0, resnext_groups=None):
layer = []
layer.append(block(builder, channels, stride, channels != in_channels,
in_channels=in_channels, version=self.version,
resnext_groups=resnext_groups))
for _ in range(layers-1):
layer.append(block(builder, channels, 1, False, in_channels=channels,
version=self.version, resnext_groups=resnext_groups))
return builder.sequence(*layer)
def hybrid_forward(self, F, x):
x = self.features(x)
x = self.output(x)
return x
class Xception(HybridBlock):
def __init__(self, builder,
definition=([32, 64],
[[128, 128, 0], [256, 256, 0], [728, 728, 0],
*([[728, 728, 728]] * 8), [728, 1024, 0]],
[1536, 2048]),
classes=1000):
super().__init__()
definition1, definition2, definition3 = definition
with self.name_scope():
features = []
last_channels = 0
for i, channels in enumerate(definition1):
features += [
builder.conv(channels, 3, strides=(2 if i == 0 else 1), in_channels=last_channels),
builder.batchnorm_relu(),
]
last_channels = channels
for i, block_definition in enumerate(definition2):
features.append(XceptionBlock(builder, block_definition, in_channels=last_channels,
relu_at_beginning=False if i == 0 else True))
last_channels = list(filter(lambda x: x > 0, block_definition))[-1]
for i, channels in enumerate(definition3):
features += [
builder.separable_conv(channels, 3, in_channels=last_channels),
builder.batchnorm_relu(),
]
last_channels = channels
features.append(builder.global_avg_pool())
self.features = builder.sequence(*features)
self.output = builder.dense(classes, in_units=last_channels)
def hybrid_forward(self, F, x):
x = self.features(x)
x = self.output(x)
return x
resnet_spec = {18: (ResNetBasicBlock, [2, 2, 2, 2], [64, 64, 128, 256, 512]),
34: (ResNetBasicBlock, [3, 4, 6, 3], [64, 64, 128, 256, 512]),
50: (ResNetBottleNeck, [3, 4, 6, 3], [64, 256, 512, 1024, 2048]),
101: (ResNetBottleNeck, [3, 4, 23, 3], [64, 256, 512, 1024, 2048]),
152: (ResNetBottleNeck, [3, 8, 36, 3], [64, 256, 512, 1024, 2048])}
def create_resnet(builder, version, num_layers=50, resnext=False, classes=1000):
assert num_layers in resnet_spec, \
"Invalid number of layers: {}. Options are {}".format(
num_layers, str(resnet_spec.keys()))
block_class, layers, channels = resnet_spec[num_layers]
assert not resnext or num_layers >= 50, \
"Cannot create resnext with less then 50 layers"
net = ResNet(builder, block_class, layers, channels, version=version,
resnext_groups=args.num_groups if resnext else None)
return net
class fp16_model(mx.gluon.block.HybridBlock):
def __init__(self, net, **kwargs):
super(fp16_model, self).__init__(**kwargs)
with self.name_scope():
self._net = net
def hybrid_forward(self, F, x):
y = self._net(x)
y = F.cast(y, dtype='float32')
return y
def get_model(arch, num_classes, num_layers, image_shape, dtype, amp,
input_layout, conv_layout, batchnorm_layout, pooling_layout,
batchnorm_eps, batchnorm_mom, fuse_bn_relu, fuse_bn_add_relu, **kwargs):
builder = Builder(
dtype = dtype,
input_layout = input_layout,
conv_layout = conv_layout,
bn_layout = batchnorm_layout,
pooling_layout = pooling_layout,
bn_eps = batchnorm_eps,
bn_mom = batchnorm_mom,
fuse_bn_relu = fuse_bn_relu,
fuse_bn_add_relu = fuse_bn_add_relu,
)
if arch.startswith('resnet') or arch.startswith('resnext'):
version = '1' if arch in {'resnetv1', 'resnextv1'} else '1.5'
net = create_resnet(
builder = builder,
version = version,
resnext = arch.startswith('resnext'),
num_layers = num_layers,
classes = num_classes,
)
elif arch == 'xception':
net = Xception(builder, classes=num_classes)
else:
raise ValueError('Wrong model architecture')
net.hybridize(static_shape=True, static_alloc=True)
if not amp:
net.cast(dtype)
if dtype == 'float16':
net = fp16_model(net)
return net

View file

@ -21,15 +21,21 @@
# - "metrics" : per epoch metrics for train and validation
# (some of below metrics may not exist in the report,
# depending on application arguments)
# - "train.top1" : training top1 accuracy in epoch.
# - "train.top5" : training top5 accuracy in epoch.
# - "train.loss" : training loss in epoch.
# - "train.time" : average training time of iteration in seconds.
# - "train.total_ips" : training speed (data and compute time taken into account) for epoch in images/sec.
# - "val.top1", "val.top5", "val.loss", "val.time", "val.total_ips" : the same but for validation.
# - "train.top1" : training top1 accuracy in epoch.
# - "train.top5" : training top5 accuracy in epoch.
# - "train.loss" : training loss in epoch.
# - "train.total_ips" : training speed (data and compute time taken into account) for epoch in images/sec.
# - "train.latency_avg" : average latency of one iteration in seconds.
# - "train.latency_50" : median latency of one iteration in seconds.
# - "train.latency_90" : 90th percentile latency of one iteration in seconds.
# - "train.latency_95" : 95th percentile latency of one iteration in seconds.
# - "train.latency_99" : 99th percentile latency of one iteration in seconds.
# - "train.latency_100" : highest observed latency of one iteration in seconds.
# - "val.top1", "val.top5", "val.time", "val.total_ips", "val.latency_avg", "val.latency_50",
# "val.latency_90", "val.latency_95", "val.latency_99", "val.latency_100" : the same but for validation.
import json
from collections import defaultdict, OrderedDict
from collections import OrderedDict
class Report:
def __init__(self, model_name, ngpus, cmd):
@ -37,15 +43,21 @@ class Report:
self.ngpus = ngpus
self.cmd = cmd
self.total_duration = 0
self.metrics = defaultdict(lambda: [])
self.metrics = OrderedDict()
def add_value(self, metric, value):
if metric not in self.metrics:
self.metrics[metric] = []
self.metrics[metric].append(value)
def set_total_duration(self, duration):
self.total_duration = duration
def save(self, filename):
with open(filename, 'w') as f:
f.write(self.get_report())
def get_report(self):
report = OrderedDict([
('model', self.model_name),
('ngpus', self.ngpus),
@ -53,5 +65,4 @@ class Report:
('cmd', self.cmd),
('metrics', self.metrics),
])
with open(filename, 'w') as f:
json.dump(report, f, indent=4)
return json.dumps(report, indent=4)

View file

@ -1,376 +0,0 @@
# Copyright 2017-2018 The Apache Software Foundation
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# -----------------------------------------------------------------------
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
(Original author Wei Wu) by Antti-Pekka Hynninen
"Flexible Layout" (fl) version created by Dick Carter.
Implementing the original resnet ILSVRC 2015 winning network from:
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
'''
import mxnet as mx
import numpy as np
import random
# Transform a symbol from one layout to another, or do nothing if they have the same layout
def transform_layout(data, from_layout, to_layout):
supported_layouts = ['NCHW', 'NHWC']
if from_layout not in supported_layouts:
raise ValueError('Not prepared to handle layout: {}'.format(from_layout))
if to_layout not in supported_layouts:
raise ValueError('Not prepared to handle layout: {}'.format(to_layout))
# Insert transpose if from_layout and to_layout don't match
if from_layout == 'NCHW' and to_layout == 'NHWC':
return mx.sym.transpose(data, axes=(0, 2, 3, 1))
elif from_layout == 'NHWC' and to_layout == 'NCHW':
return mx.sym.transpose(data, axes=(0, 3, 1, 2))
else:
return data
# A BatchNorm wrapper that responds to the input layout
def batchnorm(data, io_layout, batchnorm_layout, **kwargs):
# Transpose as needed to batchnorm_layout
transposed_as_needed = transform_layout(data, io_layout, batchnorm_layout)
bn_axis = 3 if batchnorm_layout == 'NHWC' else 1
batchnormed = mx.sym.BatchNorm(data=transposed_as_needed, axis=bn_axis, **kwargs)
# Transpose back to i/o layout as needed
return transform_layout(batchnormed, batchnorm_layout, io_layout)
# A BatchNormAddRelu wrapper that responds to the input layout
def batchnorm_add_relu(data, addend, io_layout, batchnorm_layout, **kwargs):
# Transpose as needed to batchnorm_layout
transposed_data_as_needed = transform_layout(data, io_layout, batchnorm_layout)
transposed_addend_as_needed = transform_layout(addend, io_layout, batchnorm_layout)
bn_axis = 3 if batchnorm_layout == 'NHWC' else 1
batchnormed = mx.sym.BatchNormAddRelu(data=transposed_data_as_needed,
addend=transposed_addend_as_needed,
axis=bn_axis, **kwargs)
# Transpose back to i/o layout as needed
return transform_layout(batchnormed, batchnorm_layout, io_layout)
# A Pooling wrapper that responds to the input layout
def pooling(data, io_layout, pooling_layout, **kwargs):
# Pooling kernel, as specified by pooling_layout, may be in conflict with i/o layout.
transposed_as_needed = transform_layout(data, io_layout, pooling_layout)
pooled = mx.sym.Pooling(data=transposed_as_needed, layout=pooling_layout, **kwargs)
# Transpose back to i/o layout as needed
return transform_layout(pooled, pooling_layout, io_layout)
# Assumption is that data comes in and out in the 'conv_layout' format.
# If this format is different from the 'batchnorm_layout' format, then the batchnorm() routine
# will introduce transposes on both sides of the mx.sym.BatchNorm symbol
def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True,
workspace=256, memonger=False, conv_layout='NCHW', batchnorm_layout='NCHW',
verbose=False, cudnn_bn_off=False, bn_eps=2e-5, bn_mom=0.9, conv_algo=-1,
fuse_bn_relu=False, fuse_bn_add_relu=False, cudnn_tensor_core_only=False):
"""Return ResNet Unit symbol for building ResNet
Parameters
----------
data : str
Input data
num_filter : int
Number of output channels
bnf : int
Bottle neck channels factor with regard to num_filter
stride : tuple
Stride used in convolution
dim_match : Boolean
True means channel number between input and output is the same, otherwise means differ
name : str
Base name of the operators
workspace : int
Workspace used in convolution operator
"""
act = 'relu' if fuse_bn_relu else None
if bottle_neck:
conv1 = mx.sym.Convolution(data=data, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
no_bias=True, workspace=workspace, name=name + '_conv1', layout=conv_layout,
cudnn_algo_verbose=verbose,
cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
cudnn_tensor_core_only=cudnn_tensor_core_only)
bn1 = batchnorm(data=conv1, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn1', cudnn_off=cudnn_bn_off, act_type=act)
act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') if not fuse_bn_relu else bn1
conv2 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
no_bias=True, workspace=workspace, name=name + '_conv2', layout=conv_layout,
cudnn_algo_verbose=verbose,
cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
cudnn_tensor_core_only=cudnn_tensor_core_only)
bn2 = batchnorm(data=conv2, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn2', cudnn_off=cudnn_bn_off, act_type=act)
act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2') if not fuse_bn_relu else bn2
conv3 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
workspace=workspace, name=name + '_conv3', layout=conv_layout,
cudnn_algo_verbose=verbose,
cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
cudnn_tensor_core_only=cudnn_tensor_core_only)
if dim_match:
shortcut = data
else:
conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
workspace=workspace, name=name+'_conv1sc', layout=conv_layout,
cudnn_algo_verbose=verbose,
cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
cudnn_tensor_core_only=cudnn_tensor_core_only)
shortcut = batchnorm(data=conv1sc, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_sc', cudnn_off=cudnn_bn_off)
if memonger:
shortcut._set_attr(mirror_stage='True')
if fuse_bn_add_relu:
return batchnorm_add_relu(data=conv3, addend=shortcut, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn3', cudnn_off=cudnn_bn_off)
else:
bn3 = batchnorm(data=conv3, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, eps=bn_eps, momentum=bn_mom, name=name + '_bn3', cudnn_off=cudnn_bn_off)
return mx.sym.Activation(data=bn3 + shortcut, act_type='relu', name=name + '_relu3')
else:
conv1 = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
no_bias=True, workspace=workspace, name=name + '_conv1', layout=conv_layout,
cudnn_algo_verbose=verbose,
cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
cudnn_tensor_core_only=cudnn_tensor_core_only)
bn1 = batchnorm(data=conv1, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn1', cudnn_off=cudnn_bn_off, act_type=act)
act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') if not fuse_bn_relu else bn1
conv2 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
no_bias=True, workspace=workspace, name=name + '_conv2', layout=conv_layout,
cudnn_algo_verbose=verbose,
cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
cudnn_tensor_core_only=cudnn_tensor_core_only)
if dim_match:
shortcut = data
else:
conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
workspace=workspace, name=name+'_conv1sc', layout=conv_layout,
cudnn_algo_verbose=verbose,
cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
cudnn_tensor_core_only=cudnn_tensor_core_only)
shortcut = batchnorm(data=conv1sc, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_sc', cudnn_off=cudnn_bn_off)
if memonger:
shortcut._set_attr(mirror_stage='True')
if fuse_bn_add_relu:
return batchnorm_add_relu(data=conv2, addend=shortcut, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn2', cudnn_off=cudnn_bn_off)
else:
bn2 = batchnorm(data=conv2, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, momentum=bn_mom, eps=bn_eps, name=name + '_bn2', cudnn_off=cudnn_bn_off)
return mx.sym.Activation(data=bn2 + shortcut, act_type='relu', name=name + '_relu2')
def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, workspace=256, dtype='float32', memonger=False,
input_layout='NCHW', conv_layout='NCHW', batchnorm_layout='NCHW', pooling_layout='NCHW', verbose=False,
cudnn_bn_off=False, bn_eps=2e-5, bn_mom=0.9, conv_algo=-1,
fuse_bn_relu=False, fuse_bn_add_relu=False, force_tensor_core=False, use_dali=True):
"""Return ResNet symbol of
Parameters
----------
units : list
Number of units in each stage
num_stages : int
Number of stage
filter_list : list
Channel size of each stage
num_classes : int
Ouput size of symbol
dataset : str
Dataset type, only cifar10 and imagenet supports
workspace : int
Workspace used in convolution operator
dtype : str
Precision (float32 or float16)
memonger : boolean
Activates "memory monger" to reduce the model's memory footprint
input_layout : str
interpretation (e.g. NCHW vs NHWC) of data provided by the i/o pipeline (may introduce transposes
if in conflict with 'layout' above)
conv_layout : str
interpretation (e.g. NCHW vs NHWC) of data for convolution operation.
batchnorm_layout : str
directs which kernel performs the batchnorm (may introduce transposes if in conflict with 'conv_layout' above)
pooling_layout : str
directs which kernel performs the pooling (may introduce transposes if in conflict with 'conv_layout' above)
"""
act = 'relu' if fuse_bn_relu else None
num_unit = len(units)
assert(num_unit == num_stages)
data = mx.sym.Variable(name='data')
if not use_dali:
# double buffering of data
if dtype == 'float32':
data = mx.sym.identity(data=data, name='id')
else:
if dtype == 'float16':
data = mx.sym.Cast(data=data, dtype=np.float16)
(nchannel, height, width) = image_shape
# Insert transpose as needed to get the input layout to match the desired processing layout
data = transform_layout(data, input_layout, conv_layout)
if height <= 32: # such as cifar10
body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
no_bias=True, name="conv0", workspace=workspace, layout=conv_layout,
cudnn_algo_verbose=verbose,
cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
cudnn_tensor_core_only=force_tensor_core)
# Is this BatchNorm supposed to be here?
body = batchnorm(data=body, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, eps=bn_eps, momentum=bn_mom, name='bn0', cudnn_off=cudnn_bn_off)
else: # often expected to be 224 such as imagenet
body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
no_bias=True, name="conv0", workspace=workspace, layout=conv_layout,
cudnn_algo_verbose=verbose,
cudnn_algo_fwd=conv_algo, cudnn_algo_bwd_data=conv_algo, cudnn_algo_bwd_filter=conv_algo,
cudnn_tensor_core_only=force_tensor_core)
body = batchnorm(data=body, io_layout=conv_layout, batchnorm_layout=batchnorm_layout,
fix_gamma=False, eps=bn_eps, momentum=bn_mom, name='bn0', cudnn_off=cudnn_bn_off, act_type=act)
if not fuse_bn_relu:
body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
body = pooling(data=body, io_layout=conv_layout, pooling_layout=pooling_layout,
kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max')
for i in range(num_stages):
body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
name='stage%d_unit%d' % (i + 1, 1),
bottle_neck=bottle_neck, workspace=workspace,
memonger=memonger, conv_layout=conv_layout, batchnorm_layout=batchnorm_layout,
verbose=verbose, cudnn_bn_off=cudnn_bn_off, bn_eps=bn_eps, bn_mom=bn_mom,
conv_algo=conv_algo, fuse_bn_relu=fuse_bn_relu, fuse_bn_add_relu=fuse_bn_add_relu,
cudnn_tensor_core_only=force_tensor_core)
for j in range(units[i]-1):
body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
bottle_neck=bottle_neck, workspace=workspace,
memonger=memonger, conv_layout=conv_layout, batchnorm_layout=batchnorm_layout,
verbose=verbose, cudnn_bn_off=cudnn_bn_off, bn_eps = bn_eps, bn_mom=bn_mom,
conv_algo=conv_algo, fuse_bn_relu=fuse_bn_relu, fuse_bn_add_relu=fuse_bn_add_relu,
cudnn_tensor_core_only=force_tensor_core)
# bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
# relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
# Although kernel is not used here when global_pool=True, we should put one
pool1 = pooling(data=body, io_layout=conv_layout, pooling_layout=pooling_layout,
global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
flat = mx.sym.Flatten(data=pool1)
fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', cublas_algo_verbose=verbose)
if dtype == 'float16':
fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32',
input_layout='NCHW', conv_layout='NCHW', batchnorm_layout='NCHW', pooling_layout='NCHW',
verbose=False, seed=None, cudnn_bn_off=False, batchnorm_eps=2e-5, batchnorm_mom=0.9,
conv_algo=-1, fuse_bn_relu=False, fuse_bn_add_relu=False, force_tensor_core=False, use_dali=True, **kwargs):
"""
Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
(Original author Wei Wu) by Antti-Pekka Hynninen
Implementing the original resnet ILSVRC 2015 winning network from:
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
"""
if seed is not None:
print('Setting seeds to %s' % (seed,))
random.seed(seed)
np.random.seed(seed)
mx.random.seed(seed)
image_shape = [int(l) for l in image_shape.split(',')]
(nchannel, height, width) = image_shape
if height <= 28:
num_stages = 3
if (num_layers-2) % 9 == 0 and num_layers >= 164:
per_unit = [(num_layers-2)//9]
filter_list = [16, 64, 128, 256]
bottle_neck = True
elif (num_layers-2) % 6 == 0 and num_layers < 164:
per_unit = [(num_layers-2)//6]
filter_list = [16, 16, 32, 64]
bottle_neck = False
else:
raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
units = per_unit * num_stages
else:
if num_layers >= 50:
filter_list = [64, 256, 512, 1024, 2048]
bottle_neck = True
else:
filter_list = [64, 64, 128, 256, 512]
bottle_neck = False
num_stages = 4
if num_layers == 18:
units = [2, 2, 2, 2]
elif num_layers == 34:
units = [3, 4, 6, 3]
elif num_layers == 50:
units = [3, 4, 6, 3]
elif num_layers == 101:
units = [3, 4, 23, 3]
elif num_layers == 152:
units = [3, 8, 36, 3]
elif num_layers == 200:
units = [3, 24, 36, 3]
elif num_layers == 269:
units = [3, 30, 48, 8]
else:
raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
return resnet(units = units,
num_stages = num_stages,
filter_list = filter_list,
num_classes = num_classes,
image_shape = image_shape,
bottle_neck = bottle_neck,
workspace = conv_workspace,
dtype = dtype,
input_layout = input_layout,
conv_layout = conv_layout,
batchnorm_layout = batchnorm_layout,
pooling_layout = pooling_layout,
verbose = verbose,
cudnn_bn_off = cudnn_bn_off,
bn_eps = batchnorm_eps,
bn_mom = batchnorm_mom,
conv_algo = conv_algo,
fuse_bn_relu = fuse_bn_relu,
fuse_bn_add_relu = fuse_bn_add_relu,
force_tensor_core = force_tensor_core,
use_dali = use_dali)

View file

@ -14,77 +14,56 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os, socket
from argparse import ArgumentParser
import warnings
import os
import argparse
from pathlib import Path
optparser = ArgumentParser(description="train resnet50 with MXNet")
optparser.add_argument("-n", "--n-GPUs", type=int, default=8, help="number of GPUs to use; " +\
"default = 8")
optparser.add_argument("-b", "--batch-size", type=int, default=208, help="batch size per GPU; " +\
"default = 208")
optparser.add_argument("-e", "--num-epochs", type=int, default=90, help="number of epochs; " +\
"default = 90")
optparser.add_argument("-l", "--lr", type=float, default=0.1, help="learning rate; default = 0.1; " +\
"IMPORTANT: true learning rate will be calculated as `lr * batch_size/256`")
optparser.add_argument("--no-val", action="store_true",
help="if set no validation will be performed")
optparser.add_argument("--no-dali", action="store_true", default=False,
help="use default MXNet pipeline instead of DALI")
optparser.add_argument("--data-root", type=str, help="Directory with RecordIO data files", default="/data/imagenet/train-val-recordio-passthrough")
optparser.add_argument("--data-nthreads", type=int, help="number of threads for data loading; default = 40", default=40)
optparser.add_argument("--dtype", type=str, help="Precision, float16 or float32", default="float16")
optparser = argparse.ArgumentParser(description='Train classification models on ImageNet',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
optparser.add_argument('-n', '--ngpus', type=int, default=1, help='number of GPUs to use')
optparser.add_argument('-b', '--batch-size', type=int, default=192, help='batch size per GPU')
optparser.add_argument('-e', '--num-epochs', type=int, default=90, help='number of epochs')
optparser.add_argument('-l', '--lr', type=float, default=0.256, help='learning rate; '
'IMPORTANT: true learning rate will be calculated as `lr * batch_size / 256`')
optparser.add_argument('--data-root', type=Path, help='Directory with RecordIO data files', default=Path('/data/imagenet/train-val-recordio-passthrough'))
optparser.add_argument('--dtype', help='Precision', default='float16', choices=('float32', 'float16'))
optparser.add_argument('--kv-store', default='horovod', choices=('device', 'horovod'), help='key-value store type')
optparser.add_argument('--data-backend', default='dali-gpu', choices=('dali-gpu', 'dali-cpu', 'mxnet', 'synthetic'), help='data backend')
opts, args = optparser.parse_known_args()
if opts.dtype == "float16":
n_ch = str(4 - int(opts.no_dali))
if opts.dtype == 'float16':
n_ch = str(4 - int(opts.data_backend == 'mxnet'))
else:
n_ch = str(3)
opts.batch_size *= opts.n_GPUs
opts.batch_size *= opts.ngpus
opts.lr *= opts.batch_size / 256
opts.lr *= opts.batch_size/256
command = ""
command += "python "+os.path.dirname(__file__)+"/train.py"
command += " --num-layers 50"
command += " --data-train " + opts.data_root + "/train.rec"
command += " --data-train-idx " + opts.data_root + "/train.idx"
if not opts.no_val:
command += " --data-val " + opts.data_root + "/val.rec"
command += " --data-val-idx " + opts.data_root + "/val.idx"
command += " --data-nthreads " + str(opts.data_nthreads)
command += " --optimizer sgd --dtype " + opts.dtype
command += " --lr-step-epochs 30,60,80 --max-random-area 1"
command += " --min-random-area 0.05 --max-random-scale 1"
command += " --min-random-scale 1 --min-random-aspect-ratio 0.75"
command += " --max-random-aspect-ratio 1.33 --max-random-shear-ratio 0"
command += " --max-random-rotate-angle 0 --random-resized-crop 1"
command += " --random-crop 0 --random-mirror 1"
command += " --image-shape "+n_ch+",224,224 --warmup-epochs 5"
command += " --disp-batches 20"
command += " --batchnorm-mom 0.9 --batchnorm-eps 1e-5"
command = []
if 'horovod' in opts.kv_store:
command += ['horovodrun', '-np', str(opts.ngpus)]
command += ['python', str(Path(__file__).parent / "train.py")]
command += ['--data-train', str(opts.data_root / "train.rec")]
command += ['--data-train-idx', str(opts.data_root / "train.idx")]
command += ['--data-val', str(opts.data_root / "val.rec")]
command += ['--data-val-idx', str(opts.data_root / "val.idx")]
command += ['--dtype', opts.dtype]
command += ['--image-shape', n_ch + ',224,224']
if opts.dtype == 'float16':
command += " --fuse-bn-relu 1"
command += " --input-layout NHWC --conv-layout NHWC"
command += " --batchnorm-layout NHWC --pooling-layout NHWC"
command += " --conv-algo 1 --force-tensor-core 1"
command += " --fuse-bn-add-relu 1"
command += '--fuse-bn-relu 1 --fuse-bn-add-relu 1'.split()
command += '--input-layout NCHW --conv-layout NHWC ' \
'--batchnorm-layout NHWC --pooling-layout NHWC'.split()
command += " --kv-store device"
if not opts.no_dali:
command += " --use-dali"
command += " --dali-prefetch-queue 2 --dali-nvjpeg-memory-padding 64"
command += " --lr "+str(opts.lr)
command += " --gpus " + str(list(range(opts.n_GPUs))).replace(' ', '').replace('[', '').replace(']', '')
command += " --batch-size " + str(opts.batch_size)
command += " --num-epochs " + str(opts.num_epochs)
command += ['--kv-store', opts.kv_store]
command += ['--data-backend', opts.data_backend]
command += ['--lr', str(opts.lr)]
command += ['--gpus', ','.join(list(map(str, range(opts.ngpus))))]
command += ['--batch-size', str(opts.batch_size)]
command += ['--num-epochs', str(opts.num_epochs)]
command += args
for arg in args:
command += " " + arg
os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
@ -92,5 +71,11 @@ os.environ['MXNET_USE_TENSORRT'] = "0"
os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"
os.environ['HOROVOD_CYCLE_TIME'] = "0.1"
os.environ['HOROVOD_FUSION_THRESHOLD'] = "67108864"
os.environ['HOROVOD_NUM_NCCL_STREAMS'] = "2"
os.environ['MXNET_HOROVOD_NUM_GROUPS'] = "16"
os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD'] = "999"
os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD'] = "25"
exit(os.system('/bin/bash -c "'+command+'"'))
os.execvp(command[0], command)

View file

@ -1,3 +1,4 @@
#!/bin/bash
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -12,8 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -lt 2 ] ; then
echo "usage: $0 raw_dataset prepared_dataset"
exit 1
fi
# This script launches ResNet50 benchmark in FP16 on 1,4,8 GPUs with 64,128,192,208 batch size
# Usage ./BENCHMARK_FP16.sh <additionals flags>
python benchmark.py -n 1,4,8 -b 64,128,192,208 -e 2 -w 1 -i 100 -o report.json $@
cd "$2" &&
python /opt/mxnet/tools/im2rec.py --list --recursive train "$1/train" &&
python /opt/mxnet/tools/im2rec.py --list --recursive val "$1/val" &&
python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 train "$1/train" &&
python /opt/mxnet/tools/im2rec.py --pass-through --num-thread 40 val "$1/val" &&
echo "Dataset was prepared succesfully!"

View file

@ -34,58 +34,37 @@
# limitations under the License.
import os
import sys
import argparse
import logging
logging.basicConfig(level=logging.DEBUG)
import data, dali, fit
import mxnet as mx
import numpy as np
def set_imagenet_aug(aug):
# standard data augmentation setting for imagenet training
aug.set_defaults(rgb_mean='123.68,116.779,103.939', rgb_std='58.393,57.12,57.375')
aug.set_defaults(random_crop=0, random_resized_crop=1, random_mirror=1)
aug.set_defaults(min_random_area=0.08)
aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
import data, dali
import fit
import models
if __name__ == '__main__':
# parse args
parser = argparse.ArgumentParser(description="train resnet on imagenet",
def parse_args():
parser = argparse.ArgumentParser(description="Train classification models on ImageNet",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
models.add_model_args(parser)
fit.add_fit_args(parser)
data.add_data_args(parser)
dali.add_dali_args(parser)
data.add_data_aug_args(parser)
# Instead, to get standard resnet augmentation on a per-use basis, invoke as in:
# train_imagenet.py --set-resnet-aug ...
# Finally, to get the legacy MXNet v1.2 training settings on a per-use basis, invoke as in:
# train_imagenet.py --set-data-aug-level 3
parser.set_defaults(
# network
num_layers = 50,
return parser.parse_args()
# data
resize = 256,
num_classes = 1000,
num_examples = 1281167,
image_shape = '3,224,224',
min_random_scale = 1, # if input image has min size k, suggest to use
# 256.0/x, e.g. 0.533 for 480
# train
num_epochs = 90,
lr_step_epochs = '30,60,80',
dtype = 'float32'
)
args = parser.parse_args()
def setup_logging(args):
head = '{asctime}:{levelname}: {message}'
logging.basicConfig(level=logging.DEBUG, format=head, style='{',
handlers=[logging.StreamHandler(sys.stderr), logging.FileHandler(args.log)])
logging.info('Start with arguments {}'.format(args))
if not args.use_dali:
data.set_data_aug_level(parser, 0)
if __name__ == '__main__':
args = parse_args()
setup_logging(args)
# load network
import resnet as net
sym = net.get_symbol(**vars(args))
model = models.get_model(**vars(args))
data_loader = data.get_data_loader(args)
# train
fit.fit(args, sym, dali.get_rec_iter)
fit.fit(args, model, data_loader)

1
PyTorch/Detection/SSD/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
**/__pycache__

View file

@ -1,11 +1,11 @@
FROM nvcr.io/nvidia/pytorch:19.05-py3
FROM nvcr.io/nvidia/pytorch:19.08-py3
# Set working directory
WORKDIR /workspace
ENV PYTHONPATH "${PYTHONPATH}:/workspace"
RUN apt-get update && apt-get install -y python3-tk python-pip git tmux htop tree
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y python3-tk python-pip git tmux htop tree
# Necessary pip packages
RUN pip install --upgrade pip

View file

@ -242,11 +242,11 @@ The following section lists the requirements in order to start training the SSD3
### Requirements
This repository contains `Dockerfile` which extends the PyTorch 19.06 NGC container
This repository contains `Dockerfile` which extends the PyTorch 19.08 NGC container
and encapsulates some dependencies. Aside from these dependencies,
ensure you have the following software:
* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
* [PyTorch 19.06-py3+ NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
* [PyTorch 19.08-py3+ NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
For more information about how to get started with NGC containers, see the
@ -256,7 +256,7 @@ Documentation:
* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
For those unable to use the [PyTorch 19.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch),
For those unable to use the [PyTorch 19.08-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch),
to set up the required environment or create your own container,
see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
@ -537,9 +537,9 @@ The flag `--save` flag enables storing checkpoints after each epoch under `./mod
Our scripts for SSD300 v1.1 presents two ways to run inference.
To get meaningful results, you need a pre-trained model checkpoint.
One way is to run an interactive session on Jupyter notebook, as described in a [Quick Start Guide](#8-start-inferencepredictions).
One way is to run an interactive session on Jupyter notebook, as described in a 8th step of the [Quick Start Guide](#quick-start-guide).
Another way is to run a script `src/SSD300_inference.py`. It contains the logic from the notebook, wrapped into a Python script. The script contains sample usage.
Another way is to run a script `examples/SSD300_inference.py`. It contains the logic from the notebook, wrapped into a Python script. The script contains sample usage.
To use the inference example script in your own code, you can call the `main` function, providing input image URIs as an argument. The result will be a list of detections for each input image.
@ -597,16 +597,18 @@ The following sections provide details on how we achieved our performance and ac
##### NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the `./examples/SSD300_FP{16,32}_{1,4,8}GPU.sh`
script in the `pytorch-19.06-py3` NGC container on NVIDIA DGX-1 with 8x
script in the `pytorch-19.08-py3` NGC container on NVIDIA DGX-1 with 8x
V100 16G GPUs. Performance numbers (in items/images per second) were averaged
over an entire training epoch.
| **Number of GPUs** | **Mixed precision mAP** | **Training time with mixed precision** | **FP32 mAP** | **Training time with FP32** |
|:------------------:|:------------------------:|:-------------------------------------:|:------------:|:---------------------------:|
| 1 | 0.2494 | 10h 39min | 0.2483 | 21h 40min |
| 4 | 0.2495 | 2h 53min | 0.2478 | 5h 52min |
| 8 | 0.2489 | 1h 31min | 0.2475 | 2h 54min |
|GPUs |Batch size / GPU|Accuracy - FP32|Accuracy - mixed precision|Time to train - FP32|Time to train - mixed precision|Time to train speedup (FP32 to mixed precision)|
|-----------|----------------|---------------|---------------------------|--------------------|--------------------------------|------------------------------------------------|
|1 |32 |0.250 |0.250 |20:20:13 |10:23:46 |195.62% |
|4 |32 |0.249 |0.250 |5:11:17 |2:39:28 |195.20% |
|8 |32 |0.250 |0.250 |2:37:35 |1:25:38 |184.01% |
|1 |64 |<N/A> |0.252 |<N/A> |9:27:33 |215.00% |
|4 |64 |<N/A> |0.251 |<N/A> |2:24:43 |215.10% |
|8 |64 |<N/A> |0.252 |<N/A> |1:13:01 |215.85% |
Here are example graphs of FP32 and FP16 training on 8 GPU configuration:
@ -620,15 +622,18 @@ Here are example graphs of FP32 and FP16 training on 8 GPU configuration:
##### NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the `main.py` script with the `--mode
benchmark-training` flag in the `pytorch-19.06-py3` NGC container on NVIDIA
benchmark-training` flag in the `pytorch-19.08-py3` NGC container on NVIDIA
DGX-1 with 8x V100 16G GPUs. Performance numbers (in items/images per second)
were averaged over an entire training epoch.
| **Number of GPUs** | **Batch size per GPU** | **Mixed precision img/s (median)** | **FP32 img/s (median)** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with mixed precision** | **Multi-gpu weak scaling with FP32** |
|:------------------:|:----------------------:|:----------------------------------:|:-----------------------:|:---------------------------------:|:-----------------------------------------------:|:------------------------------------:|
| 1 | 32 | 217.052 | 102.495 | 2.12 | 1.00 | 1.00 |
| 4 | 32 | 838.457 | 397.797 | 2.11 | 3.86 | 3.88 |
| 8 | 32 | 1639.843 | 789.695 | 2.08 | 7.56 | 7.70 |
|GPUs |Batch size / GPU|Throughput - FP32|Throughput - mixed precision|Throughput speedup (FP32 - mixed precision)|Weak scaling - FP32 |Weak scaling - mixed precision |
|-----------|----------------|-----------------|-----------------------------|-------------------------------------------|--------------------------------|------------------------------------------------|
|1 |32 |133.67 |215.30 |161.07% |100.00% |100.00% |
|4 |32 |532.05 |828.63 |155.74% |398.04% |384.88% |
|8 |32 |1,060.33 |1,647.74 |155.40% |793.27% |765.33% |
|1 |64 |<N/A> |232.22 |173.73% |<N/A> |100.00% |
|4 |64 |<N/A> |910.77 |171.18% |<N/A> |392.20% |
|8 |64 |<N/A> |1,769.48 |166.88% |<N/A> |761.99% |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
@ -638,16 +643,16 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
##### NVIDIA DGX-1 (1x V100 16G)
Our results were obtained by running the `main.py` script with `--mode
benchmark-inference` flag in the pytorch-19.06-py3 NGC container on NVIDIA
benchmark-inference` flag in the pytorch-19.08-py3 NGC container on NVIDIA
DGX-1 with (1x V100 16G) GPUs.
| **Batch size** | **Mixed precision img/s (median)** | **FP32 img/s (median)** |
|:--------------:|:----------------------------------:|:-----------------------:|
| 2 | 163.12 | 147.91 |
| 4 | 296.60 | 201.62 |
| 8 | 412.52 | 228.16 |
| 16 | 470.10 | 280.57 |
| 32 | 520.54 | 302.43 |
|Batch size |Throughput - FP32|Throughput - mixed precision|Throughput speedup (FP32 - mixed precision)|Weak scaling - FP32 |Weak scaling - mixed precision |
|-----------|-----------------|-----------------------------|-------------------------------------------|--------------------|--------------------------------|
|2 |148.99 |186.60 |125.24% |100.00% |100.00% |
|4 |203.35 |326.69 |160.66% |136.48% |175.08% |
|8 |227.32 |433.45 |190.68% |152.57% |232.29% |
|16 |278.02 |493.19 |177.39% |186.60% |264.31% |
|32 |299.81 |545.84 |182.06% |201.23% |292.53% |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
@ -655,6 +660,13 @@ To achieve these same results, follow the [Quick Start Guide](#quick-start-guide
### Changelog
August 2019
* upgrade the PyTorch container to 19.08
* update Results section in the README
* code updated to use DALI 0.12.0
* checkpoint loading fix
* fixed links in the README
July 2019
* script and notebook for inference
* use AMP instead of hand-crafted FP16 support
@ -666,7 +678,7 @@ July 2019
March 2019
* Initial release
### Known issues
## Known issues
There are no known issues with this model.

View file

@ -1,6 +1,6 @@
/******************************************************************************
*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.

View file

@ -1,6 +1,6 @@
/******************************************************************************
*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.

View file

@ -1,6 +1,6 @@
/******************************************************************************
*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.

View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import skimage

View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import numpy as np
@ -10,7 +24,6 @@ from src.utils import dboxes300_coco, Encoder
def load_checkpoint(model, model_file):
cp = torch.load(model_file)['model']
cp = { k.replace('module.1.', ''): cp[k] for k in cp }
model.load_state_dict(cp)

File diff suppressed because one or more lines are too long

View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
from argparse import ArgumentParser
@ -157,13 +171,12 @@ def train(train_loop_func, logger, args):
if args.checkpoint is not None:
if os.path.isfile(args.checkpoint):
load_checkpoint(ssd300, args.checkpoint)
load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint)
checkpoint = torch.load(args.checkpoint,
map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device()))
start_epoch = checkpoint['epoch']
iteration = checkpoint['iteration']
scheduler.load_state_dict(checkpoint['scheduler'])
ssd300.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
else:
print('Provided checkpoint is not path to a file')

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

14
PyTorch/Detection/SSD/src/coco.py Executable file → Normal file
View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__author__ = 'tylin'
__version__ = '2.0'
# Interface for accessing the Microsoft COCO dataset.

View file

@ -1,4 +1,4 @@
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -43,7 +43,7 @@ class COCOPipeline(Pipeline):
self.input = ops.COCOReader(file_root = file_root, annotations_file = annotations_file,
shard_id = shard_id, num_shards = num_gpus, ratio=True, ltrb=True, random_shuffle=True,
skip_empty=True)
self.decode = ops.HostDecoder(device = "cpu", output_type = types.RGB)
self.decode = ops.ImageDecoder(device = "cpu", output_type = types.RGB)
# Augumentation techniques
self.crop = ops.SSDRandomCrop(device="cpu", num_attempts=1)
@ -163,7 +163,7 @@ class DALICOCOIterator(object):
for p in self._pipes:
p._prefetch()
for p in self._pipes:
outputs.append(p._share_outputs())
outputs.append(p.share_outputs())
for i in range(self._num_gpus):
dev_id = self._pipes[i].device_id
out_images = []
@ -237,8 +237,8 @@ class DALICOCOIterator(object):
pyt_offsets[j] = torch.IntTensor(bbox_offsets[j])
for p in self._pipes:
p._release_outputs()
p._run()
p.release_outputs()
p.schedule_run()
copy_db_index = self._current_data_batch
# Change index for double buffering

View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
@ -18,7 +32,7 @@ def get_train_loader(args, local_seed):
output_fp16=args.amp, output_nhwc=False,
pad_output=False, seed=local_seed)
train_pipe.build()
test_run = train_pipe.run()
test_run = train_pipe.schedule_run(), train_pipe.share_outputs(), train_pipe.release_outputs()
train_loader = DALICOCOIterator(train_pipe, 118287 / args.N_gpu)
return train_loader

View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import time
import numpy as np

View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np

View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
from torchvision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152

View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from torch.autograd import Variable
import torch
import time

View file

@ -1,3 +1,17 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torchvision.transforms as transforms
import torch.utils.data as data

View file

@ -1,8 +1,20 @@
data/download/
data/extracted/
data/formatted_one_article_per_line/
data/sharded/
data/hdf5/
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
data/download
data/extracted
data/formatted_one_article_per_line
data/sharded
data/hdf5
vocab/
results/
checkpoints/*
results/

View file

@ -8,14 +8,11 @@ __pycache__/
# C extensions
*.so
#Data
#Data checkpoints and results
data/*/*/
data/*/*.zip
data/*
#checkpoints and results
checkpoints/*
results/*
checkpoints/
results/
# Distribution / packaging
.Python

View file

@ -1,24 +1,22 @@
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.07-py3
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.08-py3
FROM ${FROM_IMAGE_NAME}
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
ENV BERT_PREP_WORKING_DIR /workspace/bert/data
WORKDIR /opt
RUN rm -rf /opt/pytorch/apex ; \
git clone https://github.com/NVIDIA/apex.git pytorch/apex ; \
cd pytorch/apex ; \
pip uninstall --yes apex; \
git checkout 880ab925bce9f817a93988b021e12db5f67f7787; \
git pull; \
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
#WORKDIR /opt
#RUN cd pytorch/apex \
# && git fetch origin pull/334/head:multi_tensor_lamb_optimizer \
# && git checkout multi_tensor_lamb_optimizer \
# && python setup.py develop --cuda_ext --cpp_ext
WORKDIR /workspace
RUN git clone https://github.com/attardi/wikiextractor.git
RUN git clone https://github.com/soskek/bookcorpus.git

View file

@ -1,4 +1,3 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
@ -176,6 +175,8 @@
END OF TERMS AND CONDITIONS
Copyright 2019 NVIDIA CORPORATION. All rights reserved.
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following

View file

@ -1,8 +1,8 @@
# BERT For PyTorch
This repository provides a script and recipe to train the BERT model to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
This repository provides a script and recipe to train the BERT model for PyTorch to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
**Table Of Contents**
## Table Of Contents
- [Model overview](#model-overview)
* [Model architecture](#model-architecture)
@ -11,6 +11,7 @@ This repository provides a script and recipe to train the BERT model to achieve
* [Features](#features)
* [Mixed precision training](#mixed-precision-training)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Glossary](#glossary)
- [Setup](#setup)
* [Requirements](#requirements)
- [Quick Start Guide](#quick-start-guide)
@ -18,14 +19,12 @@ This repository provides a script and recipe to train the BERT model to achieve
* [Scripts and sample code](#scripts-and-sample-code)
* [Parameters](#parameters)
* [Pre-training parameters](#pre-training-parameters)
* [Multi-node](#multi-node)
* [Fine-tuning parameters](#fine-tuning-parameters)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Dataset guidelines](#dataset-guidelines)
* [Multi-dataset](#multi-dataset)
* [Relocating hdf5 files](#relocating-hdf5-files)
* [Inter sequence-pair mixing](#inter-sequence-pair-mixing)
* [Retaining document-level granularity](#retaining-document-level-granularity)
* [Training process](#training-process)
* [Pre-training](#pre-training)
* [Fine-tuning](#fine-tuning)
@ -43,31 +42,34 @@ This repository provides a script and recipe to train the BERT model to achieve
* [Training stability test](#training-stability-test)
* [Pre-training stability test](#pre-training-stability-test)
* [Fine-tuning stability test](#fine-tuning-stability-test)
* [Training performance results](#training-performance-results)
* [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
* [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
* [Fine-tuning NVIDIA DGX-1 With 16G](#fine-tuning-nvidia-dgx-1-with-16g)
* [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
* [Pre-training NVIDIA DGX-1 With 32G](#pre-training-nvidia-dgx-1-with-32g)
* [Fine-tuning NVIDIA DGX-1 With 32G](#fine-tuning-nvidia-dgx-1-with-32g)
* [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
* [Pre-training NVIDIA DGX-2 With 32G](#pre-training-nvidia-dgx-2-with-32g)
* [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)
* [Inference performance results](#inference-performance-results)
* [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
* [Pre-training inference on NVIDIA DGX-1 with 16G](#pre-training-inference-on-nvidia-dgx-1-with-16g)
* [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
* [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
* [Pre-training inference on NVIDIA DGX-1 with 32G](#pre-training-inference-on-nvidia-dgx-1-with-32g)
* [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
* [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
* [Pre-training inference on NVIDIA DGX-2 with 32G](#pre-training-inference-on-nvidia-dgx-2-with-32g)
* [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
* [Training performance results](#training-performance-results)
* [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
* [Pre-training NVIDIA DGX-1 With 16G](#pre-training-nvidia-dgx-1-with-16g)
* [Pre-training on multiple NVIDIA DGX-1 With 16G](#pre-training-on-multiple-nvidia-dgx-1-with-16g)
* [Fine-tuning NVIDIA DGX-1 With 16G](#fine-tuning-nvidia-dgx-1-with-16g)
* [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
* [Pre-training NVIDIA DGX-1 With 32G](#pre-training-nvidia-dgx-1-with-32g)
* [Fine-tuning NVIDIA DGX-1 With 32G](#fine-tuning-nvidia-dgx-1-with-32g)
* [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
* [Pre-training NVIDIA DGX-2 With 32G](#pre-training-nvidia-dgx-2-with-32g)
* [Pre-training on multiple NVIDIA DGX-2H With 32G](#pre-training-on-multiple-nvidia-dgx-2h-with-32g)
* [Fine-tuning NVIDIA DGX-2 With 32G](#fine-tuning-nvidia-dgx-2-with-32g)
* [Inference performance results](#inference-performance-results)
* [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g)
* [Pre-training inference on NVIDIA DGX-1 with 16G](#pre-training-inference-on-nvidia-dgx-1-with-16g)
* [Fine-tuning inference on NVIDIA DGX-1 with 16G](#fine-tuning-inference-on-nvidia-dgx-1-with-16g)
* [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32g)
* [Pre-training inference on NVIDIA DGX-1 with 32G](#pre-training-inference-on-nvidia-dgx-1-with-32g)
* [Fine-tuning inference on NVIDIA DGX-1 with 32G](#fine-tuning-inference-on-nvidia-dgx-1-with-32g)
* [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32g)
* [Pre-training inference on NVIDIA DGX-2 with 32G](#pre-training-inference-on-nvidia-dgx-2-with-32g)
* [Fine-tuning inference on NVIDIA DGX-2 with 32G](#fine-tuning-inference-on-nvidia-dgx-2-with-32g)
- [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## Model overview
BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's implementation of BERT is an optimized version of the [Hugging Face implementation](https://github.com/huggingface/pytorch-pretrained-BERT), leveraging mixed precision arithmetic and Tensor Cores on V100 GPUs for faster training times while maintaining target accuracy.
@ -75,22 +77,25 @@ BERT, or Bidirectional Encoder Representations from Transformers, is a new metho
The repository also contains scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for both pre-training and fine-tuning for tasks such as question answering. The major differences between the original implementation of the paper and this version of BERT are as follows:
- Scripts to download Wikipedia and BookCorpus datasets
- Scripts to preprocess downloaded data or a custom corpus into inputs and targets for pre-training in a modular fashion.
- Scripts to preprocess downloaded data or a custom corpus into inputs and targets for pre-training in a modular fashion
- Fused [LAMB](https://arxiv.org/pdf/1904.00962.pdf) optimizer to support training with larger batches
- Fused Adam optimizer for fine tuning tasks
- Fused CUDA kernels for better performance LayerNorm
- Automatic Mixed precision training support
- Automatic mixed precision (AMP) training support
- Scripts to launch on multiple number of nodes
Other publicly available implementations of BERT include:
1. [Google's official implementation](https://github.com/google-research/bert)
2. [codertimo](https://github.com/codertimo/BERT-pytorch)
1. [NVIDIA Tensorflow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT)
2. [Hugging Face](https://github.com/huggingface/pytorch-pretrained-BERT)
3. [codertimo](https://github.com/codertimo/BERT-pytorch)
4. [gluon-nlp](https://github.com/dmlc/gluon-nlp/tree/master/scripts/bert)
5. [Google's implementation](https://github.com/google-research/bert)
This model trains with mixed precision Tensor Cores on Volta and provides a push-button solution to pretraining on a corpus of choice. As a result, researchers can get results 4x faster than training without Tensor Cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
### Model architecture
The BERT architecture uses the same architecture as the encoder half of the Transformer. Input sequences are projected into an embedding space before being fed into the encoder structure. Additionally, a positional and segment encodings are added to the embeddings to preserve positional information. The encoder structure is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer accomplishes self-attention on multiple input representations.
The BERT architecture uses the same architecture as the encoder half of the Transformer. Input sequences are projected into an embedding space before being fed into the encoder structure. Additionally, positional and segment encodings are added to the embeddings to preserve positional information. The encoder structure is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer accomplishes self-attention on multiple input representations.
An illustration of the architecture taken from the [Transformer paper](https://arxiv.org/pdf/1706.03762.pdf) is shown below.
@ -100,14 +105,14 @@ An illustration of the architecture taken from the [Transformer paper](https://a
The architecture of the BERT model is almost identical to the Transformer model that was first introduced in the [Attention Is All You Need paper](https://arxiv.org/pdf/1706.03762.pdf). The main innovation of BERT lies in the pre-training step, where the model is trained on two unsupervised prediction tasks using a large text corpus. Training on these unsupervised tasks produces a generic language model, which can then be quickly fine-tuned to achieve state-of-the-art performance on language processing tasks such as question answering.
The BERT paper reports results two configurations of BERT, each corresponding to a unique model size. This implementation provides the same configurations by default, which are described in the table below.
The BERT paper reports the results for two configurations of BERT, each corresponding to a unique model size. This implementation provides the same configurations by default, which are described in the table below.
| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |
|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|
|BERTBASE |12 encoder| 768| 12|4 x 768|512|110M|
|BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|
Additionally, this implementation supports training on multiple GPUs. Mixed precision training and inference with dynamic loss scaling is also supported.
### Feature support matrix
@ -118,12 +123,13 @@ The following features are supported by this model.
|APEX AMP|Yes|
|APEX DDP|Yes|
|LAMB|Yes|
|Multi-node|Yes|
#### Features
[APEX](https://github.com/NVIDIA/apex) is a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training.
[APEX](https://github.com/NVIDIA/apex) is a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training, whereas [AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
[DDP](https://nvidia.github.io/apex/parallel.html) stands for DistributedDataParallel and is used for multi-GPU training, where as [AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
[DDP](https://nvidia.github.io/apex/parallel.html) stands for DistributedDataParallel and is used for multi-GPU training.
[LAMB](https://arxiv.org/pdf/1904.00962.pdf) stands for Layerwise Adaptive Moments based optimizer, is a large batch optimization technique that helps accelerate training of deep neural networks using large minibatches. It allows using a global batch size of 65536 and 32768 on sequence lengths 128 and 512 respectively, compared to a batch size of 256 for Adam. The optimized implementation accumulates 1024 gradients batches in phase 1 and 4096 steps in phase 2 before updating weights once. This results in 15% training speedup. On multi-node systems, LAMB allows scaling up to 1024 GPUs resulting in training speedups of up to 72x in comparison to [Adam](https://arxiv.org/pdf/1412.6980.pdf). Adam has limitations on the learning rate that can be used since it is applied globally on all parameters whereas LAMB follows a layerwise learning rate strategy.
@ -135,10 +141,9 @@ Mixed precision is the combined use of different numerical precisions in a compu
2. Adding loss scaling to preserve small gradient values.
For information about:
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
- APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
- APEX tools for mixed precision training, see the [NVIDIA APEX: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
#### Enabling mixed precision
@ -149,15 +154,35 @@ Automatic mixed precision can be enabled with the following code changes:
```
from apex import amp
if fp16:
# Wrap optimizer and model
model, optimizer = amp.initialize(model, optimizer, opt_level=<opt_level>, loss_scale=”dynamic”)
# Wrap optimizer and model
model, optimizer = amp.initialize(model, optimizer, opt_level=<opt_level>, loss_scale=”dynamic”)
if fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
```
Where `<opt_level>` is the optimization level. In the pretraining, “O2” is set as the optimization level. Mixed precision training can be turned on by passing the `fp16` argument to the pre-training and fine-tuning Python scripts. Shell scripts all have a positional argument available to enable mixed precision training.
Where `<opt_level>` is the optimization level. In the pretraining, `O2` is set as the optimization level. Mixed precision training can be turned on by passing the `fp16` argument to the `run_pretraining.py` and `run_squad.py`. All shell scripts have a positional argument available to enable mixed precision training.
### Glossary
**Fine-tuning**
Training an already pretrained model further using a task specific dataset for subject-specific refinements, by adding task-specific layers on top if required.
**Language Model**
Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence.
**Pre-training**
Training a model on vast amounts of data on the same (or different) task to build general understandings.
**Transformer**
The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
**Phase1**
Pretraining on samples of sequence length 128 and 20 masked predictions per sequence.
**Phase2**
Pretraining on samples of sequence length 512 and 80 masked predictions per sequence.
## Setup
@ -178,9 +203,14 @@ For more information about how to get started with NGC containers, see the follo
For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
For multi-node, the sample provided in this repository requires [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) set up on a [SLURM](https://slurm.schedmd.com) cluster.
More information on how to set up and launch can be found in the [Multi-node Documentation](https://docs.nvidia.com/ngc/multi-node-bert-user-guide).
## Quick Start Guide
To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. The default parameters for pretraining have been set to run on 8 x V100 32G cards. For the specifics concerning training and inference, see [Advanced](#advanced).
To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model. The default parameters for pretraining have been set to run on 8 x V100 32G cards. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
1. Clone the repository.
@ -190,11 +220,11 @@ To train your model using mixed precision with Tensor Cores or using FP32, perfo
`cd DeepLearningExamples/PyTorch/LanguageModeling/BERT`
2. Download NVIDIA pretrained checkpoint.
2. Download the NVIDIA pretrained checkpoint.
If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/catalog/models) and browse the available models. This downloaded checkpoint is used to fine-tune on SQuAD. Make sure to place the downloaded checkpoint in `checkpoints/` folder.
If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/catalog/models) and browse the available models. This downloaded checkpoint is used to fine-tune on SQuAD. Ensure you place the downloaded checkpoint in the `checkpoints/` folder.
3. Build the BERT 19.07 NGC container.
3. Build the BERT 19.08 NGC container.
`bash scripts/docker/build.sh`
@ -202,7 +232,7 @@ If you want to use a pretrained checkpoint, visit [NGC](https://ngc.nvidia.com/c
`bash scripts/docker/launch.sh`
Resultant logs and checkpoints of pretraining and finetuning routines get stored in the `results/` folder.
Resultant logs and checkpoints of pretraining and fine-tuning routines get stored in the `results/` folder.
`data` and `vocab.txt` are downloaded in `data/` directory by default. Refer to the [Getting the data](#getting-the-data) section for more details on how to process a custom corpus as required for BERT pretraining.
@ -214,25 +244,29 @@ This repository provides scripts to download, verify and extract the following d
- Wikipedia (pre-training)
- BookCorpus (pre-training)
To download, verify, extract the datasets, and create the shards in hdf5 format, run:
To download, verify, extract the datasets, and create the shards in hdf5 format, run:
`/workspace/bert/data/create_datasets_from_start.sh`
6. Start pre-training.
Depending on the speed of your internet connection, this process takes about a day to complete.
BERT is designed to pre-train deep bidirectional representations for language representations. The following scripts are to replicate pretraining on Wikipedia+Book Corpus from this [paper](https://arxiv.org/pdf/1810.04805.pdf). These scripts are general and can be used for pre-training language representations on any corpus of choice.
6. Start pretraining.
From within the container, you can use the following script to run pre-training.
BERT is designed to pre-train deep bidirectional networks for language representations. The following scripts replicate pretraining on Wikipedia + BookCorpus from this [paper](https://arxiv.org/pdf/1810.04805.pdf). These scripts are general and can be used for pre-training language representations on any corpus of choice.
To run on a single node, from within the container, you can use the following script to run pre-training.
`bash scripts/run_pretraining.sh`
More details can be found in Details/Training Process
7. Start fine-tuning with the SQUAD dataset.
The default hyperparameters are set to run on 8 x V100 32G cards.
The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script launches fine-tuning for question answering with the SQuaD dataset.
To run on multiple nodes, see the [Multi-node](#multi-node) section.
7. Start fine-tuning with the SQuAD dataset.
The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script launches fine-tuning for question answering with the SQuAD dataset.
`bash scripts/run_squad.sh /workspace/checkpoints/<downloaded_checkpoint>`
Default arguments are listed below in order,
Default arguments are listed below in the order the scripts expects:
- Initial checkpoint - The default is `/workspace/checkpoints/bert_uncased.pt`.
- Number of training Epochs - The default is `2`.
@ -244,18 +278,18 @@ Default arguments are listed below in order,
- SQuAD directory - The default is `/workspace/bert/data/v1.1`.
- Vocabulary file (token to ID mapping) - The default is `/workspace/bert/vocab/vocab`.
- Output directory for result - The default is `/results/SQuAD`.
- Mode (“train”, “eval”, “train eval”, "predict") - The default is `train`.
- Config file for the bert model (It should be the same as the pretrained model) - The default is `/workspace/bert/bert_config.json`.
- Mode (`train`, `eval`, `train eval`, `predict`) - The default is `train`.
- Config file for the BERT model (It should be the same as the pretrained model) - The default is `/workspace/bert/bert_config.json`.
The script will save the final checkpoint to the `/results/SQuAD/pytorch_model.bin` file.
The script saves the final checkpoint to the `/results/SQuAD/pytorch_model.bin` file.
9. Start validation/evaluation.
Validation can be performed with the same script as above, setting `Mode` to "prediction".
Validation can be performed with the same script as above, setting `Mode` to `prediction`.
10. Start inference/predictions.
Inference can be performed with the same script as above, setting `Mode` to `eval`. Inference predictions get saved to `<OUTPUT_DIRECTORY>/predictions.json`.
Inference can be performed with the same script as above, setting `Mode` to `eval`. Inference predictions are saved to `<OUTPUT_DIRECTORY>/predictions.json`.
## Advanced
@ -273,7 +307,7 @@ Descriptions of the key scripts and folders are provided below.
- `create_pretraining_data.py` - Creates `.hdf5` files from shared text files in the final step of dataset creation.
- `model.py` - Implements the BERT pre-training and fine-tuning model architectures with PyTorch.
- `optimization.py` - Implements the LAMB optimizer with PyTorch.
- `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuaD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
- `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
- `run_pretraining.py` - Implements BERT pre-training.
- `run_pretraining_inference.py` - Implements evaluation of a BERT pre-trained model.
@ -284,145 +318,169 @@ Descriptions of the key scripts and folders are provided below.
The complete list of the available parameters for the `run_pretraining.py` script are:
```
--input_dir INPUT_DIR - The input data directory.
Should contain .hdf5 files for the task.
--input_dir INPUT_DIR - The input data directory.
Should contain .hdf5 files for the task.
--config_file CONFIG_FILE - Path to a json file describing the BERT model
configuration. This file configures the model
architecture, such as the number of transformer
blocks, number of attention heads, etc.
--config_file CONFIG_FILE - Path to a json file describing the BERT model
configuration. This file configures the model
architecture, such as the number of transformer
blocks, number of attention heads, etc.
--bert_model BERT_MODEL - Specifies the type of BERT model to use;
should be one of the following:
bert-base-uncased
bert-large-uncased
bert-base-cased
bert-base-multilingual
bert-base-chinese
--bert_model BERT_MODEL - Specifies the type of BERT model to use;
should be one of the following:
bert-base-uncased
bert-large-uncased
bert-base-cased
bert-base-multilingual
bert-base-chinese
--output_dir OUTPUT_DIR - Path to the output directory where the model
checkpoints will be written.
--output_dir OUTPUT_DIR - Path to the output directory where the model
checkpoints will be written.
--max_seq_length MAX_SEQ_LENGTH
- The maximum total input sequence length after
WordPiece tokenization. Sequences longer than
this will be truncated, and sequences shorter
than this will be padded.
- The maximum total input sequence length after
WordPiece tokenization. Sequences longer than
this will be truncated, and sequences shorter
than this will be padded.
--max_predictions_per_seq MAX_PREDICTIONS_PER_SEQ
- The maximum total of masked tokens per input
sequence for Masked LM.
- The maximum total of masked tokens per input
sequence for Masked LM.
--train_batch_size TRAIN_BATCH_SIZE
- Batch size per GPU for training.
- Batch size per GPU for training.
--learning_rate LEARNING_RATE
- The initial learning rate for LAMB optimizer.
- The initial learning rate for LAMB optimizer.
--max_steps MAX_STEPS - Total number of training steps to perform.
--max_steps MAX_STEPS - Total number of training steps to perform.
--warmup_proportion WARMUP_PROPORTION
- Proportion of training to perform linear learning
rate warmup for. For example, 0.1 = 10% of training.
- Proportion of training to perform linear learning
rate warmup for. For example, 0.1 = 10% of training.
--seed SEED - Sets the seed to use for random number generation.
--seed SEED - Sets the seed to use for random number generation.
--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
- Number of update steps to accumulate before
performing a backward/update pass.
- Number of update steps to accumulate before
performing a backward/update pass.
--fp16 - If set, will perform computations using
automatic mixed precision.
--fp16 - If set, will perform computations using
automatic mixed precision.
--loss_scale LOSS_SCALE - Sets the loss scaling value to use when
mixed precision is used. The default value (0)
tells the script to use dynamic loss scaling
instead of fixed loss scaling.
--loss_scale LOSS_SCALE - Sets the loss scaling value to use when
mixed precision is used. The default value (0)
tells the script to use dynamic loss scaling
instead of fixed loss scaling.
--log_freq LOG_FREQ - If set, the script will output the training
loss every LOG_FREQ steps.
--log_freq LOG_FREQ - If set, the script will output the training
loss every LOG_FREQ steps.
--resume_from_checkpoint - If set, training will resume from a checkpoint
that currently exists in OUTPUT_DIR.
--resume_from_checkpoint - If set, training will resume from a checkpoint
that currently exists in OUTPUT_DIR.
--num_steps_per_checkpoint NUM_STEPS_PER_CHECKPOINT
- Number of update steps until a model checkpoint
is saved to disk.`
- Number of update steps until a model checkpoint
is saved to disk.
--phase2 - Specified if training on phase 2 only. If not specified, default pretraining is on phase 1.
--phase1_end_step - The number of steps phase 1 was trained for. In order to
resume phase 2 the correct way, phase1_end_step should correspond to the --max_steps phase 1 was trained for.
```
#### Multi-node
Multi-node runs can be launched on a pyxis/enroot Slurm cluster (see [Requirements](#requirements)) with the `run.sub` script with the following command for a 4-node DGX1 example for both phase 1 and phase 2:
```
BATCHSIZE=2048 LR=6e-3 GRADIENT_STEPS=128 PHASE=1 sbatch -N4 --ntasks-per-node=8 run.sub
BATCHSIZE=1024 LR=4e-3 GRADIENT_STEPS=256 PHASE=2 sbatch -N4 --ntasks-per-node=8 run.sub
```
Checkpoint after phase 1 will be saved in `checkpointdir` specified in `run.sub`. The checkpoint will be automatically picked up to resume training on phase 2. Note that phase 2 should be run after phase 1.
Variables to re-run the [Training performance results](#training-performance-results) are available in the `configurations.yml` file.
The batch variables `BATCHSIZE`, `LR`, `GRADIENT_STEPS`,`PHASE` refer to the Python arguments `train_batch_size`, `learning_rate`, `gradient_accumulation_steps`, `phase2` respectively.
Note that the `run.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `datadir` handle the location of the files for each phase.
Refer to the files contents to see the full list of variables to adjust for your system.
#### Fine-tuning parameters
The run_squad.py script contains many of the same arguments as `run_pretraining.py`.
The `run_squad.py` script contains many of the same arguments as `run_pretraining.py`.
The main script specific parameters are:
```
--bert_model BERT_MODEL - Specifies the type of BERT model to use;
should be one of the following:
bert-base-uncased
bert-large-uncased
bert-base-cased
bert-base-multilingual
bert-base-chinese
--bert_model BERT_MODEL - Specifies the type of BERT model to use;
should be one of the following:
bert-base-uncased
bert-large-uncased
bert-base-cased
bert-base-multilingual
bert-base-chinese
--train_file TRAIN_FILE - Path to the SQuAD json for training.
For example, train-v1.1.json.
--train_file TRAIN_FILE - Path to the SQuAD json for training.
For example, train-v1.1.json.
--predict_file PREDICT_FILE - Path to the SQuAD json for predictions.
For example, dev-v1.1.json or test-v1.1.json.
--predict_file PREDICT_FILE - Path to the SQuAD json for predictions.
For example, dev-v1.1.json or test-v1.1.json.
--max_seq_length MAX_SEQ_LENGTH
- The maximum total input sequence length
after WordPiece tokenization.
Sequences longer than this will be truncated,
and sequences shorter than this will be padded.
- The maximum total input sequence length
after WordPiece tokenization.
Sequences longer than this will be truncated,
and sequences shorter than this will be padded.
--doc_stride DOC_STRIDE - When splitting up a long document into chunks
this parameters sets how much stride to take
between chunks of tokens.
--doc_stride DOC_STRIDE - When splitting up a long document into chunks
this parameters sets how much stride to take
between chunks of tokens.
--max_query_length MAX_QUERY_LENGTH
- The maximum number of tokens for the question.
Questions longer than <max_query_length>
will be truncated to the value specified.
- The maximum number of tokens for the question.
Questions longer than <max_query_length>
will be truncated to the value specified.
--n_best_size N_BEST_SIZE - The total number of n-best predictions to
generate in the nbest_predictions.json
output file.
--n_best_size N_BEST_SIZE - The total number of n-best predictions to
generate in the nbest_predictions.json
output file.
--max_answer_length MAX_ANSWER_LENGTH
- The maximum length of an answer that can be
generated. This is needed because the start and
end predictions are not conditioned on one another.
- The maximum length of an answer that can be
generated. This is needed because the start and
end predictions are not conditioned on one another.
--verbose_logging - If true, all the warnings related to data
processing will be printed. A number of warnings
are expected for a normal SQuAD evaluation.
--verbose_logging - If true, all the warnings related to data
processing will be printed. A number of warnings
are expected for a normal SQuAD evaluation.
--do_lower_case - Whether to lower case the input text. Set to
true for uncased models and false for cased models.
--do_lower_case - Whether to lower case the input text. Set to
true for uncased models and false for cased models.
--version_2_with_negative - If true, the SQuAD examples contain questions
that do not have an answer.
--version_2_with_negative - If true, the SQuAD examples contain questions
that do not have an answer.
--null_score_diff_threshold NULL_SCORE_DIFF_THRES HOLD
- A null answer will be predicted if null_score if
best_non_null is greater than NULL_SCORE_DIFF_THRESHOLD.
- A null answer will be predicted if null_score if
best_non_null is greater than NULL_SCORE_DIFF_THRESHOLD.
```
### Command-line options
To see the full list of available options and their descriptions, use the -h or --help command line option, for example:
To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
`python run_pretraining.py --help`
`python run_squad.py --help`
Detailed descriptions of command line options can be found in the Parameters section above.
Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section.
### Getting the data
For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as Book Corpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. BERT requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as BookCorpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. BERT requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
The preparation of pre-training dataset is described in the `bertPrep.py` script found in the `data/` folder. The component steps in the automated scripts to prepare the datasets are as follows:
@ -436,12 +494,11 @@ The preparation of pre-training dataset is described in the `bertPrep.py` script
5. hdf5 file creation - each text file shard is processed by the `create_pretraining_data.py` script to produce a corresponding hdf5 file. The script generates input data and labels for masked language modeling and sentence prediction tasks for the input text shard.
The tools used for preparing the Bookcorpus and Wikipedia datasets can be applied to prepare an arbitrary corpus. The `create_datasets_from_start.sh` script in the `data/` directory applies sentence segmentation, sharding, and hdf5 file creation given an arbitrary text file containing a document-separated text corpus.
The tools used for preparing the BookCorpus and Wikipedia datasets can be applied to prepare an arbitrary corpus. The `create_datasets_from_start.sh` script in the `data/` directory applies sentence segmentation, sharding, and hdf5 file creation given an arbitrary text file containing a document-separated text corpus.
For fine-tuning a pre-trained BERT model for specific tasks, by default this repository prepares the following dataset:
- [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering
- [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering
#### Dataset guidelines
@ -469,7 +526,7 @@ The training process consists of two steps: pre-training and fine-tuning.
Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `scripts/run_pretraining.sh`.
The `run_pretraining.sh` script runs a job on a single node that trains the BERT-large model from scratch using the Wikipedia and BookCorpus datasets as training data using LAMB optimizer. By default, the training script runs two phases of training with a hyperparameter recipe specific to 8 x V100 32G cards:
The `run_pretraining.sh` script runs a job on a single node that trains the BERT-large model from scratch using Wikipedia and BookCorpus datasets as training data using the LAMB optimizer. By default, the training script runs two phases of training with a hyperparameter recipe specific to 8 x V100 32G cards:
Phase 1: (Maximum sequence length of 128)
- Runs on 8 GPUs with training batch size of 64 per GPU
@ -487,7 +544,7 @@ Phase 2: (Maximum sequence length of 512)
- Saves a checkpoint every 200 iterations (keeps only the latest 3 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
- Creates a log file containing all the output
These parameters will train on Wikipedia and BooksCorpus to SoTA accuracy on a DGX-1 with 32GB V100 cards.
These parameters will train on Wikipedia and BookCorpus to SoTA accuracy on a DGX-1 with 32GB V100 cards.
`bash run_pretraining.sh <training_batch_size> <learning-rate> <precision> <num_gpus> <warmup_proportion> <training_steps> <save_checkpoint_steps> <resume_training> <create_logfile> <accumulate_gradients> <gradient_accumulation_steps> <seed> <job_name> <allreduce_post_accumulation> <allreduce_post_accumulation_fp16> <accumulate_into_fp16> <train_bath_size_phase2> <learning_rate_phase2> <warmup_proportion_phase2> <train_steps_phase2> <gradient_accumulation_steps_phase2> `
@ -496,20 +553,23 @@ Where:
- `<training_batch_size>` is per-GPU batch size used for training. Larger batch sizes run more efficiently, but require more memory.
- `<learning_rate>` is the base learning rate for training
- `<precision>` is the type of math in your model, can be either `fp32` or `fp16`. The options mean:
- FP32: 32-bit IEEE single precision floats.
- FP16: Mixed precision 16 and 32 bit floats.
- FP32: 32-bit IEEE single precision floats.
- FP16: Mixed precision 16 and 32 bit floats.
- `<num_gpus>` is the number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node.
- `<warmup_proportion>` is the percentage of training steps used for warm-up at the start of training.
- `<training_steps>` is the total number of training steps.
- `<save_checkpoint_steps>` controls how often checkpoints are saved.
- `<resume_training>` if set to true, training should resume from latest model in /results/checkpoints. Default is false.
- `<create_logfile>` a flag indicating if output should be written to a log file or not (acceptable values are true or false. true indicates output should be saved to a log file.)
- `<resume_training>` if set to `true`, training should resume from latest model in `/results/checkpoints`. Default is `false`.
- `<create_logfile>` a flag indicating if output should be written to a log file or not (acceptable values are `true` or 'false`. `true` indicates output should be saved to a log file.)
- `<accumulate_gradient>` a flag indicating whether a larger batch should be simulated with gradient accumulation.
- `<gradient_accumulation_steps>` an integer indicating the number of steps to accumulate gradients over. Effective batch size = `training_batch_size` / `gradient_accumulation_steps`.
- `<seed>` random seed for the run.
- `<allreduce_post_accumulation>` - If set to `true`, performs allreduce only after the defined number of gradient accumulation steps.
- `<allreduce_post_accumulation_fp16>` - If set to `true`, performs allreduce after gradient accumulation steps in FP16.
- `<accumulate_into_fp16>` - If set to `true`, accumulates/sums the gradients in FP16.
Note: The above three options need to be set to false when running on fp32.
- `<training_batch_size_phase2>` is per-GPU batch size used for training in phase 2. Larger batch sizes run more efficiently, but require more memory.
- `<learning_rate_phase2>` is the base learning rate for training phase 2.
- `<warmup_proportion_phase2>` is the percentage of training steps used for warm-up at the start of training.
@ -522,7 +582,7 @@ For example:
Trains BERT-large from scratch on a DGX-1 32G using FP16 arithmetic. 90% of the training steps are done with sequence length 128 (phase1 of training) and 10% of the training steps are done with sequence length 512 (phase2 of training).
In order to train on a DGX-1 16G, set `gradient_accumulation_steps` to `512` and `gradient_accumulation_steps_phase2` to `1024` in `scripts/run_pretraining.sh`
In order to train on a DGX-1 16G, set `gradient_accumulation_steps` to `512` and `gradient_accumulation_steps_phase2` to `1024` in `scripts/run_pretraining.sh`.
In order to train on a DGX-2 32G, set `train_batch_size` to `4096`, `train_batch_size_phase2` to `2048`, `num_gpus` to `16`, `gradient_accumulation_steps` to `64` and `gradient_accumulation_steps_phase2` to `256` in `scripts/run_pretraining.sh`
@ -538,17 +598,17 @@ By default, each Python script implements fine-tuning a pre-trained BERT model f
- Has FP16 precision enabled
- Saves a checkpoint at the end of training to the `/results/<dataset_name>` folder
Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through NVIDIAs [Apex](https://github.com/NVIDIA/apex) library. For a full list of parameters and associated explanations, consult the [Parameters](#parameters) section.
Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through NVIDIAs [APEX](https://github.com/NVIDIA/apex) library. For a full list of parameters and associated explanations, see the [Parameters](#parameters) section.
All fine-tuning shell scripts have the same positional arguments, outlined below:
`bash scripts/run_squad.sh <checkpoint_to_load> <epochs> <batch_size per GPU> <learning rate> <precision (either `fp16` or `fp32`)> <number of GPUs to use> <seed> <SQUAD_DATA_DIR> <VOCAB_FILE> <OUTPUT_DIR> <mode (either `train`, `eval` or `train eval`)> <CONFIG_FILE>`
```bash scripts/run_squad.sh <checkpoint_to_load> <epochs> <batch_size per GPU> <learning rate> <precision (either `fp16` or `fp32`)> <number of GPUs to use> <seed> <SQuAD_DATA_DIR> <VOCAB_FILE> <OUTPUT_DIR> <mode (either `train`, `eval` or `train eval`)> <CONFIG_FILE>```
By default, the mode positional argument is set to train eval. See the [Quick Start Guide](#quick-start-guide) for explanations of each positional argument.
Note: The first positional argument (the path to the checkpoint to load) is required.
Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command line input to `run_squad.sh`.
Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command-line input to `run_squad.sh`.
### Inference process
@ -578,13 +638,13 @@ Where:
- `<evaluation_batch_size>` is per-GPU batch size used for inference. Larger batch sizes run more efficiently, but require more memory.
- `<precision>` is the type of math in your model, can be either `fp32` or `fp16`. The options mean:
- `fp32`: 32-bit IEEE single precision floats
- `fp16`: 16-bit floats for 3.2x faster inference
- `fp32`: 32-bit IEEE single precision floats
- `fp16`: 16-bit floats for 3.2x faster inference
- `<num_gpus>` is the number of GPUs to use for inference. Must be equal to or smaller than the number of GPUs attached to your node.
- `<inference_mode>` is either `--eval` for evaluation or `--prediction` for inference
- `<model_checkpoint>` is the model checkpoint to run inference on. Default is `-1`, which takes the most recent model checkpoint from the checkpoints folder.
- `<model_checkpoint>` is the model checkpoint to run inference on. Default is `-1`, which takes the most recent model checkpoint from the `checkpoints` folder.
- `<inference_steps>` is the total number of inference steps per process. Default is `-1`, which iterates over the entire dataset.
- `<create_logfile>` a flag indicating if output should be written to a logfile or not (acceptable values are true or false. true indicates output should be saved to a logfile.)
- `<create_logfile>` a flag indicating if output should be written to a logfile or not (acceptable values are `true` or `false`. `true` indicates output should be saved to a logfile.)
For example:
@ -598,11 +658,10 @@ Evaluation fine-tuning is enabled by the same scripts as training:
The mode positional argument of the shell script is used to run in evaluation mode. The fine-tuned BERT model will be run on the evaluation dataset, and the evaluation loss and accuracy will be displayed.
Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running evaluation on a given dataset or doing prediction.
Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running prediction and evaluating them on a given dataset or just the former.
`bash scripts/run_squad.sh <path to fine-tuned model checkpoint>`
Note: Fine-tuning evaluation is only supported on single GPU.
## Performance
@ -612,11 +671,11 @@ The following section shows how to run benchmarks measuring the model performanc
#### Training performance benchmark
Training performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command line as described in [Training process](#training-process).
Training performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Training process](#training-process).
To benchmark the training performance on a specific batch size, run:
`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to squad dataset> <path to vocab set> <results directory> train <BERT config path] <max steps>`
`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to SQuAD dataset> <path to vocab set> <results directory> train <BERT config path] <max steps>`
An example call used to generate throughput numbers:
@ -626,11 +685,11 @@ An example call used to generate throughput numbers:
#### Inference performance benchmark
Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command line as described in [Inference process](#inference-process).
Inference performance benchmarks for both pretraining and fine-tuning can be obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` respectively. The required parameters can be passed through the command-line as described in [Inference process](#inference-process).
To benchmark the inference performance on a specific batch size, run:
`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to squad dataset> <path to vocab set> <results directory> eval <BERT config path> <max steps>`
`bash scripts/run_squad.sh <pretrained model path> <epochs> <batch size> <learning rate> <fp16|fp32> <num_gpus> <seed> <path to SQuAD dataset> <path to vocab set> <results directory> eval <BERT config path> <max steps>`
An example call used to generate throughput numbers:
@ -644,18 +703,20 @@ An example call used to generate throughput numbers:
#### Training accuracy results
Our results were obtained by running `scripts/run_squad.sh` and `scripts/run_pretraining.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs for pretraining and NVIDIA DGX-1 with (8x V100 16G) GPUs for fine-tuning.
Note: Pretraining results were obtained with a dataset that was created using an earlier version of the data preprocessing scripts than are currently in this repository, and with an an earlier snapshot of wikidumps. The results in the table will be updated soon with results using the latest data prep scripts. Early data show the results are quite similar.
Our results were obtained by running the `scripts/run_squad.sh` and `scripts/run_pretraining.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs for pretraining and NVIDIA DGX-1 with (8x V100 16G) GPUs for fine-tuning.
##### Pre-training loss results
| DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(days) - FP32 | Time to train(days) - mixed precision | Time to train speedup (FP32 to mixed precision)
| DGX System | GPUs | Accumulated Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - FP32 | Final Loss - mixed precision | Time to train(hours) - FP32 | Time to train(hours) - mixed precision | Time to train speedup (FP32 to mixed precision)
|---|---|---|---|---|---|---|---|---
| NVIDIA DGX-1 With 16G|8|8192 and 4196 |512 and 1024|-|1.53|-|6.84|-
| NVIDIA DGX-2 With 32G|16|4096 and 2048 |64 and 256|-|1.52|-|2.71|-
| 1 x NVIDIA DGX-1 With 16G|8|8192 and 4196 |512 and 1024|-|1.36|-|153.16|-
| 1 x NVIDIA DGX-2H With 32G|16|4096 and 2048 |64 and 256|-|1.35|-|58.4|-
| 4 x NVIDIA DGX-1 With 16G|8|2048 and 1024 |128 and 256|-|1.34|-|39.27|-
| 4 x NVIDIA DGX-2H With 32G|16|1024 and 512 |16 and 64|-|1.33|-|15.35|-
| 16 x NVIDIA DGX-1 With 16G|8|512 and 256 |32 and 64|-|1.329|-|10.36|-
| 16 x NVIDIA DGX-2H With 32G|16|256 and 128 |4 and 16|-|1.33|-|3.94|-
| 64 x NVIDIA DGX-2H With 32G|16|64 and 32 |(1 and 4)FP16 and (2 and 8)FP32|1.33|1.331|4.338|1.124|3.85
##### Fine-tuning accuracy results
@ -667,9 +728,9 @@ Note: Pretraining results were obtained with a dataset that was created using an
###### Pre-training stability test
| Accuracy Metric | Seed 1
|---|---
| Final Loss | 1.52
| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
|---|---|---|---|---|---|---|---
|Final Loss| 1.344 | 1.328 | 1.324 | 1.326 | 1.333 | 1.331 | 0.009
###### Fine-tuning stability test
@ -680,11 +741,12 @@ Training stability with 8 GPUs, FP16 computations, batch size of 4:
|Exact Match %| 84.50 | 84.07 | 84.52 | 84.23 | 84.17 | 84.30 | .200
| f1 % | 91.29 | 91.01 | 91.14 | 91.10 | 90.85 | 91.08 | 0.162
#### Training performance results
##### Training performance: NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.shtraining scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a predefined number of training iterations.
Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in sequences per second) were averaged over a predefined number of training iterations.
###### Pre-training NVIDIA DGX-1 With 16G
@ -698,6 +760,18 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
| 8| 2| 4| 512| 56.16 |194.56 | 3.46| 7.43| 7.30
###### Pre-training on multiple NVIDIA DGX-1 With 16G
| Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|--------------
|1 |8 | N/A | 16| 128| N/A |874.24 |N/A |N/A | 1.00
|4 |8 | N/A | 16| 128| N/A |3089.76 | N/A| N/A| 3.53
|16 |8 | N/A | 16| 128| N/A |12144.64 | N/A| N/A| 13.89
|1 |8 | N/A | 4| 512| N/A |195.93 |N/A |N/A | 1.00
|4 |8 | N/A | 4| 512| N/A |700.16 | N/A| N/A| 3.57
|16| 8| N/A | 4| 512| N/A |2746.368 | N/A| N/A| 14.02
###### Fine-tuning NVIDIA DGX-1 With 16G
@ -713,7 +787,7 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
##### Training performance: NVIDIA DGX-1 (8x V100 32G)
Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (8x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
###### Pre-training NVIDIA DGX-1 With 32G
@ -729,6 +803,7 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
|4 |N/A | 10| 512|N/A |164.00 | N/A| N/A| 3.57
| 8|N/A | 10| 512|N/A |325.60| N/A| N/A| 7.08
###### Fine-tuning NVIDIA DGX-1 With 32G
| GPUs | Batch size / GPU | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
@ -743,7 +818,7 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
##### Training performance: NVIDIA DGX-2 (16x V100 32G)
Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
Our results were obtained by running the `scripts/run_pretraining.sh` and `scripts/run_squad.sh` training scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
###### Pre-training NVIDIA DGX-2 With 32G
@ -762,6 +837,22 @@ Our results were obtained by running `scripts/run_pretraining.sh` and `scripts/r
|8 | N/A | 10| 512| N/A| 325.60| N/A| N/A| 6.87
|16 | N/A | 10| 512| N/A| 648.00| N/A| N/A| 13.67
###### Pre-training on multiple NVIDIA DGX-2H With 32G
Note: Multi-node performance numbers below are on DGX-2H whereas the single node performance numbers above are on DGX-2.
| Nodes | GPUs | Batch size / GPU (FP32) | Batch size / GPU (FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------|---------------------
|1 |16 | N/A | 64| 128| N/A |3379.2 |N/A |N/A | 1.00
|4 |16 | N/A | 64| 128| N/A |12709.88 | N/A| N/A| 3.76
|16 |16 | N/A | 64| 128| N/A |51937.28 | N/A| N/A| 15.37
|64 |16 | 32 | 64| 128| 46628.86 |188088.32 | 4.03 | N/A| 55.66
|1 |16 | N/A | 8| 512| N/A |625.66 |N/A |N/A | 1.00
|4 |16 | N/A | 8| 512| N/A |2386.38 | N/A| N/A| 3.81
|16| 16| N/A | 8| 512| N/A |9932.8 | N/A| N/A| 15.87
|64| 16| 4 | 8| 512| 9543.68 |37478.4 | 3.92| N/A| 59.9
###### Fine-tuning NVIDIA DGX-2 With 32G
| GPUs | Batch size / GPU | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
@ -781,7 +872,7 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
Our results were obtained by running `scripts/run_pretraining_inference.sh` on data of sequence length 512 and `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
Our results were obtained by running the `scripts/run_pretraining_inference.sh` script on data of sequence length 512 and the `scripts/run_squad.sh` script in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
###### Pre-training inference on NVIDIA DGX-1 with 16G
@ -797,7 +888,7 @@ Our results were obtained by running `scripts/run_pretraining_inference.sh` on d
##### Inference performance: NVIDIA DGX-1 (1x V100 32G)
Our results were obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
Our results were obtained by running the `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-1 with (1x V100 32G) GPUs.
###### Pre-training inference on NVIDIA DGX-1 with 32G
@ -813,13 +904,13 @@ Our results were obtained by running `scripts/run_pretraining_inference.sh` and
##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
Our results were obtained by running `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
Our results were obtained by running the `scripts/run_pretraining_inference.sh` and `scripts/run_squad.sh` scripts in the pytorch:19.07-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
###### Pre-training inference on NVIDIA DGX-2 with 32G
|GPUs | Throughput - FP32(sequences/sec)|Throughput - Mixed Precision(sequences/sec)
|---------- |---------|---------------
| 1| 30.24 97.72
| 1| 30.24| 97.72
###### Fine-tuning inference on NVIDIA DGX-2 with 32G
@ -835,16 +926,20 @@ The inference performance metrics used were items/second.
### Changelog
September 2019
- Scripts to support multi-node launch
- Update pretraining loss results based on the latest data preparation scripts
August 2019
- Pretraining support with LAMB optimizer
- Pre-training support with LAMB optimizer
- Updated Data download and Preprocessing
July 2019
- Initial release
### Known issues
There are no known issues with this model.

View file

@ -1,3 +1,16 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import subprocess
import os

View file

@ -0,0 +1,182 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#1 DGX1 phase1
bert--DGX1:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "1"
BATCHSIZE: "8192"
LR: "6e-3"
GRADIENT_STEPS: "512"
PHASE: "1"
#4 DGX1 phase1
bert--DGX1_4x8x16x128:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "4"
BATCHSIZE: "2048"
LR: "6e-3"
GRADIENT_STEPS: "128"
PHASE: "1"
#16 DGX1 phase1
bert--DGX1_16x8x16x32:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "16"
BATCHSIZE: "512"
LR: "6e-3"
GRADIENT_STEPS: "32"
PHASE: "1"
#1 DGX2 phase1
bert--DGX2:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "1"
BATCHSIZE: "4096"
LR: "6e-3"
GRADIENT_STEPS: "64"
PHASE: "1"
#4 DGX2 phase1
bert--DGX2_4x16x64x16:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "4"
BATCHSIZE: "1024"
LR: "6e-3"
GRADIENT_STEPS: "16"
PHASE: "1"
#16 DGX2 phase1
bert--DGX2_16x16x64x4:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "16"
BATCHSIZE: "256"
LR: "6e-3"
GRADIENT_STEPS: "4"
PHASE: "1"
#64 DGX2 phase1
bert--DGX2_64x16x64:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "64"
BATCHSIZE: "64"
LR: "6e-3"
GRADIENT_STEPS: "1"
PHASE: "1"
#1 DGX1 phase2
bert--DGX1_1x8x4x1024:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "1"
BATCHSIZE: "4096"
LR: "4e-3"
GRADIENT_STEPS: "1024"
PHASE: "2"
#4 DGX1 phase2
bert--DGX1_4x8x4x256:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "4"
BATCHSIZE: "1024"
LR: "4e-3"
GRADIENT_STEPS: "256"
PHASE: "2"
#16 DGX1 phase2
bert--DGX1_16x8x4x64:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "16"
BATCHSIZE: "256"
LR: "4e-3"
GRADIENT_STEPS: "64"
PHASE: "2"
#1 DGX2 phase2
bert--DGX2_1x16x8x256:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "1"
BATCHSIZE: "2048"
LR: "4e-3"
GRADIENT_STEPS: "256"
PHASE: "2"
#4 DGX2 phase2
bert--DGX2_4x16x8x64:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "4"
BATCHSIZE: "512"
LR: "4e-3"
GRADIENT_STEPS: "64"
PHASE: "2"
#16 DGX2 phase2
bert--DGX2_16x16x8x16:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "16"
BATCHSIZE: "128"
LR: "4e-3"
GRADIENT_STEPS: "16"
PHASE: "2"
#64 DGX2 phase2
bert--DGX2_64x16x8x4:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "64"
BATCHSIZE: "32"
LR: "4e-3"
GRADIENT_STEPS: "4"
PHASE: "2"

View file

@ -1,6 +1,6 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from __future__ import absolute_import, division, print_function, unicode_literals

View file

@ -1,4 +1,16 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import subprocess
class BooksDownloader:

View file

@ -1,4 +1,16 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os

View file

@ -1,4 +1,16 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
from WikiDownloader import WikiDownloader

View file

@ -1,4 +1,15 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import os

View file

@ -1,4 +1,15 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os

View file

@ -1,4 +1,15 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

View file

@ -1,4 +1,15 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os

View file

@ -1,4 +1,15 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from itertools import islice

View file

@ -1,4 +1,15 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os
@ -43,6 +54,4 @@ class WikiDownloader:
subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
else:
assert False, 'WikiDownloader not implemented for this language yet.'
assert False, 'WikiDownloader not implemented for this language yet.'

View file

@ -1,4 +1,15 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os

View file

@ -0,0 +1,12 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View file

@ -1,4 +1,15 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import BookscorpusTextFormatting
import Downloader
@ -70,14 +81,13 @@ def main(args):
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()
wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
wiki_formatter.merge()
assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
elif args.dataset == 'wikicorpus_zh':
assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
if args.skip_wikiextractor == 0:
@ -85,6 +95,7 @@ def main(args):
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()
wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'

View file

@ -1,5 +1,18 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Download
python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus
python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
@ -26,4 +39,4 @@ python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset bo
# Create HDF5 files Phase 2
python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 512 \
--max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
--max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1

View file

@ -1,5 +1,18 @@
#!/usr/bin/env bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Downloading MRPC data"
wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py

View file

@ -1,5 +1,18 @@
#!/usr/bin/env bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Downloading dataset for squad..."
# Download SQuAD

View file

@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract pre-computed feature vectors from a PyTorch BERT model."""
from __future__ import absolute_import

View file

@ -1,8 +1,22 @@
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
from __future__ import (absolute_import, division, print_function, unicode_literals)
import json

View file

@ -1,7 +1,6 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -13,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model."""
from __future__ import absolute_import, division, print_function, unicode_literals

View file

@ -13,6 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch optimization for BERT model."""
import math
@ -24,6 +25,7 @@ from torch.nn.utils import clip_grad_norm_
from apex.optimizers import FusedAdam
from apex.multi_tensor_apply import multi_tensor_applier
import amp_C
multi_tensor_l2norm = amp_C.multi_tensor_l2norm
lamb_compute_update = amp_C.multi_tensor_lamb_stage1_cuda
lamb_apply_update = amp_C.multi_tensor_lamb_stage2_cuda

View file

@ -0,0 +1,74 @@
#!/bin/bash
#SBATCH --exclusive
#SBATCH --mem=0
#SBATCH --overcommit
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -eux
# The following variables variables need to be set
# Base container to be used
readonly docker_image="nvcr.io/nvidia/pytorch:19.08-py3"
# Location of dataset for phase 1
readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
# Location of dataset for phase 2
readonly datadir_phase2="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"
# Path to where trained checkpoints will be saved on the system
readonly checkpointdir="$PWD/checkpoints"
readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
PHASE1="\
--train_batch_size=${BATCHSIZE:-16} \
--learning_rate=${LR:-6e-3} \
--warmup_proportion=${WARMUP_UPDATES:-0.2843} \
--input_dir=/workspace/data \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--max_steps=7038 \
--num_steps_per_checkpoint=2500 \
"
PHASE2="\
--train_batch_size=${BATCHSIZE:-4096} \
--learning_rate=${LR:-4e-3} \
--warmup_proportion=${WARMUP_UPDATES:-0.128} \
--input_dir=/workspace/data_phase2 \
--phase2 \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--max_steps=1563 \
--num_steps_per_checkpoint=1000 \
--resume_from_checkpoint --phase1_end_step=7038 \
"
PHASES=( "$PHASE1" "$PHASE2" )
PHASE=${PHASE:-1}
BERT_CMD="\
python -u /workspace/bert/run_pretraining.py \
--seed=42 \
${PHASES[$((PHASE-1))]} \
--do_train \
--config_file=/workspace/bert/bert_config.json \
--output_dir=/results \
--fp16 \
--allreduce_post_accumulation --allreduce_post_accumulation_fp16 \
--gradient_accumulation_steps=${GRADIENT_STEPS:-2} \
--log_freq=1 \
--local_rank=\${SLURM_LOCALID}"
srun -l --container-image="${docker_image}" --container-mounts="${mounts}" sh -c "${BERT_CMD}"

View file

@ -1,7 +1,6 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -13,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import, division, print_function

View file

@ -13,6 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
@ -65,7 +66,6 @@ def create_pretraining_dataset(input_file, max_pred_length, shared_list, args):
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=args.train_batch_size * args.n_gpu, num_workers=4,
pin_memory=True)
# shared_list["0"] = (train_dataloader, input_file)
return train_dataloader, input_file
class pretraining_dataset(Dataset):
@ -179,7 +179,7 @@ def parse_arguments():
type=float, default=0.0,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--log_freq',
type=float, default=50.0,
type=float, default=1.0,
help='frequency of logging loss.')
parser.add_argument('--checkpoint_activations',
default=False,
@ -253,7 +253,7 @@ def setup_training(args):
raise ValueError(" `do_train` must be True.")
if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (
os.listdir(args.output_dir) and os.listdir(args.output_dir) != ['logfile.txt']):
os.listdir(args.output_dir) and any([i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
if not args.resume_from_checkpoint:
@ -478,8 +478,7 @@ def main():
for f_id in range(f_start_id + 1 , len(files)):
# torch.cuda.synchronize()
# f_start = time.time()
if torch.distributed.get_world_size() > num_files:
data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_id)%num_files]
else:
@ -489,23 +488,10 @@ def main():
previous_file = data_file
# train_dataloader = shared_file_list["0"][0]
# thread = multiprocessing.Process(
# name="LOAD DATA:" + str(f_id) + ":" + str(data_file),
# target=create_pretraining_dataset,
# args=(data_file, args.max_predictions_per_seq, shared_file_list, args, n_gpu)
# )
# thread.start()
dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args)
# torch.cuda.synchronize()
# f_end = time.time()
# print('[{}] : shard overhead {}'.format(torch.distributed.get_rank(), f_end - f_start))
train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process() else train_dataloader
for step, batch in enumerate(train_iter):
# torch.cuda.synchronize()
# iter_start = time.time()
training_steps += 1
batch = [t.to(device) for t in batch]
@ -533,7 +519,7 @@ def main():
global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
if global_step >= args.max_steps:
last_num_steps = global_step % args.log_freq
last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
average_loss = average_loss / (last_num_steps * divisor)
@ -541,7 +527,7 @@ def main():
average_loss /= torch.distributed.get_world_size()
torch.distributed.all_reduce(average_loss)
if is_main_process():
logger.info("Total Steps:{} Final Loss = {}".format(training_steps, average_loss.item()))
logger.info("Total Steps:{} Final Loss = {}".format(training_steps / args.gradient_accumulation_steps, average_loss.item()))
elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
if is_main_process():
print("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / (
@ -578,13 +564,6 @@ def main():
# thread.join()
return args
# torch.cuda.synchronize()
# iter_end = time.time()
# if torch.distributed.get_rank() == 0:
# print('step {} : {}'.format(global_step, iter_end - iter_start))
del train_dataloader
# thread.join()
# Make sure pool has finished and switch train_dataloader

View file

@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import

View file

@ -1,7 +1,6 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -13,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD."""
from __future__ import absolute_import, division, print_function
@ -40,6 +40,7 @@ from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from optimization import BertAdam, warmup_linear
from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
from utils import is_main_process
if sys.version_info[0] == 2:
import cPickle as pickle
@ -923,9 +924,11 @@ def main():
model = BertForQuestionAnswering(config)
# model = BertForQuestionAnswering.from_pretrained(args.bert_model,
# cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
print("USING CHECKOINT")
if is_main_process():
print("LOADING CHECKOINT")
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
print("USED CHECKPOINT \n\n")
if is_main_process():
print("LOADED CHECKPOINT")
model.to(device)
if args.fp16 and args.old:
model.half()

View file

@ -1,7 +1,6 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -13,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
import argparse

View file

@ -1,3 +1,17 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import torch
from torch.optim.optimizer import Optimizer

View file

@ -1,4 +1,18 @@
#!/usr/bin/env bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
DATA_DIR=${1:-/workspace/bert/data}
# Download vocab files from pretrained model

View file

@ -1,5 +1,18 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
MRPC_DIR=/workspace/bert/data/glue/MRPC
OUT_DIR=/results/MRPC
@ -55,7 +68,8 @@ CMD+="$use_fp16"
LOGFILE=$OUT_DIR/logfile
$CMD |& tee $LOGFILE
sed -r 's/ |(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`

View file

@ -1,5 +1,18 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Container nvidia build = " $NVIDIA_BUILD_ID
train_batch_size=${1:-8192}
learning_rate=${2:-"6e-3"}
@ -18,11 +31,11 @@ allreduce_post_accumulation=${14:-"true"}
allreduce_post_accumulation_fp16=${15:-"true"}
accumulate_into_fp16=${16:-"false"}
train_batch_size_phase2=${1:-4096}
learning_rate_phase2=${2:-"4e-3"}
warmup_proportion_phase2=${5:-"0.128"}
train_steps_phase2=${6:-1563}
gradient_accumulation_steps_phase2=${11:-512}
train_batch_size_phase2=${17:-4096}
learning_rate_phase2=${18:-"4e-3"}
warmup_proportion_phase2=${19:-"0.128"}
train_steps_phase2=${20:-1563}
gradient_accumulation_steps_phase2=${21:-512}
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus # change this for other datasets
DATA_DIR=$BERT_PREP_WORKING_DIR/${DATASET}/
@ -108,13 +121,7 @@ CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" $ACCUMULATE_INTO_FP16"
CMD+=" --do_train"
if [ "$num_gpus" -gt 1 ] ; then
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
else
CMD="python3 $CMD"
fi
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size \* $num_gpus)
@ -145,7 +152,7 @@ throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}'
loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size')}')
train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size' / '$gradient_accumulation_steps' )}')
echo " training throughput phase1: $train_perf sequences/second"
echo "average loss: $loss"
echo "final loss: $final_loss"
@ -207,13 +214,7 @@ CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" $ACCUMULATE_INTO_FP16"
CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
if [ "$num_gpus" -gt 1 ] ; then
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
else
CMD="python3 $CMD"
fi
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
@ -239,7 +240,8 @@ throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}'
loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size_phase2')}')
train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size_phase2' / '$gradient_accumulation_steps_phase2')}')
echo " training throughput phase2: $train_perf sequences/second"
echo "average loss: $loss"
echo "final loss: $final_loss"

View file

@ -1,5 +1,18 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Container nvidia build = " $NVIDIA_BUILD_ID
DATASET=wikipedia_corpus # change this for other datasets

View file

@ -1,7 +1,19 @@
#!/usr/bin/env bash
#OUT_DIR=/results/SQuAD
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#OUT_DIR=/results/SQuAD
echo "Container nvidia build = " $NVIDIA_BUILD_ID

View file

@ -1,5 +1,18 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SWAG_DIR=/workspace/bert/data/swag
OUT_DIR=/results/SWAG
@ -54,7 +67,8 @@ CMD+="$use_fp16"
LOGFILE=$OUT_DIR/logfile
$CMD |& tee $LOGFILE
sed -r 's/ |(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`

View file

@ -1,4 +1,18 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# purpose: for multinode training on slurm clusters
node_type=${1:-"dgx1"}
num_nodes=${2:-1}

View file

@ -1,6 +1,6 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import, division, print_function, unicode_literals

View file

@ -1,3 +1,16 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.distributed as dist

View file

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.06-py3
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3
FROM ${FROM_IMAGE_NAME}
RUN apt-get update && \

Some files were not shown because too many files have changed in this diff Show more