Merge pull request #764 from NVIDIA/gh/release

[UNet medical/TF2] Fix
This commit is contained in:
nv-kkudrynski 2020-11-23 18:12:17 +01:00 committed by GitHub
commit f3c6bdf656
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 140 additions and 115 deletions

View file

@ -231,20 +231,20 @@ For the specifics concerning training and inference, see the [Advanced](#advance
This script will launch a training on a single fold and store the models checkpoint in the <path/to/checkpoint> directory.
The script can be run directly by modifying flags if necessary, especially the number of GPUs, which is defined after the `-np` flag. Since the test volume does not have labels, 20% of the training data is used for validation in 5-fold cross-validation manner. The number of fold can be changed using `--crossvalidation_idx` with an integer in range 0-4. For example, to run with 4 GPUs using fold 1 use:
The script can be run directly by modifying flags if necessary, especially the number of GPUs, which is defined after the `-np` flag. Since the test volume does not have labels, 20% of the training data is used for validation in 5-fold cross-validation manner. The number of fold can be changed using `--fold` with an integer in range 0-4. For example, to run with 4 GPUs using fold 1 use:
```bash
horovodrun -np 4 python main.py --data_dir /data --model_dir /results --batch_size 1 --exec_mode train --crossvalidation_idx 1 --xla --amp
horovodrun -np 4 python main.py --data_dir /data --model_dir /results --batch_size 1 --exec_mode train --fold 1 --xla --amp
```
Training will result in a checkpoint file being written to `./results` on the host machine.
6. Start validation/evaluation.
The trained model can be evaluated by passing the `--exec_mode evaluate` flag. Since evaluation is carried out on a validation dataset, the `--crossvalidation_idx` parameter should be filled. For example:
The trained model can be evaluated by passing the `--exec_mode evaluate` flag. Since evaluation is carried out on a validation dataset, the `--fold` parameter should be filled. For example:
```bash
python main.py --data_dir /data --model_dir /results --batch_size 1 --exec_mode evaluate --crossvalidation_idx 0 --xla --amp
python main.py --data_dir /data --model_dir /results --batch_size 1 --exec_mode evaluate --fold 0 --xla --amp
```
Evaluation can also be triggered jointly after training by passing the `--exec_mode train_and_evaluate` flag.
@ -291,19 +291,20 @@ Other folders included in the root directory are:
The complete list of the available parameters for the `main.py` script contains:
* `--exec_mode`: Select the execution mode to run the model (default: `train`). Modes available:
* `train` - trains model from scratch.
* `evaluate` - loads checkpoint (if available) and performs evaluation on validation subset (requires `--crossvalidation_idx` other than `None`).
* `train_and_evaluate` - trains model from scratch and performs validation at the end (requires `--crossvalidation_idx` other than `None`).
* `evaluate` - loads checkpoint (if available) and performs evaluation on validation subset (requires `--fold` other than `None`).
* `train_and_evaluate` - trains model from scratch and performs validation at the end (requires `--fold` other than `None`).
* `predict` - loads checkpoint (if available) and runs inference on the test set. Stores the results in `--model_dir` directory.
* `train_and_predict` - trains model from scratch and performs inference.
* `--model_dir`: Set the output directory for information related to the model (default: `/results`).
* `--log_dir`: Set the output directory for logs (default: None).
* `--data_dir`: Set the input directory containing the dataset (default: `None`).
* `--batch_size`: Size of each minibatch per GPU (default: `1`).
* `--crossvalidation_idx`: Selected fold for cross-validation (default: `None`).
* `--fold`: Selected fold for cross-validation (default: `None`).
* `--max_steps`: Maximum number of steps (batches) for training (default: `1000`).
* `--seed`: Set random seed for reproducibility (default: `0`).
* `--weight_decay`: Weight decay coefficient (default: `0.0005`).
* `--log_every`: Log performance every n steps (default: `100`).
* `--evaluate_every`: Evaluate every n steps (default: `0` - evaluate once at the end).
* `--learning_rate`: Models learning rate (default: `0.0001`).
* `--augment`: Enable data augmentation (default: `False`).
* `--benchmark`: Enable performance benchmarking (default: `False`). If the flag is set, the script runs in a benchmark mode - each iteration is timed and the performance result (in images per second) is printed at the end. Works for both `train` and `predict` execution modes.
@ -324,8 +325,8 @@ usage: main.py [-h]
[--exec_mode {train,train_and_predict,predict,evaluate,train_and_evaluate}]
[--model_dir MODEL_DIR] --data_dir DATA_DIR [--log_dir LOG_DIR]
[--batch_size BATCH_SIZE] [--learning_rate LEARNING_RATE]
[--crossvalidation_idx CROSSVALIDATION_IDX]
[--max_steps MAX_STEPS] [--weight_decay WEIGHT_DECAY]
[--fold FOLD] [--max_steps MAX_STEPS]
[--evaluate_every EVALUATE_EVERY] [--weight_decay WEIGHT_DECAY]
[--log_every LOG_EVERY] [--warmup_steps WARMUP_STEPS]
[--seed SEED] [--augment] [--benchmark]
[--amp] [--xla]
@ -333,34 +334,39 @@ usage: main.py [-h]
UNet-medical
optional arguments:
-h, --help show this help message and exit
--exec_mode {train,train_and_predict,predict,evaluate,train_and_evaluate}
Execution mode of running the model
--model_dir MODEL_DIR
Output directory for information related to the model
--data_dir DATA_DIR Input directory containing the dataset for training
the model
--log_dir LOG_DIR Output directory for training logs
--batch_size BATCH_SIZE
Size of each minibatch per GPU
--learning_rate LEARNING_RATE
Learning rate coefficient for AdamOptimizer
--crossvalidation_idx CROSSVALIDATION_IDX
Chosen fold for cross-validation. Use None to disable
cross-validation
--max_steps MAX_STEPS
Maximum number of steps (batches) used for training
--weight_decay WEIGHT_DECAY
Weight decay coefficient
--log_every LOG_EVERY
Log performance every n steps
--warmup_steps WARMUP_STEPS
Number of warmup steps
--seed SEED Random seed
--augment Perform data augmentation during training
--benchmark Collect performance metrics during training
--amp Train using TF-AMP
--xla Train using XLA
-h, --help show this help message and exit
--exec_mode {train,train_and_predict,predict,evaluate,train_and_evaluate}
Execution mode of running the model
--model_dir MODEL_DIR
Output directory for information related to the model
--data_dir DATA_DIR Input directory containing the dataset for training
the model
--log_dir LOG_DIR Output directory for training logs
--batch_size BATCH_SIZE
Size of each minibatch per GPU
--learning_rate LEARNING_RATE
Learning rate coefficient for AdamOptimizer
--fold FOLD Chosen fold for cross-validation. Use None to disable
cross-validation
--max_steps MAX_STEPS
Maximum number of steps (batches) used for training
--weight_decay WEIGHT_DECAY
Weight decay coefficient
--log_every LOG_EVERY
Log performance every n steps
--evaluate_every EVALUATE_EVERY
Evaluate every n steps
--warmup_steps WARMUP_STEPS
Number of warmup steps
--seed SEED Random seed
--augment Perform data augmentation during training
--no-augment
--benchmark Collect performance metrics during training
--no-benchmark
--use_amp, --amp Train using TF-AMP
--use_xla, --xla Train using XLA
--use_trt Use TF-TRT
--resume_training Resume training from a checkpoint
```
@ -420,7 +426,7 @@ horovodrun -np <number/of/gpus> python main.py --data_dir /data [other parameter
The main result of the training are checkpoints stored by default in `./results/` on the host machine, and in the `/results` in the container. This location can be controlled
by the `--model_dir` command-line argument, if a different location was mounted while starting the container. In the case when the training is run in `train_and_predict` mode, the inference will take place after the training is finished, and inference results will be stored to the `/results` directory.
If the `--exec_mode train_and_evaluate` parameter was used, and if `--crossvalidation_idx` parameter is set to an integer value of {0, 1, 2, 3, 4}, the evaluation of the validation set takes place after the training is completed. The results of the evaluation will be printed to the console.
If the `--exec_mode train_and_evaluate` parameter was used, and if `--fold` parameter is set to an integer value of {0, 1, 2, 3, 4}, the evaluation of the validation set takes place after the training is completed. The results of the evaluation will be printed to the console.
### Inference process

View file

@ -15,4 +15,4 @@
# This script launches U-Net run in FP32 on 1 GPU and trains for 6400 iterations with batch_size 8. Usage:
# bash unet_FP32_1GPU.sh <path to dataset> <path to results directory>
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --log_dir $2/log.json
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --fold 0 --augment --xla --log_dir $2/log.json

View file

@ -15,4 +15,4 @@
# This script launches U-Net run in FP32 on 8 GPUs and trains for 6400 iterations with batch_size 8. Usage:
# bash unet_FP32_8GPU.sh <path to dataset> <path to results directory>
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --log_dir $2/log.json
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --fold 0 --augment --xla --log_dir $2/log.json

View file

@ -15,4 +15,4 @@
# This script launches U-Net run in FP32 on 1 GPU for inference batch_size 1. Usage:
# bash unet_INFER_FP32.sh <path to this repository> <path to dataset> <path to results directory>
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size 1 --exec_mode predict --xla
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size 1 --exec_mode predict --xla --fold 0

View file

@ -15,4 +15,4 @@
# This script launches U-Net run in FP32 on 1 GPU for inference benchmarking. Usage:
# bash unet_INFER_BENCHMARK_FP32.sh <path to dataset> <path to results directory> <batch size>
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size $3 --exec_mode predict --benchmark --warmup_steps 200 --max_steps 600 --xla
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size $3 --exec_mode predict --benchmark --warmup_steps 200 --max_steps 600 --xla --fold 0

View file

@ -15,4 +15,4 @@
# This script launches U-Net run in FP16 on 1 GPU for inference benchmarking. Usage:
# bash unet_INFER_BENCHMARK_TF-AMP.sh <path to dataset> <path to results directory> <batch size>
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size $3 --exec_mode predict --benchmark --warmup_steps 200 --max_steps 600 --xla --amp
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size $3 --exec_mode predict --benchmark --warmup_steps 200 --max_steps 600 --xla --amp --fold 0

View file

@ -15,4 +15,4 @@
# This script launches U-Net run in FP16 on 1 GPU for inference batch_size 1. Usage:
# bash unet_INFER_TF-AMP.sh <path to dataset> <path to results directory>
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size 1 --exec_mode predict --xla --amp
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --batch_size 1 --exec_mode predict --xla --amp --fold 0

View file

@ -15,4 +15,4 @@
# This script launches U-Net run in FP16 on 1 GPU and trains for 6400 iterations batch_size 8. Usage:
# bash unet_TF-AMP_1GPU.sh <path to dataset> <path to results directory>
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --amp --log_dir $2/log.json
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --fold 0 --augment --xla --amp --log_dir $2/log.json

View file

@ -15,4 +15,4 @@
# This script launches U-Net run in FP16 on 8 GPUs and trains for 6400 iterations batch_size 8. Usage:
# bash unet_TF-AMP_8GPU.sh <path to dataset> <path to results directory>
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --amp --log_dir $2/log.json
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size 8 --exec_mode train_and_evaluate --fold 0 --augment --xla --amp --log_dir $2/log.json

View file

@ -16,9 +16,9 @@
# Usage:
# bash unet_TRAIN_FP32_1GPU.sh <path to dataset> <path to results directory> <batch size>
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla > $2/log_FP32_1GPU_fold0.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 1 --augment --xla > $2/log_FP32_1GPU_fold1.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 2 --augment --xla > $2/log_FP32_1GPU_fold2.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 3 --augment --xla > $2/log_FP32_1GPU_fold3.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 4 --augment --xla > $2/log_FP32_1GPU_fold4.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 0 --augment --xla > $2/log_FP32_1GPU_fold0.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 1 --augment --xla > $2/log_FP32_1GPU_fold1.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla > $2/log_FP32_1GPU_fold2.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla > $2/log_FP32_1GPU_fold3.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla > $2/log_FP32_1GPU_fold4.txt
python utils/parse_results.py --model_dir $2 --exec_mode convergence --env FP32_1GPU

View file

@ -16,9 +16,9 @@
# Usage:
# bash unet_TRAIN_FP32_8GPU.sh <path to dataset> <path to results directory> <batch size>
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla > $2/log_FP32_8GPU_fold0.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 1 --augment --xla > $2/log_FP32_8GPU_fold1.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 2 --augment --xla > $2/log_FP32_8GPU_fold2.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 3 --augment --xla > $2/log_FP32_8GPU_fold3.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 4 --augment --xla > $2/log_FP32_8GPU_fold4.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 0 --augment --xla > $2/log_FP32_8GPU_fold0.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 1 --augment --xla > $2/log_FP32_8GPU_fold1.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla > $2/log_FP32_8GPU_fold2.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla > $2/log_FP32_8GPU_fold3.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla > $2/log_FP32_8GPU_fold4.txt
python utils/parse_results.py --model_dir $2 --exec_mode convergence --env FP32_8GPU

View file

@ -16,9 +16,9 @@
# Usage:
# bash unet_TRAIN_TF-AMP_1GPU.sh <path to dataset> <path to results directory> <batch size>
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold0.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 1 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold1.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 2 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold2.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 3 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold3.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 4 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold4.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 0 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold0.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 1 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold1.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold2.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold3.txt
horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold4.txt
python utils/parse_results.py --model_dir $2 --exec_mode convergence --env TF-AMP_1GPU

View file

@ -16,9 +16,9 @@
# Usage:
# bash unet_TRAIN_TF-AMP_8GPU.sh <path to dataset> <path to results directory> <batch size>
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 0 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold0.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 1 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold1.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 2 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold2.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 3 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold3.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --crossvalidation_idx 4 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold4.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 0 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold0.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 1 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold1.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold2.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold3.txt
horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold4.txt
python utils/parse_results.py --model_dir $2 --exec_mode convergence --env TF-AMP_8GPU

View file

@ -26,10 +26,10 @@ Example:
import horovod.tensorflow as hvd
from model.unet import Unet
from run import train, evaluate, predict
from utils.setup import get_logger, set_flags, prepare_model_dir
from utils.cmd_util import PARSER, parse_args
from utils.data_loader import Dataset
from runtime.run import train, evaluate, predict
from runtime.setup import get_logger, set_flags, prepare_model_dir
from runtime.arguments import PARSER, parse_args
from data_loading.data_loader import Dataset
def main():
@ -47,7 +47,7 @@ def main():
dataset = Dataset(data_dir=params.data_dir,
batch_size=params.batch_size,
fold=params.crossvalidation_idx,
fold=params.fold,
augment=params.augment,
gpu_id=hvd.rank(),
num_gpus=hvd.size(),

View file

@ -49,7 +49,7 @@ PARSER.add_argument('--learning_rate',
default=0.0001,
help="""Learning rate coefficient for AdamOptimizer""")
PARSER.add_argument('--crossvalidation_idx',
PARSER.add_argument('--fold',
type=int,
default=None,
help="""Chosen fold for cross-validation. Use None to disable cross-validation""")
@ -69,6 +69,11 @@ PARSER.add_argument('--log_every',
default=100,
help="""Log performance every n steps""")
PARSER.add_argument('--evaluate_every',
type=int,
default=0,
help="""Evaluate every n steps""")
PARSER.add_argument('--warmup_steps',
type=int,
default=200,
@ -110,10 +115,11 @@ def parse_args(flags):
'log_dir': flags.log_dir,
'batch_size': flags.batch_size,
'learning_rate': flags.learning_rate,
'crossvalidation_idx': flags.crossvalidation_idx,
'fold': flags.fold,
'max_steps': flags.max_steps,
'weight_decay': flags.weight_decay,
'log_every': flags.log_every,
'evaluate_every': flags.evaluate_every,
'warmup_steps': flags.warmup_steps,
'augment': flags.augment,
'benchmark': flags.benchmark,

View file

@ -17,21 +17,21 @@ import numpy as np
import argparse
def process_performance_stats(timestamps, params):
warmup_steps = params['warmup_steps']
batch_size = params['batch_size']
timestamps_ms = 1000 * timestamps[warmup_steps:]
timestamps_ms = timestamps_ms[timestamps_ms > 0]
latency_ms = timestamps_ms.mean()
std = timestamps_ms.std()
n = np.sqrt(len(timestamps_ms))
throughput_imgps = (1000.0 * batch_size / timestamps_ms).mean()
def process_performance_stats(timestamps, batch_size, mode):
""" Get confidence intervals
:param timestamps: Collection of timestamps
:param batch_size: Number of samples per batch
:param mode: Estimator's execution mode
:return: Stats
"""
timestamps_ms = 1000 * timestamps
throughput_imgps = (1000.0 * batch_size / timestamps_ms).mean()
stats = {f"throughput_{mode}": throughput_imgps,
f"latency_{mode}_mean": timestamps_ms.mean()}
for level in [90, 95, 99]:
stats.update({f"latency_{mode}_{level}": np.percentile(timestamps_ms, level)})
stats = [("Throughput Avg", str(throughput_imgps)),
('Latency Avg:', str(latency_ms))]
for ci, lvl in zip(["90%:", "95%:", "99%:"],
[1.645, 1.960, 2.576]):
stats.append(("Latency_"+ci, str(latency_ms + lvl * std / n)))
return stats

View file

@ -19,8 +19,8 @@ from PIL import Image
import horovod.tensorflow as hvd
import tensorflow as tf
from utils.losses import partial_losses
from utils.parse_results import process_performance_stats
from runtime.losses import partial_losses
from runtime.parse_results import process_performance_stats
def train(params, model, dataset, logger):
@ -35,7 +35,7 @@ def train(params, model, dataset, logger):
ce_loss = tf.keras.metrics.Mean(name='ce_loss')
f1_loss = tf.keras.metrics.Mean(name='dice_loss')
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
if params.resume_training:
if params.resume_training and params.model_dir:
checkpoint.restore(tf.train.latest_checkpoint(params.model_dir))
@tf.function
@ -69,26 +69,30 @@ def train(params, model, dataset, logger):
if params.benchmark:
assert max_steps * hvd.size() > params.warmup_steps, \
"max_steps value has to be greater than warmup_steps"
timestamps = np.zeros((hvd.size(), max_steps * hvd.size() + 1), dtype=np.float32)
timestamps = []
for iteration, (images, labels) in enumerate(dataset.train_fn(drop_remainder=True)):
t0 = time()
loss = train_step(images, labels, warmup_batch=iteration == 0).numpy()
timestamps[hvd.rank(), iteration] = time() - t0
if iteration > params.warmup_steps:
timestamps.append(time())
if iteration >= max_steps * hvd.size():
break
timestamps = np.mean(timestamps, axis=0)
if hvd.rank() == 0:
stats = process_performance_stats(timestamps, params)
logger.log(step=(),
data={metric: value for (metric, value) in stats})
deltas = np.array([timestamps[i + 1] - timestamps[i] for i in range(len(timestamps) - 1)])
stats = process_performance_stats(deltas, hvd.size() * params.batch_size, mode="train")
logger.log(step=(), data=stats)
else:
for iteration, (images, labels) in enumerate(dataset.train_fn()):
train_step(images, labels, warmup_batch=iteration == 0)
if (hvd.rank() == 0) and (iteration % params.log_every == 0):
logger.log(step=(iteration, max_steps),
data={"train_ce_loss": float(ce_loss.result()),
"train_dice_loss": float(f1_loss.result()),
"train_total_loss": float(f1_loss.result() + ce_loss.result())})
if hvd.rank() == 0:
if iteration % params.log_every == 0:
logger.log(step=(iteration, max_steps),
data={"train_ce_loss": float(ce_loss.result()),
"train_dice_loss": float(f1_loss.result()),
"train_total_loss": float(f1_loss.result() + ce_loss.result())})
if (params.evaluate_every > 0) and (iteration % params.evaluate_every == 0):
evaluate(params, model, dataset, logger, restore_checkpoint=False)
f1_loss.reset_states()
ce_loss.reset_states()
@ -101,13 +105,15 @@ def train(params, model, dataset, logger):
logger.flush()
def evaluate(params, model, dataset, logger):
def evaluate(params, model, dataset, logger, restore_checkpoint=True):
if params.fold is None:
print("No fold specified for evaluation. Please use --fold [int] to select a fold.")
ce_loss = tf.keras.metrics.Mean(name='ce_loss')
f1_loss = tf.keras.metrics.Mean(name='dice_loss')
checkpoint = tf.train.Checkpoint(model=model)
checkpoint.restore(tf.train.latest_checkpoint(params.model_dir)).expect_partial()
if params.model_dir and restore_checkpoint:
checkpoint.restore(tf.train.latest_checkpoint(params.model_dir)).expect_partial()
@tf.function
def validation_step(features, labels):
output_map = model(features, training=False)
crossentropy_loss, dice_loss = partial_losses(output_map, labels)
@ -130,7 +136,8 @@ def evaluate(params, model, dataset, logger):
def predict(params, model, dataset, logger):
checkpoint = tf.train.Checkpoint(model=model)
checkpoint.restore(tf.train.latest_checkpoint(params.model_dir)).expect_partial()
if params.model_dir:
checkpoint.restore(tf.train.latest_checkpoint(params.model_dir)).expect_partial()
@tf.function
def prediction_step(features):
@ -139,16 +146,16 @@ def predict(params, model, dataset, logger):
if params.benchmark:
assert params.max_steps > params.warmup_steps, \
"max_steps value has to be greater than warmup_steps"
timestamps = np.zeros(params.max_steps + 1, dtype=np.float32)
timestamps = []
for iteration, images in enumerate(dataset.test_fn(count=None, drop_remainder=True)):
t0 = time()
prediction_step(images)
timestamps[iteration] = time() - t0
timestamps.append(time())
if iteration >= params.max_steps:
break
stats = process_performance_stats(timestamps, params)
logger.log(step=(),
data={metric: value for (metric, value) in stats})
deltas = np.array([timestamps[i + 1] - timestamps[i] for i in range(len(timestamps) - 1)])
stats = process_performance_stats(deltas, params.batch_size, mode="test")
logger.log(step=(), data=stats)
else:
predictions = np.concatenate([prediction_step(images).numpy()
for images in dataset.test_fn(count=1)], axis=0)
@ -163,4 +170,6 @@ def predict(params, model, dataset, logger):
compression="tiff_deflate",
save_all=True,
append_images=multipage_tif[1:])
print("Predictions saved at {}".format(output_dir))
logger.flush()

View file

@ -13,11 +13,13 @@
# limitations under the License.
import os
import multiprocessing
import numpy as np
import tensorflow as tf
import horovod.tensorflow as hvd
import dllogger as logger
import horovod.tensorflow as hvd
from dllogger import StdOutBackend, Verbosity, JSONStreamBackend
@ -32,6 +34,7 @@ def set_flags(params):
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
os.environ['TF_SYNC_ON_FINISH'] = '0'
os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '0'
np.random.seed(params.seed)
tf.random.set_seed(params.seed)
@ -45,10 +48,11 @@ def set_flags(params):
if gpus:
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
tf.config.threading.set_intra_op_parallelism_threads(1)
tf.config.threading.set_inter_op_parallelism_threads(max(2, (multiprocessing.cpu_count() // hvd.size()) - 2))
if params.use_amp:
tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
else:
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '0'
def prepare_model_dir(params):