Merge pull request #823 from NVIDIA/gh/release

[UNet medical/TF2] Fixes
2021-02-02 16:07:03 +01:00 · 2021-02-02 16:07:03 +01:00 · 1bc696b00d
parent 859a84e483 32e921cd99
commit 1bc696b00d
7 changed files with 8 additions and 9 deletions
--- a/TensorFlow2/Segmentation/UNet_Medical/README.md
+++ b/TensorFlow2/Segmentation/UNet_Medical/README.md
@ -526,7 +526,7 @@ This command will launch a script which will run 5-fold cross-validation trainin
 **Learning curves**

 The following image show the training loss as a function of iteration for training using DGX A100 (TF32 and TF-AMP) and DGX-1 V100 (FP32 and TF-AMP).
-![LearningCurves](images/UNetMed_TF2_conv.png)
+![LearningCurves](images/U-NetMed_TF2_conv.png)


 #### Training performance results
@ -634,6 +634,5 @@ February 2020
 
 ### Known issues
 
-* Some set-ups suffer from a `ncclCommInitRank failed: unhandled system error`. This is a known issue with NCCL 2.7.5. The issue is solved in NCCL 2.7.8, which can be applied by changing the first line the Dockerfile from `ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf2-py3` to `ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.08-tf2-py3` and rebuilding the docker image.
 * For TensorFlow 2.0 the training performance using AMP and XLA is around 30% lower than reported here. The issue was solved in TensorFlow 2.1.

--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_1GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_1GPU.sh
@ -21,4 +21,4 @@ horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --m
 horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla > $2/log_FP32_1GPU_fold2.txt
 horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla > $2/log_FP32_1GPU_fold3.txt
 horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla > $2/log_FP32_1GPU_fold4.txt
-python utils/parse_results.py --model_dir $2 --exec_mode convergence --env FP32_1GPU
+python runtime/parse_results.py --model_dir $2 --exec_mode convergence --env FP32_1GPU
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_8GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_8GPU.sh
@ -21,4 +21,4 @@ horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --m
 horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla > $2/log_FP32_8GPU_fold2.txt
 horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla > $2/log_FP32_8GPU_fold3.txt
 horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla > $2/log_FP32_8GPU_fold4.txt
-python utils/parse_results.py --model_dir $2 --exec_mode convergence --env FP32_8GPU
+python runtime/parse_results.py --model_dir $2 --exec_mode convergence --env FP32_8GPU
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_TF-AMP_1GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_TF-AMP_1GPU.sh
@ -21,4 +21,4 @@ horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --m
 horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold2.txt
 horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold3.txt
 horovodrun -np 1 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla --amp > $2/log_TF-AMP_1GPU_fold4.txt
-python utils/parse_results.py --model_dir $2 --exec_mode convergence --env TF-AMP_1GPU
+python runtime/parse_results.py --model_dir $2 --exec_mode convergence --env TF-AMP_1GPU
--- a/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_TF-AMP_8GPU.sh
+++ b/TensorFlow2/Segmentation/UNet_Medical/examples/unet_TRAIN_TF-AMP_8GPU.sh
@ -21,4 +21,4 @@ horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --m
 horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 2 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold2.txt
 horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 3 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold3.txt
 horovodrun -np 8 python main.py --data_dir $1 --model_dir $2 --log_every 100 --max_steps 6400 --batch_size $3 --exec_mode train_and_evaluate --fold 4 --augment --xla --amp > $2/log_TF-AMP_8GPU_fold4.txt
-python utils/parse_results.py --model_dir $2 --exec_mode convergence --env TF-AMP_8GPU
+python runtime/parse_results.py --model_dir $2 --exec_mode convergence --env TF-AMP_8GPU
--- a/TensorFlow2/Segmentation/UNet_Medical/requirements.txt
+++ b/TensorFlow2/Segmentation/UNet_Medical/requirements.txt
@ -1,3 +1,3 @@
 Pillow
 tf2onnx
-munch
+munch
--- a/TensorFlow2/Segmentation/UNet_Medical/runtime/losses.py
+++ b/TensorFlow2/Segmentation/UNet_Medical/runtime/losses.py
@ -32,8 +32,8 @@ def partial_losses(predict, target):
    flat_labels = tf.reshape(target,
                             [tf.shape(input=predict)[0], -1, n_classes])

-    crossentropy_loss = tf.reduce_mean(input_tensor=tf.keras.backend.binary_crossentropy(output=flat_logits,
-                                                                                         target=flat_labels),
+    crossentropy_loss = tf.reduce_mean(input_tensor=tf.nn.softmax_cross_entropy_with_logits(logits=flat_logits,
+                                                                                            labels=flat_labels),
                                       name='cross_loss_ref')

    dice_loss = tf.reduce_mean(input_tensor=1 - dice_coef(tf.keras.activations.softmax(flat_logits, axis=-1),