From b169ad3ba15a16a1670f73f38bfb08ae9367a73e Mon Sep 17 00:00:00 2001
From: Adrian Lancucki <alancucki@nvidia.com>
Date: Tue, 7 Sep 2021 07:27:31 -0700
Subject: [PATCH] [FastPitch/PyT] Update NGC checkpoint url

---
 PyTorch/SpeechSynthesis/FastPitch/README.md         |  4 ++--
 .../FastPitch/common/text/cmudict.py                | 13 +++++++++++--
 .../FastPitch/scripts/download_cmudict.sh           |  6 ++++++
 .../FastPitch/scripts/download_dataset.sh           |  3 +--
 .../FastPitch/scripts/download_fastpitch.sh         |  6 +++---
 .../FastPitch/scripts/inference_example.sh          |  7 ++++---
 6 files changed, 27 insertions(+), 12 deletions(-)
 create mode 100755 PyTorch/SpeechSynthesis/FastPitch/scripts/download_cmudict.sh

diff --git a/PyTorch/SpeechSynthesis/FastPitch/README.md b/PyTorch/SpeechSynthesis/FastPitch/README.md
index dc2aad59..93a54df6 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/README.md
+++ b/PyTorch/SpeechSynthesis/FastPitch/README.md
@@ -74,7 +74,7 @@ This is reflected in Mean Opinion Scores ([details](https://arxiv.org/abs/2006.0
 | FastPitch 1.0  | 4.080 ± 0.133            |
 
 The current version of the model offers even higher quality, as reflected
-in the pairwise preference scores.
+in the pairwise preference scores ([details](https://arxiv.org/abs/2108.10447)).
 
 | Model          | Average preference |
 |:---------------|:-------------------|
@@ -82,7 +82,7 @@ in the pairwise preference scores.
 | FastPitch 1.1  | 0.565 ± 0.068      |
 
 The FastPitch model is based on the [FastSpeech](https://arxiv.org/abs/1905.09263) model. The main differences between FastPitch and FastSpeech are that FastPitch:
-* no dependence on external aligner (Transformer TTS, Tacotron 2); in version 1.1, FastPitch aligns audio to transcriptions by itself,
+* no dependence on external aligner (Transformer TTS, Tacotron 2); in version 1.1, FastPitch aligns audio to transcriptions by itself as in [One TTS Alignment To Rule Them All](https://arxiv.org/abs/2108.10447),
 * explicitly learns to predict the pitch contour,
 * pitch conditioning removes harsh sounding artifacts and provides faster convergence,
 * no need for distilling mel-spectrograms with a teacher model,
diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/cmudict.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/cmudict.py
index 54f4ca95..2543cc32 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/common/text/cmudict.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/common/text/cmudict.py
@@ -38,8 +38,17 @@ class CMUDict:
 
   def initialize(self, file_or_path, keep_ambiguous=True):
     if isinstance(file_or_path, str):
-      with open(file_or_path, encoding='latin-1') as f:
-        entries = _parse_cmudict(f)
+      try:
+        with open(file_or_path, encoding='latin-1') as f:
+          entries = _parse_cmudict(f)
+      except FileNotFoundError:
+        print("CMUdict missing. Download with")
+        print()
+        print("    bash scripts/download_cmudict.sh")
+        print()
+        print("and re-run the script.")
+        import sys
+        sys.exit(0)
     else:
       entries = _parse_cmudict(file_or_path)
     if not keep_ambiguous:
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_cmudict.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_cmudict.sh
new file mode 100755
index 00000000..f4bc67fe
--- /dev/null
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_cmudict.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+set -e
+
+echo "Downloading cmudict-0.7b ..."
+wget https://github.com/Alexir/CMUdict/raw/master/cmudict-0.7b -qO cmudict/cmudict-0.7b
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_dataset.sh
index a5404c06..22a2acc9 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_dataset.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_dataset.sh
@@ -2,8 +2,7 @@
 
 set -e
 
-echo "Downloading cmudict-0.7b ..."
-wget https://github.com/Alexir/CMUdict/raw/master/cmudict-0.7b -qO cmudict/cmudict-0.7b
+scripts/download_cmudict.sh
 
 DATA_DIR="LJSpeech-1.1"
 LJS_ARCH="LJSpeech-1.1.tar.bz2"
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_fastpitch.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_fastpitch.sh
index 2a0eefa0..bf969c73 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_fastpitch.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_fastpitch.sh
@@ -3,9 +3,9 @@
 set -e
 
 : ${MODEL_DIR:="pretrained_models/fastpitch"}
-MODEL_ZIP="nvidia_fastpitch_200518.zip"
-MODEL="nvidia_fastpitch_200518.pt"
-MODEL_URL="https://api.ngc.nvidia.com/v2/models/nvidia/fastpitch_pyt_amp_ckpt_v1/versions/20.02.0/zip"
+MODEL_ZIP="nvidia_fastpitch_210824.zip"
+MODEL="nvidia_fastpitch_210824.pt"
+MODEL_URL="https://api.ngc.nvidia.com/v2/models/nvidia/fastpitch_pyt_amp_ckpt_v1_1/versions/21.05.0/zip"
 
 mkdir -p "$MODEL_DIR"
 
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh
index d1d185a1..7bc3b9c8 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 : ${WAVEGLOW:="pretrained_models/waveglow/nvidia_waveglow256pyt_fp16.pt"}
-: ${FASTPITCH:="output/FastPitch_checkpoint_1000.pt"}
+: ${FASTPITCH:="pretrained_models/fastpitch/nvidia_fastpitch_210824.pt"}
 : ${BATCH_SIZE:=32}
 : ${PHRASES:="phrases/devset10.tsv"}
 : ${OUTPUT_DIR:="./output/audio_$(basename ${PHRASES} .tsv)"}
@@ -13,6 +13,7 @@
 : ${DENOISING:=0.01}
 : ${WARMUP:=0}
 : ${REPEATS:=1}
+: ${CPU:=false}
 
 : ${SPEAKER:=0}
 : ${NUM_SPEAKERS:=1}
@@ -20,8 +21,6 @@
 echo -e "\nAMP=$AMP, batch_size=$BATCH_SIZE\n"
 
 ARGS=""
-ARGS+=" --cuda"
-ARGS+=" --cudnn-benchmark"
 ARGS+=" -i $PHRASES"
 ARGS+=" -o $OUTPUT_DIR"
 ARGS+=" --log-file $LOG_FILE"
@@ -35,6 +34,8 @@ ARGS+=" --repeats $REPEATS"
 ARGS+=" --warmup-steps $WARMUP"
 ARGS+=" --speaker $SPEAKER"
 ARGS+=" --n-speakers $NUM_SPEAKERS"
+[ "$CPU" = false ]          && ARGS+=" --cuda"
+[ "$CPU" = false ]          && ARGS+=" --cudnn-benchmark"
 [ "$AMP" = true ]           && ARGS+=" --amp"
 [ "$PHONE" = "true" ]       && ARGS+=" --p-arpabet 1.0"
 [ "$ENERGY" = "true" ]      && ARGS+=" --energy-conditioning"