From b169ad3ba15a16a1670f73f38bfb08ae9367a73e Mon Sep 17 00:00:00 2001 From: Adrian Lancucki Date: Tue, 7 Sep 2021 07:27:31 -0700 Subject: [PATCH] [FastPitch/PyT] Update NGC checkpoint url --- PyTorch/SpeechSynthesis/FastPitch/README.md | 4 ++-- .../FastPitch/common/text/cmudict.py | 13 +++++++++++-- .../FastPitch/scripts/download_cmudict.sh | 6 ++++++ .../FastPitch/scripts/download_dataset.sh | 3 +-- .../FastPitch/scripts/download_fastpitch.sh | 6 +++--- .../FastPitch/scripts/inference_example.sh | 7 ++++--- 6 files changed, 27 insertions(+), 12 deletions(-) create mode 100755 PyTorch/SpeechSynthesis/FastPitch/scripts/download_cmudict.sh diff --git a/PyTorch/SpeechSynthesis/FastPitch/README.md b/PyTorch/SpeechSynthesis/FastPitch/README.md index dc2aad59..93a54df6 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/README.md +++ b/PyTorch/SpeechSynthesis/FastPitch/README.md @@ -74,7 +74,7 @@ This is reflected in Mean Opinion Scores ([details](https://arxiv.org/abs/2006.0 | FastPitch 1.0 | 4.080 ± 0.133 | The current version of the model offers even higher quality, as reflected -in the pairwise preference scores. +in the pairwise preference scores ([details](https://arxiv.org/abs/2108.10447)). | Model | Average preference | |:---------------|:-------------------| @@ -82,7 +82,7 @@ in the pairwise preference scores. | FastPitch 1.1 | 0.565 ± 0.068 | The FastPitch model is based on the [FastSpeech](https://arxiv.org/abs/1905.09263) model. The main differences between FastPitch and FastSpeech are that FastPitch: -* no dependence on external aligner (Transformer TTS, Tacotron 2); in version 1.1, FastPitch aligns audio to transcriptions by itself, +* no dependence on external aligner (Transformer TTS, Tacotron 2); in version 1.1, FastPitch aligns audio to transcriptions by itself as in [One TTS Alignment To Rule Them All](https://arxiv.org/abs/2108.10447), * explicitly learns to predict the pitch contour, * pitch conditioning removes harsh sounding artifacts and provides faster convergence, * no need for distilling mel-spectrograms with a teacher model, diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/cmudict.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/cmudict.py index 54f4ca95..2543cc32 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/common/text/cmudict.py +++ b/PyTorch/SpeechSynthesis/FastPitch/common/text/cmudict.py @@ -38,8 +38,17 @@ class CMUDict: def initialize(self, file_or_path, keep_ambiguous=True): if isinstance(file_or_path, str): - with open(file_or_path, encoding='latin-1') as f: - entries = _parse_cmudict(f) + try: + with open(file_or_path, encoding='latin-1') as f: + entries = _parse_cmudict(f) + except FileNotFoundError: + print("CMUdict missing. Download with") + print() + print(" bash scripts/download_cmudict.sh") + print() + print("and re-run the script.") + import sys + sys.exit(0) else: entries = _parse_cmudict(file_or_path) if not keep_ambiguous: diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_cmudict.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_cmudict.sh new file mode 100755 index 00000000..f4bc67fe --- /dev/null +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_cmudict.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +set -e + +echo "Downloading cmudict-0.7b ..." +wget https://github.com/Alexir/CMUdict/raw/master/cmudict-0.7b -qO cmudict/cmudict-0.7b diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_dataset.sh index a5404c06..22a2acc9 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_dataset.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_dataset.sh @@ -2,8 +2,7 @@ set -e -echo "Downloading cmudict-0.7b ..." -wget https://github.com/Alexir/CMUdict/raw/master/cmudict-0.7b -qO cmudict/cmudict-0.7b +scripts/download_cmudict.sh DATA_DIR="LJSpeech-1.1" LJS_ARCH="LJSpeech-1.1.tar.bz2" diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_fastpitch.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_fastpitch.sh index 2a0eefa0..bf969c73 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/download_fastpitch.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/download_fastpitch.sh @@ -3,9 +3,9 @@ set -e : ${MODEL_DIR:="pretrained_models/fastpitch"} -MODEL_ZIP="nvidia_fastpitch_200518.zip" -MODEL="nvidia_fastpitch_200518.pt" -MODEL_URL="https://api.ngc.nvidia.com/v2/models/nvidia/fastpitch_pyt_amp_ckpt_v1/versions/20.02.0/zip" +MODEL_ZIP="nvidia_fastpitch_210824.zip" +MODEL="nvidia_fastpitch_210824.pt" +MODEL_URL="https://api.ngc.nvidia.com/v2/models/nvidia/fastpitch_pyt_amp_ckpt_v1_1/versions/21.05.0/zip" mkdir -p "$MODEL_DIR" diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh index d1d185a1..7bc3b9c8 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash : ${WAVEGLOW:="pretrained_models/waveglow/nvidia_waveglow256pyt_fp16.pt"} -: ${FASTPITCH:="output/FastPitch_checkpoint_1000.pt"} +: ${FASTPITCH:="pretrained_models/fastpitch/nvidia_fastpitch_210824.pt"} : ${BATCH_SIZE:=32} : ${PHRASES:="phrases/devset10.tsv"} : ${OUTPUT_DIR:="./output/audio_$(basename ${PHRASES} .tsv)"} @@ -13,6 +13,7 @@ : ${DENOISING:=0.01} : ${WARMUP:=0} : ${REPEATS:=1} +: ${CPU:=false} : ${SPEAKER:=0} : ${NUM_SPEAKERS:=1} @@ -20,8 +21,6 @@ echo -e "\nAMP=$AMP, batch_size=$BATCH_SIZE\n" ARGS="" -ARGS+=" --cuda" -ARGS+=" --cudnn-benchmark" ARGS+=" -i $PHRASES" ARGS+=" -o $OUTPUT_DIR" ARGS+=" --log-file $LOG_FILE" @@ -35,6 +34,8 @@ ARGS+=" --repeats $REPEATS" ARGS+=" --warmup-steps $WARMUP" ARGS+=" --speaker $SPEAKER" ARGS+=" --n-speakers $NUM_SPEAKERS" +[ "$CPU" = false ] && ARGS+=" --cuda" +[ "$CPU" = false ] && ARGS+=" --cudnn-benchmark" [ "$AMP" = true ] && ARGS+=" --amp" [ "$PHONE" = "true" ] && ARGS+=" --p-arpabet 1.0" [ "$ENERGY" = "true" ] && ARGS+=" --energy-conditioning"