[FastPitch/PyT] Update NGC checkpoint url

This commit is contained in:
Adrian Lancucki 2021-09-07 07:27:31 -07:00 committed by Krzysztof Kudrynski
parent 5d6d417ff5
commit b169ad3ba1
6 changed files with 27 additions and 12 deletions

View file

@ -74,7 +74,7 @@ This is reflected in Mean Opinion Scores ([details](https://arxiv.org/abs/2006.0
| FastPitch 1.0 | 4.080 ± 0.133 | | FastPitch 1.0 | 4.080 ± 0.133 |
The current version of the model offers even higher quality, as reflected The current version of the model offers even higher quality, as reflected
in the pairwise preference scores. in the pairwise preference scores ([details](https://arxiv.org/abs/2108.10447)).
| Model | Average preference | | Model | Average preference |
|:---------------|:-------------------| |:---------------|:-------------------|
@ -82,7 +82,7 @@ in the pairwise preference scores.
| FastPitch 1.1 | 0.565 ± 0.068 | | FastPitch 1.1 | 0.565 ± 0.068 |
The FastPitch model is based on the [FastSpeech](https://arxiv.org/abs/1905.09263) model. The main differences between FastPitch and FastSpeech are that FastPitch: The FastPitch model is based on the [FastSpeech](https://arxiv.org/abs/1905.09263) model. The main differences between FastPitch and FastSpeech are that FastPitch:
* no dependence on external aligner (Transformer TTS, Tacotron 2); in version 1.1, FastPitch aligns audio to transcriptions by itself, * no dependence on external aligner (Transformer TTS, Tacotron 2); in version 1.1, FastPitch aligns audio to transcriptions by itself as in [One TTS Alignment To Rule Them All](https://arxiv.org/abs/2108.10447),
* explicitly learns to predict the pitch contour, * explicitly learns to predict the pitch contour,
* pitch conditioning removes harsh sounding artifacts and provides faster convergence, * pitch conditioning removes harsh sounding artifacts and provides faster convergence,
* no need for distilling mel-spectrograms with a teacher model, * no need for distilling mel-spectrograms with a teacher model,

View file

@ -38,8 +38,17 @@ class CMUDict:
def initialize(self, file_or_path, keep_ambiguous=True): def initialize(self, file_or_path, keep_ambiguous=True):
if isinstance(file_or_path, str): if isinstance(file_or_path, str):
with open(file_or_path, encoding='latin-1') as f: try:
entries = _parse_cmudict(f) with open(file_or_path, encoding='latin-1') as f:
entries = _parse_cmudict(f)
except FileNotFoundError:
print("CMUdict missing. Download with")
print()
print(" bash scripts/download_cmudict.sh")
print()
print("and re-run the script.")
import sys
sys.exit(0)
else: else:
entries = _parse_cmudict(file_or_path) entries = _parse_cmudict(file_or_path)
if not keep_ambiguous: if not keep_ambiguous:

View file

@ -0,0 +1,6 @@
#!/usr/bin/env bash
set -e
echo "Downloading cmudict-0.7b ..."
wget https://github.com/Alexir/CMUdict/raw/master/cmudict-0.7b -qO cmudict/cmudict-0.7b

View file

@ -2,8 +2,7 @@
set -e set -e
echo "Downloading cmudict-0.7b ..." scripts/download_cmudict.sh
wget https://github.com/Alexir/CMUdict/raw/master/cmudict-0.7b -qO cmudict/cmudict-0.7b
DATA_DIR="LJSpeech-1.1" DATA_DIR="LJSpeech-1.1"
LJS_ARCH="LJSpeech-1.1.tar.bz2" LJS_ARCH="LJSpeech-1.1.tar.bz2"

View file

@ -3,9 +3,9 @@
set -e set -e
: ${MODEL_DIR:="pretrained_models/fastpitch"} : ${MODEL_DIR:="pretrained_models/fastpitch"}
MODEL_ZIP="nvidia_fastpitch_200518.zip" MODEL_ZIP="nvidia_fastpitch_210824.zip"
MODEL="nvidia_fastpitch_200518.pt" MODEL="nvidia_fastpitch_210824.pt"
MODEL_URL="https://api.ngc.nvidia.com/v2/models/nvidia/fastpitch_pyt_amp_ckpt_v1/versions/20.02.0/zip" MODEL_URL="https://api.ngc.nvidia.com/v2/models/nvidia/fastpitch_pyt_amp_ckpt_v1_1/versions/21.05.0/zip"
mkdir -p "$MODEL_DIR" mkdir -p "$MODEL_DIR"

View file

@ -1,7 +1,7 @@
#!/usr/bin/env bash #!/usr/bin/env bash
: ${WAVEGLOW:="pretrained_models/waveglow/nvidia_waveglow256pyt_fp16.pt"} : ${WAVEGLOW:="pretrained_models/waveglow/nvidia_waveglow256pyt_fp16.pt"}
: ${FASTPITCH:="output/FastPitch_checkpoint_1000.pt"} : ${FASTPITCH:="pretrained_models/fastpitch/nvidia_fastpitch_210824.pt"}
: ${BATCH_SIZE:=32} : ${BATCH_SIZE:=32}
: ${PHRASES:="phrases/devset10.tsv"} : ${PHRASES:="phrases/devset10.tsv"}
: ${OUTPUT_DIR:="./output/audio_$(basename ${PHRASES} .tsv)"} : ${OUTPUT_DIR:="./output/audio_$(basename ${PHRASES} .tsv)"}
@ -13,6 +13,7 @@
: ${DENOISING:=0.01} : ${DENOISING:=0.01}
: ${WARMUP:=0} : ${WARMUP:=0}
: ${REPEATS:=1} : ${REPEATS:=1}
: ${CPU:=false}
: ${SPEAKER:=0} : ${SPEAKER:=0}
: ${NUM_SPEAKERS:=1} : ${NUM_SPEAKERS:=1}
@ -20,8 +21,6 @@
echo -e "\nAMP=$AMP, batch_size=$BATCH_SIZE\n" echo -e "\nAMP=$AMP, batch_size=$BATCH_SIZE\n"
ARGS="" ARGS=""
ARGS+=" --cuda"
ARGS+=" --cudnn-benchmark"
ARGS+=" -i $PHRASES" ARGS+=" -i $PHRASES"
ARGS+=" -o $OUTPUT_DIR" ARGS+=" -o $OUTPUT_DIR"
ARGS+=" --log-file $LOG_FILE" ARGS+=" --log-file $LOG_FILE"
@ -35,6 +34,8 @@ ARGS+=" --repeats $REPEATS"
ARGS+=" --warmup-steps $WARMUP" ARGS+=" --warmup-steps $WARMUP"
ARGS+=" --speaker $SPEAKER" ARGS+=" --speaker $SPEAKER"
ARGS+=" --n-speakers $NUM_SPEAKERS" ARGS+=" --n-speakers $NUM_SPEAKERS"
[ "$CPU" = false ] && ARGS+=" --cuda"
[ "$CPU" = false ] && ARGS+=" --cudnn-benchmark"
[ "$AMP" = true ] && ARGS+=" --amp" [ "$AMP" = true ] && ARGS+=" --amp"
[ "$PHONE" = "true" ] && ARGS+=" --p-arpabet 1.0" [ "$PHONE" = "true" ] && ARGS+=" --p-arpabet 1.0"
[ "$ENERGY" = "true" ] && ARGS+=" --energy-conditioning" [ "$ENERGY" = "true" ] && ARGS+=" --energy-conditioning"