From 054fed043f8fe179eeba5bb5d0356d117e224209 Mon Sep 17 00:00:00 2001 From: Adrian Lancucki Date: Thu, 7 Oct 2021 11:26:31 -0700 Subject: [PATCH] [FastPitch/PyT] Drop parselmouth dependency --- PyTorch/SpeechSynthesis/FastPitch/README.md | 11 ++---- .../FastPitch/fastpitch/data_function.py | 35 ++----------------- .../FastPitch/prepare_dataset.py | 2 +- .../FastPitch/requirements.txt | 1 - .../FastPitch/scripts/prepare_dataset.sh | 3 +- .../FastPitch/scripts/train_benchmark.sh | 1 - PyTorch/SpeechSynthesis/FastPitch/train.py | 4 +-- 7 files changed, 10 insertions(+), 47 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/README.md b/PyTorch/SpeechSynthesis/FastPitch/README.md index 93a54df6..96d01135 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/README.md +++ b/PyTorch/SpeechSynthesis/FastPitch/README.md @@ -363,16 +363,11 @@ FastPitch 1.1 aligns input symbols to output mel-spectrogram frames automaticall on any external aligning model. FastPitch training can now be started on raw waveforms without any pre-processing: pitch values and mel-spectrograms will be calculated on-line. -For every mel-spectrogram frame, its fundamental frequency in Hz is estimated with either -the Probabilistic YIN algorithm or [Praat](http://praat.org). - -The former is more accurate but time consuming, and we recommend to pre-calculate -pitch during the data processing step. The latter is suitable for on-line pitch calculation. -Pitch values are then averaged over every character, in order to provide sparse -pitch cues for the model. +For every mel-spectrogram frame, its fundamental frequency in Hz is estimated with +the Probabilistic YIN algorithm.

- Pitch estimates extracted with Praat + Pitch contour estimate

Figure 2. Pitch estimates for mel-spectrogram frames of phrase "in being comparatively" diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index 06679083..0014a997 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -32,7 +32,6 @@ from pathlib import Path import librosa import numpy as np -import parselmouth import torch import torch.nn.functional as F from scipy import ndimage @@ -88,35 +87,7 @@ def estimate_pitch(wav, mel_len, method='pyin', normalize_mean=None, if type(normalize_std) is float or type(normalize_std) is list: normalize_std = torch.tensor(normalize_std) - if method == 'praat': - - snd = parselmouth.Sound(wav) - pitch_mel = snd.to_pitch(time_step=snd.duration / (mel_len + 3) - ).selected_array['frequency'] - assert np.abs(mel_len - pitch_mel.shape[0]) <= 1.0 - - pitch_mel = torch.from_numpy(pitch_mel).unsqueeze(0) - - if n_formants > 1: - formant = snd.to_formant_burg( - time_step=snd.duration / (mel_len + 3)) - formant_n_frames = formant.get_number_of_frames() - assert np.abs(mel_len - formant_n_frames) <= 1.0 - - formants_mel = np.zeros((formant_n_frames + 1, n_formants - 1)) - for i in range(1, formant_n_frames + 1): - formants_mel[i] = np.asarray([ - formant.get_value_at_time( - formant_number=f, - time=formant.get_time_from_frame_number(i)) - for f in range(1, n_formants) - ]) - - pitch_mel = torch.cat( - [pitch_mel, torch.from_numpy(formants_mel).permute(1, 0)], - dim=0) - - elif method == 'pyin': + if method == 'pyin': snd, sr = librosa.load(wav) pitch_mel, voiced_flag, voiced_probs = librosa.pyin( @@ -181,7 +152,7 @@ class TTSDataset(torch.utils.data.Dataset): pitch_online_dir=None, betabinomial_online_dir=None, use_betabinomial_interpolator=True, - pitch_online_method='praat', + pitch_online_method='pyin', **ignored): # Expect a list of filenames @@ -338,7 +309,7 @@ class TTSDataset(torch.utils.data.Dataset): if cached_fpath.is_file(): return torch.load(cached_fpath) - # No luck so far - calculate or replace with praat + # No luck so far - calculate wav = audiopath if not wav.endswith('.wav'): wav = re.sub('/mels/', '/wavs/', wav) diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index b64dc452..d93065b4 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -73,7 +73,7 @@ def parse_args(parser): parser.add_argument('--n-mel-channels', type=int, default=80) # Pitch extraction parser.add_argument('--f0-method', default='pyin', type=str, - choices=('pyin', 'praat'), help='F0 estimation method') + choices=['pyin'], help='F0 estimation method') # Performance parser.add_argument('-b', '--batch-size', default=1, type=int) parser.add_argument('--n-workers', type=int, default=16) diff --git a/PyTorch/SpeechSynthesis/FastPitch/requirements.txt b/PyTorch/SpeechSynthesis/FastPitch/requirements.txt index 3ef24dc5..9f0fec9b 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/requirements.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/requirements.txt @@ -4,6 +4,5 @@ inflect librosa==0.8.0 scipy Unidecode -praat-parselmouth==0.3.3 tensorboardX==2.0 git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh index 72a7700f..43525ef4 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh @@ -3,7 +3,6 @@ set -e : ${DATA_DIR:=LJSpeech-1.1} -: ${F0_METHOD:="pyin"} : ${ARGS="--extract-mels"} python prepare_dataset.py \ @@ -12,5 +11,5 @@ python prepare_dataset.py \ --batch-size 1 \ --dataset-path $DATA_DIR \ --extract-pitch \ - --f0-method $F0_METHOD \ + --f0-method pyin \ $ARGS diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train_benchmark.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train_benchmark.sh index c18d3b4b..45fb8358 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train_benchmark.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train_benchmark.sh @@ -6,7 +6,6 @@ set -a : ${NUM_GPUS_SEQUENCE:="1 4 8"} : ${EPOCHS:=30} : ${OUTPUT_DIR:="./output"} -: ${F0_METHOD:=praat} : ${BATCH_SIZE:=16} for NUM_GPUS in $NUM_GPUS_SEQUENCE ; do diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index ef9cadfb..f3384251 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -147,8 +147,8 @@ def parse_args(parser): 'n_speakers > 1 enables speaker embeddings') cond.add_argument('--load-pitch-from-disk', action='store_true', help='Use pitch cached on disk with prepare_dataset.py') - cond.add_argument('--pitch-online-method', default='praat', - choices=['praat', 'pyin'], + cond.add_argument('--pitch-online-method', default='pyin', + choices=['pyin'], help='Calculate pitch on the fly during trainig') cond.add_argument('--pitch-online-dir', type=str, default=None, help='A directory for storing pitch calculated on-line')