DeepLearningExamples/CUDA-Optimized/FastSpeech/fastspeech/dataset/ljspeech_dataset.py
Dabi Ahn fd32b990ac [CUDA-Optimized/FastSpeech]
- support for PyTorch 1.7 and TensorRT 7.2
- limit sample audio file length
2020-11-02 21:17:00 +08:00

177 lines
6.7 KiB
Python

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import csv
import pprint
import librosa
from torch.utils.data import Dataset
import pandas as pd
from fastspeech.text_norm import text_to_sequence
from fastspeech import audio
from fastspeech.utils.logging import tprint
import os
import pathlib
import fire
import numpy as np
from tqdm import tqdm
from fastspeech import hparam as hp
pp = pprint.PrettyPrinter(indent=4, width=1000)
class LJSpeechDataset(Dataset):
def __init__(self, root_path, meta_file="metadata.csv",
sr=22050, n_fft=1024, win_len=1024, hop_len=256, n_mels=80, mel_fmin=0.0, mel_fmax=8000.0, exclude_mels=False, mels_path=None,
aligns_path=None, text_cleaner=['english_cleaners'], sort_by_length=False):
self.root_path = root_path
self.meta_file = meta_file
self.text_cleaner = text_cleaner
self.sr = sr
self.n_fft = n_fft
self.win_len = win_len
self.hop_len = hop_len
self.n_mels = n_mels
self.mel_fmin = mel_fmin
self.mel_fmax = mel_fmax
self.aligns_path = aligns_path
self.mels_path = mels_path
self.exclude_mels = exclude_mels
self.sort_by_length = sort_by_length
# Read metadata file.
# - column: <name, transcription, normalized_transcription>
self.metas = pd.read_csv(os.path.join(root_path, meta_file),
sep="|",
header=None,
keep_default_na=False,
quoting=csv.QUOTE_NONE,
names=["name", "transcription", "normalized_transcription"],
)
if sort_by_length:
self.metas.insert(3, 'length', self.metas['normalized_transcription'].str.len())
self.metas.sort_values('length', ascending=True, inplace=True)
def __len__(self):
return len(self.metas)
def __getitem__(self, idx):
name = self.metas.iloc[idx, 0]
path = "{}/wavs/{}.wav".format(self.root_path, name)
# Text normalization
text = self.metas.iloc[idx, 1]
text_norm = self.metas.iloc[idx, 2]
text_encoded = np.array(text_to_sequence(text_norm, self.text_cleaner))
text_pos = np.array([idx+1 for idx, _ in enumerate(text_encoded)])
data = {
"name": name,
"text": text,
"text_norm": text_norm,
"text_encoded": text_encoded,
"text_pos": text_pos,
"text_len": text_encoded.shape[-1],
"sr": self.sr
}
if not self.exclude_mels:
wav, sr = librosa.load(path, sr=self.sr) # wav is [-1.0, 1.0]
if sr != self.sr:
raise ValueError("{} SR doesn't match target {} SR".format(sr, self.sr))
# Audio processing
wav, _ = librosa.effects.trim(wav, frame_length=self.win_len, hop_length=self.hop_len)
if self.mels_path:
mel = np.load(os.path.join(self.mels_path, name + ".mel.npy"))
else:
mel = librosa.feature.melspectrogram(wav,
sr=sr,
n_fft=self.n_fft,
win_length=self.win_len,
hop_length=self.hop_len,
n_mels=self.n_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax,
power=1.0)
mel = audio.dynamic_range_compression(mel)
data_mel = {
"wav": wav,
"mel": mel,
"mel_len": mel.shape[-1],
}
data.update(data_mel)
if self.aligns_path:
aligns = np.load(os.path.join(self.aligns_path, name + ".align.npy"))
data['align'] = aligns
return data
def preprocess_mel(hparam="base.yaml", **kwargs):
"""The script for preprocessing mel-spectrograms from the dataset.
By default, this script assumes to load parameters in the default config file, fastspeech/hparams/base.yaml.
Besides the flags, you can also set parameters in the config file via the command-line. For examples,
--dataset_path=DATASET_PATH
Path to dataset directory.
--mels_path=MELS_PATH
Path to output preprocessed mels directory.
Refer to fastspeech/hparams/base.yaml to see more parameters.
Args:
hparam (str, optional): Path to default config file. Defaults to "base.yaml".
"""
hp.set_hparam(hparam, kwargs)
tprint("Hparams:\n{}".format(pp.pformat(hp)))
pathlib.Path(hp.mels_path).mkdir(parents=True, exist_ok=True)
dataset = LJSpeechDataset(hp.dataset_path, mels_path=None)
for data in tqdm(dataset):
name = data["name"]
mel = data["mel"]
save_path = os.path.join(hp.mels_path, name + ".mel.npy")
if os.path.exists(save_path):
continue
# print(name, mel)
np.save(save_path, mel)
if __name__ == '__main__':
fire.Fire(preprocess_mel)