177 lines
6.7 KiB
Python
177 lines
6.7 KiB
Python
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of the NVIDIA CORPORATION nor the
|
|
# names of its contributors may be used to endorse or promote products
|
|
# derived from this software without specific prior written permission.
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
import csv
|
|
|
|
import pprint
|
|
|
|
import librosa
|
|
from torch.utils.data import Dataset
|
|
import pandas as pd
|
|
from fastspeech.text_norm import text_to_sequence
|
|
from fastspeech import audio
|
|
from fastspeech.utils.logging import tprint
|
|
|
|
import os
|
|
import pathlib
|
|
|
|
import fire
|
|
import numpy as np
|
|
from tqdm import tqdm
|
|
|
|
from fastspeech import hparam as hp
|
|
|
|
pp = pprint.PrettyPrinter(indent=4, width=1000)
|
|
|
|
class LJSpeechDataset(Dataset):
|
|
|
|
def __init__(self, root_path, meta_file="metadata.csv",
|
|
sr=22050, n_fft=1024, win_len=1024, hop_len=256, n_mels=80, mel_fmin=0.0, mel_fmax=8000.0, exclude_mels=False, mels_path=None,
|
|
aligns_path=None, text_cleaner=['english_cleaners'], sort_by_length=False):
|
|
self.root_path = root_path
|
|
self.meta_file = meta_file
|
|
self.text_cleaner = text_cleaner
|
|
self.sr = sr
|
|
self.n_fft = n_fft
|
|
self.win_len = win_len
|
|
self.hop_len = hop_len
|
|
self.n_mels = n_mels
|
|
self.mel_fmin = mel_fmin
|
|
self.mel_fmax = mel_fmax
|
|
self.aligns_path = aligns_path
|
|
self.mels_path = mels_path
|
|
self.exclude_mels = exclude_mels
|
|
self.sort_by_length = sort_by_length
|
|
|
|
# Read metadata file.
|
|
# - column: <name, transcription, normalized_transcription>
|
|
self.metas = pd.read_csv(os.path.join(root_path, meta_file),
|
|
sep="|",
|
|
header=None,
|
|
keep_default_na=False,
|
|
quoting=csv.QUOTE_NONE,
|
|
names=["name", "transcription", "normalized_transcription"],
|
|
)
|
|
if sort_by_length:
|
|
self.metas.insert(3, 'length', self.metas['normalized_transcription'].str.len())
|
|
self.metas.sort_values('length', ascending=True, inplace=True)
|
|
|
|
def __len__(self):
|
|
return len(self.metas)
|
|
|
|
def __getitem__(self, idx):
|
|
name = self.metas.iloc[idx, 0]
|
|
path = "{}/wavs/{}.wav".format(self.root_path, name)
|
|
|
|
# Text normalization
|
|
text = self.metas.iloc[idx, 1]
|
|
text_norm = self.metas.iloc[idx, 2]
|
|
text_encoded = np.array(text_to_sequence(text_norm, self.text_cleaner))
|
|
text_pos = np.array([idx+1 for idx, _ in enumerate(text_encoded)])
|
|
|
|
data = {
|
|
"name": name,
|
|
"text": text,
|
|
"text_norm": text_norm,
|
|
"text_encoded": text_encoded,
|
|
"text_pos": text_pos,
|
|
"text_len": text_encoded.shape[-1],
|
|
"sr": self.sr
|
|
}
|
|
|
|
if not self.exclude_mels:
|
|
wav, sr = librosa.load(path, sr=self.sr) # wav is [-1.0, 1.0]
|
|
if sr != self.sr:
|
|
raise ValueError("{} SR doesn't match target {} SR".format(sr, self.sr))
|
|
|
|
# Audio processing
|
|
wav, _ = librosa.effects.trim(wav, frame_length=self.win_len, hop_length=self.hop_len)
|
|
|
|
if self.mels_path:
|
|
mel = np.load(os.path.join(self.mels_path, name + ".mel.npy"))
|
|
else:
|
|
mel = librosa.feature.melspectrogram(wav,
|
|
sr=sr,
|
|
n_fft=self.n_fft,
|
|
win_length=self.win_len,
|
|
hop_length=self.hop_len,
|
|
n_mels=self.n_mels,
|
|
fmin=self.mel_fmin,
|
|
fmax=self.mel_fmax,
|
|
power=1.0)
|
|
mel = audio.dynamic_range_compression(mel)
|
|
|
|
data_mel = {
|
|
"wav": wav,
|
|
"mel": mel,
|
|
"mel_len": mel.shape[-1],
|
|
}
|
|
data.update(data_mel)
|
|
|
|
if self.aligns_path:
|
|
aligns = np.load(os.path.join(self.aligns_path, name + ".align.npy"))
|
|
data['align'] = aligns
|
|
|
|
return data
|
|
|
|
|
|
def preprocess_mel(hparam="base.yaml", **kwargs):
|
|
"""The script for preprocessing mel-spectrograms from the dataset.
|
|
|
|
By default, this script assumes to load parameters in the default config file, fastspeech/hparams/base.yaml.
|
|
|
|
Besides the flags, you can also set parameters in the config file via the command-line. For examples,
|
|
--dataset_path=DATASET_PATH
|
|
Path to dataset directory.
|
|
--mels_path=MELS_PATH
|
|
Path to output preprocessed mels directory.
|
|
|
|
Refer to fastspeech/hparams/base.yaml to see more parameters.
|
|
|
|
Args:
|
|
hparam (str, optional): Path to default config file. Defaults to "base.yaml".
|
|
"""
|
|
|
|
hp.set_hparam(hparam, kwargs)
|
|
tprint("Hparams:\n{}".format(pp.pformat(hp)))
|
|
|
|
pathlib.Path(hp.mels_path).mkdir(parents=True, exist_ok=True)
|
|
|
|
dataset = LJSpeechDataset(hp.dataset_path, mels_path=None)
|
|
|
|
for data in tqdm(dataset):
|
|
name = data["name"]
|
|
mel = data["mel"]
|
|
|
|
save_path = os.path.join(hp.mels_path, name + ".mel.npy")
|
|
|
|
if os.path.exists(save_path):
|
|
continue
|
|
|
|
# print(name, mel)
|
|
np.save(save_path, mel)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
fire.Fire(preprocess_mel) |