# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the NVIDIA CORPORATION nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import csv import pprint import librosa from torch.utils.data import Dataset import pandas as pd from fastspeech.text_norm import text_to_sequence from fastspeech import audio from fastspeech.utils.logging import tprint import os import pathlib import fire import numpy as np from tqdm import tqdm from fastspeech import hparam as hp pp = pprint.PrettyPrinter(indent=4, width=1000) class LJSpeechDataset(Dataset): def __init__(self, root_path, meta_file="metadata.csv", sr=22050, n_fft=1024, win_len=1024, hop_len=256, n_mels=80, mel_fmin=0.0, mel_fmax=8000.0, exclude_mels=False, mels_path=None, aligns_path=None, text_cleaner=['english_cleaners'], sort_by_length=False): self.root_path = root_path self.meta_file = meta_file self.text_cleaner = text_cleaner self.sr = sr self.n_fft = n_fft self.win_len = win_len self.hop_len = hop_len self.n_mels = n_mels self.mel_fmin = mel_fmin self.mel_fmax = mel_fmax self.aligns_path = aligns_path self.mels_path = mels_path self.exclude_mels = exclude_mels self.sort_by_length = sort_by_length # Read metadata file. # - column: self.metas = pd.read_csv(os.path.join(root_path, meta_file), sep="|", header=None, keep_default_na=False, quoting=csv.QUOTE_NONE, names=["name", "transcription", "normalized_transcription"], ) if sort_by_length: self.metas.insert(3, 'length', self.metas['normalized_transcription'].str.len()) self.metas.sort_values('length', ascending=True, inplace=True) def __len__(self): return len(self.metas) def __getitem__(self, idx): name = self.metas.iloc[idx, 0] path = "{}/wavs/{}.wav".format(self.root_path, name) # Text normalization text = self.metas.iloc[idx, 1] text_norm = self.metas.iloc[idx, 2] text_encoded = np.array(text_to_sequence(text_norm, self.text_cleaner)) text_pos = np.array([idx+1 for idx, _ in enumerate(text_encoded)]) data = { "name": name, "text": text, "text_norm": text_norm, "text_encoded": text_encoded, "text_pos": text_pos, "text_len": text_encoded.shape[-1], "sr": self.sr } if not self.exclude_mels: wav, sr = librosa.load(path, sr=self.sr) # wav is [-1.0, 1.0] if sr != self.sr: raise ValueError("{} SR doesn't match target {} SR".format(sr, self.sr)) # Audio processing wav, _ = librosa.effects.trim(wav, frame_length=self.win_len, hop_length=self.hop_len) if self.mels_path: mel = np.load(os.path.join(self.mels_path, name + ".mel.npy")) else: mel = librosa.feature.melspectrogram(wav, sr=sr, n_fft=self.n_fft, win_length=self.win_len, hop_length=self.hop_len, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax, power=1.0) mel = audio.dynamic_range_compression(mel) data_mel = { "wav": wav, "mel": mel, "mel_len": mel.shape[-1], } data.update(data_mel) if self.aligns_path: aligns = np.load(os.path.join(self.aligns_path, name + ".align.npy")) data['align'] = aligns return data def preprocess_mel(hparam="base.yaml", **kwargs): """The script for preprocessing mel-spectrograms from the dataset. By default, this script assumes to load parameters in the default config file, fastspeech/hparams/base.yaml. Besides the flags, you can also set parameters in the config file via the command-line. For examples, --dataset_path=DATASET_PATH Path to dataset directory. --mels_path=MELS_PATH Path to output preprocessed mels directory. Refer to fastspeech/hparams/base.yaml to see more parameters. Args: hparam (str, optional): Path to default config file. Defaults to "base.yaml". """ hp.set_hparam(hparam, kwargs) tprint("Hparams:\n{}".format(pp.pformat(hp))) pathlib.Path(hp.mels_path).mkdir(parents=True, exist_ok=True) dataset = LJSpeechDataset(hp.dataset_path, mels_path=None) for data in tqdm(dataset): name = data["name"] mel = data["mel"] save_path = os.path.join(hp.mels_path, name + ".mel.npy") if os.path.exists(save_path): continue # print(name, mel) np.save(save_path, mel) if __name__ == '__main__': fire.Fire(preprocess_mel)