DeepLearningExamples/CUDA-Optimized/FastSpeech/tacotron2/train.py
2020-07-31 14:59:15 +08:00

144 lines
5.5 KiB
Python

# BSD 3-Clause License
# Copyright (c) 2018-2020, NVIDIA Corporation
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""https://github.com/NVIDIA/tacotron2"""
import os
from numpy import finfo
import torch
from tacotron2.distributed import apply_gradient_allreduce
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader
from tacotron2.model import Tacotron2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def reduce_tensor(tensor, n_gpus):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.reduce_op.SUM)
rt /= n_gpus
return rt
def init_distributed(hparams, n_gpus, rank, group_name):
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
print("Initializing Distributed")
# Set cuda device so everything is done on the right GPU.
torch.cuda.set_device(rank % torch.cuda.device_count())
# Initialize distributed communication
dist.init_process_group(
backend=hparams.dist_backend, init_method=hparams.dist_url,
world_size=n_gpus, rank=rank, group_name=group_name)
print("Done initializing distributed")
def load_model(hparams):
model = Tacotron2(hparams).to(device)
if hparams.fp16_run:
model.decoder.attention_layer.score_mask_value = finfo('float16').min
if hparams.distributed_run:
model = apply_gradient_allreduce(model)
return model
def warm_start_model(checkpoint_path, model, ignore_layers):
assert os.path.isfile(checkpoint_path)
print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
model_dict = checkpoint_dict['state_dict']
if len(ignore_layers) > 0:
model_dict = {k: v for k, v in model_dict.items()
if k not in ignore_layers}
dummy_dict = model.state_dict()
dummy_dict.update(model_dict)
model_dict = dummy_dict
model.load_state_dict(model_dict)
return model
def load_checkpoint(checkpoint_path, model, optimizer):
assert os.path.isfile(checkpoint_path)
print("Loading checkpoint '{}'".format(checkpoint_path))
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint_dict['state_dict'])
optimizer.load_state_dict(checkpoint_dict['optimizer'])
learning_rate = checkpoint_dict['learning_rate']
iteration = checkpoint_dict['iteration']
print("Loaded checkpoint '{}' from iteration {}" .format(
checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
print("Saving model and optimizer state at iteration {} to {}".format(
iteration, filepath))
torch.save({'iteration': iteration,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
'learning_rate': learning_rate}, filepath)
def validate(model, criterion, valset, iteration, batch_size, n_gpus,
collate_fn, logger, distributed_run, rank):
"""Handles all the validation scoring and printing"""
model.eval()
with torch.no_grad():
val_sampler = DistributedSampler(valset) if distributed_run else None
val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
shuffle=False, batch_size=batch_size,
pin_memory=False, collate_fn=collate_fn)
val_loss = 0.0
for i, batch in enumerate(val_loader):
x, y = model.parse_batch(batch)
y_pred = model(x)
loss = criterion(y_pred, y)
if distributed_run:
reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
else:
reduced_val_loss = loss.item()
val_loss += reduced_val_loss
val_loss = val_loss / (i + 1)
model.train()
if rank == 0:
print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss))
logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)