144 lines
5.5 KiB
Python
144 lines
5.5 KiB
Python
# BSD 3-Clause License
|
|
|
|
# Copyright (c) 2018-2020, NVIDIA Corporation
|
|
# All rights reserved.
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
# * Redistributions of source code must retain the above copyright notice, this
|
|
# list of conditions and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
|
|
# * Neither the name of the copyright holder nor the names of its
|
|
# contributors may be used to endorse or promote products derived from
|
|
# this software without specific prior written permission.
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""https://github.com/NVIDIA/tacotron2"""
|
|
|
|
import os
|
|
from numpy import finfo
|
|
|
|
import torch
|
|
from tacotron2.distributed import apply_gradient_allreduce
|
|
import torch.distributed as dist
|
|
from torch.utils.data.distributed import DistributedSampler
|
|
from torch.utils.data import DataLoader
|
|
|
|
from tacotron2.model import Tacotron2
|
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
|
|
def reduce_tensor(tensor, n_gpus):
|
|
rt = tensor.clone()
|
|
dist.all_reduce(rt, op=dist.reduce_op.SUM)
|
|
rt /= n_gpus
|
|
return rt
|
|
|
|
|
|
def init_distributed(hparams, n_gpus, rank, group_name):
|
|
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
|
|
print("Initializing Distributed")
|
|
|
|
# Set cuda device so everything is done on the right GPU.
|
|
torch.cuda.set_device(rank % torch.cuda.device_count())
|
|
|
|
# Initialize distributed communication
|
|
dist.init_process_group(
|
|
backend=hparams.dist_backend, init_method=hparams.dist_url,
|
|
world_size=n_gpus, rank=rank, group_name=group_name)
|
|
|
|
print("Done initializing distributed")
|
|
|
|
|
|
def load_model(hparams):
|
|
model = Tacotron2(hparams).to(device)
|
|
if hparams.fp16_run:
|
|
model.decoder.attention_layer.score_mask_value = finfo('float16').min
|
|
|
|
if hparams.distributed_run:
|
|
model = apply_gradient_allreduce(model)
|
|
|
|
return model
|
|
|
|
|
|
def warm_start_model(checkpoint_path, model, ignore_layers):
|
|
assert os.path.isfile(checkpoint_path)
|
|
print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
|
|
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
|
model_dict = checkpoint_dict['state_dict']
|
|
if len(ignore_layers) > 0:
|
|
model_dict = {k: v for k, v in model_dict.items()
|
|
if k not in ignore_layers}
|
|
dummy_dict = model.state_dict()
|
|
dummy_dict.update(model_dict)
|
|
model_dict = dummy_dict
|
|
model.load_state_dict(model_dict)
|
|
return model
|
|
|
|
|
|
def load_checkpoint(checkpoint_path, model, optimizer):
|
|
assert os.path.isfile(checkpoint_path)
|
|
print("Loading checkpoint '{}'".format(checkpoint_path))
|
|
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
|
model.load_state_dict(checkpoint_dict['state_dict'])
|
|
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
|
learning_rate = checkpoint_dict['learning_rate']
|
|
iteration = checkpoint_dict['iteration']
|
|
print("Loaded checkpoint '{}' from iteration {}" .format(
|
|
checkpoint_path, iteration))
|
|
return model, optimizer, learning_rate, iteration
|
|
|
|
|
|
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
|
|
print("Saving model and optimizer state at iteration {} to {}".format(
|
|
iteration, filepath))
|
|
torch.save({'iteration': iteration,
|
|
'state_dict': model.state_dict(),
|
|
'optimizer': optimizer.state_dict(),
|
|
'learning_rate': learning_rate}, filepath)
|
|
|
|
|
|
def validate(model, criterion, valset, iteration, batch_size, n_gpus,
|
|
collate_fn, logger, distributed_run, rank):
|
|
"""Handles all the validation scoring and printing"""
|
|
model.eval()
|
|
with torch.no_grad():
|
|
val_sampler = DistributedSampler(valset) if distributed_run else None
|
|
val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
|
|
shuffle=False, batch_size=batch_size,
|
|
pin_memory=False, collate_fn=collate_fn)
|
|
|
|
val_loss = 0.0
|
|
for i, batch in enumerate(val_loader):
|
|
x, y = model.parse_batch(batch)
|
|
y_pred = model(x)
|
|
loss = criterion(y_pred, y)
|
|
if distributed_run:
|
|
reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
|
|
else:
|
|
reduced_val_loss = loss.item()
|
|
val_loss += reduced_val_loss
|
|
val_loss = val_loss / (i + 1)
|
|
|
|
model.train()
|
|
if rank == 0:
|
|
print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss))
|
|
logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
|