DeepLearningExamples/TensorFlow2/Classification/ConvNets/efficientnet/runtime/runner.py
2021-04-09 23:32:53 +02:00

297 lines
12 KiB
Python

# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import multiprocessing
import warnings
import yaml
import time
import tensorflow as tf
import numpy as np
import horovod.tensorflow.keras as hvd
from utils import hvd_utils, optimizer_factory
from utils import callbacks as custom_callbacks
from runtime.runner_utils import get_optimizer_params, get_metrics, get_learning_rate_params, \
build_model_params, get_models, get_dataset_builders, build_stats, \
parse_inference_input, preprocess_image_files
__all__ = [
'Runner',
]
DTYPE_MAP = {
'float32': tf.float32,
'bfloat16': tf.bfloat16,
'float16': tf.float16,
'fp32': tf.float32,
'bf16': tf.bfloat16,
}
class Runner(object):
def __init__(self, flags, logger):
self.params = flags
self.logger = logger
if hvd.rank() == 0:
self.serialize_config(model_dir=self.params.model_dir)
# =================================================
# Define Datasets
# =================================================
label_smoothing = flags.label_smoothing
self.one_hot = label_smoothing and label_smoothing > 0
builders = get_dataset_builders(self.params, self.one_hot)
datasets = [builder.build() if builder else None for builder in builders]
self.train_dataset, self.validation_dataset = datasets
self.train_builder, self.validation_builder = builders
self.initialize()
# =================================================
# Define Model
# =================================================
model_params = build_model_params(model_name=self.params.arch,
is_training="predict" not in self.params.mode,
batch_norm=self.params.batch_norm,
num_classes=self.params.num_classes,
activation=self.params.activation,
dtype=DTYPE_MAP[self.params.dtype],
weight_decay=self.params.weight_decay,
weight_init=self.params.weight_init
)
models_dict = get_models()
self.model = [model for model_name, model in models_dict.items() if model_name in self.params.arch][0](**model_params)
self.metrics = ['accuracy', 'top_5']
if self.params.dataset == 'ImageNet':
self.train_num_examples = 1281167
self.eval_num_examples = 50000
def initialize(self):
"""Initializes backend related initializations."""
if tf.config.list_physical_devices('GPU'):
data_format = 'channels_first'
else:
data_format = 'channels_last'
tf.keras.backend.set_image_data_format(data_format)
if self.params.run_eagerly:
# Enable eager execution to allow step-by-step debugging
tf.config.experimental_run_functions_eagerly(True)
def load_model_weights(self, model_dir):
latest_checkpoint = tf.train.latest_checkpoint(model_dir)
if not latest_checkpoint:
return 0
self.model.load_weights(latest_checkpoint)
return self.model.optimizer.iterations
def resume_from_checkpoint(self,
model_dir: str,
train_steps: int) -> int:
"""Resumes from the latest checkpoint, if possible.
Loads the model weights and optimizer settings from a checkpoint.
This function should be used in case of preemption recovery.
Args:
model: The model whose weights should be restored.
model_dir: The directory where model weights were saved.
train_steps: The number of steps to train.
Returns:
The epoch of the latest checkpoint, or 0 if not restoring.
"""
last_iteration = self.load_model_weights(model_dir)
initial_epoch = last_iteration // train_steps
return int(initial_epoch)
def serialize_config(self, model_dir: str):
"""Serializes and saves the experiment config."""
params_save_path = os.path.join(model_dir, 'params.yaml')
with open(params_save_path, 'w') as outfile:
yaml.dump(vars(self.params), outfile, default_flow_style=False)
def train(self):
train_epochs = self.params.max_epochs
train_steps = self.params.steps_per_epoch if self.params.steps_per_epoch is not None else self.train_num_examples // self.train_builder.global_batch_size
if self.validation_builder is not None:
validation_steps = self.eval_num_examples // self.validation_builder.global_batch_size
else:
validation_steps = None
learning_rate = optimizer_factory.build_learning_rate(
params=get_learning_rate_params(name=self.params.lr_decay,
initial_lr=self.params.lr_init,
decay_epochs=self.params.lr_decay_epochs,
decay_rate=self.params.lr_decay_rate,
warmup_epochs=self.params.lr_warmup_epochs),
batch_size=self.train_builder.global_batch_size,
train_steps=train_steps,
max_epochs=train_epochs)
optimizer = optimizer_factory.build_optimizer(
optimizer_name=self.params.optimizer,
base_learning_rate=learning_rate,
params=get_optimizer_params(name=self.params.optimizer,
decay=self.params.decay,
epsilon=self.params.epsilon,
momentum=self.params.momentum,
moving_average_decay=self.params.moving_average_decay,
nesterov=self.params.nesterov,
beta_1=self.params.beta_1,
beta_2=self.params.beta_2)
)
metrics_map = get_metrics(self.one_hot)
metrics = [metrics_map[metric] for metric in self.metrics]
optimizer = hvd.DistributedOptimizer(optimizer, compression=hvd.Compression.fp16)
if self.one_hot:
loss_obj = tf.keras.losses.CategoricalCrossentropy(
label_smoothing=self.params.label_smoothing)
else:
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
# Training
self.model.compile(optimizer=optimizer,
loss=loss_obj,
metrics=metrics,
experimental_run_tf_function=False)
initial_epoch = 0
if self.params.resume_checkpoint:
initial_epoch = self.resume_from_checkpoint(model_dir=self.params.model_dir,
train_steps=train_steps)
#Define Callbacks (TODO)
callbacks=[hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
callbacks += custom_callbacks.get_callbacks(
model_checkpoint=self.params.enable_checkpoint_and_export,
include_tensorboard=self.params.enable_tensorboard,
time_history=self.params.time_history,
track_lr=True,
write_model_weights=self.params.write_model_weights,
initial_step=initial_epoch * train_steps,
batch_size=self.train_builder.global_batch_size,
log_steps=self.params.log_steps,
model_dir=self.params.model_dir,
save_checkpoint_freq=train_steps * self.params.save_checkpoint_freq,
logger=self.logger)
if "eval" not in self.params.mode:
validation_kwargs = {}
else:
validation_kwargs = {
'validation_data': self.validation_dataset,
'validation_steps': validation_steps,
'validation_freq': self.params.num_epochs_between_eval,
}
history = self.model.fit(
self.train_dataset,
epochs=train_epochs,
steps_per_epoch=train_steps,
initial_epoch=initial_epoch,
callbacks=callbacks,
verbose=2,
**validation_kwargs)
validation_output = None
eval_callback = None
if not self.params.skip_eval and self.validation_builder is not None:
eval_callback = custom_callbacks.EvalTimeHistory(batch_size=self.params.eval_batch_size, logger=self.logger)
worker_validation_output = self.model.evaluate(
self.validation_dataset, steps=validation_steps, callbacks=eval_callback, verbose=2)
validation_output = list(hvd.allreduce(worker_validation_output,average=True))
build_stats(history, validation_output, callbacks, eval_callback, self.logger)
def evaluate(self):
if self.validation_builder is not None:
validation_steps = self.eval_num_examples // self.validation_builder.global_batch_size
else:
validation_steps = None
metrics_map = get_metrics(self.one_hot)
metrics = [metrics_map[metric] for metric in self.metrics]
if self.one_hot:
loss_obj = tf.keras.losses.CategoricalCrossentropy(
label_smoothing=self.params.label_smoothing)
else:
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
# Training
self.model.compile(optimizer="rmsprop",
loss=loss_obj,
metrics=metrics,
experimental_run_tf_function=False)
_ = self.load_model_weights(self.params.model_dir)
eval_callback = custom_callbacks.EvalTimeHistory(batch_size=self.params.eval_batch_size, logger=self.logger)
results = self.model.evaluate(self.validation_dataset, steps=validation_steps, callbacks=eval_callback, verbose=1)
build_stats(None, results, None, eval_callback, self.logger)
def predict(self, to_predict, checkpoint_name=None, print_results=True):
images = preprocess_image_files(directory_name=to_predict, arch=self.params.arch, batch_size=self.params.predict_batch_size, dtype=DTYPE_MAP[self.params.dtype])
nb_samples = len(images)
if checkpoint_name is not None:
self.model.load_weights(checkpoint_name)
try:
file_names = images.filenames
num_files = len(file_names)
if self.params.benchmark:
nb_samples *= 50
print_results = False
num_files *= 50
start_time = time.time()
inference_results = self.model.predict(images, verbose=1, steps=nb_samples)
total_time = time.time() - start_time
score = tf.nn.softmax(inference_results, axis=1)
if print_results:
for i, name in enumerate(file_names):
print(
"This {} image most likely belongs to {} class with a {} percent confidence."
.format(name, tf.math.argmax(score[i]), 100 * tf.math.reduce_max(score[i]))
)
print("Total time to infer {} images :: {}".format(num_files, total_time))
print("Inference Throughput {}".format(num_files/total_time))
print("Inference Latency {}".format(total_time/num_files))
except KeyboardInterrupt:
print("Keyboard interrupt")
print('Ending Inference ...')