DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep/trainer/utils/setup.py

118 lines
3.6 KiB
Python

# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import json
import logging
import os
import dllogger
import horovod.tensorflow.keras as hvd
import tensorflow as tf
from data.outbrain.dataloader import train_input_fn, eval_input_fn
from trainer.utils.gpu_affinity import set_affinity
def init_cpu(args, logger):
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
init_logger(full=True, args=args, logger=logger)
logger.warning("--gpu flag not set, running computation on CPU")
raise RuntimeError("CPU not supported with nvTabular dataloader")
def init_gpu(args, logger):
hvd.init()
init_logger(full=hvd.rank() == 0, args=args, logger=logger)
if args.affinity != "disabled":
gpu_id = hvd.local_rank()
affinity = set_affinity(
gpu_id=gpu_id, nproc_per_node=hvd.size(), mode=args.affinity
)
logger.warning(f"{gpu_id}: thread affinity: {affinity}")
if args.amp:
tf.keras.mixed_precision.set_global_policy("mixed_float16")
if args.xla:
tf.config.optimizer.set_jit(True)
def init_logger(args, full, logger):
if full:
logger.setLevel(logging.INFO)
log_path = os.path.join(args.results_dir, args.log_filename)
os.makedirs(args.results_dir, exist_ok=True)
dllogger.init(
backends=[
dllogger.JSONStreamBackend(
verbosity=dllogger.Verbosity.VERBOSE, filename=log_path
),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE),
]
)
logger.warning("command line arguments: {}".format(json.dumps(vars(args))))
if not os.path.exists(args.results_dir):
os.mkdir(args.results_dir)
with open("{}/args.json".format(args.results_dir), "w") as f:
json.dump(vars(args), f, indent=4)
else:
logger.setLevel(logging.ERROR)
dllogger.init(backends=[])
dllogger.log(data=vars(args), step="PARAMETER")
def create_config(args):
assert not (
args.cpu and args.amp
), "Automatic mixed precision conversion works only with GPU"
assert (
not args.benchmark or args.benchmark_warmup_steps < args.benchmark_steps
), "Number of benchmark steps must be higher than warmup steps"
logger = logging.getLogger("tensorflow")
if args.cpu:
init_cpu(args, logger)
else:
init_gpu(args, logger)
num_gpus = 1 if args.cpu else hvd.size()
train_batch_size = args.global_batch_size // num_gpus
eval_batch_size = args.eval_batch_size // num_gpus
train_paths = sorted(glob.glob(args.train_data_pattern))
valid_paths = sorted(glob.glob(args.eval_data_pattern))
train_spec_input_fn = train_input_fn(
train_paths=train_paths,
records_batch_size=train_batch_size,
)
eval_spec_input_fn = eval_input_fn(
valid_paths=valid_paths, records_batch_size=eval_batch_size
)
config = {
"train_dataset": train_spec_input_fn,
"eval_dataset": eval_spec_input_fn,
}
return config