118 lines
3.6 KiB
Python
118 lines
3.6 KiB
Python
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import glob
|
|
import json
|
|
import logging
|
|
import os
|
|
|
|
import dllogger
|
|
import horovod.tensorflow.keras as hvd
|
|
import tensorflow as tf
|
|
from data.outbrain.dataloader import train_input_fn, eval_input_fn
|
|
from trainer.utils.gpu_affinity import set_affinity
|
|
|
|
|
|
def init_cpu(args, logger):
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
|
|
|
init_logger(full=True, args=args, logger=logger)
|
|
|
|
logger.warning("--gpu flag not set, running computation on CPU")
|
|
|
|
raise RuntimeError("CPU not supported with nvTabular dataloader")
|
|
|
|
|
|
def init_gpu(args, logger):
|
|
hvd.init()
|
|
|
|
init_logger(full=hvd.rank() == 0, args=args, logger=logger)
|
|
if args.affinity != "disabled":
|
|
gpu_id = hvd.local_rank()
|
|
affinity = set_affinity(
|
|
gpu_id=gpu_id, nproc_per_node=hvd.size(), mode=args.affinity
|
|
)
|
|
logger.warning(f"{gpu_id}: thread affinity: {affinity}")
|
|
|
|
if args.amp:
|
|
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
|
|
|
if args.xla:
|
|
tf.config.optimizer.set_jit(True)
|
|
|
|
|
|
def init_logger(args, full, logger):
|
|
if full:
|
|
logger.setLevel(logging.INFO)
|
|
log_path = os.path.join(args.results_dir, args.log_filename)
|
|
os.makedirs(args.results_dir, exist_ok=True)
|
|
dllogger.init(
|
|
backends=[
|
|
dllogger.JSONStreamBackend(
|
|
verbosity=dllogger.Verbosity.VERBOSE, filename=log_path
|
|
),
|
|
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE),
|
|
]
|
|
)
|
|
logger.warning("command line arguments: {}".format(json.dumps(vars(args))))
|
|
if not os.path.exists(args.results_dir):
|
|
os.mkdir(args.results_dir)
|
|
|
|
with open("{}/args.json".format(args.results_dir), "w") as f:
|
|
json.dump(vars(args), f, indent=4)
|
|
else:
|
|
logger.setLevel(logging.ERROR)
|
|
dllogger.init(backends=[])
|
|
|
|
dllogger.log(data=vars(args), step="PARAMETER")
|
|
|
|
|
|
def create_config(args):
|
|
assert not (
|
|
args.cpu and args.amp
|
|
), "Automatic mixed precision conversion works only with GPU"
|
|
assert (
|
|
not args.benchmark or args.benchmark_warmup_steps < args.benchmark_steps
|
|
), "Number of benchmark steps must be higher than warmup steps"
|
|
|
|
logger = logging.getLogger("tensorflow")
|
|
|
|
if args.cpu:
|
|
init_cpu(args, logger)
|
|
else:
|
|
init_gpu(args, logger)
|
|
|
|
num_gpus = 1 if args.cpu else hvd.size()
|
|
train_batch_size = args.global_batch_size // num_gpus
|
|
eval_batch_size = args.eval_batch_size // num_gpus
|
|
|
|
train_paths = sorted(glob.glob(args.train_data_pattern))
|
|
valid_paths = sorted(glob.glob(args.eval_data_pattern))
|
|
|
|
train_spec_input_fn = train_input_fn(
|
|
train_paths=train_paths,
|
|
records_batch_size=train_batch_size,
|
|
)
|
|
|
|
eval_spec_input_fn = eval_input_fn(
|
|
valid_paths=valid_paths, records_batch_size=eval_batch_size
|
|
)
|
|
|
|
config = {
|
|
"train_dataset": train_spec_input_fn,
|
|
"eval_dataset": eval_spec_input_fn,
|
|
}
|
|
|
|
return config
|