[NCF/TF] Adding new logging
This commit is contained in:
parent
0845aaa901
commit
43aa1de260
0
TensorFlow/Recommendation/NCF/.gitmodules
vendored
Normal file
0
TensorFlow/Recommendation/NCF/.gitmodules
vendored
Normal file
|
@ -12,7 +12,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.06-py3
|
||||
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.10-py3
|
||||
FROM ${FROM_IMAGE_NAME}
|
||||
|
||||
RUN apt-get update && \
|
||||
|
|
|
@ -217,7 +217,7 @@ To run evaluation on a specific checkpoint, simply run the following command:
|
|||
|
||||
```bash
|
||||
checkpoint=/data/checkpoints/model.ckpt
|
||||
python ncf.py --data /data/cache/ml-20m --mode test --checkpoint-dir $checkpoint
|
||||
python ncf.py --data /data/cache/ml-20m --mode test --load-checkpoint-path $checkpoint
|
||||
```
|
||||
|
||||
Note: TensorFlow checkpoints consist of 3 files each with a `*.ckpt` prefix.
|
||||
|
@ -389,11 +389,7 @@ performance in training and inference modes.
|
|||
To benchmark the training and inference performance, run:
|
||||
|
||||
```
|
||||
numgpu=4
|
||||
datadir=/data/cache/ml-20m
|
||||
mpirun -np $numgpu \
|
||||
--allow-run-as-root \
|
||||
python ncf.py --data $datadir
|
||||
mpirun -np 1 --allow-run-as-root python ncf.py --data /data/cache/ml-20m
|
||||
```
|
||||
|
||||
By default, the `ncf.py` script outputs metrics describing the following:
|
||||
|
|
|
@ -32,16 +32,12 @@ from argparse import ArgumentParser
|
|||
import pandas as pd
|
||||
from load import implicit_load
|
||||
|
||||
from logger.logger import LOGGER
|
||||
from logger import tags
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
MIN_RATINGS = 20
|
||||
USER_COLUMN = 'user_id'
|
||||
ITEM_COLUMN = 'item_id'
|
||||
|
||||
LOGGER.model = 'ncf'
|
||||
|
||||
def parse_args():
|
||||
parser = ArgumentParser()
|
||||
|
@ -59,7 +55,6 @@ def main():
|
|||
|
||||
print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
|
||||
grouped = df.groupby(USER_COLUMN)
|
||||
LOGGER.log(key=tags.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS)
|
||||
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)
|
||||
|
||||
print("Mapping original user and item IDs to new sequential IDs")
|
||||
|
|
|
@ -1,3 +1,20 @@
|
|||
# -----------------------------------------------------------------------
|
||||
#
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import numpy as np
|
||||
import cupy as cp
|
||||
|
||||
|
|
|
@ -1,131 +0,0 @@
|
|||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
import json
|
||||
|
||||
from logger import logger as nvl
|
||||
from logger.parser import NVLogParser
|
||||
from logger import tags
|
||||
|
||||
|
||||
def collect_by_scope(loglines):
|
||||
|
||||
# dict to gather run scope results
|
||||
run_stats = dict()
|
||||
epoch_stats = dict()
|
||||
iteration_stats = dict()
|
||||
|
||||
# TODO: check if there is only one tag per run scope
|
||||
# gather all lines with run_scope events & variables
|
||||
run_events = dict((l.tag, l) for l in loglines if l.scope == nvl.RUN_SCOPE)
|
||||
|
||||
# gather all variable tags
|
||||
run_variables = dict(k for k in run_events.items() if k[1].value is not None)
|
||||
|
||||
# find all time block names
|
||||
timed_blocks = [k[:-6] for k in run_events if k.endswith('_start')]
|
||||
|
||||
# measure times for the run scope
|
||||
for prefix in timed_blocks:
|
||||
# only when both start & stop are found
|
||||
# TODO: assert when not paired
|
||||
if prefix + "_start" in run_events and prefix + "_stop" in run_events:
|
||||
start = run_events[prefix + "_start"].timestamp
|
||||
stop = run_events[prefix + "_stop"].timestamp
|
||||
run_stats[prefix + "_time"] = stop - start
|
||||
|
||||
# collect all variables - even nested
|
||||
for k in run_variables:
|
||||
e = run_events[k]
|
||||
if isinstance(e.value, dict):
|
||||
for d in e.value.keys():
|
||||
run_stats[k + "_" + d] = e.value[d]
|
||||
else:
|
||||
run_stats[k] = e.value
|
||||
|
||||
# find epochs
|
||||
epochs = sorted(list({int(l.epoch) for l in loglines if int(l.epoch) >= 0}))
|
||||
epoch_stats['x'] = epochs
|
||||
|
||||
# gather eval_accuracy
|
||||
eval_accuracy_dup = [l.value for l in loglines if l.tag == tags.EVAL_ACCURACY]
|
||||
eval_accuracy = list({l['value']:l for l in eval_accuracy_dup})
|
||||
epoch_stats['eval_accuracy'] = eval_accuracy
|
||||
|
||||
# gather it_per_sec
|
||||
eval_it_per_sec = [l.value for l in loglines if l.tag == tags.PERF_IT_PER_SEC]
|
||||
#eval_it_per_sec = list({l['value']:l for l in eval_it_per_sec_dup})
|
||||
epoch_stats['it_per_sec'] = eval_it_per_sec
|
||||
|
||||
|
||||
# gather all epoch-iter tuples
|
||||
# TODO: l.iteration is always set to -1 in parser.py
|
||||
all_iterations = {(int(l.epoch), int(l.iteration)) for l in loglines if int(l.iteration) >= 0}
|
||||
|
||||
# group by epoch
|
||||
collected_iterations = defaultdict(list)
|
||||
for el in all_iterations:
|
||||
collected_iterations[el[0]].append(el[1])
|
||||
|
||||
# convert to list of lists
|
||||
iterations = [sorted(collected_iterations[k]) for k in sorted(collected_iterations.keys())]
|
||||
iteration_stats['x'] = iterations
|
||||
|
||||
# gather all epoch-iter-loss triples
|
||||
all_loss_dicts = [l.value for l in loglines if l.tag == tags.TRAIN_ITERATION_LOSS]
|
||||
all_loss = {(l['epoch'], l['iteration'], l['value']) for l in all_loss_dicts}
|
||||
|
||||
# group by epoch
|
||||
collected_loss = defaultdict(list)
|
||||
for el in all_loss:
|
||||
collected_loss[el[0]].append(el[2])
|
||||
|
||||
# convert to list of lists
|
||||
iterations_loss = [sorted(collected_loss[k]) for k in sorted(collected_loss.keys())]
|
||||
iteration_stats['loss'] = iterations_loss
|
||||
|
||||
# find epoch events and variables
|
||||
epoch_events = [l for l in loglines if l.scope == nvl.EPOCH_SCOPE]
|
||||
epoch_event_names = {l.tag for l in epoch_events}
|
||||
epoch_timed_blocks = {k[:-6] for k in epoch_event_names if k.endswith('_start')}
|
||||
epoch_variables = {l.tag for l in epoch_events if l.value is not None}
|
||||
|
||||
# TODO: WIP
|
||||
|
||||
|
||||
return {"run" : run_stats, "epoch": epoch_stats, "iter" : iteration_stats}
|
||||
|
||||
|
||||
def analyze(input_path, output_path=None):
|
||||
parser = NVLogParser()
|
||||
loglines, errors, worker_loglines = parser.parse_file(input_path)
|
||||
|
||||
stats = collect_by_scope(worker_loglines['(0)'])
|
||||
|
||||
if not output_path:
|
||||
print(json.dumps(stats, indent=4))
|
||||
else:
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(obj=stats, fp=f, indent=4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 2:
|
||||
print('usage: analyzer.py FILENAME')
|
||||
print(' tests analyzing on the file.')
|
||||
sys.exit(1)
|
||||
|
||||
analyze(input_path=sys.argv[1], output_path=None)
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import subprocess
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from logger.logger import LOGGER
|
||||
|
||||
#TODO: print CUDA version, container version etc
|
||||
|
||||
def log_hardware():
|
||||
# TODO: asserts - what if you cannot launch those commands?
|
||||
# number of CPU threads
|
||||
cpu_info_command = 'cat /proc/cpuinfo'
|
||||
cpu_info = subprocess.run(cpu_info_command.split(), stdout=subprocess.PIPE).stdout.split()
|
||||
cpu_num_index = len(cpu_info) - cpu_info[::-1].index(b'processor') + 1
|
||||
cpu_num = int(cpu_info[cpu_num_index]) + 1
|
||||
|
||||
# CPU name
|
||||
cpu_name_begin_index = cpu_info.index(b'name')
|
||||
cpu_name_end_index = cpu_info.index(b'stepping')
|
||||
cpu_name = b' '.join(cpu_info[cpu_name_begin_index + 2:cpu_name_end_index]).decode('utf-8')
|
||||
|
||||
LOGGER.log(key='cpu_info', value={"num": cpu_num, "name": cpu_name}, stack_offset=1)
|
||||
|
||||
# RAM memory
|
||||
ram_info_command = 'free -m -h'
|
||||
ram_info = subprocess.run(ram_info_command.split(), stdout=subprocess.PIPE).stdout.split()
|
||||
ram_index = ram_info.index(b'Mem:') + 1
|
||||
ram = ram_info[ram_index].decode('utf-8')
|
||||
|
||||
LOGGER.log(key='mem_info', value={"ram": ram}, stack_offset=1)
|
||||
|
||||
# GPU
|
||||
nvidia_smi_command = 'nvidia-smi -q -x'
|
||||
nvidia_smi_output = subprocess.run(nvidia_smi_command.split(), stdout=subprocess.PIPE).stdout
|
||||
nvidia_smi = ET.fromstring(nvidia_smi_output)
|
||||
gpus = nvidia_smi.findall('gpu')
|
||||
ver = nvidia_smi.findall('driver_version')
|
||||
|
||||
LOGGER.log(key="gpu_info",
|
||||
stack_offset=1,
|
||||
value={
|
||||
"driver_version": ver[0].text,
|
||||
"num": len(gpus),
|
||||
"name": [g.find('product_name').text for g in gpus],
|
||||
"mem": [g.find('fb_memory_usage').find('total').text for g in gpus]})
|
||||
|
||||
def log_args(args):
|
||||
LOGGER.log(key='args', value=vars(args), stack_offset=1)
|
|
@ -1,175 +0,0 @@
|
|||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import time
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import inspect
|
||||
import sys
|
||||
import re
|
||||
from contextlib import contextmanager
|
||||
import functools
|
||||
|
||||
|
||||
NVLOGGER_VERSION='0.1.0'
|
||||
NVLOGGER_TOKEN= ':::NVLOG'
|
||||
NVLOGGER_NAME="nv_dl_logger"
|
||||
NVLOGGER_FILE_NAME="nv_dl_logger"
|
||||
|
||||
RUN_SCOPE = 0
|
||||
EPOCH_SCOPE = 1
|
||||
TRAIN_ITER_SCOPE = 2
|
||||
EVAL_ITER_SCOPE = 3
|
||||
|
||||
LOGGING_SCOPE = {
|
||||
RUN_SCOPE,
|
||||
EPOCH_SCOPE,
|
||||
TRAIN_ITER_SCOPE,
|
||||
EVAL_ITER_SCOPE
|
||||
}
|
||||
|
||||
|
||||
def get_caller(stack_index=2, root_dir=None):
|
||||
caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
|
||||
|
||||
# Trim the file names for readability.
|
||||
filename = caller.filename
|
||||
if root_dir is not None:
|
||||
filename = re.sub("^" + root_dir + "/", "", filename)
|
||||
return "%s:%d" % (filename, caller.lineno)
|
||||
|
||||
|
||||
class NVLogger(object):
|
||||
__instance = None
|
||||
token = NVLOGGER_TOKEN
|
||||
version = NVLOGGER_VERSION
|
||||
stack_offset = 0
|
||||
extra_print = False
|
||||
model = "NN"
|
||||
root_dir = None
|
||||
worker = [0]
|
||||
prefix = ''
|
||||
log_file = None
|
||||
file_handler = None
|
||||
|
||||
@staticmethod
|
||||
def get_instance():
|
||||
if NVLogger.__instance is None:
|
||||
NVLogger()
|
||||
|
||||
return NVLogger.__instance
|
||||
|
||||
def set_worker(self, worker):
|
||||
if worker is None:
|
||||
self.prefix = ''
|
||||
self.worker = [0]
|
||||
else:
|
||||
self.prefix = json.dumps(worker)
|
||||
self.worker = list(worker)
|
||||
|
||||
def set_file(self, file_name=None):
|
||||
|
||||
if file_name is None:
|
||||
self.log_file = os.getenv(NVLOGGER_FILE_NAME)
|
||||
else:
|
||||
self.log_file = file_name
|
||||
|
||||
if self.log_file:
|
||||
self.file_handler = logging.FileHandler(self.log_file)
|
||||
self.file_handler.setLevel(logging.DEBUG)
|
||||
self.logger.addHandler(self.file_handler)
|
||||
self.stream_handler.setLevel(logging.INFO)
|
||||
else:
|
||||
self.stream_handler.setLevel(logging.DEBUG)
|
||||
|
||||
def __init__(self):
|
||||
|
||||
if NVLogger.__instance is None:
|
||||
NVLogger.__instance = self
|
||||
else:
|
||||
raise Exception("This class is a singleton!")
|
||||
|
||||
self.logger = logging.getLogger(NVLOGGER_NAME)
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
|
||||
self.stream_handler = logging.StreamHandler(stream=sys.stdout)
|
||||
self.stream_handler.setLevel(logging.DEBUG)
|
||||
self.logger.addHandler(self.stream_handler)
|
||||
|
||||
def print_vars(self, variables, forced=False, stack_offset=0):
|
||||
if isinstance(variables, dict):
|
||||
for v in variables.keys():
|
||||
self.log(key=v, value=variables[v], forced=forced, stack_offset=stack_offset+1)
|
||||
|
||||
def print_vars2(self, key, variables, forced=False, stack_offset=0):
|
||||
if isinstance(variables, dict):
|
||||
self.log(key=key, value=variables, forced=forced, stack_offset=stack_offset+1)
|
||||
|
||||
def log(self, key, value=None, forced=False, stack_offset=0):
|
||||
|
||||
# only the 0-worker will log
|
||||
if not forced and self.worker != 0:
|
||||
pass
|
||||
|
||||
if value is None:
|
||||
msg = key
|
||||
else:
|
||||
str_json = json.dumps(value)
|
||||
msg = '{key}: {value}'.format(key=key, value=str_json)
|
||||
|
||||
call_site = get_caller(2 + self.stack_offset + stack_offset, root_dir=self.root_dir)
|
||||
now = time.time()
|
||||
|
||||
message = msg
|
||||
if self.extra_print:
|
||||
print()
|
||||
|
||||
self.logger.debug(message)
|
||||
|
||||
|
||||
LOGGER = NVLogger.get_instance()
|
||||
|
||||
@contextmanager
|
||||
def timed_block(prefix, value=None, logger=LOGGER, forced=False, stack_offset=2):
|
||||
""" This function helps with timed blocks
|
||||
----
|
||||
Parameters:
|
||||
prefix - one of items from TIMED_BLOCKS; the action to be timed
|
||||
logger - NVLogger object
|
||||
forced - if True then the events are always logged (even if it should be skipped)
|
||||
"""
|
||||
if logger is None:
|
||||
pass
|
||||
logger.log(key=prefix + "_start", value=value, forced=forced, stack_offset=stack_offset)
|
||||
yield logger
|
||||
logger.log(key=prefix + "_stop", forced=forced, stack_offset=stack_offset)
|
||||
|
||||
|
||||
def timed_function(prefix, variable=None, forced=False):
|
||||
""" This decorator helps with timed functions
|
||||
----
|
||||
Parameters:
|
||||
prefix - one of items from TIME_BLOCK; the action to be timed
|
||||
logger - NVLogger object
|
||||
forced - if True then the events are always logged (even if it should be skipped)
|
||||
"""
|
||||
def timed_function_decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
logger = kwargs.get('logger', LOGGER)
|
||||
value = kwargs.get(variable, next(iter(args), None))
|
||||
with timed_block(prefix=prefix, logger=logger, value=value, forced=forced, stack_offset=3):
|
||||
func(*args, **kwargs)
|
||||
return wrapper
|
||||
return timed_function_decorator
|
|
@ -1,223 +0,0 @@
|
|||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
from logger import tags
|
||||
import logger.logger as nvl
|
||||
|
||||
|
||||
LogLine = collections.namedtuple('LogLine', [
|
||||
'full_string', # the complete line as a string
|
||||
'worker', # the worker id
|
||||
'token', # the token, i.e. ':::NVLOG'
|
||||
'version_str', # the version string, e.g. 'v0.1.0'
|
||||
'model', # the model, e.g. 'ncf'
|
||||
'timestamp', # seconds as a float, e.g. 1234.567
|
||||
'filename', # the which generated the log line, e.g. './convert.py'
|
||||
'lineno', # the line in the file which generated the log line, e.g. 119
|
||||
'tag', # the string tag
|
||||
'value', # the parsed value associated with the tag, or None if no value
|
||||
'epoch', # the epoch number of -1 if none
|
||||
'iteration', # the interation number of -1 if none
|
||||
'scope' # run, epoch, iteration, eval_iteration
|
||||
])
|
||||
|
||||
|
||||
def get_dict_value(x):
|
||||
if isinstance(x, dict):
|
||||
return x
|
||||
return {"value": x}
|
||||
|
||||
|
||||
def get_value(x):
|
||||
if isinstance(x, dict):
|
||||
if "value" in x:
|
||||
return x.get("value")
|
||||
else:
|
||||
return x
|
||||
return x
|
||||
|
||||
|
||||
def get_named_value(x, name):
|
||||
if isinstance(x, dict):
|
||||
if name in x:
|
||||
return x.get(name)
|
||||
else:
|
||||
return None
|
||||
return x
|
||||
|
||||
|
||||
class NVLogParser(object):
|
||||
|
||||
def __init__(self, token=nvl.NVLOGGER_TOKEN, version=nvl.NVLOGGER_VERSION):
|
||||
|
||||
self.epoch = defaultdict(lambda: -1)
|
||||
self.iteration = defaultdict(lambda: -1)
|
||||
self.scope = defaultdict(lambda: 0)
|
||||
|
||||
self.version = version
|
||||
self.token = token
|
||||
self.line_pattern = (
|
||||
'^'
|
||||
'([\d]?)' # optional worker id (0)
|
||||
'(' + token + ')' # mandatory token (1)
|
||||
'v([\d]+\.[\d+]\.[\d+])[ ]' # mandatory version (2)
|
||||
'([A-Za-z0-9_]+)[ ]' # mandatory model (3)
|
||||
'([\d\.]+)[ ]' # mandatory timestamp (4)
|
||||
'\(([^: ]+)' # mandatory file (5)
|
||||
':(\d+)\)[ ]' # mandatory lineno (6)
|
||||
'([A-Za-z0-9_]+)[ ]?' # mandatory tag (7)
|
||||
'(:\s+(.+))?' # optional value (8)
|
||||
'$'
|
||||
)
|
||||
# print(self.line_pattern)
|
||||
|
||||
self.line_regex = re.compile(self.line_pattern, re.X)
|
||||
|
||||
def string_to_logline(self, string):
|
||||
|
||||
m = self.line_regex.match(string)
|
||||
|
||||
if m is None:
|
||||
raise ValueError('does not match regex')
|
||||
|
||||
args = [
|
||||
m.group(0), # full string
|
||||
]
|
||||
|
||||
# by default
|
||||
worker = m.group(1)
|
||||
if worker == "":
|
||||
worker = "(0)"
|
||||
|
||||
args.append(worker)
|
||||
|
||||
args.append(m.group(2)) # token
|
||||
args.append(m.group(3)) # version
|
||||
args.append(m.group(4)) # model
|
||||
|
||||
try:
|
||||
ts = float(m.group(5)) # parse timestamp
|
||||
args.append(ts)
|
||||
except ValueError:
|
||||
raise ValueError('timestamp format incorrect')
|
||||
|
||||
args.append(m.group(6)) # file name
|
||||
|
||||
try:
|
||||
lineno = int(m.group(7)) # may raise error
|
||||
args.append(lineno)
|
||||
except ValueError:
|
||||
raise ValueError('line number format incorrect')
|
||||
|
||||
tag = m.group(8)
|
||||
args.append(tag) # tag
|
||||
|
||||
# 9th is ignored
|
||||
|
||||
value = m.group(10)
|
||||
|
||||
if value is not None:
|
||||
j = json.loads(value)
|
||||
args.append(j)
|
||||
else:
|
||||
# no Value
|
||||
args.append(None)
|
||||
|
||||
# update processing state
|
||||
if tag == tags.TRAIN_EPOCH_START or tag == tags.TRAIN_EPOCH:
|
||||
self.epoch[worker] = get_named_value(value, tags.VALUE_EPOCH)
|
||||
self.scope[worker] = nvl.EPOCH_SCOPE
|
||||
self.iteration[worker] = -1
|
||||
|
||||
if tag == tags.TRAIN_EPOCH_STOP:
|
||||
self.scope[worker] = nvl.RUN_SCOPE
|
||||
|
||||
if tag == tags.TRAIN_ITER_START:
|
||||
self.iteration[worker] = get_named_value(value, tags.VALUE_ITERATION)
|
||||
self.scope[worker] = nvl.TRAIN_ITER_SCOPE
|
||||
|
||||
if tag == tags.TRAIN_ITER_STOP:
|
||||
self.scope[worker] = nvl.EPOCH_SCOPE
|
||||
|
||||
if tag == tags.PERF_IT_PER_SEC:
|
||||
self.scope[worker] = nvl.EPOCH_SCOPE
|
||||
|
||||
if tag == tags.PERF_TIME_TO_TRAIN:
|
||||
self.scope[worker] = nvl.RUN_SCOPE
|
||||
|
||||
args.append(self.epoch[worker])
|
||||
args.append(self.iteration[worker])
|
||||
args.append(self.scope[worker])
|
||||
|
||||
return LogLine(*args)
|
||||
|
||||
def parse_generator(self, gen):
|
||||
worker_loglines = defaultdict(list)
|
||||
loglines = []
|
||||
failed = []
|
||||
|
||||
# state init for parsing
|
||||
self.epoch.clear()
|
||||
self.iteration.clear()
|
||||
self.scope.clear()
|
||||
|
||||
for line in gen:
|
||||
line = line.strip()
|
||||
if line.find(self.token) == -1:
|
||||
continue
|
||||
try:
|
||||
ll = self.string_to_logline(line)
|
||||
worker_loglines[ll.worker].append(ll)
|
||||
loglines.append(ll)
|
||||
except ValueError as e:
|
||||
failed.append((line, str(e)))
|
||||
|
||||
return loglines, failed, worker_loglines
|
||||
|
||||
def parse_file(self, filename):
|
||||
with open(filename) as f:
|
||||
return self.parse_generator(f)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 2:
|
||||
print('usage: parser.py FILENAME')
|
||||
print(' tests parsing on the file.')
|
||||
sys.exit(1)
|
||||
|
||||
filename = sys.argv[1]
|
||||
parser = NVLogParser()
|
||||
loglines, errors, worker_loglines = parser.parse_file(filename)
|
||||
|
||||
print('Parsed {} log lines with {} errors.'.format(len(loglines), len(errors)))
|
||||
print('Found workers: {}.'.format(list(worker_loglines.keys())))
|
||||
|
||||
if len(errors) > 0:
|
||||
print('Lines which failed to parse:')
|
||||
for line, error in errors:
|
||||
print(' Following line failed: {}'.format(error))
|
||||
print(line)
|
||||
|
||||
if len(loglines) > 0:
|
||||
print('Lines which where parsed sucessfully:')
|
||||
for line in loglines:
|
||||
print(line.full_string, " ---> ", line.epoch, line.iteration, line.scope)
|
||||
|
|
@ -1,242 +0,0 @@
|
|||
# Copyright 2018 MLBenchmark Group. All Rights Reserved.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Common values reported
|
||||
|
||||
VALUE_EPOCH = "epoch"
|
||||
VALUE_ITERATION = "iteration"
|
||||
VALUE_ACCURACY = "accuracy"
|
||||
VALUE_BLEU = "bleu"
|
||||
VALUE_TOP1 = "top1"
|
||||
VALUE_TOP5 = "top5"
|
||||
VALUE_BBOX_MAP = "bbox_map"
|
||||
VALUE_MASK_MAP = "mask_map"
|
||||
VALUE_BCE = "binary_cross_entropy"
|
||||
|
||||
|
||||
# Timed blocks (used with timed_function & timed_block
|
||||
# For each there should be *_start and *_stop tags defined
|
||||
|
||||
RUN_BLOCK = "run"
|
||||
SETUP_BLOCK = "setup"
|
||||
PREPROC_BLOCK = "preproc"
|
||||
|
||||
TRAIN_BLOCK = "train"
|
||||
TRAIN_PREPROC_BLOCK = "train_preproc"
|
||||
TRAIN_EPOCH_BLOCK = "train_epoch"
|
||||
TRAIN_EPOCH_PREPROC_BLOCK = "train_epoch_preproc"
|
||||
TRAIN_CHECKPOINT_BLOCK = "train_checkpoint"
|
||||
TRAIN_ITER_BLOCK = "train_iteration"
|
||||
|
||||
EVAL_BLOCK = "eval"
|
||||
EVAL_ITER_BLOCK = "eval_iteration"
|
||||
|
||||
TIMED_BLOCKS = {
|
||||
RUN_BLOCK,
|
||||
SETUP_BLOCK,
|
||||
PREPROC_BLOCK,
|
||||
TRAIN_BLOCK,
|
||||
TRAIN_PREPROC_BLOCK,
|
||||
TRAIN_EPOCH_BLOCK,
|
||||
TRAIN_EPOCH_PREPROC_BLOCK,
|
||||
TRAIN_CHECKPOINT_BLOCK,
|
||||
TRAIN_ITER_BLOCK,
|
||||
EVAL_BLOCK,
|
||||
EVAL_ITER_BLOCK,
|
||||
}
|
||||
|
||||
|
||||
# Events
|
||||
|
||||
RUN_INIT = "run_init"
|
||||
|
||||
SETUP_START = "setup_start"
|
||||
SETUP_STOP = "setup_stop"
|
||||
|
||||
PREPROC_START = "preproc_start"
|
||||
PREPROC_STOP = "preproc_stop"
|
||||
|
||||
RUN_START = "run_start"
|
||||
RUN_STOP = "run_stop"
|
||||
RUN_FINAL = "run_final"
|
||||
|
||||
TRAIN_CHECKPOINT_START = "train_checkpoint_start"
|
||||
TRAIN_CHECKPOINT_STOP = "train_checkpoint_stop"
|
||||
|
||||
TRAIN_PREPROC_START = "train_preproc_start"
|
||||
TRAIN_PREPROC_STOP = "train_preproc_stop"
|
||||
|
||||
TRAIN_EPOCH_PREPROC_START = "train_epoch_preproc_start"
|
||||
TRAIN_EPOCH_PREPROC_STOP = "train_epoch_preproc_stop"
|
||||
|
||||
TRAIN_ITER_START = "train_iter_start"
|
||||
TRAIN_ITER_STOP = "train_iter_stop"
|
||||
|
||||
TRAIN_EPOCH_START = "train_epoch_start"
|
||||
TRAIN_EPOCH_STOP = "train_epoch_stop"
|
||||
|
||||
|
||||
# MLPerf specific tags
|
||||
|
||||
RUN_CLEAR_CACHES = "run_clear_caches"
|
||||
|
||||
PREPROC_NUM_TRAIN_EXAMPLES = "preproc_num_train_examples"
|
||||
PREPROC_NUM_EVAL_EXAMPLES = "preproc_num_eval_examples"
|
||||
PREPROC_TOKENIZE_TRAINING = "preproc_tokenize_training"
|
||||
PREPROC_TOKENIZE_EVAL = "preproc_tokenize_eval"
|
||||
PREPROC_VOCAB_SIZE = "preproc_vocab_size"
|
||||
|
||||
RUN_SET_RANDOM_SEED = "run_set_random_seed"
|
||||
|
||||
INPUT_SIZE = "input_size"
|
||||
INPUT_BATCH_SIZE = "input_batch_size"
|
||||
INPUT_ORDER = "input_order"
|
||||
INPUT_SHARD = "input_shard"
|
||||
INPUT_BN_SPAN = "input_bn_span"
|
||||
|
||||
INPUT_CENTRAL_CROP = "input_central_crop"
|
||||
INPUT_CROP_USES_BBOXES = "input_crop_uses_bboxes"
|
||||
INPUT_DISTORTED_CROP_MIN_OBJ_COV = "input_distorted_crop_min_object_covered"
|
||||
INPUT_DISTORTED_CROP_RATIO_RANGE = "input_distorted_crop_aspect_ratio_range"
|
||||
INPUT_DISTORTED_CROP_AREA_RANGE = "input_distorted_crop_area_range"
|
||||
INPUT_DISTORTED_CROP_MAX_ATTEMPTS = "input_distorted_crop_max_attempts"
|
||||
INPUT_MEAN_SUBTRACTION = "input_mean_subtraction"
|
||||
INPUT_RANDOM_FLIP = "input_random_flip"
|
||||
|
||||
INPUT_RESIZE = "input_resize"
|
||||
INPUT_RESIZE_ASPECT_PRESERVING = "input_resize_aspect_preserving"
|
||||
|
||||
|
||||
# Opt
|
||||
|
||||
OPT_NAME = "opt_name"
|
||||
|
||||
OPT_LR = "opt_learning_rate"
|
||||
OPT_MOMENTUM = "opt_momentum"
|
||||
|
||||
OPT_WEIGHT_DECAY = "opt_weight_decay"
|
||||
|
||||
OPT_HP_ADAM_BETA1 = "opt_hp_Adam_beta1"
|
||||
OPT_HP_ADAM_BETA2 = "opt_hp_Adam_beta2"
|
||||
OPT_HP_ADAM_EPSILON = "opt_hp_Adam_epsilon"
|
||||
|
||||
OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps"
|
||||
|
||||
|
||||
# Train
|
||||
|
||||
TRAIN_LOOP = "train_loop"
|
||||
TRAIN_EPOCH = "train_epoch"
|
||||
TRAIN_CHECKPOINT = "train_checkpoint"
|
||||
TRAIN_LOSS = "train_loss"
|
||||
TRAIN_ITERATION_LOSS = "train_iteration_loss"
|
||||
|
||||
|
||||
# Eval
|
||||
|
||||
EVAL_START = "eval_start"
|
||||
EVAL_SIZE = "eval_size"
|
||||
EVAL_TARGET = "eval_target"
|
||||
EVAL_ACCURACY = "eval_accuracy"
|
||||
EVAL_STOP = "eval_stop"
|
||||
|
||||
|
||||
# Perf
|
||||
|
||||
PERF_IT_PER_SEC = "perf_it_per_sec"
|
||||
PERF_TIME_TO_TRAIN = "time_to_train"
|
||||
|
||||
EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy"
|
||||
|
||||
|
||||
# Model
|
||||
|
||||
MODEL_HP_LOSS_FN = "model_hp_loss_fn"
|
||||
|
||||
MODEL_HP_INITIAL_SHAPE = "model_hp_initial_shape"
|
||||
MODEL_HP_FINAL_SHAPE = "model_hp_final_shape"
|
||||
|
||||
MODEL_L2_REGULARIZATION = "model_l2_regularization"
|
||||
MODEL_EXCLUDE_BN_FROM_L2 = "model_exclude_bn_from_l2"
|
||||
|
||||
MODEL_HP_RELU = "model_hp_relu"
|
||||
MODEL_HP_CONV2D_FIXED_PADDING = "model_hp_conv2d_fixed_padding"
|
||||
MODEL_HP_BATCH_NORM = "model_hp_batch_norm"
|
||||
MODEL_HP_DENSE = "model_hp_dense"
|
||||
|
||||
|
||||
# GNMT specific
|
||||
|
||||
MODEL_HP_LOSS_SMOOTHING = "model_hp_loss_smoothing"
|
||||
MODEL_HP_NUM_LAYERS = "model_hp_num_layers"
|
||||
MODEL_HP_HIDDEN_SIZE = "model_hp_hidden_size"
|
||||
MODEL_HP_DROPOUT = "model_hp_dropout"
|
||||
|
||||
EVAL_HP_BEAM_SIZE = "eval_hp_beam_size"
|
||||
TRAIN_HP_MAX_SEQ_LEN = "train_hp_max_sequence_length"
|
||||
EVAL_HP_MAX_SEQ_LEN = "eval_hp_max_sequence_length"
|
||||
EVAL_HP_LEN_NORM_CONST = "eval_hp_length_normalization_constant"
|
||||
EVAL_HP_LEN_NORM_FACTOR = "eval_hp_length_normalization_factor"
|
||||
EVAL_HP_COV_PENALTY_FACTOR = "eval_hp_coverage_penalty_factor"
|
||||
|
||||
|
||||
# NCF specific
|
||||
|
||||
PREPROC_HP_MIN_RATINGS = "preproc_hp_min_ratings"
|
||||
PREPROC_HP_NUM_EVAL = "preproc_hp_num_eval"
|
||||
PREPROC_HP_SAMPLE_EVAL_REPLACEMENT = "preproc_hp_sample_eval_replacement"
|
||||
|
||||
INPUT_HP_NUM_NEG = "input_hp_num_neg"
|
||||
INPUT_HP_SAMPLE_TRAIN_REPLACEMENT = "input_hp_sample_train_replacement"
|
||||
INPUT_STEP_TRAIN_NEG_GEN = "input_step_train_neg_gen"
|
||||
INPUT_STEP_EVAL_NEG_GEN = "input_step_eval_neg_gen"
|
||||
|
||||
EVAL_HP_NUM_USERS = "eval_hp_num_users"
|
||||
EVAL_HP_NUM_NEG = "eval_hp_num_neg"
|
||||
|
||||
MODEL_HP_MF_DIM = "model_hp_mf_dim"
|
||||
MODEL_HP_MLP_LAYER_SIZES = "model_hp_mlp_layer_sizes"
|
||||
|
||||
|
||||
# RESNET specific
|
||||
|
||||
EVAL_EPOCH_OFFSET = "eval_offset"
|
||||
|
||||
MODEL_HP_INITIAL_MAX_POOL = "model_hp_initial_max_pool"
|
||||
MODEL_HP_BEGIN_BLOCK = "model_hp_begin_block"
|
||||
MODEL_HP_END_BLOCK = "model_hp_end_block"
|
||||
MODEL_HP_BLOCK_TYPE = "model_hp_block_type"
|
||||
MODEL_HP_PROJECTION_SHORTCUT = "model_hp_projection_shortcut"
|
||||
MODEL_HP_SHORTCUT_ADD = "model_hp_shorcut_add"
|
||||
MODEL_HP_RESNET_TOPOLOGY = "model_hp_resnet_topology"
|
||||
|
||||
|
||||
# Transformer specific
|
||||
|
||||
INPUT_MAX_LENGTH = "input_max_length"
|
||||
|
||||
MODEL_HP_INITIALIZER_GAIN = "model_hp_initializer_gain"
|
||||
MODEL_HP_VOCAB_SIZE = "model_hp_vocab_size"
|
||||
MODEL_HP_NUM_HIDDEN_LAYERS = "model_hp_hidden_layers"
|
||||
MODEL_HP_EMBEDDING_SHARED_WEIGHTS = "model_hp_embedding_shared_weights"
|
||||
MODEL_HP_ATTENTION_DENSE = "model_hp_attention_dense"
|
||||
MODEL_HP_ATTENTION_DROPOUT = "model_hp_attention_dropout"
|
||||
MODEL_HP_FFN_OUTPUT_DENSE = "model_hp_ffn_output_dense"
|
||||
MODEL_HP_FFN_FILTER_DENSE = "model_hp_ffn_filter_dense"
|
||||
MODEL_HP_RELU_DROPOUT = "model_hp_relu_dropout"
|
||||
MODEL_HP_LAYER_POSTPROCESS_DROPOUT = "model_hp_layer_postprocess_dropout"
|
||||
MODEL_HP_NORM = "model_hp_norm"
|
||||
MODEL_HP_SEQ_BEAM_SEARCH = "model_hp_sequence_beam_search"
|
||||
|
|
@ -44,8 +44,8 @@ from mpi4py import MPI
|
|||
from neumf import ncf_model_ops
|
||||
from input_pipeline import DataGenerator
|
||||
|
||||
from logger.logger import LOGGER
|
||||
from logger.autologging import log_args
|
||||
import dllogger
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
|
@ -96,8 +96,7 @@ def parse_args():
|
|||
parser.add_argument('--loss-scale', default=8192, type=int,
|
||||
help='Loss scale value to use when manually enabling mixed precision')
|
||||
parser.add_argument('--checkpoint-dir', default='/data/checkpoints/', type=str,
|
||||
help='Path to the store the result checkpoint file for training, \
|
||||
or to read from for evaluation')
|
||||
help='Path to the store the result checkpoint file for training')
|
||||
parser.add_argument('--load-checkpoint-path', default=None, type=str,
|
||||
help='Path to the checkpoint for initialization. If None will initialize with random weights')
|
||||
parser.add_argument('--mode', choices=['train', 'test'], default='train', type=str,
|
||||
|
@ -105,11 +104,12 @@ def parse_args():
|
|||
otherwise full training will be performed')
|
||||
parser.add_argument('--eval-after', type=int, default=8,
|
||||
help='Perform evaluations only after this many epochs')
|
||||
parser.add_argument('--verbose', action='store_true',
|
||||
help='Log the performance and accuracy after every epoch')
|
||||
parser.add_argument('--log-path', default='log.json', type=str,
|
||||
help='Path for the JSON training log')
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def hvd_init():
|
||||
"""
|
||||
Initialize Horovod
|
||||
|
@ -124,6 +124,7 @@ def hvd_init():
|
|||
print('PY', sys.version)
|
||||
print('TF', tf.__version__)
|
||||
|
||||
|
||||
def get_local_train_data(pos_train_users, pos_train_items, negative_samples):
|
||||
"""
|
||||
For distributed, split up the train data and only keep the local portion
|
||||
|
@ -148,6 +149,7 @@ def get_local_train_data(pos_train_users, pos_train_items, negative_samples):
|
|||
|
||||
return local_train_users, local_train_items, local_train_labels
|
||||
|
||||
|
||||
def get_local_test_data(pos_test_users, pos_test_items):
|
||||
"""
|
||||
For distributed, split up the test data and only keep the local portion
|
||||
|
@ -162,17 +164,21 @@ def get_local_test_data(pos_test_users, pos_test_items):
|
|||
|
||||
return local_test_users, local_test_items
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Run training/evaluation
|
||||
"""
|
||||
script_start = time.time()
|
||||
hvd_init()
|
||||
mpi_comm = MPI.COMM_WORLD
|
||||
args = parse_args()
|
||||
|
||||
if hvd.rank() == 0:
|
||||
log_args(args)
|
||||
dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
|
||||
filename=args.log_path),
|
||||
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
|
||||
else:
|
||||
dllogger.init(backends=[])
|
||||
|
||||
dllogger.log(data=vars(args), step='PARAMETER')
|
||||
|
||||
if args.seed is not None:
|
||||
tf.random.set_random_seed(args.seed)
|
||||
|
@ -185,11 +191,6 @@ def main():
|
|||
and os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] == "1":
|
||||
args.fp16 = False
|
||||
|
||||
# directory to store/read final checkpoint
|
||||
if args.mode == 'train' and hvd.rank() == 0:
|
||||
print("Saving best checkpoint to {}".format(args.checkpoint_dir))
|
||||
elif hvd.rank() == 0:
|
||||
print("Reading checkpoint: {}".format(args.checkpoint_dir))
|
||||
if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
|
||||
os.makedirs(args.checkpoint_dir, exist_ok=True)
|
||||
final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt')
|
||||
|
@ -317,11 +318,11 @@ def main():
|
|||
ndcg = ndcg_sum / ndcg_cnt
|
||||
|
||||
if hvd.rank() == 0:
|
||||
LOGGER.log("Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}"
|
||||
.format(eval_duration, hit_rate, ndcg))
|
||||
|
||||
eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration
|
||||
LOGGER.log('Average Eval Throughput: {:.4f}'.format(eval_throughput))
|
||||
dllogger.log(step=tuple(), data={'eval_throughput': eval_throughput,
|
||||
'eval_time': eval_duration,
|
||||
'hr@10': hit_rate,
|
||||
'ndcg': ndcg})
|
||||
return
|
||||
|
||||
# Performance Metrics
|
||||
|
@ -345,8 +346,6 @@ def main():
|
|||
|
||||
# Begin training
|
||||
begin_train = time.time()
|
||||
if hvd.rank() == 0:
|
||||
LOGGER.log("Begin Training. Setup Time: {}".format(begin_train - script_start))
|
||||
for epoch in range(args.epochs):
|
||||
# Train for one epoch
|
||||
train_start = time.time()
|
||||
|
@ -364,7 +363,7 @@ def main():
|
|||
}
|
||||
)
|
||||
train_duration = time.time() - train_start
|
||||
## Only log "warm" epochs
|
||||
# Only log "warm" epochs
|
||||
if epoch >= 1:
|
||||
train_times.append(train_duration)
|
||||
# Evaluate
|
||||
|
@ -399,22 +398,16 @@ def main():
|
|||
ndcg = global_ndcg_sum[0] / global_ndcg_count[0]
|
||||
|
||||
eval_duration = time.time() - eval_start
|
||||
## Only log "warm" epochs
|
||||
# Only log "warm" epochs
|
||||
if epoch >= 1:
|
||||
eval_times.append(eval_duration)
|
||||
|
||||
if hvd.rank() == 0:
|
||||
if args.verbose:
|
||||
log_string = "Epoch: {:02d}, Train Time: {:.4f}, Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}"
|
||||
LOGGER.log(
|
||||
log_string.format(
|
||||
epoch,
|
||||
train_duration,
|
||||
eval_duration,
|
||||
hit_rate,
|
||||
ndcg
|
||||
)
|
||||
)
|
||||
dllogger.log(step=(epoch,), data={
|
||||
'train_time': train_duration,
|
||||
'eval_time': eval_duration,
|
||||
'hr@10': hit_rate,
|
||||
'ndcg': ndcg})
|
||||
|
||||
# Update summary metrics
|
||||
if hit_rate > args.target and first_to_target is None:
|
||||
|
@ -424,18 +417,6 @@ def main():
|
|||
best_hr = hit_rate
|
||||
best_epoch = epoch
|
||||
time_to_best = time.time() - begin_train
|
||||
if not args.verbose:
|
||||
log_string = "New Best Epoch: {:02d}, Train Time: {:.4f}, Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}"
|
||||
LOGGER.log(
|
||||
log_string.format(
|
||||
epoch,
|
||||
train_duration,
|
||||
eval_duration,
|
||||
hit_rate,
|
||||
ndcg
|
||||
)
|
||||
)
|
||||
# Save, if meets target
|
||||
if hit_rate > args.target:
|
||||
saver.save(sess, final_checkpoint_path)
|
||||
|
||||
|
@ -445,26 +426,22 @@ def main():
|
|||
train_throughputs = pos_train_users.shape[0]*(args.negative_samples+1) / train_times
|
||||
eval_times = np.array(eval_times)
|
||||
eval_throughputs = pos_test_users.shape[0]*(args.valid_negative+1) / eval_times
|
||||
LOGGER.log(' ')
|
||||
|
||||
LOGGER.log('batch_size: {}'.format(args.batch_size))
|
||||
LOGGER.log('num_gpus: {}'.format(hvd.size()))
|
||||
LOGGER.log('AMP: {}'.format(1 if args.amp else 0))
|
||||
LOGGER.log('seed: {}'.format(args.seed))
|
||||
LOGGER.log('Minimum Train Time per Epoch: {:.4f}'.format(np.min(train_times)))
|
||||
LOGGER.log('Average Train Time per Epoch: {:.4f}'.format(np.mean(train_times)))
|
||||
LOGGER.log('Average Train Throughput: {:.4f}'.format(np.mean(train_throughputs)))
|
||||
LOGGER.log('Minimum Eval Time per Epoch: {:.4f}'.format(np.min(eval_times)))
|
||||
LOGGER.log('Average Eval Time per Epoch: {:.4f}'.format(np.mean(eval_times)))
|
||||
LOGGER.log('Average Eval Throughput: {:.4f}'.format(np.mean(eval_throughputs)))
|
||||
LOGGER.log('First Epoch to hit: {}'.format(first_to_target))
|
||||
LOGGER.log('Time to Train: {:.4f}'.format(time_to_train))
|
||||
LOGGER.log('Time to Best: {:.4f}'.format(time_to_best))
|
||||
LOGGER.log('Best HR: {:.4f}'.format(best_hr))
|
||||
LOGGER.log('Best Epoch: {}'.format(best_epoch))
|
||||
dllogger.log(step=tuple(), data={
|
||||
'average_train_time_per_epoch': np.mean(train_times),
|
||||
'average_train_throughput': np.mean(train_throughputs),
|
||||
'average_eval_time_per_epoch': np.mean(eval_times),
|
||||
'average_eval_throughput': np.mean(eval_throughputs),
|
||||
'first_epoch_to_hit': first_to_target,
|
||||
'time_to_train': time_to_train,
|
||||
'time_to_best': time_to_best,
|
||||
'best_hr': best_hr,
|
||||
'best_epoch': best_epoch})
|
||||
dllogger.flush()
|
||||
|
||||
sess.close()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
pandas
|
||||
cupy
|
||||
-e git://github.com/NVIDIA/dllogger#egg=dllogger
|
Loading…
Reference in a new issue