DeepLearningExamples/PyTorch/Classification/ConvNets/image_classification/logger.py
2021-11-09 13:42:18 -08:00

489 lines
15 KiB
Python

# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from collections import OrderedDict
from numbers import Number
import dllogger
import numpy as np
def format_step(step):
if isinstance(step, str):
return step
s = ""
if len(step) > 0:
if isinstance(step[0], Number):
s += "Epoch: {} ".format(step[0])
else:
s += "{} ".format(step[0])
if len(step) > 1:
s += "Iteration: {} ".format(step[1])
if len(step) > 2:
s += "Validation Iteration: {} ".format(step[2])
if len(step) == 0:
s = "Summary:"
return s
PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter())
LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter())
ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter())
LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter())
LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1))
LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99))
LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95))
class Meter(object):
def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator):
self.run_aggregator = run_aggregator
self.epoch_aggregator = epoch_aggregator
self.iteration_aggregator = iteration_aggregator
def record(self, val, n=1):
self.iteration_aggregator.record(val, n=n)
def get_iteration(self):
v, n = self.iteration_aggregator.get_val()
return v
def reset_iteration(self):
v, n = self.iteration_aggregator.get_data()
self.iteration_aggregator.reset()
if v is not None:
self.epoch_aggregator.record(v, n=n)
def get_epoch(self):
v, n = self.epoch_aggregator.get_val()
return v
def reset_epoch(self):
v, n = self.epoch_aggregator.get_data()
self.epoch_aggregator.reset()
if v is not None:
self.run_aggregator.record(v, n=n)
def get_run(self):
v, n = self.run_aggregator.get_val()
return v
def reset_run(self):
self.run_aggregator.reset()
class QuantileMeter(object):
def __init__(self, q):
self.q = q
self.reset()
def reset(self):
self.vals = []
self.n = 0
def record(self, val, n=1):
if isinstance(val, list):
self.vals += val
self.n += len(val)
else:
self.vals += [val] * n
self.n += n
def get_val(self):
if not self.vals:
return None, self.n
return np.quantile(self.vals, self.q, interpolation="nearest"), self.n
def get_data(self):
return self.vals, self.n
class MaxMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.max = None
self.n = 0
def record(self, val, n=1):
if self.max is None:
self.max = val
else:
self.max = max(self.max, val)
self.n = n
def get_val(self):
return self.max, self.n
def get_data(self):
return self.max, self.n
class MinMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.min = None
self.n = 0
def record(self, val, n=1):
if self.min is None:
self.min = val
else:
self.min = max(self.min, val)
self.n = n
def get_val(self):
return self.min, self.n
def get_data(self):
return self.min, self.n
class LastMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.last = None
self.n = 0
def record(self, val, n=1):
self.last = val
self.n = n
def get_val(self):
return self.last, self.n
def get_data(self):
return self.last, self.n
class AverageMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.n = 0
self.val = 0
def record(self, val, n=1):
self.n += n
self.val += val * n
def get_val(self):
if self.n == 0:
return None, 0
return self.val / self.n, self.n
def get_data(self):
if self.n == 0:
return None, 0
return self.val / self.n, self.n
class Logger(object):
def __init__(self, print_interval, backends, start_epoch=-1, verbose=False):
self.epoch = start_epoch
self.iteration = -1
self.val_iteration = -1
self.calib_iteration = -1
self.metrics = OrderedDict()
self.backends = backends
self.print_interval = print_interval
self.verbose = verbose
dllogger.init(backends)
def log_parameter(self, data, verbosity=0):
dllogger.log(step="PARAMETER", data=data, verbosity=verbosity)
def register_metric(self, metric_name, meter, verbosity=0, metadata={}):
if self.verbose:
print("Registering metric: {}".format(metric_name))
self.metrics[metric_name] = {"meter": meter, "level": verbosity}
dllogger.metadata(metric_name, metadata)
def log_metric(self, metric_name, val, n=1):
self.metrics[metric_name]["meter"].record(val, n=n)
def start_iteration(self, mode="train"):
if mode == "val":
self.val_iteration += 1
elif mode == "train":
self.iteration += 1
elif mode == "calib":
self.calib_iteration += 1
def end_iteration(self, mode="train"):
if mode == "val":
it = self.val_iteration
elif mode == "train":
it = self.iteration
elif mode == "calib":
it = self.calib_iteration
if it % self.print_interval == 0 or mode == "calib":
metrics = {n: m for n, m in self.metrics.items() if n.startswith(mode)}
if mode == "train":
step = (self.epoch, self.iteration)
elif mode == "val":
step = (self.epoch, self.iteration, self.val_iteration)
elif mode == "calib":
step = ("Calibration", self.calib_iteration)
verbositys = {m["level"] for _, m in metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in metrics.items() if m["level"] == ll}
dllogger.log(
step=step,
data={n: m["meter"].get_iteration() for n, m in llm.items()},
verbosity=ll,
)
for n, m in metrics.items():
m["meter"].reset_iteration()
dllogger.flush()
def start_epoch(self):
self.epoch += 1
self.iteration = 0
self.val_iteration = 0
for n, m in self.metrics.items():
if not n.startswith("calib"):
m["meter"].reset_epoch()
def end_epoch(self):
for n, m in self.metrics.items():
if not n.startswith("calib"):
m["meter"].reset_iteration()
verbositys = {m["level"] for _, m in self.metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
dllogger.log(
step=(self.epoch,),
data={n: m["meter"].get_epoch() for n, m in llm.items()},
)
def start_calibration(self):
self.calib_iteration = 0
for n, m in self.metrics.items():
if n.startswith("calib"):
m["meter"].reset_epoch()
def end_calibration(self):
for n, m in self.metrics.items():
if n.startswith("calib"):
m["meter"].reset_iteration()
def end(self):
for n, m in self.metrics.items():
m["meter"].reset_epoch()
verbositys = {m["level"] for _, m in self.metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
dllogger.log(
step=tuple(), data={n: m["meter"].get_run() for n, m in llm.items()}
)
for n, m in self.metrics.items():
m["meter"].reset_epoch()
dllogger.flush()
def iteration_generator_wrapper(self, gen, mode="train"):
for g in gen:
self.start_iteration(mode=mode)
yield g
self.end_iteration(mode=mode)
def epoch_generator_wrapper(self, gen):
for g in gen:
self.start_epoch()
yield g
self.end_epoch()
class Metrics:
ACC_METADATA = {"unit": "%", "format": ":.2f"}
IPS_METADATA = {"unit": "img/s", "format": ":.2f"}
TIME_METADATA = {"unit": "s", "format": ":.5f"}
LOSS_METADATA = {"format": ":.5f"}
LR_METADATA = {"format": ":.5f"}
def __init__(self, logger):
self.logger = logger
self.map = {}
def log(self, **kwargs):
if self.logger is None:
return
for k, v in kwargs.items():
tks = self.map.get(k, [k])
for tk in tks:
if isinstance(v, tuple):
self.logger.log_metric(tk, v[0], v[1])
else:
self.logger.log_metric(tk, v)
class TrainingMetrics(Metrics):
def __init__(self, logger):
super().__init__(logger)
if self.logger is not None:
self.map = {
"loss": ["train.loss"],
"compute_ips": ["train.compute_ips"],
"total_ips": ["train.total_ips"],
"data_time": ["train.data_time"],
"compute_time": ["train.compute_time"],
"lr": ["train.lr"],
}
logger.register_metric(
"train.loss",
LOSS_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.LOSS_METADATA,
)
logger.register_metric(
"train.compute_ips",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.IPS_METADATA,
)
logger.register_metric(
"train.total_ips",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.IPS_METADATA,
)
logger.register_metric(
"train.data_time",
PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
"train.compute_time",
PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
"train.lr",
LR_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
)
class ValidationMetrics(Metrics):
def __init__(self, logger, prefix):
super().__init__(logger)
if self.logger is not None:
self.map = {
"loss": [f"{prefix}.loss"],
"top1": [f"{prefix}.top1"],
"top5": [f"{prefix}.top5"],
"compute_ips": [f"{prefix}.compute_ips"],
"total_ips": [f"{prefix}.total_ips"],
"data_time": [f"{prefix}.data_time"],
"compute_time": [
f"{prefix}.compute_latency",
f"{prefix}.compute_latency_at100",
f"{prefix}.compute_latency_at99",
f"{prefix}.compute_latency_at95",
],
}
logger.register_metric(
f"{prefix}.top1",
ACC_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.ACC_METADATA,
)
logger.register_metric(
f"{prefix}.top5",
ACC_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.ACC_METADATA,
)
logger.register_metric(
f"{prefix}.loss",
LOSS_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.LOSS_METADATA,
)
logger.register_metric(
f"{prefix}.compute_ips",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.IPS_METADATA,
)
logger.register_metric(
f"{prefix}.total_ips",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.IPS_METADATA,
)
logger.register_metric(
f"{prefix}.data_time",
PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
f"{prefix}.compute_latency",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
f"{prefix}.compute_latency_at100",
LAT_100(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
f"{prefix}.compute_latency_at99",
LAT_99(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
f"{prefix}.compute_latency_at95",
LAT_95(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)