Inference throughput computation removing outliers, eval when eval script is given only

This commit is contained in:
Swetha Mandava 2020-04-29 14:17:22 -07:00
parent c94b73f9ea
commit b7903f0f62
2 changed files with 16 additions and 23 deletions

View file

@ -1122,11 +1122,12 @@ def main(_):
end_logits=end_logits))
eval_time_elapsed = time.time() - eval_start_time
eval_time_wo_overhead = eval_hooks[-1].total_time
time_list = eval_hooks[-1].time_list
time_list.sort()
num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size
# Removing outliers (init/warmup) in throughput computation.
eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)])
num_sentences = (int(len(time_list) * 0.99)) * FLAGS.predict_batch_size
avg = np.mean(time_list)
cf_50 = max(time_list[:int(len(time_list) * 0.50)])
@ -1140,7 +1141,7 @@ def main(_):
tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
eval_hooks[-1].count * FLAGS.predict_batch_size)
tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
(eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size)
num_sentences)
tf.compat.v1.logging.info("Summary Inference Statistics")
tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
@ -1164,16 +1165,17 @@ def main(_):
FLAGS.do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file)
import sys
import subprocess
eval_out = subprocess.check_output([sys.executable, FLAGS.eval_script,
FLAGS.predict_file, output_prediction_file])
scores = str(eval_out).strip()
exact_match = float(scores.split(":")[1].split(",")[0])
f1 = float(scores.split(":")[2].split("}")[0])
dllogging.logger.log(step=(), data={"f1": f1}, verbosity=Verbosity.DEFAULT)
dllogging.logger.log(step=(), data={"exact_match": exact_match}, verbosity=Verbosity.DEFAULT)
print(str(eval_out))
if FLAGS.eval_script:
import sys
import subprocess
eval_out = subprocess.check_output([sys.executable, FLAGS.eval_script,
FLAGS.predict_file, output_prediction_file])
scores = str(eval_out).strip()
exact_match = float(scores.split(":")[1].split(",")[0])
f1 = float(scores.split(":")[2].split("}")[0])
dllogging.logger.log(step=(), data={"f1": f1}, verbosity=Verbosity.DEFAULT)
dllogging.logger.log(step=(), data={"exact_match": exact_match}, verbosity=Verbosity.DEFAULT)
print(str(eval_out))
if __name__ == "__main__":

View file

@ -19,9 +19,7 @@ class LogEvalRunHook(tf.estimator.SessionRunHook):
def __init__(self, global_batch_size, hvd_rank=-1):
self.global_batch_size = global_batch_size
self.hvd_rank = hvd_rank
self.total_time = 0.0
self.count = 0
self.skipped = 0
self.time_list = []
def before_run(self, run_context):
@ -30,14 +28,7 @@ class LogEvalRunHook(tf.estimator.SessionRunHook):
def after_run(self, run_context, run_values):
elapsed_secs = time.time() - self.t0
self.count += 1
# Removing first 2 (arbitrary) number of startup iterations from perf evaluations
if self.count <= 2:
print("Skipping time record for ", self.count, " due to overhead")
self.skipped += 1
else:
self.time_list.append(elapsed_secs)
self.total_time += elapsed_secs
self.time_list.append(elapsed_secs)
# report throughput during training
class LogTrainRunHook(tf.estimator.SessionRunHook):