# Copyright 2018 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Estimator functions supporting running on TPU.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import codecs import os import subprocess import time import numpy as np import tensorflow as tf from tensorflow.contrib.compiler import xla from tensorflow.core.protobuf import rewriter_config_pb2 from tensorflow.python.framework import ops as tf_ops from tensorflow.python.ops import lookup_ops from tensorflow.python.util import tf_contextlib import gnmt_model import model_helper from utils import iterator_utils from utils import misc_utils from utils import nmt_utils from utils import vocab_utils from variable_mgr import variable_mgr from variable_mgr import variable_mgr_util from benchmark_hooks import BenchmarkHook def _get_custom_getter(): """Returns a custom getter that this class's methods must be called under. All methods of this class must be called under a variable scope that was passed this custom getter. Example: ```python network = ConvNetBuilder(...) with tf.variable_scope('cg', custom_getter=network.get_custom_getter()): network.conv(...) # Call more methods of network here ``` Currently, this custom getter only does anything if self.use_tf_layers is True. In that case, it causes variables to be stored as dtype self.variable_type, then casted to the requested dtype, instead of directly storing the variable as the requested dtype. """ def inner_custom_getter(getter, *args, **kwargs): """Custom getter that forces variables to have type self.variable_type.""" cast_to_float16 = False requested_dtype = kwargs["dtype"] if requested_dtype == tf.float16: # Only change the variable dtype if doing so does not decrease variable # precision. kwargs["dtype"] = tf.float32 cast_to_float16 = True var = getter(*args, **kwargs) with tf_ops.init_scope(): # This if statement is needed to guard the cast, because batch norm # assigns directly to the return value of this custom getter. The cast # makes the return value not a variable so it cannot be assigned. Batch # norm variables are always in fp32 so this if statement is never # triggered for them. if cast_to_float16: var = tf.cast(var, tf.float16) return var return inner_custom_getter @tf_contextlib.contextmanager def mixed_precision_scope(): with tf.variable_scope("", custom_getter=_get_custom_getter()) as varscope: yield varscope def maybe_xla_compile(hparams, fn, *args): pure_fn = lambda: fn(*args) if hparams and hparams.xla_compile: return xla.compile(pure_fn) else: return pure_fn() class ModelFnFactory(object): """docstring.""" def __init__(self, hparams): self.hparams = hparams def build_graph_dist_strategy(self, features, labels, mode, params): """Model function.""" del labels, params misc_utils.print_out("Running dist_strategy mode_fn") hparams = self.hparams # Create a GNMT model for training. # assert (hparams.encoder_type == "gnmt" or # hparams.attention_architecture in ["gnmt", "gnmt_v2"]) with mixed_precision_scope(): model = gnmt_model.GNMTModel(hparams, mode=mode, features=features) if mode == tf.contrib.learn.ModeKeys.INFER: sample_ids = model.sample_id reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file( hparams.tgt_vocab_file, default_value=vocab_utils.UNK) sample_words = reverse_target_vocab_table.lookup( tf.to_int64(sample_ids)) # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if hparams.time_major: sample_words = tf.transpose(sample_words) elif sample_words.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. sample_words = tf.transpose(sample_words, [2, 0, 1]) predictions = {"predictions": sample_words} # return loss, vars, grads, predictions, train_op, scaffold return None, None, None, predictions, None, None elif mode == tf.contrib.learn.ModeKeys.TRAIN: loss = model.train_loss train_op = model.update return loss, model.params, model.grads, None, train_op, None else: raise ValueError("Unknown mode in model_fn: %s" % mode) def _create_loss_scale_vars(self): """docstring.""" # Create loss scale vars if necessary hparams = self.hparams loss_scale, loss_scale_normal_steps = None, None if hparams.use_fp16: loss_scale = tf.get_variable( "loss_scale", initializer=float(hparams.fp16_loss_scale), dtype=tf.float32, trainable=False) if hparams.enable_auto_loss_scale: loss_scale_normal_steps = tf.get_variable( "loss_scale_normal_steps", initializer=0, trainable=False) return loss_scale, loss_scale_normal_steps def _shard_inputs(self, features, num_towers): """docstring.""" if num_towers == 1: return [features] source = features["source"] target_input = features["target_input"] target_output = features["target_output"] source_seq_length = features["source_sequence_length"] target_seq_length = features["target_sequence_length"] # Compute each split sizes. global_batch_size = tf.size(source_seq_length) tower_batch_size = tf.cast(global_batch_size / num_towers, dtype=tf.int32) split_sizes = [tower_batch_size] * (num_towers - 1) split_sizes.append(global_batch_size - (num_towers - 1) * tower_batch_size) sources = tf.split(source, split_sizes, axis=0) target_inputs = tf.split(target_input, split_sizes, axis=0) target_outputs = tf.split(target_output, split_sizes, axis=0) source_sequence_lengths = tf.split(source_seq_length, split_sizes) target_sequence_lengths = tf.split(target_seq_length, split_sizes) tower_features = [] for i in range(num_towers): tower_features.append({ "source": sources[i], "target_input": target_inputs[i], "target_output": target_outputs[i], "source_sequence_length": source_sequence_lengths[i], "target_sequence_length": target_sequence_lengths[i] }) return tower_features def get_optimizer(self, hparams, learning_rate): """docstring.""" if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(learning_rate) else: raise ValueError("Unknown optimizer type %s" % hparams.optimizer) return opt def _compute_tower_grads(self, tower_loss, tower_params, learning_rate, use_fp16=False, loss_scale=None, colocate_gradients_with_ops=True): """docstring.""" if use_fp16: assert loss_scale scaled_loss = tf.multiply( tower_loss, tf.convert_to_tensor(loss_scale, dtype=tower_loss.dtype), name="scaling_loss") else: scaled_loss = tower_loss opt = self.get_optimizer(self.hparams, learning_rate) grads_and_vars = opt.compute_gradients(scaled_loss, tower_params, colocate_gradients_with_ops=self.hparams.colocate_gradients_with_ops) grads = [x for (x, _) in grads_and_vars] assert grads for g in grads: assert g.dtype == tf.float32, "grad.dtype isn't fp32: %s" % g.name # Downscale grads for var, grad in zip(tower_params, grads): if grad is None: misc_utils.print_out("%s gradient is None!" % var.name) if use_fp16: grads = [ grad * tf.reciprocal(loss_scale) for grad in grads ] return tower_params, grads, opt def _get_variable_mgr(self, hparams): """docstring.""" assert not hparams.use_dist_strategy # A hack to create a container object that later get passed to VariableMgr # __init__() as the ill-designed `benchmark_cnn` argument. class Config(object): pass config = Config() config.params = Config() params = config.params # This is num_gpus per worker, a.k.a the number of towers. params.num_gpus = hparams.num_gpus # TODO(jamesqin): make more robust params.use_resource_vars = hparams.use_resource_vars params.use_fp16 = hparams.use_fp16 params.compact_gradient_transfer = hparams.compact_gradient_transfer # For nmt, only strong consistency params.variable_consistency = "strong" params.all_reduce_spec = hparams.all_reduce_spec params.gpu_indices = hparams.gpu_indices params.agg_small_grads_max_bytes = hparams.agg_small_grads_max_bytes params.agg_small_grads_max_group = hparams.agg_small_grads_max_group params.hierarchical_copy = hparams.hierarchical_copy params.network_topology = hparams.network_topology params.local_parameter_device = hparams.local_parameter_device params.gradient_repacking = hparams.gradient_repacking params.allreduce_merge_scope = hparams.allreduce_merge_scope config.enable_auto_loss_scale = hparams.enable_auto_loss_scale if hparams.num_gpus > 0: config.raw_devices = ["gpu:%i" % i for i in range(hparams.num_gpus)] else: config.raw_devices = ["cpu:0"] config.devices = config.raw_devices return variable_mgr.VariableMgrLocalReplicated( config, config.params.all_reduce_spec, config.params.agg_small_grads_max_bytes, config.params.agg_small_grads_max_group, config.params.allreduce_merge_scope) def _print_varinfo(self, var_params, tower_id): # Print trainable variables misc_utils.print_out("# Trainable variables for tower: %d" % tower_id) misc_utils.print_out( "Format: , , , <(soft) device placement>") for param in var_params: misc_utils.print_out( " %s, %s, %s, %s" % (param.name, str(param.get_shape()), param.dtype.name, param.op.device)) misc_utils.print_out("Total params size: %.2f GB" % (4. * np.sum([ p.get_shape().num_elements() for p in var_params if p.get_shape().is_fully_defined() ]) / 2**30)) def build_graph(self, features, labels, mode, params): """docstring.""" del labels, params misc_utils.print_out("Running fast mode_fn") hparams = self.hparams # Create global_step tf.train.get_or_create_global_step() if mode == tf.contrib.learn.ModeKeys.INFER: # Doing inference only on one GPU inf_hparams = tf.contrib.training.HParams(**hparams.values()) inf_hparams.set_hparam("num_gpus", 1) # Inference is done in fp32 and in the same way as that of dist_strategy. inf_hparams.set_hparam("use_fp16", False) misc_utils.print_out("inference hparmas:") misc_utils.print_hparams(inf_hparams) # Create variable_mgr var_mgr = self._get_variable_mgr(inf_hparams) with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope( "tower_0"), var_mgr.create_outer_variable_scope(0): model = gnmt_model.GNMTModel(inf_hparams, mode=mode, features=features) sample_ids = model.sample_id reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file( inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK) sample_words = reverse_target_vocab_table.lookup( tf.to_int64(sample_ids)) # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if inf_hparams.time_major: sample_words = tf.transpose(sample_words) elif sample_words.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. sample_words = tf.transpose(sample_words, [2, 0, 1]) predictions = {"predictions": sample_words} # return loss, vars, grads, predictions, train_op, scaffold return None, None, None, predictions, None, None elif mode == tf.contrib.learn.ModeKeys.TRAIN: num_towers = hparams.num_gpus # Shard inputs tower_features = self._shard_inputs(features, num_towers) # Create loss scale vars if necessary loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars() # Create variable_mgr var_mgr = self._get_variable_mgr(hparams) # Build per-tower fprop and bprop devices = var_mgr.get_devices() tower_gradvars = [] tower_scopes = [] var_scopes = [] train_losses = [] learning_rates = [] batch_sizes = [] opts = [] def fprop_and_bprop(tid): """docstring.""" model = gnmt_model.GNMTModel( hparams, mode=mode, features=tower_features[tid]) # sync training. assert model.learning_rate is not None # The following handles shouldn't be built in when doing manual assert model.grad_norm is None assert model.update is None tower_loss = model.train_loss # Only check loss numerics if in fp16 if hparams.use_fp16 and hparams.check_tower_loss_numerics: tower_loss = tf.check_numerics( tower_loss, "tower_%d has Inf/NaN loss" % tid) # Cast to fp32, otherwise would easily overflow. tower_loss = tf.to_float(tower_loss) var_params, grads, opt = self._compute_tower_grads( tower_loss, var_mgr.trainable_variables_on_device(tid, tid), model.learning_rate, use_fp16=hparams.use_fp16, loss_scale=loss_scale, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) self._print_varinfo(var_params, tid) res = [model.train_loss, model.learning_rate, model.batch_size] res.extend(grads) opts.append(opt) return res def unpack_fprop_and_bprop_output(output): train_loss = output[0] learning_rate = output[1] batch_size = output[2] grads = output[3:] return train_loss, learning_rate, batch_size, grads with mixed_precision_scope(): for tid in range(num_towers): with tf.device(devices[tid % len(devices)]), tf.name_scope( "tower_%s" % tid) as scope: tower_scopes.append(scope) with var_mgr.create_outer_variable_scope(tid) as var_scope: var_scopes.append(var_scope) outputs = maybe_xla_compile(hparams, fprop_and_bprop, tid) (train_loss, learning_rate, batch_size, grads) = unpack_fprop_and_bprop_output(outputs) train_losses.append(train_loss) learning_rates.append(learning_rate) batch_sizes.append(batch_size) var_params = var_mgr.trainable_variables_on_device(tid, tid) tower_gradvars.append(list(zip(grads, var_params))) # Add summaries if hparams.show_metrics: tf.summary.scalar("learning_rate", learning_rates[0]) if loss_scale: tf.summary.scalar("loss_scale", loss_scale) if hparams.enable_auto_loss_scale: tf.summary.scalar("loss_scale_normal_steps", loss_scale_normal_steps) misc_utils.print_out("Finish building fprop and per-tower bprop.") # Aggregate gradients # The following compute the aggregated grads for each tower, stored in # opaque grad_states structure. apply_grads_devices, grad_states = var_mgr.preprocess_device_grads( tower_gradvars) master_grads = None master_params = None update_ops = [] for i, device in enumerate(apply_grads_devices): with tf.device(device), tf.name_scope(tower_scopes[i]): # Get per-tower grads. with tf.name_scope("get_gradients_to_apply"): avg_gradvars = var_mgr.get_gradients_to_apply(i, grad_states) avg_grads = [gv[0] for gv in avg_gradvars] # gradients post-processing with tf.name_scope("clip_gradients"): if hparams.clip_grads: clipped_grads, grad_norm = model_helper.gradient_clip( avg_grads, max_gradient_norm=hparams.max_gradient_norm) # summary the grad on the 1st tower if i == 0 and hparams.show_metrics: tf.summary.scalar("grad_norm", grad_norm) tf.summary.scalar("clipped_grad_norm", tf.global_norm(clipped_grads)) else: clipped_grads = avg_grads if i == 0: master_grads = clipped_grads # Build apply-gradients ops clipped_gradvars = list( zip(clipped_grads, [gv[1] for gv in avg_gradvars])) if i == 0: master_params = [gv[1] for gv in avg_gradvars] with tf.name_scope("append_gradient_ops"): loss_scale_params = variable_mgr_util.AutoLossScaleParams( enable_auto_loss_scale=hparams.enable_auto_loss_scale, loss_scale=loss_scale, loss_scale_normal_steps=loss_scale_normal_steps, inc_loss_scale_every_n=hparams.fp16_inc_loss_scale_every_n, is_chief=True) opt = opts[i] var_mgr.append_apply_gradients_ops(grad_states, opt, clipped_gradvars, update_ops, loss_scale_params) misc_utils.print_out("Finish building grad aggregation.") assert len(update_ops) == num_towers train_op = tf.group(update_ops) with tf.control_dependencies([train_op]): global_step = tf.train.get_global_step() train_op = global_step.assign_add(1) # Compute loss on the first gpu # TODO(jamesqin): optimize it? with tf.device("gpu:0"): loss = misc_utils.weighted_avg(train_losses, batch_sizes) # Create local init_ops # TODO(jamesqin): handle resource variables! # At present if not using mirror strategy, not using resource vars. local_init_ops = [] local_init_op = tf.local_variables_initializer() with tf.control_dependencies([local_init_op]): local_init_ops.append(var_mgr.get_post_init_ops()) local_init_ops.extend([local_init_op, tf.tables_initializer()]) saveable_vars = var_mgr.savable_variables() # Add saveables for cudnn vars in master tower. saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) saveable_objects = [x for x in saveable_objects if "v0" in x.name] misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars)) for mv in saveable_vars: misc_utils.print_out(mv.name) misc_utils.print_out( "All global trainable vars(%d): " % len(tf.trainable_variables())) for tv in tf.trainable_variables(): misc_utils.print_out(tv.name) misc_utils.print_out( "All global vars(%d): " % len(tf.global_variables())) for gv in tf.global_variables(): misc_utils.print_out(gv.name) misc_utils.print_out( "master backproped params(%d): " % len(master_params)) for mp in master_params: misc_utils.print_out(mp.name) # Note the cudnn vars are skipped the init check. :( scaffold = tf.train.Scaffold( ready_op=tf.report_uninitialized_variables(saveable_vars), ready_for_local_init_op=tf.report_uninitialized_variables( saveable_vars), local_init_op=tf.group(*local_init_ops), saver=tf.train.Saver(saveable_vars + saveable_objects, save_relative_paths=True)) misc_utils.print_out("Finish building model_fn") # return loss, vars, grads, predictions, train_op, scaffold return loss, master_params, master_grads, None, train_op, scaffold def make_model_fn(hparams): """Construct a GNMT model function for training.""" factory = ModelFnFactory(hparams) if hparams.use_dist_strategy: def fn(features, labels, mode, params): """docstring.""" (loss, _, _, predictions, train_op, _) = factory.build_graph_dist_strategy(features, labels, mode, params) if mode == tf.contrib.learn.ModeKeys.INFER: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) else: if hparams.use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op) else: return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) return fn else: build_fn = factory.build_graph def fn(features, labels, mode, params): """docstring.""" (loss, _, _, predictions, train_op, scaffold) = build_fn( features, labels, mode, params) if mode == tf.contrib.learn.ModeKeys.INFER: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) else: return tf.estimator.EstimatorSpec(mode=mode, loss=loss, scaffold=scaffold, train_op=train_op) return fn def make_input_fn(hparams, mode): """Construct a input function for training.""" def _input_fn(params): """Input function.""" del params if mode == tf.contrib.learn.ModeKeys.TRAIN: src_file = "%s.%s" % (hparams.train_prefix, hparams.src) tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt) else: if hparams.mode == "translate": src_file = hparams.translate_file + ".tok" tgt_file = hparams.translate_file + ".tok" else: src_file = "%s.%s" % (hparams.test_prefix, hparams.src) tgt_file = "%s.%s" % (hparams.test_prefix, hparams.tgt) src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables( src_vocab_file, tgt_vocab_file, hparams.share_vocab) src_dataset = tf.data.TextLineDataset(src_file) tgt_dataset = tf.data.TextLineDataset(tgt_file) if mode == tf.contrib.learn.ModeKeys.TRAIN: # Run one epoch and stop if running train_and_eval. if hparams.mode == "train_and_eval": # In this mode input pipeline is restarted every epoch, so choose a # different random_seed. num_repeat = 1 random_seed = hparams.random_seed + int(time.time()) % 100 else: num_repeat = 8 random_seed = hparams.random_seed return iterator_utils.get_iterator( src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, random_seed=random_seed, num_buckets=hparams.num_buckets, src_max_len=hparams.src_max_len, tgt_max_len=hparams.tgt_max_len, output_buffer_size=None, skip_count=None, num_shards=1, # flags.num_workers shard_index=0, # flags.jobid reshuffle_each_iteration=True, use_char_encode=hparams.use_char_encode, num_repeat=num_repeat, filter_oversized_sequences=True ) # need to update get_effective_train_epoch_size() if this flag flips. else: return iterator_utils.get_infer_iterator( src_dataset, src_vocab_table, batch_size=hparams.infer_batch_size, eos=hparams.eos, src_max_len=hparams.src_max_len, use_char_encode=hparams.use_char_encode) def _synthetic_input_fn(params): """Fake inputs for debugging and benchmarking.""" del params batch_size = hparams.batch_size src_max_len = hparams.src_max_len tgt_max_len = hparams.tgt_max_len features = { "source": tf.random_uniform( dtype=tf.int32, minval=1, maxval=10, seed=1, shape=(batch_size, src_max_len)), "target_input": tf.random_uniform( dtype=tf.int32, minval=1, maxval=10, seed=2, shape=(batch_size, tgt_max_len)), "target_output": tf.random_uniform( dtype=tf.int32, minval=1, maxval=10, seed=3, shape=(batch_size, tgt_max_len)), "source_sequence_length": tf.constant([src_max_len] * batch_size), "target_sequence_length": tf.constant([tgt_max_len] * batch_size) } return features if hparams.use_synthetic_data: return _synthetic_input_fn else: return _input_fn def get_distribution_strategy(num_gpus): if num_gpus == 0: return tf.contrib.distribute.OneDeviceStrategy("device:CPU:0") elif num_gpus == 1: return tf.contrib.distribute.OneDeviceStrategy("device:GPU:0") else: return tf.contrib.distribute.MirroredStrategy(num_gpus=num_gpus) def get_sacrebleu(trans_file, detokenizer_file): """Detokenize the trans_file and get the sacrebleu score.""" assert tf.gfile.Exists(detokenizer_file) local_detokenizer_file = "/tmp/detokenizer.perl" if tf.gfile.Exists(local_detokenizer_file): tf.gfile.Remove(local_detokenizer_file) tf.gfile.Copy(detokenizer_file, local_detokenizer_file, overwrite=True) assert tf.gfile.Exists(trans_file) local_trans_file = "/tmp/newstest2014_out.tok.de" if tf.gfile.Exists(local_trans_file): tf.gfile.Remove(local_trans_file) tf.gfile.Copy(trans_file, local_trans_file, overwrite=True) detok_trans_path = "/tmp/newstest2014_out.detok.de" if tf.gfile.Exists(detok_trans_path): tf.gfile.Remove(detok_trans_path) # Detokenize the trans_file. cmd = "cat %s | perl %s -l de | cat > %s" % ( local_trans_file, local_detokenizer_file, detok_trans_path) subprocess.run(cmd, shell=True) assert tf.gfile.Exists(detok_trans_path) # run sacrebleu cmd = ("cat %s | sacrebleu -t wmt14/full -l en-de --score-only -lc --tokenize" " intl") % (detok_trans_path) sacrebleu = subprocess.run([cmd], stdout=subprocess.PIPE, shell=True) score = sacrebleu.stdout.strip() return float(score) def get_metrics(hparams, model_fn, ckpt=None, only_translate=False): """Run inference and compute metrics.""" pred_estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=hparams.output_dir) benchmark_hook = BenchmarkHook(hparams.infer_batch_size) predictions = pred_estimator.predict( make_input_fn(hparams, tf.contrib.learn.ModeKeys.INFER), checkpoint_path=ckpt, hooks=[benchmark_hook]) translations = [] output_tokens = [] beam_id = 0 for prediction in predictions: # get the top translation. if beam_id == 0: for sent_id in range(hparams.infer_batch_size): if sent_id >= prediction["predictions"].shape[0]: break trans, output_length = nmt_utils.get_translation( prediction["predictions"], sent_id=sent_id, tgt_eos=hparams.eos, subword_option=hparams.subword_option) translations.append(trans) output_tokens.append(output_length) beam_id += 1 if beam_id == hparams.beam_width: beam_id = 0 if only_translate: trans_file = hparams.translate_file + '.trans.tok' else: trans_file = os.path.join( hparams.output_dir, "newstest2014_out_{}.tok.de".format( pred_estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP))) trans_dir = os.path.dirname(trans_file) if not tf.gfile.Exists(trans_dir): tf.gfile.MakeDirs(trans_dir) tf.logging.info("Writing to file %s" % trans_file) with codecs.getwriter("utf-8")(tf.gfile.GFile(trans_file, mode="wb")) as trans_f: trans_f.write("") # Write empty string to ensure file is created. for translation in translations: trans_f.write((translation + b"\n").decode("utf-8")) if only_translate: return None, benchmark_hook.get_average_speed_and_latencies(), sum(output_tokens) # Evaluation output_dir = os.path.join(pred_estimator.model_dir, "eval") tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) ref_file = "%s.%s" % (hparams.test_prefix, hparams.tgt) # Hardcoded. metric = "bleu" score = get_sacrebleu(trans_file, hparams.detokenizer_file) misc_utils.print_out("bleu is %.5f" % score) with tf.Graph().as_default(): summaries = [] summaries.append(tf.Summary.Value(tag=metric, simple_value=score)) tf_summary = tf.Summary(value=list(summaries)) summary_writer.add_summary( tf_summary, pred_estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP)) summary_writer.close() return score, benchmark_hook.get_average_speed_and_latencies(), sum(output_tokens) def train_fn(hparams): """Train function.""" model_fn = make_model_fn(hparams) input_fn = make_input_fn(hparams, tf.contrib.learn.ModeKeys.TRAIN) log_step_count_steps = hparams.log_step_count_steps save_checkpoints_steps = hparams.save_checkpoints_steps if hparams.use_dist_strategy: distribution_strategy = get_distribution_strategy(hparams.num_gpus) config = tf.estimator.RunConfig( train_distribute=distribution_strategy, log_step_count_steps=log_step_count_steps, keep_checkpoint_max=None, save_checkpoints_steps=save_checkpoints_steps) else: sess_config = tf.ConfigProto(allow_soft_placement=True) if hparams.use_autojit_xla: sess_config.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) if not hparams.use_pintohost_optimizer: sess_config.graph_options.rewrite_options.pin_to_host_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) config = tf.estimator.RunConfig( log_step_count_steps=log_step_count_steps, session_config=sess_config, keep_checkpoint_max=None, save_checkpoints_steps=save_checkpoints_steps) misc_utils.print_out("sess master is %s" % config.master) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=hparams.output_dir, config=config) benchmark_hook = BenchmarkHook(hparams.batch_size, hparams.warmup_steps + 5) train_hooks = [benchmark_hook] if hparams.profile: train_hooks.append(tf.train.ProfilerHook( output_dir=hparams.output_dir, save_steps=hparams.profile_save_steps, show_dataflow=True, show_memory=True)) max_steps = hparams.debug_num_train_steps estimator.train( input_fn=input_fn, max_steps=max_steps, hooks=train_hooks, ) return benchmark_hook.get_average_speed_and_latencies() def eval_fn(hparams, ckpt=None, only_translate=False): model_fn = make_model_fn(hparams) return get_metrics(hparams, model_fn, ckpt, only_translate=only_translate)