# Copyright 2017 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Utility functions for building models.""" from __future__ import print_function import collections import os import time import numpy as np import six import tensorflow as tf from utils import math_utils from utils import misc_utils as utils from utils import vocab_utils __all__ = [ "get_initializer", "create_emb_for_encoder_and_decoder", "create_rnn_cell", "gradient_clip", "create_or_load_model", "load_model", "avg_checkpoints", ] # If a vocab size is greater than this value, put the embedding on cpu instead VOCAB_SIZE_THRESHOLD_CPU = 50000 def get_initializer(init_op, seed=None, init_weight=0): """Create an initializer. init_weight is only for uniform.""" if init_op == "uniform": assert init_weight return tf.random_uniform_initializer( -init_weight, init_weight, seed=seed) elif init_op == "glorot_normal": return tf.keras.initializers.glorot_normal( seed=seed) elif init_op == "glorot_uniform": return tf.keras.initializers.glorot_uniform( seed=seed) elif init_op.isdigit(): # dtype is default fp32 for variables. val = int(init_op) return tf.constant_initializer(val) else: raise ValueError("Unknown init_op %s" % init_op) class ExtraArgs(collections.namedtuple( "ExtraArgs", ("single_cell_fn", "model_device_fn", "attention_mechanism_fn", "encoder_emb_lookup_fn"))): pass class TrainModel( collections.namedtuple("TrainModel", ("graph", "model", "iterator", "skip_count_placeholder"))): pass def _get_embed_device(vocab_size): """Decide on which device to place an embed matrix given its vocab size.""" if vocab_size > VOCAB_SIZE_THRESHOLD_CPU: return "/cpu:0" else: return "/gpu:0" def _create_pretrained_emb_from_txt( vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32, scope=None): """Load pretrain embeding from embed_file, and return an embedding matrix. Args: vocab_file: Path to vocab file. embed_file: Path to a Glove formmated embedding txt file. num_trainable_tokens: Make the first n tokens in the vocab file as trainable variables. Default is 3, which is "", "" and "". dtype: data type. scope: tf scope name. Returns: pretrained embedding table variable. """ vocab, _ = vocab_utils.load_vocab(vocab_file) trainable_tokens = vocab[:num_trainable_tokens] utils.print_out("# Using pretrained embedding: %s." % embed_file) utils.print_out(" with trainable tokens: ") emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file) for token in trainable_tokens: utils.print_out(" %s" % token) if token not in emb_dict: emb_dict[token] = [0.0] * emb_size emb_mat = np.array( [emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype()) emb_mat = tf.constant(emb_mat) emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1]) with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope: emb_mat_var = tf.get_variable( "emb_mat_var", [num_trainable_tokens, emb_size]) return tf.concat([emb_mat_var, emb_mat_const], 0) def _create_or_load_embed(embed_name, vocab_file, embed_file, vocab_size, embed_size, dtype): """Create a new or load an existing embedding matrix.""" if vocab_file and embed_file: embedding = _create_pretrained_emb_from_txt(vocab_file, embed_file) else: embedding = tf.get_variable( embed_name, [vocab_size, embed_size], dtype) return embedding def create_emb_for_encoder_and_decoder(share_vocab, src_vocab_size, tgt_vocab_size, src_embed_size, tgt_embed_size, dtype=tf.float32, num_enc_partitions=0, num_dec_partitions=0, src_vocab_file=None, tgt_vocab_file=None, src_embed_file=None, tgt_embed_file=None, use_char_encode=False, scope=None): """Create embedding matrix for both encoder and decoder. Args: share_vocab: A boolean. Whether to share embedding matrix for both encoder and decoder. src_vocab_size: An integer. The source vocab size. tgt_vocab_size: An integer. The target vocab size. src_embed_size: An integer. The embedding dimension for the encoder's embedding. tgt_embed_size: An integer. The embedding dimension for the decoder's embedding. dtype: dtype of the embedding matrix. Default to float32. num_enc_partitions: number of partitions used for the encoder's embedding vars. num_dec_partitions: number of partitions used for the decoder's embedding vars. src_vocab_file: A string. The source vocabulary file. tgt_vocab_file: A string. The target vocabulary file. src_embed_file: A string. The source embedding file. tgt_embed_file: A string. The target embedding file. use_char_encode: A boolean. If true, use char encoder. scope: VariableScope for the created subgraph. Default to "embedding". Returns: embedding_encoder: Encoder's embedding matrix. embedding_decoder: Decoder's embedding matrix. Raises: ValueError: if use share_vocab but source and target have different vocab size. """ if num_enc_partitions <= 1: enc_partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions) if num_dec_partitions <= 1: dec_partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions) if src_embed_file and enc_partitioner: raise ValueError( "Can't set num_enc_partitions > 1 when using pretrained encoder " "embedding") if tgt_embed_file and dec_partitioner: raise ValueError( "Can't set num_dec_partitions > 1 when using pretrained decdoer " "embedding") with tf.variable_scope( scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope: # Share embedding if share_vocab: if src_vocab_size != tgt_vocab_size: raise ValueError("Share embedding but different src/tgt vocab sizes" " %d vs. %d" % (src_vocab_size, tgt_vocab_size)) assert src_embed_size == tgt_embed_size utils.print_out("# Use the same embedding for source and target") vocab_file = src_vocab_file or tgt_vocab_file embed_file = src_embed_file or tgt_embed_file embedding_encoder = _create_or_load_embed( "embedding_share", vocab_file, embed_file, src_vocab_size, src_embed_size, dtype) embedding_decoder = embedding_encoder else: if not use_char_encode: with tf.variable_scope("encoder", partitioner=enc_partitioner): embedding_encoder = _create_or_load_embed( "embedding_encoder", src_vocab_file, src_embed_file, src_vocab_size, src_embed_size, dtype) else: embedding_encoder = None with tf.variable_scope("decoder", partitioner=dec_partitioner): embedding_decoder = _create_or_load_embed( "embedding_decoder", tgt_vocab_file, tgt_embed_file, tgt_vocab_size, tgt_embed_size, dtype) return embedding_encoder, embedding_decoder def build_cell(cell, input_shape): if isinstance(cell, tf.contrib.rnn.MultiRNNCell): assert isinstance(input_shape, collections.Sequence) for i, c in enumerate(cell._cells): if i == 0: c.build((None, input_shape)) else: c.build((None, c.num_units)) return if isinstance(cell, tf.nn.rnn_cell.DropoutWrapper): build_cell(cell._cell, input_shape) elif isinstance(cell, tf.nn.rnn_cell.ResidualWrapper): build_cell(cell._cell, input_shape) elif isinstance(cell, tf.nn.rnn_cell.LSTMCell): cell.build(input_shape) else: raise ValueError("%s not supported" % type(cell)) def _single_cell(unit_type, num_units, forget_bias, dropout, mode, dtype=None, residual_connection=False, residual_fn=None, use_block_lstm=False): """Create an instance of a single RNN cell.""" # dropout (= 1 - keep_prob) is set to 0 during eval and infer dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0 # Cell Type if unit_type == "lstm": utils.print_out(" LSTM, forget_bias=%g" % forget_bias, new_line=False) if not use_block_lstm: single_cell = tf.nn.rnn_cell.LSTMCell( num_units, dtype=dtype, forget_bias=forget_bias) else: single_cell = tf.contrib.rnn.LSTMBlockCell( num_units, forget_bias=forget_bias) elif unit_type == "gru": utils.print_out(" GRU", new_line=False) single_cell = tf.contrib.rnn.GRUCell(num_units) elif unit_type == "layer_norm_lstm": utils.print_out(" Layer Normalized LSTM, forget_bias=%g" % forget_bias, new_line=False) single_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( num_units, forget_bias=forget_bias, layer_norm=True) elif unit_type == "nas": utils.print_out(" NASCell", new_line=False) single_cell = tf.contrib.rnn.NASCell(num_units) else: raise ValueError("Unknown unit type %s!" % unit_type) # Dropout (= 1 - keep_prob) if dropout > 0.0: single_cell = tf.contrib.rnn.DropoutWrapper( cell=single_cell, input_keep_prob=(1.0 - dropout)) utils.print_out(" %s, dropout=%g " %(type(single_cell).__name__, dropout), new_line=False) # Residual if residual_connection: single_cell = tf.contrib.rnn.ResidualWrapper( single_cell, residual_fn=residual_fn) utils.print_out(" %s" % type(single_cell).__name__, new_line=False) return single_cell def _cell_list(unit_type, num_units, num_layers, num_residual_layers, forget_bias, dropout, mode, dtype=None, single_cell_fn=None, residual_fn=None, use_block_lstm=False): """Create a list of RNN cells.""" if not single_cell_fn: single_cell_fn = _single_cell # Multi-GPU cell_list = [] for i in range(num_layers): utils.print_out(" cell %d" % i, new_line=False) single_cell = single_cell_fn( unit_type=unit_type, num_units=num_units, forget_bias=forget_bias, dropout=dropout, mode=mode, dtype=dtype, residual_connection=(i >= num_layers - num_residual_layers), residual_fn=residual_fn, use_block_lstm=use_block_lstm ) utils.print_out("") cell_list.append(single_cell) return cell_list def create_rnn_cell(unit_type, num_units, num_layers, num_residual_layers, forget_bias, dropout, mode, dtype=None, single_cell_fn=None, use_block_lstm=False): """Create multi-layer RNN cell. Args: unit_type: string representing the unit type, i.e. "lstm". num_units: the depth of each unit. num_layers: number of cells. num_residual_layers: Number of residual layers from top to bottom. For example, if `num_layers=4` and `num_residual_layers=2`, the last 2 RNN cells in the returned list will be wrapped with `ResidualWrapper`. forget_bias: the initial forget bias of the RNNCell(s). dropout: floating point value between 0.0 and 1.0: the probability of dropout. this is ignored if `mode != TRAIN`. mode: either tf.contrib.learn.TRAIN/EVAL/INFER single_cell_fn: allow for adding customized cell. When not specified, we default to model_helper._single_cell Returns: An `RNNCell` instance. """ cell_list = _cell_list(unit_type=unit_type, num_units=num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=forget_bias, dropout=dropout, mode=mode, dtype=dtype, single_cell_fn=single_cell_fn, use_block_lstm=use_block_lstm) if len(cell_list) == 1: # Single layer. return cell_list[0] else: # Multi layers return tf.contrib.rnn.MultiRNNCell(cell_list) def gradient_clip(gradients, max_gradient_norm): """Clipping gradients of a model.""" clipped_gradients, gradient_norm = math_utils.clip_by_global_norm( gradients, max_gradient_norm) return clipped_gradients, gradient_norm def print_variables_in_ckpt(ckpt_path): """Print a list of variables in a checkpoint together with their shapes.""" utils.print_out("# Variables in ckpt %s" % ckpt_path) reader = tf.train.NewCheckpointReader(ckpt_path) variable_map = reader.get_variable_to_shape_map() for key in sorted(variable_map.keys()): utils.print_out(" %s: %s" % (key, variable_map[key])) def load_model(model, ckpt_path, session, name): """Load model from a checkpoint.""" start_time = time.time() try: model.saver.restore(session, ckpt_path) except tf.errors.NotFoundError as e: utils.print_out("Can't load checkpoint") print_variables_in_ckpt(ckpt_path) utils.print_out("%s" % str(e)) session.run(tf.tables_initializer()) utils.print_out( " loaded %s model parameters from %s, time %.2fs" % (name, ckpt_path, time.time() - start_time)) return model def avg_checkpoints(model_dir, num_last_checkpoints, global_step_name): """Average the last N checkpoints in the model_dir.""" checkpoint_state = tf.train.get_checkpoint_state(model_dir) if not checkpoint_state: utils.print_out("# No checkpoint file found in directory: %s" % model_dir) return None # Checkpoints are ordered from oldest to newest. checkpoints = ( checkpoint_state.all_model_checkpoint_paths[-num_last_checkpoints:]) if len(checkpoints) < num_last_checkpoints: utils.print_out( "# Skipping averaging checkpoints because not enough checkpoints is " "available.") return None avg_model_dir = os.path.join(model_dir, "avg_checkpoints") if not tf.gfile.Exists(avg_model_dir): utils.print_out( "# Creating new directory %s for saving averaged checkpoints." % avg_model_dir) tf.gfile.MakeDirs(avg_model_dir) utils.print_out("# Reading and averaging variables in checkpoints:") var_list = tf.contrib.framework.list_variables(checkpoints[0]) var_values, var_dtypes = {}, {} for (name, shape) in var_list: if name != global_step_name: var_values[name] = np.zeros(shape) for checkpoint in checkpoints: utils.print_out(" %s" % checkpoint) reader = tf.contrib.framework.load_checkpoint(checkpoint) for name in var_values: tensor = reader.get_tensor(name) var_dtypes[name] = tensor.dtype var_values[name] += tensor for name in var_values: var_values[name] /= len(checkpoints) # Build a graph with same variables in the checkpoints, and save the averaged # variables into the avg_model_dir. with tf.Graph().as_default(): tf_vars = [ tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[name]) for v in var_values ] placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars] assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)] saver = tf.train.Saver(tf.all_variables(), save_relative_paths=True) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for p, assign_op, (name, value) in zip(placeholders, assign_ops, six.iteritems(var_values)): sess.run(assign_op, {p: value}) # Use the built saver to save the averaged checkpoint. Only keep 1 # checkpoint and the best checkpoint will be moved to avg_best_metric_dir. saver.save( sess, os.path.join(avg_model_dir, "translate.ckpt")) return avg_model_dir def create_or_load_model(model, model_dir, session, name): """Create translation model and initialize or load parameters in session.""" latest_ckpt = tf.train.latest_checkpoint(model_dir) if latest_ckpt: model = load_model(model, latest_ckpt, session, name) else: start_time = time.time() session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) utils.print_out(" created %s model with fresh parameters, time %.2fs" % (name, time.time() - start_time)) global_step = model.global_step.eval(session=session) return model, global_step