# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import tensorflow as tf import numpy as np import math import six import os from utils.common import create_initializer from utils.position import SinusoidalPositionEncoder def gelu(x): cdf = 0.5 * (1.0 + tf.tanh( (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf def layer_norm(input_tensor, name=None): return tf.contrib.layers.layer_norm( inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) def attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, do_return_2d_tensor=False, batch_size=None, from_seq_length=None, to_seq_length=None, tf_datatype=tf.float32): def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width): output_tensor = tf.reshape( input_tensor, [batch_size, seq_length, num_attention_heads, width]) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") from_tensor_2d = reshape_to_matrix(from_tensor) to_tensor_2d = reshape_to_matrix(to_tensor) # `query_layer` = [B*F, N*H] query_layer = tf.layers.dense( from_tensor_2d, num_attention_heads * size_per_head, activation=query_act, name="query", use_bias=True, bias_initializer=create_initializer(initializer_range, tf_datatype), kernel_initializer=create_initializer(initializer_range, tf_datatype)) # `key_layer` = [B*T, N*H] key_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=key_act, name="key", use_bias=True, bias_initializer=create_initializer(initializer_range, tf_datatype), kernel_initializer=create_initializer(initializer_range, tf_datatype)) # `value_layer` = [B*T, N*H] value_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=value_act, name="value", use_bias=True, bias_initializer=create_initializer(initializer_range, tf_datatype), kernel_initializer=create_initializer(initializer_range, tf_datatype)) # `query_layer` = [B, N, F, H] query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, from_seq_length, size_per_head) # `key_layer` = [B, N, T, H] key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] if tf.rank(attention_mask) == 3: attention_mask = tf.expand_dims(attention_mask, axis=[1]) adder = (1.0 - tf.cast(attention_mask, tf_datatype)) * -10000.0 attention_scores += adder attention_probs = tf.nn.softmax(attention_scores) value_layer = tf.reshape( value_layer, [batch_size, to_seq_length, num_attention_heads, size_per_head]) value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) if do_return_2d_tensor: context_layer = tf.reshape( context_layer, [batch_size * from_seq_length, num_attention_heads * size_per_head]) else: context_layer = tf.reshape( context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head]) return context_layer def tf_encoder(input_tensor, encoder_args, attention_mask=None, intermediate_act_fn=gelu, initializer_range=0.02): ''' Run the bert transformer layer by TensorFlow. Args: inputs: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension]. The inputs tensor of encoder. The rank must be 3. encoder_args: The arguments for encoder. The details are in the class "TransformerArgument" of common.py attention_mask: A tf.Tensor. The attention mask for self attention. intermediate_act_fn: A callable function. The activation function in the FFN. It is gelu in BERT. initializer_range: A float value. The range of initializer for all weights. Outputs: outputs: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension]. The results of encoder. ''' intermediate_size = encoder_args.hidden_dim * 4 if encoder_args.hidden_dim % encoder_args.head_num != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (encoder_args.hidden_dim, encoder_args.head_num)) attention_head_size = int(encoder_args.hidden_dim / encoder_args.head_num) input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] prev_output = reshape_to_matrix(input_tensor) for layer_idx in range(encoder_args.num_layer): with tf.variable_scope("layer_%d" % layer_idx, reuse=tf.AUTO_REUSE): layer_input = prev_output with tf.variable_scope("attention"): with tf.variable_scope("self"): attention_head = attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=encoder_args.head_num, size_per_head=encoder_args.size_per_head, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length, tf_datatype=encoder_args.dtype) attention_output = attention_head with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, encoder_args.hidden_dim, use_bias=True, bias_initializer=create_initializer( initializer_range, encoder_args.dtype), kernel_initializer=create_initializer(initializer_range, encoder_args.dtype)) attention_output = layer_norm( attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, use_bias=True, bias_initializer=create_initializer( initializer_range, encoder_args.dtype), kernel_initializer=create_initializer(initializer_range, encoder_args.dtype)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output = tf.layers.dense( intermediate_output, encoder_args.hidden_dim, use_bias=True, bias_initializer=create_initializer( initializer_range, encoder_args.dtype), kernel_initializer=create_initializer(initializer_range, encoder_args.dtype)) layer_output = layer_norm(layer_output + attention_output) prev_output = layer_output # amaxList for int8 quantization if encoder_args.int8_mode != 0: amaxList = tf.get_variable(name="amaxList", shape=[80 + 9*encoder_args.hidden_dim], dtype=tf.float32) prev_output = tf.reshape(prev_output, shape=tf.shape(input_tensor)) return prev_output def build_sequence_mask(sequence_length, num_heads=None, maximum_length=None, dtype=tf.float32): """Builds the dot product mask. Args: sequence_length: The sequence length. num_heads: The number of heads. maximum_length: Optional size of the returned time dimension. Otherwise it is the maximum of :obj:`sequence_length`. dtype: The type of the mask tensor. Returns: A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape ``[batch_size, 1, max_length, max_length]``. """ mask = tf.sequence_mask(sequence_length, maxlen=maximum_length, dtype=dtype) # [batch_size, maximum_length] mask = tf.reshape(mask, [-1, 1, 1, maximum_length]) m_2 = tf.transpose(mask, [0, 1, 3, 2]) mask = mask * m_2 return mask def tf_encoder_opennmt(input_tensor, encoder_args, initializer_range=0.02, sequence_length=None): ''' Run the bert transformer layer by TensorFlow. Args: input_tensor: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension]. The inputs tensor of encoder. The rank must be 3. encoder_args: The arguments for encoder. The details are in the class "TransformerArgument" of common.py initializer_range: A float value. The range of initializer for all weights. sequence_length: A tf.Tensor with shape [batch_size], with tf.int type. The sequence length of each sentence in input_tensor. Outputs: output: A tf.Tensor with shape [batch_size, max(sequence_length), hidden_dimension]. The results of encoder. ''' data_type = encoder_args.dtype input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_tensor *= encoder_args.hidden_dim**0.5 position_encoder = SinusoidalPositionEncoder() input_tensor = position_encoder(input_tensor, position=tf.range(seq_length)) mask = build_sequence_mask( sequence_length, encoder_args.head_num, maximum_length=tf.shape(input_tensor)[1], dtype=data_type) intermediate_size = encoder_args.hidden_dim * 4 if encoder_args.hidden_dim % encoder_args.head_num != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (encoder_args.hidden_dim, encoder_args.head_num)) layer_input = input_tensor for layer_idx in range(encoder_args.num_layer): with tf.variable_scope("layer_%d" % layer_idx, reuse=tf.AUTO_REUSE): with tf.variable_scope("multi_head"): normed_input = tf.cast(layer_norm(tf.cast(layer_input, tf.float32)), data_type) queries, keys, values = tf.split(tf.layers.conv1d(normed_input, encoder_args.hidden_dim * 3, 1), 3, axis=2) # split head queries = tf.reshape(queries, [batch_size, seq_length, encoder_args.head_num, encoder_args.size_per_head]) queries = tf.transpose(queries, [0, 2, 1, 3]) keys = tf.reshape(keys, [batch_size, seq_length, encoder_args.head_num, encoder_args.size_per_head]) keys = tf.transpose(keys, [0, 2, 1, 3]) values = tf.reshape(values, [batch_size, seq_length, encoder_args.head_num, encoder_args.size_per_head]) values = tf.transpose(values, [0, 2, 1, 3]) queries *= (encoder_args.size_per_head)**-0.5 dot = tf.matmul(queries, keys, transpose_b=True) if mask is not None: dot = tf.cast(tf.cast(dot, data_type) * mask + ((1.0 - mask) * data_type.min), dot.dtype) attn = tf.cast(tf.nn.softmax(tf.cast(dot, data_type)), dot.dtype) context_1 = tf.matmul(attn, values) context_1 = tf.transpose(context_1, [0, 2, 1, 3]) context_1 = tf.reshape(context_1, [batch_size, seq_length, encoder_args.hidden_dim]) attention_output = tf.layers.conv1d(context_1, encoder_args.hidden_dim, 1) context_2 = attention_output + layer_input with tf.variable_scope("ffn"): normed_context_2 = tf.cast(layer_norm(tf.cast(context_2, tf.float32)), data_type) intermediate_output = tf.layers.conv1d(normed_context_2, intermediate_size, 1, activation=tf.nn.relu) layer_output_1 = tf.layers.conv1d(intermediate_output, encoder_args.hidden_dim, 1) layer_output_2 = layer_output_1 + context_2 layer_input = layer_output_2 layer_input = tf.cast(layer_input, tf.float32) output = layer_norm(layer_input, name="LayerNorm") return output def get_shape_list(tensor, expected_rank=None, name=None): if name is None: name = tensor.name if expected_rank is not None: assert_rank(tensor, expected_rank, name) shape = tensor.shape.as_list() non_static_indexes = [] for (index, dim) in enumerate(shape): if dim is None: non_static_indexes.append(index) if not non_static_indexes: return shape dyn_shape = tf.shape(tensor) for index in non_static_indexes: shape[index] = dyn_shape[index] return shape def reshape_to_matrix(input_tensor): """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" ndims = input_tensor.shape.ndims if ndims < 2: raise ValueError("Input tensor must have at least rank 2. Shape = %s" % (input_tensor.shape)) if ndims == 2: return input_tensor width = input_tensor.shape[-1] output_tensor = tf.reshape(input_tensor, [-1, width]) return output_tensor def reshape_from_matrix(output_tensor, orig_shape_list): if len(orig_shape_list) == 2: return output_tensor output_shape = get_shape_list(output_tensor) orig_dims = orig_shape_list[0:-1] width = output_shape[-1] return tf.reshape(output_tensor, orig_dims + [width]) def assert_rank(tensor, expected_rank, name=None): if name is None: name = tensor.name expected_rank_dict = {} if isinstance(expected_rank, six.integer_types): expected_rank_dict[expected_rank] = True else: for x in expected_rank: expected_rank_dict[x] = True actual_rank = tensor.shape.ndims if actual_rank not in expected_rank_dict: scope_name = tf.get_variable_scope().name raise ValueError( "For the tensor `%s` in scope `%s`, the actual rank " "`%d` (shape = %s) is not equal to the expected rank `%s`" % (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) def op_encoder(inputs, encoder_args, attention_mask, encoder_vars_dict, sequence_length): ''' Run the bert transformer layer by FasterTransformer. Args: inputs: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension]. The inputs tensor of encoder. The rank must be 3. encoder_args: The arguments for encoder. The details are in the class "TransformerArgument" of common.py attention_mask: A tf.Tensor. The attention mask for self attention. encoder_vars_dict: A dict of tf.Tensor or numpy array. The variables for encoder. They can be either some tensor or some numpy array. The key is the name of the tensor, like 'layer_0/attention/self/query/kernel:0'. Teh value is the corresponding tensor or numpy array sequence_length: A tf.Tensor or numpy array with shape [batch_size]. The sequence length of the sentences Outputs: outputs: A tensor with shape [batch_size, seq_len, hidden_dimension]. The results of encoder. ''' remove_padding = encoder_args.remove_padding transformer_op_module = tf.load_op_library(os.path.join('./lib/libtf_fastertransformer.so')) if remove_padding == True: inputs, sequence_id_offset = transformer_op_module.build_mask_remove_padding(inputs, sequence_length) else: sequence_id_offset = [] for layer_idx in range(encoder_args.num_layer): if encoder_args.int8_mode != 0: amaxList = encoder_vars_dict['layer_%d/amaxList:0' % layer_idx] else: amaxList = [] outputs = transformer_op_module.bert_transformer( inputs, inputs, encoder_vars_dict['layer_%d/attention/self/query/kernel:0' % layer_idx], encoder_vars_dict['layer_%d/attention/self/query/bias:0' % layer_idx], encoder_vars_dict['layer_%d/attention/self/key/kernel:0' % layer_idx], encoder_vars_dict['layer_%d/attention/self/key/bias:0' % layer_idx], encoder_vars_dict['layer_%d/attention/self/value/kernel:0' % layer_idx], encoder_vars_dict['layer_%d/attention/self/value/bias:0' % layer_idx], attention_mask, encoder_vars_dict['layer_%d/attention/output/dense/kernel:0' % layer_idx], encoder_vars_dict['layer_%d/attention/output/dense/bias:0' % layer_idx], encoder_vars_dict['layer_%d/attention/output/LayerNorm/beta:0' % layer_idx], encoder_vars_dict['layer_%d/attention/output/LayerNorm/gamma:0' % layer_idx], encoder_vars_dict['layer_%d/intermediate/dense/kernel:0' % layer_idx], encoder_vars_dict['layer_%d/intermediate/dense/bias:0' % layer_idx], encoder_vars_dict['layer_%d/output/dense/kernel:0' % layer_idx], encoder_vars_dict['layer_%d/output/dense/bias:0' % layer_idx], encoder_vars_dict['layer_%d/output/LayerNorm/beta:0' % layer_idx], encoder_vars_dict['layer_%d/output/LayerNorm/gamma:0' % layer_idx], sequence_id_offset, amaxList, head_num=encoder_args.head_num, size_per_head=encoder_args.size_per_head, remove_padding=remove_padding, int8_mode=encoder_args.int8_mode, layer_idx=layer_idx, layer_num=encoder_args.num_layer) inputs = outputs if remove_padding == True: outputs = transformer_op_module.rebuild_padding(outputs, sequence_id_offset, attention_mask) return outputs