DeepLearningExamples/FasterTransformer/v3.0/sample/tensorflow/utils/encoder.py

# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import tensorflow as tf
import numpy as np
import math
import six
import os
from utils.common import create_initializer
from utils.position import SinusoidalPositionEncoder
  
def gelu(x):
    cdf = 0.5 * (1.0 + tf.tanh(
        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf


def layer_norm(input_tensor, name=None):
    return tf.contrib.layers.layer_norm(
        inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)


def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    do_return_2d_tensor=False,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None,
                    tf_datatype=tf.float32):
    
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                             seq_length, width):
        output_tensor = tf.reshape(
            input_tensor, [batch_size, seq_length, num_attention_heads, width])

        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")

    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")

    from_tensor_2d = reshape_to_matrix(from_tensor)
    to_tensor_2d = reshape_to_matrix(to_tensor)

    # `query_layer` = [B*F, N*H]
    query_layer = tf.layers.dense(
        from_tensor_2d,
        num_attention_heads * size_per_head,
        activation=query_act,
        name="query",
        use_bias=True,
        bias_initializer=create_initializer(initializer_range, tf_datatype),
        kernel_initializer=create_initializer(initializer_range, tf_datatype))

    # `key_layer` = [B*T, N*H]
    key_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=key_act,
        name="key",
        use_bias=True,
        bias_initializer=create_initializer(initializer_range, tf_datatype),
        kernel_initializer=create_initializer(initializer_range, tf_datatype))

    # `value_layer` = [B*T, N*H]
    value_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=value_act,
        name="value",
        use_bias=True,
        bias_initializer=create_initializer(initializer_range, tf_datatype),
        kernel_initializer=create_initializer(initializer_range, tf_datatype))

    # `query_layer` = [B, N, F, H]
    query_layer = transpose_for_scores(query_layer, batch_size,
                                       num_attention_heads, from_seq_length,
                                       size_per_head)

    # `key_layer` = [B, N, T, H]
    key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
                                     to_seq_length, size_per_head)

    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # `attention_mask` = [B, 1, F, T]
        if tf.rank(attention_mask) == 3:
            attention_mask = tf.expand_dims(attention_mask, axis=[1])
            
        adder = (1.0 - tf.cast(attention_mask, tf_datatype)) * -10000.0

        attention_scores += adder

    attention_probs = tf.nn.softmax(attention_scores)

    value_layer = tf.reshape(
        value_layer,
        [batch_size, to_seq_length, num_attention_heads, size_per_head])

    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

    context_layer = tf.matmul(attention_probs, value_layer)

    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

    if do_return_2d_tensor:
        context_layer = tf.reshape(
            context_layer,
            [batch_size * from_seq_length, num_attention_heads * size_per_head])
    else:
        context_layer = tf.reshape(
            context_layer,
            [batch_size, from_seq_length, num_attention_heads * size_per_head])

    return context_layer


def tf_encoder(input_tensor,
               encoder_args,
               attention_mask=None,
               intermediate_act_fn=gelu,
               initializer_range=0.02):
    '''
    Run the bert transformer layer by TensorFlow.
    
    Args:
        inputs: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension]. 
                The inputs tensor of encoder. The rank must be 3. 
        encoder_args: The arguments for encoder. The details are in the class 
                      "TransformerArgument" of common.py
        attention_mask: A tf.Tensor. The attention mask for self attention.
        intermediate_act_fn: A callable function.  
                             The activation function in the FFN. It is gelu in BERT. 
        initializer_range: A float value.     
                           The range of initializer for all weights.
        
    Outputs:
        outputs: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension].
                 The results of encoder.
    '''
    
    intermediate_size = encoder_args.hidden_dim * 4
    if encoder_args.hidden_dim % encoder_args.head_num != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (encoder_args.hidden_dim, encoder_args.head_num))

    attention_head_size = int(encoder_args.hidden_dim / encoder_args.head_num)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]

    prev_output = reshape_to_matrix(input_tensor)

    for layer_idx in range(encoder_args.num_layer):
        with tf.variable_scope("layer_%d" % layer_idx, reuse=tf.AUTO_REUSE):
            layer_input = prev_output
            with tf.variable_scope("attention"):
                with tf.variable_scope("self"):
                    attention_head = attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=encoder_args.head_num,
                        size_per_head=encoder_args.size_per_head,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length,
                        tf_datatype=encoder_args.dtype)
                    attention_output = attention_head

                with tf.variable_scope("output"):
                    attention_output = tf.layers.dense(
                        attention_output,
                        encoder_args.hidden_dim,
                        use_bias=True,
                        bias_initializer=create_initializer(
                            initializer_range, encoder_args.dtype),
                        kernel_initializer=create_initializer(initializer_range, encoder_args.dtype))
                    attention_output = layer_norm(
                        attention_output + layer_input)

            # The activation is only applied to the "intermediate" hidden layer.
            with tf.variable_scope("intermediate"):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    use_bias=True,
                    bias_initializer=create_initializer(
                        initializer_range, encoder_args.dtype),
                    kernel_initializer=create_initializer(initializer_range, encoder_args.dtype))

            # Down-project back to `hidden_size` then add the residual.
            with tf.variable_scope("output"):
                layer_output = tf.layers.dense(
                    intermediate_output,
                    encoder_args.hidden_dim,
                    use_bias=True,
                    bias_initializer=create_initializer(
                        initializer_range, encoder_args.dtype),
                    kernel_initializer=create_initializer(initializer_range, encoder_args.dtype))
                layer_output = layer_norm(layer_output + attention_output)
                prev_output = layer_output

            # amaxList for int8 quantization
            if encoder_args.int8_mode != 0:
                amaxList = tf.get_variable(name="amaxList", shape=[80 + 9*encoder_args.hidden_dim], dtype=tf.float32)

    prev_output = tf.reshape(prev_output, shape=tf.shape(input_tensor))
    return prev_output

def build_sequence_mask(sequence_length,
                        num_heads=None,
                        maximum_length=None,
                        dtype=tf.float32):
  """Builds the dot product mask.
  Args:
    sequence_length: The sequence length.
    num_heads: The number of heads.
    maximum_length: Optional size of the returned time dimension. Otherwise
      it is the maximum of :obj:`sequence_length`.
    dtype: The type of the mask tensor.
  Returns:
    A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
    ``[batch_size, 1, max_length, max_length]``.
  """
  mask = tf.sequence_mask(sequence_length, maxlen=maximum_length, dtype=dtype) # [batch_size, maximum_length]
  mask = tf.reshape(mask, [-1, 1, 1, maximum_length])
  m_2 = tf.transpose(mask, [0, 1, 3, 2])
  mask = mask * m_2
  
  return mask

def tf_encoder_opennmt(input_tensor,
                        encoder_args,
                        initializer_range=0.02,
                        sequence_length=None):
    '''
    Run the bert transformer layer by TensorFlow.
    
    Args:
        input_tensor: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension]. 
                       The inputs tensor of encoder. The rank must be 3. 
        encoder_args: The arguments for encoder. The details are in the class 
                      "TransformerArgument" of common.py
        initializer_range: A float value.     
                           The range of initializer for all weights.
        sequence_length: A tf.Tensor with shape [batch_size], with tf.int type.
                         The sequence length of each sentence in input_tensor.
        
    Outputs:
        output: A tf.Tensor with shape [batch_size, max(sequence_length), hidden_dimension].
                The results of encoder.
    '''
    
    data_type = encoder_args.dtype
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    
    input_tensor *= encoder_args.hidden_dim**0.5
    position_encoder = SinusoidalPositionEncoder()
    input_tensor = position_encoder(input_tensor, position=tf.range(seq_length))
    
    mask = build_sequence_mask(
        sequence_length,
        encoder_args.head_num,
        maximum_length=tf.shape(input_tensor)[1],
        dtype=data_type)
    
    intermediate_size = encoder_args.hidden_dim * 4
    if encoder_args.hidden_dim % encoder_args.head_num != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (encoder_args.hidden_dim, encoder_args.head_num))

    layer_input = input_tensor
    for layer_idx in range(encoder_args.num_layer):
        with tf.variable_scope("layer_%d" % layer_idx, reuse=tf.AUTO_REUSE):
            with tf.variable_scope("multi_head"):
                normed_input = tf.cast(layer_norm(tf.cast(layer_input, tf.float32)), data_type)
                
                queries, keys, values = tf.split(tf.layers.conv1d(normed_input, encoder_args.hidden_dim * 3, 1), 3, axis=2)
                
                # split head
                queries = tf.reshape(queries, [batch_size, seq_length, encoder_args.head_num, encoder_args.size_per_head])
                queries = tf.transpose(queries, [0, 2, 1, 3])
                
                keys = tf.reshape(keys, [batch_size, seq_length, encoder_args.head_num, encoder_args.size_per_head])
                keys = tf.transpose(keys, [0, 2, 1, 3])
                
                values = tf.reshape(values, [batch_size, seq_length, encoder_args.head_num, encoder_args.size_per_head])
                values = tf.transpose(values, [0, 2, 1, 3])
                
                queries *= (encoder_args.size_per_head)**-0.5

                dot = tf.matmul(queries, keys, transpose_b=True)
                
                if mask is not None:
                    dot = tf.cast(tf.cast(dot, data_type) * mask + ((1.0 - mask) * data_type.min), dot.dtype)

                attn = tf.cast(tf.nn.softmax(tf.cast(dot, data_type)), dot.dtype)

                context_1 = tf.matmul(attn, values)
                context_1 = tf.transpose(context_1, [0, 2, 1, 3])
                context_1 = tf.reshape(context_1, [batch_size, seq_length, encoder_args.hidden_dim])
                attention_output = tf.layers.conv1d(context_1, encoder_args.hidden_dim, 1)
                context_2 = attention_output + layer_input
                
            with tf.variable_scope("ffn"):
                normed_context_2 = tf.cast(layer_norm(tf.cast(context_2, tf.float32)), data_type)
                intermediate_output = tf.layers.conv1d(normed_context_2, intermediate_size, 1, activation=tf.nn.relu)
                layer_output_1 = tf.layers.conv1d(intermediate_output, encoder_args.hidden_dim, 1)
                layer_output_2 = layer_output_1 + context_2
                layer_input = layer_output_2
                
    layer_input = tf.cast(layer_input, tf.float32)
    output = layer_norm(layer_input, name="LayerNorm")
    return output


def get_shape_list(tensor, expected_rank=None, name=None):
    if name is None:
        name = tensor.name

    if expected_rank is not None:
        assert_rank(tensor, expected_rank, name)

    shape = tensor.shape.as_list()

    non_static_indexes = []
    for (index, dim) in enumerate(shape):
        if dim is None:
            non_static_indexes.append(index)

    if not non_static_indexes:
        return shape

    dyn_shape = tf.shape(tensor)
    for index in non_static_indexes:
        shape[index] = dyn_shape[index]
    return shape


def reshape_to_matrix(input_tensor):
    """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
    ndims = input_tensor.shape.ndims
    if ndims < 2:
        raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
                         (input_tensor.shape))
    if ndims == 2:
        return input_tensor

    width = input_tensor.shape[-1]
    output_tensor = tf.reshape(input_tensor, [-1, width])
    return output_tensor


def reshape_from_matrix(output_tensor, orig_shape_list):
    if len(orig_shape_list) == 2:
        return output_tensor

    output_shape = get_shape_list(output_tensor)

    orig_dims = orig_shape_list[0:-1]
    width = output_shape[-1]

    return tf.reshape(output_tensor, orig_dims + [width])


def assert_rank(tensor, expected_rank, name=None):
    if name is None:
        name = tensor.name

    expected_rank_dict = {}
    if isinstance(expected_rank, six.integer_types):
        expected_rank_dict[expected_rank] = True
    else:
        for x in expected_rank:
            expected_rank_dict[x] = True

    actual_rank = tensor.shape.ndims
    if actual_rank not in expected_rank_dict:
        scope_name = tf.get_variable_scope().name
        raise ValueError(
            "For the tensor `%s` in scope `%s`, the actual rank "
            "`%d` (shape = %s) is not equal to the expected rank `%s`" %
            (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))


def op_encoder(inputs,
               encoder_args,
               attention_mask,
               encoder_vars_dict,
               sequence_length):
    '''
    Run the bert transformer layer by FasterTransformer.
    
    Args:
        inputs: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension].         
                The inputs tensor of encoder. The rank must be 3. 
        encoder_args: The arguments for encoder. The details are in the class "TransformerArgument" of common.py
        attention_mask: A tf.Tensor. The attention mask for self attention.
        encoder_vars_dict: A dict of tf.Tensor or numpy array. 
                            The variables for encoder. They can be either some tensor or some numpy array. 
                            The key is the name of the tensor, like 'layer_0/attention/self/query/kernel:0'.
                            Teh value is the corresponding tensor or numpy array
        sequence_length: A tf.Tensor or numpy array with shape [batch_size].
                        The sequence length of the sentences
    Outputs:
        outputs: A tensor with shape [batch_size, seq_len, hidden_dimension].
                 The results of encoder.
    '''
    remove_padding = encoder_args.remove_padding
    transformer_op_module = tf.load_op_library(os.path.join('./lib/libtf_fastertransformer.so'))
    if remove_padding == True:
        inputs, sequence_id_offset = transformer_op_module.build_mask_remove_padding(inputs, sequence_length)
    else:
        sequence_id_offset = []
    for layer_idx in range(encoder_args.num_layer):
        if encoder_args.int8_mode != 0:
            amaxList = encoder_vars_dict['layer_%d/amaxList:0' % layer_idx]
        else:
            amaxList = []
        outputs = transformer_op_module.bert_transformer(
            inputs,
            inputs,
            encoder_vars_dict['layer_%d/attention/self/query/kernel:0' % layer_idx],
            encoder_vars_dict['layer_%d/attention/self/query/bias:0' % layer_idx],
            encoder_vars_dict['layer_%d/attention/self/key/kernel:0' % layer_idx],
            encoder_vars_dict['layer_%d/attention/self/key/bias:0' % layer_idx],
            encoder_vars_dict['layer_%d/attention/self/value/kernel:0' % layer_idx],
            encoder_vars_dict['layer_%d/attention/self/value/bias:0' % layer_idx],
            attention_mask,
            encoder_vars_dict['layer_%d/attention/output/dense/kernel:0' % layer_idx],
            encoder_vars_dict['layer_%d/attention/output/dense/bias:0' % layer_idx],
            encoder_vars_dict['layer_%d/attention/output/LayerNorm/beta:0' % layer_idx],
            encoder_vars_dict['layer_%d/attention/output/LayerNorm/gamma:0' % layer_idx],
            encoder_vars_dict['layer_%d/intermediate/dense/kernel:0' % layer_idx],
            encoder_vars_dict['layer_%d/intermediate/dense/bias:0' % layer_idx],
            encoder_vars_dict['layer_%d/output/dense/kernel:0' % layer_idx],
            encoder_vars_dict['layer_%d/output/dense/bias:0' % layer_idx],
            encoder_vars_dict['layer_%d/output/LayerNorm/beta:0' % layer_idx],
            encoder_vars_dict['layer_%d/output/LayerNorm/gamma:0' % layer_idx],
            sequence_id_offset,
            amaxList,
            head_num=encoder_args.head_num, size_per_head=encoder_args.size_per_head,
            remove_padding=remove_padding,
            int8_mode=encoder_args.int8_mode, layer_idx=layer_idx, layer_num=encoder_args.num_layer)
        inputs = outputs
    
    if remove_padding == True:
        outputs = transformer_op_module.rebuild_padding(outputs, sequence_id_offset, attention_mask)

    return outputs
[FT] FasterTransformer 3.0 Release (#696) [FT] feat: Add FasterTransformer v3.0 1. Add supporting of INT8 quantization of cpp and TensorFlow op. 2. Provide the tools to quantize the model. 3. Fix the bugs that cmake 3.15 and 3.16 cannot build this project. 4. Deprecate the FasterTransformer v1 2020-09-23 04:03:37 +02:00			`# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import tensorflow as tf`
			`import numpy as np`
			`import math`
			`import six`
			`import os`
			`from utils.common import create_initializer`
			`from utils.position import SinusoidalPositionEncoder`

			`def gelu(x):`
			`cdf = 0.5 * (1.0 + tf.tanh(`
			`(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))`
			`return x * cdf`


			`def layer_norm(input_tensor, name=None):`
			`return tf.contrib.layers.layer_norm(`
			`inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)`


			`def attention_layer(from_tensor,`
			`to_tensor,`
			`attention_mask=None,`
			`num_attention_heads=1,`
			`size_per_head=512,`
			`query_act=None,`
			`key_act=None,`
			`value_act=None,`
			`attention_probs_dropout_prob=0.0,`
			`initializer_range=0.02,`
			`do_return_2d_tensor=False,`
			`batch_size=None,`
			`from_seq_length=None,`
			`to_seq_length=None,`
			`tf_datatype=tf.float32):`

			`def transpose_for_scores(input_tensor, batch_size, num_attention_heads,`
			`seq_length, width):`
			`output_tensor = tf.reshape(`
			`input_tensor, [batch_size, seq_length, num_attention_heads, width])`

			`output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])`
			`return output_tensor`

			`from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])`
			`to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])`

			`if len(from_shape) != len(to_shape):`
			`raise ValueError(`
			"The rank of `from_tensor` must match the rank of `to_tensor`.")

			`if len(from_shape) == 3:`
			`batch_size = from_shape[0]`
			`from_seq_length = from_shape[1]`
			`to_seq_length = to_shape[1]`
			`elif len(from_shape) == 2:`
			`if (batch_size is None or from_seq_length is None or to_seq_length is None):`
			`raise ValueError(`
			`"When passing in rank 2 tensors to attention_layer, the values "`
			"for `batch_size`, `from_seq_length`, and `to_seq_length` "
			`"must all be specified.")`

			`from_tensor_2d = reshape_to_matrix(from_tensor)`
			`to_tensor_2d = reshape_to_matrix(to_tensor)`

			# `query_layer` = [BF, NH]
			`query_layer = tf.layers.dense(`
			`from_tensor_2d,`
			`num_attention_heads * size_per_head,`
			`activation=query_act,`
			`name="query",`
			`use_bias=True,`
			`bias_initializer=create_initializer(initializer_range, tf_datatype),`
			`kernel_initializer=create_initializer(initializer_range, tf_datatype))`

			# `key_layer` = [BT, NH]
			`key_layer = tf.layers.dense(`
			`to_tensor_2d,`
			`num_attention_heads * size_per_head,`
			`activation=key_act,`
			`name="key",`
			`use_bias=True,`
			`bias_initializer=create_initializer(initializer_range, tf_datatype),`
			`kernel_initializer=create_initializer(initializer_range, tf_datatype))`

			# `value_layer` = [BT, NH]
			`value_layer = tf.layers.dense(`
			`to_tensor_2d,`
			`num_attention_heads * size_per_head,`
			`activation=value_act,`
			`name="value",`
			`use_bias=True,`
			`bias_initializer=create_initializer(initializer_range, tf_datatype),`
			`kernel_initializer=create_initializer(initializer_range, tf_datatype))`

			# `query_layer` = [B, N, F, H]
			`query_layer = transpose_for_scores(query_layer, batch_size,`
			`num_attention_heads, from_seq_length,`
			`size_per_head)`

			# `key_layer` = [B, N, T, H]
			`key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,`
			`to_seq_length, size_per_head)`

			`attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)`
			`attention_scores = tf.multiply(attention_scores,`
			`1.0 / math.sqrt(float(size_per_head)))`

			`if attention_mask is not None:`
			# `attention_mask` = [B, 1, F, T]
			`if tf.rank(attention_mask) == 3:`
			`attention_mask = tf.expand_dims(attention_mask, axis=[1])`

			`adder = (1.0 - tf.cast(attention_mask, tf_datatype)) * -10000.0`

			`attention_scores += adder`

			`attention_probs = tf.nn.softmax(attention_scores)`

			`value_layer = tf.reshape(`
			`value_layer,`
			`[batch_size, to_seq_length, num_attention_heads, size_per_head])`

			`value_layer = tf.transpose(value_layer, [0, 2, 1, 3])`

			`context_layer = tf.matmul(attention_probs, value_layer)`

			`context_layer = tf.transpose(context_layer, [0, 2, 1, 3])`

			`if do_return_2d_tensor:`
			`context_layer = tf.reshape(`
			`context_layer,`
			`[batch_size * from_seq_length, num_attention_heads * size_per_head])`
			`else:`
			`context_layer = tf.reshape(`
			`context_layer,`
			`[batch_size, from_seq_length, num_attention_heads * size_per_head])`

			`return context_layer`


			`def tf_encoder(input_tensor,`
			`encoder_args,`
			`attention_mask=None,`
			`intermediate_act_fn=gelu,`
			`initializer_range=0.02):`
			`'''`
			`Run the bert transformer layer by TensorFlow.`

			`Args:`
			`inputs: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension].`
			`The inputs tensor of encoder. The rank must be 3.`
			`encoder_args: The arguments for encoder. The details are in the class`
			`"TransformerArgument" of common.py`
			`attention_mask: A tf.Tensor. The attention mask for self attention.`
			`intermediate_act_fn: A callable function.`
			`The activation function in the FFN. It is gelu in BERT.`
			`initializer_range: A float value.`
			`The range of initializer for all weights.`

			`Outputs:`
			`outputs: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension].`
			`The results of encoder.`
			`'''`

			`intermediate_size = encoder_args.hidden_dim * 4`
			`if encoder_args.hidden_dim % encoder_args.head_num != 0:`
			`raise ValueError(`
			`"The hidden size (%d) is not a multiple of the number of attention "`
			`"heads (%d)" % (encoder_args.hidden_dim, encoder_args.head_num))`

			`attention_head_size = int(encoder_args.hidden_dim / encoder_args.head_num)`
			`input_shape = get_shape_list(input_tensor, expected_rank=3)`
			`batch_size = input_shape[0]`
			`seq_length = input_shape[1]`

			`prev_output = reshape_to_matrix(input_tensor)`

			`for layer_idx in range(encoder_args.num_layer):`
			`with tf.variable_scope("layer_%d" % layer_idx, reuse=tf.AUTO_REUSE):`
			`layer_input = prev_output`
			`with tf.variable_scope("attention"):`
			`with tf.variable_scope("self"):`
			`attention_head = attention_layer(`
			`from_tensor=layer_input,`
			`to_tensor=layer_input,`
			`attention_mask=attention_mask,`
			`num_attention_heads=encoder_args.head_num,`
			`size_per_head=encoder_args.size_per_head,`
			`initializer_range=initializer_range,`
			`do_return_2d_tensor=True,`
			`batch_size=batch_size,`
			`from_seq_length=seq_length,`
			`to_seq_length=seq_length,`
			`tf_datatype=encoder_args.dtype)`
			`attention_output = attention_head`

			`with tf.variable_scope("output"):`
			`attention_output = tf.layers.dense(`
			`attention_output,`
			`encoder_args.hidden_dim,`
			`use_bias=True,`
			`bias_initializer=create_initializer(`
			`initializer_range, encoder_args.dtype),`
			`kernel_initializer=create_initializer(initializer_range, encoder_args.dtype))`
			`attention_output = layer_norm(`
			`attention_output + layer_input)`

			`# The activation is only applied to the "intermediate" hidden layer.`
			`with tf.variable_scope("intermediate"):`
			`intermediate_output = tf.layers.dense(`
			`attention_output,`
			`intermediate_size,`
			`activation=intermediate_act_fn,`
			`use_bias=True,`
			`bias_initializer=create_initializer(`
			`initializer_range, encoder_args.dtype),`
			`kernel_initializer=create_initializer(initializer_range, encoder_args.dtype))`

			# Down-project back to `hidden_size` then add the residual.
			`with tf.variable_scope("output"):`
			`layer_output = tf.layers.dense(`
			`intermediate_output,`
			`encoder_args.hidden_dim,`
			`use_bias=True,`
			`bias_initializer=create_initializer(`
			`initializer_range, encoder_args.dtype),`
			`kernel_initializer=create_initializer(initializer_range, encoder_args.dtype))`
			`layer_output = layer_norm(layer_output + attention_output)`
			`prev_output = layer_output`

			`# amaxList for int8 quantization`
			`if encoder_args.int8_mode != 0:`
			`amaxList = tf.get_variable(name="amaxList", shape=[80 + 9*encoder_args.hidden_dim], dtype=tf.float32)`

			`prev_output = tf.reshape(prev_output, shape=tf.shape(input_tensor))`
			`return prev_output`

			`def build_sequence_mask(sequence_length,`
			`num_heads=None,`
			`maximum_length=None,`
			`dtype=tf.float32):`
			`"""Builds the dot product mask.`
			`Args:`
			`sequence_length: The sequence length.`
			`num_heads: The number of heads.`
			`maximum_length: Optional size of the returned time dimension. Otherwise`
			it is the maximum of :obj:`sequence_length`.
			`dtype: The type of the mask tensor.`
			`Returns:`
			A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
			``[batch_size, 1, max_length, max_length]``.
			`"""`
			`mask = tf.sequence_mask(sequence_length, maxlen=maximum_length, dtype=dtype) # [batch_size, maximum_length]`
			`mask = tf.reshape(mask, [-1, 1, 1, maximum_length])`
			`m_2 = tf.transpose(mask, [0, 1, 3, 2])`
			`mask = mask * m_2`

			`return mask`

			`def tf_encoder_opennmt(input_tensor,`
			`encoder_args,`
			`initializer_range=0.02,`
			`sequence_length=None):`
			`'''`
			`Run the bert transformer layer by TensorFlow.`

			`Args:`
			`input_tensor: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension].`
			`The inputs tensor of encoder. The rank must be 3.`
			`encoder_args: The arguments for encoder. The details are in the class`
			`"TransformerArgument" of common.py`
			`initializer_range: A float value.`
			`The range of initializer for all weights.`
			`sequence_length: A tf.Tensor with shape [batch_size], with tf.int type.`
			`The sequence length of each sentence in input_tensor.`

			`Outputs:`
			`output: A tf.Tensor with shape [batch_size, max(sequence_length), hidden_dimension].`
			`The results of encoder.`
			`'''`

			`data_type = encoder_args.dtype`
			`input_shape = get_shape_list(input_tensor, expected_rank=3)`
			`batch_size = input_shape[0]`
			`seq_length = input_shape[1]`

			`input_tensor = encoder_args.hidden_dim*0.5`
			`position_encoder = SinusoidalPositionEncoder()`
			`input_tensor = position_encoder(input_tensor, position=tf.range(seq_length))`

			`mask = build_sequence_mask(`
			`sequence_length,`
			`encoder_args.head_num,`
			`maximum_length=tf.shape(input_tensor)[1],`
			`dtype=data_type)`

			`intermediate_size = encoder_args.hidden_dim * 4`
			`if encoder_args.hidden_dim % encoder_args.head_num != 0:`
			`raise ValueError(`
			`"The hidden size (%d) is not a multiple of the number of attention "`
			`"heads (%d)" % (encoder_args.hidden_dim, encoder_args.head_num))`

			`layer_input = input_tensor`
			`for layer_idx in range(encoder_args.num_layer):`
			`with tf.variable_scope("layer_%d" % layer_idx, reuse=tf.AUTO_REUSE):`
			`with tf.variable_scope("multi_head"):`
			`normed_input = tf.cast(layer_norm(tf.cast(layer_input, tf.float32)), data_type)`

			`queries, keys, values = tf.split(tf.layers.conv1d(normed_input, encoder_args.hidden_dim * 3, 1), 3, axis=2)`

			`# split head`
			`queries = tf.reshape(queries, [batch_size, seq_length, encoder_args.head_num, encoder_args.size_per_head])`
			`queries = tf.transpose(queries, [0, 2, 1, 3])`

			`keys = tf.reshape(keys, [batch_size, seq_length, encoder_args.head_num, encoder_args.size_per_head])`
			`keys = tf.transpose(keys, [0, 2, 1, 3])`

			`values = tf.reshape(values, [batch_size, seq_length, encoder_args.head_num, encoder_args.size_per_head])`
			`values = tf.transpose(values, [0, 2, 1, 3])`

			`queries = (encoder_args.size_per_head)*-0.5`

			`dot = tf.matmul(queries, keys, transpose_b=True)`

			`if mask is not None:`
			`dot = tf.cast(tf.cast(dot, data_type) * mask + ((1.0 - mask) * data_type.min), dot.dtype)`

			`attn = tf.cast(tf.nn.softmax(tf.cast(dot, data_type)), dot.dtype)`

			`context_1 = tf.matmul(attn, values)`
			`context_1 = tf.transpose(context_1, [0, 2, 1, 3])`
			`context_1 = tf.reshape(context_1, [batch_size, seq_length, encoder_args.hidden_dim])`
			`attention_output = tf.layers.conv1d(context_1, encoder_args.hidden_dim, 1)`
			`context_2 = attention_output + layer_input`

			`with tf.variable_scope("ffn"):`
			`normed_context_2 = tf.cast(layer_norm(tf.cast(context_2, tf.float32)), data_type)`
			`intermediate_output = tf.layers.conv1d(normed_context_2, intermediate_size, 1, activation=tf.nn.relu)`
			`layer_output_1 = tf.layers.conv1d(intermediate_output, encoder_args.hidden_dim, 1)`
			`layer_output_2 = layer_output_1 + context_2`
			`layer_input = layer_output_2`

			`layer_input = tf.cast(layer_input, tf.float32)`
			`output = layer_norm(layer_input, name="LayerNorm")`
			`return output`


			`def get_shape_list(tensor, expected_rank=None, name=None):`
			`if name is None:`
			`name = tensor.name`

			`if expected_rank is not None:`
			`assert_rank(tensor, expected_rank, name)`

			`shape = tensor.shape.as_list()`

			`non_static_indexes = []`
			`for (index, dim) in enumerate(shape):`
			`if dim is None:`
			`non_static_indexes.append(index)`

			`if not non_static_indexes:`
			`return shape`

			`dyn_shape = tf.shape(tensor)`
			`for index in non_static_indexes:`
			`shape[index] = dyn_shape[index]`
			`return shape`


			`def reshape_to_matrix(input_tensor):`
			`"""Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""`
			`ndims = input_tensor.shape.ndims`
			`if ndims < 2:`
			`raise ValueError("Input tensor must have at least rank 2. Shape = %s" %`
			`(input_tensor.shape))`
			`if ndims == 2:`
			`return input_tensor`

			`width = input_tensor.shape[-1]`
			`output_tensor = tf.reshape(input_tensor, [-1, width])`
			`return output_tensor`


			`def reshape_from_matrix(output_tensor, orig_shape_list):`
			`if len(orig_shape_list) == 2:`
			`return output_tensor`

			`output_shape = get_shape_list(output_tensor)`

			`orig_dims = orig_shape_list[0:-1]`
			`width = output_shape[-1]`

			`return tf.reshape(output_tensor, orig_dims + [width])`


			`def assert_rank(tensor, expected_rank, name=None):`
			`if name is None:`
			`name = tensor.name`

			`expected_rank_dict = {}`
			`if isinstance(expected_rank, six.integer_types):`
			`expected_rank_dict[expected_rank] = True`
			`else:`
			`for x in expected_rank:`
			`expected_rank_dict[x] = True`

			`actual_rank = tensor.shape.ndims`
			`if actual_rank not in expected_rank_dict:`
			`scope_name = tf.get_variable_scope().name`
			`raise ValueError(`
			"For the tensor `%s` in scope `%s`, the actual rank "
			"`%d` (shape = %s) is not equal to the expected rank `%s`" %
			`(name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))`


			`def op_encoder(inputs,`
			`encoder_args,`
			`attention_mask,`
			`encoder_vars_dict,`
			`sequence_length):`
			`'''`
			`Run the bert transformer layer by FasterTransformer.`

			`Args:`
			`inputs: A tf.Tensor with shape [batch_size, seq_len, hidden_dimension].`
			`The inputs tensor of encoder. The rank must be 3.`
			`encoder_args: The arguments for encoder. The details are in the class "TransformerArgument" of common.py`
			`attention_mask: A tf.Tensor. The attention mask for self attention.`
			`encoder_vars_dict: A dict of tf.Tensor or numpy array.`
			`The variables for encoder. They can be either some tensor or some numpy array.`
			`The key is the name of the tensor, like 'layer_0/attention/self/query/kernel:0'.`
			`Teh value is the corresponding tensor or numpy array`
			`sequence_length: A tf.Tensor or numpy array with shape [batch_size].`
			`The sequence length of the sentences`
			`Outputs:`
			`outputs: A tensor with shape [batch_size, seq_len, hidden_dimension].`
			`The results of encoder.`
			`'''`
			`remove_padding = encoder_args.remove_padding`
			`transformer_op_module = tf.load_op_library(os.path.join('./lib/libtf_fastertransformer.so'))`
			`if remove_padding == True:`
			`inputs, sequence_id_offset = transformer_op_module.build_mask_remove_padding(inputs, sequence_length)`
			`else:`
			`sequence_id_offset = []`
			`for layer_idx in range(encoder_args.num_layer):`
			`if encoder_args.int8_mode != 0:`
			`amaxList = encoder_vars_dict['layer_%d/amaxList:0' % layer_idx]`
			`else:`
			`amaxList = []`
			`outputs = transformer_op_module.bert_transformer(`
			`inputs,`
			`inputs,`
			`encoder_vars_dict['layer_%d/attention/self/query/kernel:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/attention/self/query/bias:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/attention/self/key/kernel:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/attention/self/key/bias:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/attention/self/value/kernel:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/attention/self/value/bias:0' % layer_idx],`
			`attention_mask,`
			`encoder_vars_dict['layer_%d/attention/output/dense/kernel:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/attention/output/dense/bias:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/attention/output/LayerNorm/beta:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/attention/output/LayerNorm/gamma:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/intermediate/dense/kernel:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/intermediate/dense/bias:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/output/dense/kernel:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/output/dense/bias:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/output/LayerNorm/beta:0' % layer_idx],`
			`encoder_vars_dict['layer_%d/output/LayerNorm/gamma:0' % layer_idx],`
			`sequence_id_offset,`
			`amaxList,`
			`head_num=encoder_args.head_num, size_per_head=encoder_args.size_per_head,`
			`remove_padding=remove_padding,`
			`int8_mode=encoder_args.int8_mode, layer_idx=layer_idx, layer_num=encoder_args.num_layer)`
			`inputs = outputs`

			`if remove_padding == True:`
			`outputs = transformer_op_module.rebuild_padding(outputs, sequence_id_offset, attention_mask)`

			`return outputs`