DeepLearningExamples/FasterTransformer/v3.0/sample/tensorflow/utils/common.py
byshiue b2e89e6e80
[FT] FasterTransformer 3.0 Release (#696)
[FT] feat: Add FasterTransformer v3.0

1. Add supporting of INT8 quantization of cpp and TensorFlow op.
2. Provide the tools to quantize the model.
3. Fix the bugs that cmake 3.15 and 3.16 cannot build this project. 
4. Deprecate the FasterTransformer v1
2020-09-23 10:03:37 +08:00

234 lines
9.1 KiB
Python

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from datetime import datetime
import tensorflow as tf
import numpy as np
import ctypes
from utils.beam_search import BeamSearch
from utils.beam_search import DiverseSiblingSearch
class TransformerArgument:
def __init__( self,
beam_width,
head_num,
size_per_head,
num_layer,
dtype=tf.float32,
kernel_init_range=0.02,
bias_init_range=0.02,
fuse_qkv=True,
remove_padding=False,
int8_mode=0):
'''
The arguments of Transformer layer (for both encoder and decoder).
Args:
beam_width: The beam_width size for beam search. This argument is always one for encoder.
head_num: The head number of self attention in transformer layer.
size_per_head: The size of hidden dimension for each head of self attention in transformer layer.
num_layer: The number of transformer layer. For example, BERT-base uses 12 layers.
dtype: The data type of weights initializer and inputs.
kernel_init_range: The initializer range of kernel for all convolution layer and fully-connected layer.
kernel_init_range: The initializer range of bias for all convolution layer and fully-connected layer.
fuse_qkv: bool. Wether fuse the q, k, v gemm or not.
remove_padding: bool. Remove the padding of sentences of encoder.
int8_mode: Mode of int8 quantization. 0 means not using int8 quantization, 1 means using int8 quantization without quantizing residuals, 2 means using int8 quantization with quantizing residuals.
'''
self.beam_width = beam_width
self.head_num = head_num
self.size_per_head = size_per_head
self.num_layer = num_layer
self.dtype = dtype
self.hidden_dim = self.head_num * self.size_per_head
self.kernel_init_range = kernel_init_range
self.bias_init_range = bias_init_range
self.int8_mode = int8_mode
if self.dtype == tf.float32:
self.check_threshold = 2e-5
elif self.dtype == tf.float16:
self.check_threshold = 2e-2
self.fuse_qkv = fuse_qkv
self.remove_padding = remove_padding
class DecodingArgument(object):
def __init__( self,
vocab_size,
start_id,
end_id,
max_seq_len,
decoder_args):
'''
The arguments of Decoding.
Decoding is the function which contains the whole translation process.
For example, the embedding lookup, position encoding, decoder, and
beam search or sampling to choose the token.
Args:
vocab_size: The size of vocabulary of Decoding.
start_id: The id of start token in vocabulary.
end_id: The id of end token in vocabulary.
max_seq_len: The maximum length of sentence in translation.
decoder_args: The arguments of decoder layer.
'''
self.vocab_size = vocab_size
self.start_id = start_id
self.end_id = end_id
self.max_seq_len = max_seq_len
self.decoder_args = decoder_args
class DecodingBeamsearchArgument(DecodingArgument):
def __init__( self,
vocab_size,
start_id,
end_id,
max_seq_len,
decoder_args,
beam_search_diversity_rate=-0.0):
'''
The arguments of Decoding with beam search.
Most arguments are similar to DecodingArgument except the beam_search_diversity_rate.
Args:
vocab_size: The size of vocabulary of Decoding.
start_id: The id of start token in vocabulary.
end_id: The id of end token in vocabulary.
max_seq_len: The maximum length of sentence in translation.
decoder_args: The arguments of decoder layer.
beam_search_diversity_rate: The diversity rate of beam search. When it is 0,
it is equivalent to naive beam search.
'''
super(DecodingBeamsearchArgument, self).__init__(vocab_size,
start_id,
end_id,
max_seq_len,
decoder_args)
self.beam_search_diversity_rate = beam_search_diversity_rate
if abs(self.beam_search_diversity_rate) == 0.0:
self.search_method = BeamSearch()
else:
self.search_method = DiverseSiblingSearch(beam_search_diversity_rate)
class DecodingSamplingArgument(DecodingArgument):
def __init__( self,
vocab_size,
start_id,
end_id,
max_seq_len,
decoder_args,
top_k=0,
top_p=0.0):
'''
The arguments of Decoding with sampling.
Most arguments are similar to DecodingArgument except the top_k and top_p.
Args:
vocab_size: The size of vocabulary of Decoding.
start_id: The id of start token in vocabulary.
end_id: The id of end token in vocabulary.
max_seq_len: The maximum length of sentence in translation.
decoder_args: The arguments of decoder layer.
top_k: A int value. The value of k for top k sampling.
top_p: A float value. The value of p for top p sampling.
Note that top_k and top_p both are 0 in the same time is invalid.
Note that top_k and top_p both are non-zero in the same time is invalid.
If top_k is non-zero, the Decoding function will use the top k sampling.
If top_k is non-zero, the Decoding function will use the top p sampling.
'''
super(DecodingSamplingArgument, self).__init__(vocab_size,
start_id,
end_id,
max_seq_len,
decoder_args)
self.top_k = top_k
self.top_p = top_p
if self.top_k == 0 and self.top_p == 0.0:
print("[ERROR] top_k and top_p cannot both be 0.")
exit(-1)
elif self.top_k != 0 and self.top_p != 0.0:
print("[ERROR] top_k and top_p cannot both be non-zero.")
exit(-1)
def create_initializer(initializer_range=0.02, data_type=tf.float32):
return tf.truncated_normal_initializer(stddev=initializer_range, dtype=data_type)
def _get_shape_invariants(tensor):
"""Returns the shape of the tensor but sets middle dims to None."""
if isinstance(tensor, tf.TensorArray):
shape = None
else:
shape = tensor.shape.as_list()
for i in range(1, len(shape) - 1):
shape[i] = None
return tf.TensorShape(shape)
def time_test(sess, tensor, iterations=100, warmup=True):
# return in ms
# warmup
if warmup == True:
for i in range(iterations):
sess.run(tensor)
t1 = datetime.now()
for i in range(iterations):
sess.run(tensor)
t2 = datetime.now()
time_sum = (t2 - t1).total_seconds()
return time_sum * 1000 / iterations
def cross_check(name, tf_val, op_val, atol_threshold):
abs_diff = np.fabs(tf_val - op_val)
print("[INFO] {} Cross check {}".format(name, np.allclose(tf_val, op_val, atol=atol_threshold)))
print("[INFO] Max diff {}".format(abs_diff.max()))
print("[INFO] min diff {}".format(abs_diff.min()))
def int_result_cross_check(name, tf_result, op_result, shape):
print(" ")
is_same = (tf_result.flatten() == op_result.flatten()).all()
print(" {} cross-check: {}".format(name, is_same))
if is_same == False:
tf_reshaped_result = np.reshape(tf_result, shape)
op_reshaped_result = np.reshape(op_result, shape)
for i in range(tf_reshaped_result.shape[0]):
is_true = (tf_reshaped_result[i] == op_reshaped_result[i]).all()
print(" Cross-Check on step-{} {}".format(i, is_true))
if is_true == False:
print("TF result: {}".format(tf_reshaped_result[i]))
print("OP result: {}".format(op_reshaped_result[i]))
class cudaProfiler:
def __init__(self):
self.profiler = ctypes.CDLL("libcudart.so")
def start(self):
ret = self.profiler.cudaProfilerStart()
if ret != 0:
raise Exception("cudaProfilerStart() return %d " %ret)
def stop(self):
ret = self.profiler.cudaProfilerStop()
if ret != 0:
raise Exception("cudaProfilerStop() return %d " %ret)