1. [FasterTransformer] Fix the bug of encoder trt plugin.

This commit is contained in:
bhsueh 2020-04-29 08:42:43 +00:00
parent 90ce2a9923
commit 5ee9b2ec03
2 changed files with 56 additions and 38 deletions

View file

@ -125,7 +125,9 @@ The following section lists the requirements in order to use FasterTransformer.
- CUDA 10.1
- Python 2.7
- Tensorflow 1.14
These components are readily available within the NGC TensorFlow Docker image below.
- TensorRT 5.1.5.0
These components are readily available within the NGC TensorFlow Docker image below, except TensorRT.
Ensure you have the following components:
- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
@ -169,17 +171,20 @@ cd build
cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release .. # C++ only
cmake -DSM=xx -DCMAKE_BUILD_TYPE=Debug .. # C++ debug only
cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TF=ON -DTF_PATH=/usr/local/lib/python2.7/dist-packages/tensorflow .. # Tensorflow mode
cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TRT=ON -DTRT_PATH=/usr/include/x86_64-linux-gnu .. # TensorRT mode
cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TRT=ON -DTRT_PATH=<TensorRT_dir> .. # TensorRT mode if you put TensorRT in <TensorRT_dir>
make
```
Note: `xx` is the compute capability of your GPU. For example, 60 (P40) or 61 (P4) or 70 (V100) or 75(T4).
Note: If you use the image we recommand, then the tensorrt related libraries are in the `/usr/include/x86_64-linux-gnu`.
### Execute the encoder demos
1. Generate the `gemm_config.in` file.
```bash
./bin/encoder_gemm <batch_size> <sequence_length> <head_number> <size_per_head> <is_use_fp16>
./bin/encoder_gemm <batch_size> <sequence_length> <head_number> <size_per_head> <is_use_fp16>
./bin/encoder_gemm 1 32 12 64 0
```
@ -221,6 +226,15 @@ python encoder_sample.py \
--data_type fp16 \
--test_time 1
```
d. Run the encoder in TensorRT by tensorrt sample.
```bash
./bin/encoder_gemm 1 32 12 64 0
./bin/transformer_trt <batch_size> <num_layerse> <seq_len> <head_num> <size_per_head> fp16(fp32)
./bin/transformer_trt 1 12 32 12 64 fp32
```
3. Run the FasterTransformer in BERT.
The following script demonstrates how to integrate the FasterTransformer into a BERT model. This requires the repo of [BERT](https://github.com/google-research/bert).
@ -427,6 +441,7 @@ The `sample/` folder contains useful sample codes for FasterTransformer:
* `sample/tensorflow/encoder_decoder_sample.py` - TensorFlow `encoder_decoder` sample codes
* `sample/tensorflow/encoder_decoding_sample.py` - TensorFlow `encoder_decoding` sample codes
* `sample/tensorflow/translate_sample.py` - TensorFlow translation sample codes
* `sample/tensorRT/transformer_trt.cc` - Transformer layer tensorRT sample codes
### Command-line options
@ -853,6 +868,9 @@ bash scripts/profile_decoding_op_performance.sh
### Changelog
April 2020
- Fix the bug of encoder tensorrt plugin.
March 2020
- Add feature in FasterTransformer 2.0
- Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf.

View file

@ -49,7 +49,7 @@ class TransformerTrtTraits<half>
{
public:
static const OperationType OpType = OperationType::FP16;
static const nvinfer1::DataType DataType = nvinfer1::DataType::kFP16;
static const nvinfer1::DataType DataType = nvinfer1::DataType::kHALF;
};
class Logger : public nvinfer1::ILogger
@ -127,25 +127,25 @@ class TransformerPlugin: public IPluginV2
BertEncoderTransformer<EncoderTraits_>(*allocator_, max_batch_size, seq_len, seq_len, head_num, hidden_dim / head_num);
EncoderInitParam<T> encoder_param; //init param here
encoder_param.attr_kernel_Q = d_attr_kernel_Q_;
encoder_param.attr_kernel_K = d_attr_kernel_K_;
encoder_param.attr_kernel_V = d_attr_kernel_V_;
encoder_param.attr_bias_Q = d_attr_bias_Q_;
encoder_param.attr_bias_K = d_attr_bias_K_;
encoder_param.attr_bias_V = d_attr_bias_V_;
encoder_param.attr_output_kernel = d_attr_output_kernel_;
encoder_param.attr_output_bias = d_attr_output_bias_;
encoder_param.attr_output_layernorm_beta = d_attr_output_layernorm_beta_;
encoder_param.attr_output_layernorm_gamma = d_attr_output_layernorm_gamma_;
encoder_param.inter_kernel = d_inter_kernel_;
encoder_param.inter_bias = d_inter_bias_;
encoder_param.output_kernel = d_output_kernel_;
encoder_param.output_bias = d_output_bias_;
encoder_param.output_layernorm_beta = d_output_layernorm_beta_;
encoder_param.output_layernorm_gamma = d_output_layernorm_gamma_;
encoder_param.cublas_handle = cublas_handle_;
encoder_param.self_attention.query_weight.kernel = d_attr_kernel_Q_;
encoder_param.self_attention.key_weight.kernel = d_attr_kernel_K_;
encoder_param.self_attention.value_weight.kernel = d_attr_kernel_V_;
encoder_param.self_attention.query_weight.bias = d_attr_bias_Q_;
encoder_param.self_attention.key_weight.bias = d_attr_bias_K_;
encoder_param.self_attention.value_weight.bias = d_attr_bias_V_;
encoder_param.self_attention.attention_output_weight.kernel = d_attr_output_kernel_;
encoder_param.self_attention.attention_output_weight.bias = d_attr_output_bias_;
encoder_param.self_layernorm.beta = d_attr_output_layernorm_beta_;
encoder_param.self_layernorm.gamma = d_attr_output_layernorm_gamma_;
encoder_param.ffn.intermediate_weight.kernel = d_inter_kernel_;
encoder_param.ffn.intermediate_weight.bias = d_inter_bias_;
encoder_param.ffn.output_weight.kernel = d_output_kernel_;
encoder_param.ffn.output_weight.bias = d_output_bias_;
encoder_param.ffn_layernorm.beta = d_output_layernorm_beta_;
encoder_param.ffn_layernorm.gamma = d_output_layernorm_gamma_;
encoder_param.cublas_handle = cublas_handle_;
encoder_transformer_->initialize(encoder_param);
}
catch(std::runtime_error& error)
@ -201,23 +201,23 @@ class TransformerPlugin: public IPluginV2
BertEncoderTransformer<EncoderTraits_>(*allocator_, max_batch_size, seq_len, seq_len, head_num, hidden_dim / head_num);
EncoderInitParam<T> encoder_param; //init param here
encoder_param.attr_kernel_Q = d_attr_kernel_Q_;
encoder_param.attr_kernel_K = d_attr_kernel_K_;
encoder_param.attr_kernel_V = d_attr_kernel_V_;
encoder_param.attr_bias_Q = d_attr_bias_Q_;
encoder_param.attr_bias_K = d_attr_bias_K_;
encoder_param.attr_bias_V = d_attr_bias_V_;
encoder_param.attr_output_kernel = d_attr_output_kernel_;
encoder_param.attr_output_bias = d_attr_output_bias_;
encoder_param.attr_output_layernorm_beta = d_attr_output_layernorm_beta_;
encoder_param.attr_output_layernorm_gamma = d_attr_output_layernorm_gamma_;
encoder_param.inter_kernel = d_inter_kernel_;
encoder_param.inter_bias = d_inter_bias_;
encoder_param.output_kernel = d_output_kernel_;
encoder_param.output_bias = d_output_bias_;
encoder_param.output_layernorm_beta = d_output_layernorm_beta_;
encoder_param.output_layernorm_gamma = d_output_layernorm_gamma_;
encoder_param.self_attention.query_weight.kernel = d_attr_kernel_Q_;
encoder_param.self_attention.key_weight.kernel = d_attr_kernel_K_;
encoder_param.self_attention.value_weight.kernel = d_attr_kernel_V_;
encoder_param.self_attention.query_weight.bias = d_attr_bias_Q_;
encoder_param.self_attention.key_weight.bias = d_attr_bias_K_;
encoder_param.self_attention.value_weight.bias = d_attr_bias_V_;
encoder_param.self_attention.attention_output_weight.kernel = d_attr_output_kernel_;
encoder_param.self_attention.attention_output_weight.bias = d_attr_output_bias_;
encoder_param.self_layernorm.beta = d_attr_output_layernorm_beta_;
encoder_param.self_layernorm.gamma = d_attr_output_layernorm_gamma_;
encoder_param.ffn.intermediate_weight.kernel = d_inter_kernel_;
encoder_param.ffn.intermediate_weight.bias = d_inter_bias_;
encoder_param.ffn.output_weight.kernel = d_output_kernel_;
encoder_param.ffn.output_weight.bias = d_output_bias_;
encoder_param.ffn_layernorm.beta = d_output_layernorm_beta_;
encoder_param.ffn_layernorm.gamma = d_output_layernorm_gamma_;
encoder_param.cublas_handle = cublas_handle_;
encoder_transformer_->initialize(encoder_param);