1. [FasterTransformer] Fix the bug of encoder trt plugin.

2020-04-29 08:42:43 +00:00 · 2020-04-29 08:42:43 +00:00 · 5ee9b2ec03
parent 90ce2a9923
commit 5ee9b2ec03
2 changed files with 56 additions and 38 deletions
--- a/FasterTransformer/v2/README.md
+++ b/FasterTransformer/v2/README.md
@ -125,7 +125,9 @@ The following section lists the requirements in order to use FasterTransformer.
 - CUDA 10.1
 - Python 2.7
 - Tensorflow 1.14
-These components are readily available within the NGC TensorFlow Docker image below.
+- TensorRT 5.1.5.0
+
+These components are readily available within the NGC TensorFlow Docker image below, except TensorRT.

 Ensure you have the following components:
 - [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
@ -169,17 +171,20 @@ cd build
 cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release .. # C++ only
 cmake -DSM=xx -DCMAKE_BUILD_TYPE=Debug .. # C++ debug only
 cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TF=ON -DTF_PATH=/usr/local/lib/python2.7/dist-packages/tensorflow .. # Tensorflow mode
+cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TRT=ON -DTRT_PATH=/usr/include/x86_64-linux-gnu .. # TensorRT mode
+cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TRT=ON -DTRT_PATH=<TensorRT_dir> .. # TensorRT mode if you put TensorRT in <TensorRT_dir>
 make
 ```

 Note: `xx` is the compute capability of your GPU. For example, 60 (P40) or 61 (P4) or 70 (V100) or 75(T4).
+Note: If you use the image we recommand, then the tensorrt related libraries are in the `/usr/include/x86_64-linux-gnu`. 

 ### Execute the encoder demos

 1. Generate the `gemm_config.in` file. 

 ```bash
- ./bin/encoder_gemm <batch_size> <sequence_length> <head_number> <size_per_head> <is_use_fp16>
+./bin/encoder_gemm <batch_size> <sequence_length> <head_number> <size_per_head> <is_use_fp16>
 ./bin/encoder_gemm 1 32 12 64 0
 ``` 

@ -221,6 +226,15 @@ python encoder_sample.py \
        --data_type fp16 \
        --test_time 1
 ```
+
+d. Run the encoder in TensorRT by tensorrt sample.
+
+```bash
+./bin/encoder_gemm 1 32 12 64 0
+./bin/transformer_trt <batch_size> <num_layerse> <seq_len> <head_num> <size_per_head> fp16(fp32)
+./bin/transformer_trt 1 12 32 12 64 fp32
+```
+
 3. Run the FasterTransformer in BERT.

 The following script demonstrates how to integrate the FasterTransformer into a BERT model. This requires the repo of [BERT](https://github.com/google-research/bert).
@ -427,6 +441,7 @@ The `sample/` folder contains useful sample codes for FasterTransformer:
 * `sample/tensorflow/encoder_decoder_sample.py` - TensorFlow `encoder_decoder` sample codes 
 * `sample/tensorflow/encoder_decoding_sample.py` - TensorFlow `encoder_decoding` sample codes 
 * `sample/tensorflow/translate_sample.py` - TensorFlow translation sample codes
+* `sample/tensorRT/transformer_trt.cc` - Transformer layer tensorRT sample codes

 ### Command-line options

@ -853,6 +868,9 @@ bash scripts/profile_decoding_op_performance.sh

 ### Changelog

+April 2020
+- Fix the bug of encoder tensorrt plugin.
+
 March 2020
 - Add feature in FasterTransformer 2.0
  - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf.
--- a/FasterTransformer/v2/fastertransformer/trt_plugin/bert_transformer_plugin.h
+++ b/FasterTransformer/v2/fastertransformer/trt_plugin/bert_transformer_plugin.h
@ -49,7 +49,7 @@ class TransformerTrtTraits<half>
 {
  public:
    static const OperationType OpType = OperationType::FP16;
-    static const nvinfer1::DataType DataType = nvinfer1::DataType::kFP16;
+    static const nvinfer1::DataType DataType = nvinfer1::DataType::kHALF;
 };

 class Logger : public nvinfer1::ILogger
@ -127,25 +127,25 @@ class TransformerPlugin: public IPluginV2
          BertEncoderTransformer<EncoderTraits_>(*allocator_, max_batch_size, seq_len, seq_len, head_num, hidden_dim / head_num);

        EncoderInitParam<T> encoder_param; //init param here
-   
-        encoder_param.attr_kernel_Q = d_attr_kernel_Q_;
-        encoder_param.attr_kernel_K = d_attr_kernel_K_;
-        encoder_param.attr_kernel_V = d_attr_kernel_V_;
-        encoder_param.attr_bias_Q = d_attr_bias_Q_;
-        encoder_param.attr_bias_K = d_attr_bias_K_;
-        encoder_param.attr_bias_V = d_attr_bias_V_;
-        encoder_param.attr_output_kernel = d_attr_output_kernel_;
-        encoder_param.attr_output_bias = d_attr_output_bias_;
-        encoder_param.attr_output_layernorm_beta = d_attr_output_layernorm_beta_;
-        encoder_param.attr_output_layernorm_gamma = d_attr_output_layernorm_gamma_;
-        encoder_param.inter_kernel = d_inter_kernel_;
-        encoder_param.inter_bias = d_inter_bias_;
-        encoder_param.output_kernel = d_output_kernel_;
-        encoder_param.output_bias = d_output_bias_;
-        encoder_param.output_layernorm_beta = d_output_layernorm_beta_;
-        encoder_param.output_layernorm_gamma = d_output_layernorm_gamma_;
-        encoder_param.cublas_handle = cublas_handle_;

+        encoder_param.self_attention.query_weight.kernel = d_attr_kernel_Q_;
+        encoder_param.self_attention.key_weight.kernel = d_attr_kernel_K_;
+        encoder_param.self_attention.value_weight.kernel = d_attr_kernel_V_;
+        encoder_param.self_attention.query_weight.bias = d_attr_bias_Q_;
+        encoder_param.self_attention.key_weight.bias = d_attr_bias_K_;
+        encoder_param.self_attention.value_weight.bias = d_attr_bias_V_;
+        encoder_param.self_attention.attention_output_weight.kernel = d_attr_output_kernel_;
+        encoder_param.self_attention.attention_output_weight.bias = d_attr_output_bias_;
+        encoder_param.self_layernorm.beta = d_attr_output_layernorm_beta_;
+        encoder_param.self_layernorm.gamma = d_attr_output_layernorm_gamma_;
+        encoder_param.ffn.intermediate_weight.kernel = d_inter_kernel_;
+        encoder_param.ffn.intermediate_weight.bias = d_inter_bias_;
+        encoder_param.ffn.output_weight.kernel = d_output_kernel_;
+        encoder_param.ffn.output_weight.bias = d_output_bias_;
+        encoder_param.ffn_layernorm.beta = d_output_layernorm_beta_;
+        encoder_param.ffn_layernorm.gamma = d_output_layernorm_gamma_;
+        encoder_param.cublas_handle = cublas_handle_;
+        
        encoder_transformer_->initialize(encoder_param);
      }
      catch(std::runtime_error& error)
@ -201,23 +201,23 @@ class TransformerPlugin: public IPluginV2
          BertEncoderTransformer<EncoderTraits_>(*allocator_, max_batch_size, seq_len, seq_len, head_num, hidden_dim / head_num);

        EncoderInitParam<T> encoder_param; //init param here
-   
-        encoder_param.attr_kernel_Q = d_attr_kernel_Q_;
-        encoder_param.attr_kernel_K = d_attr_kernel_K_;
-        encoder_param.attr_kernel_V = d_attr_kernel_V_;
-        encoder_param.attr_bias_Q = d_attr_bias_Q_;
-        encoder_param.attr_bias_K = d_attr_bias_K_;
-        encoder_param.attr_bias_V = d_attr_bias_V_;
-        encoder_param.attr_output_kernel = d_attr_output_kernel_;
-        encoder_param.attr_output_bias = d_attr_output_bias_;
-        encoder_param.attr_output_layernorm_beta = d_attr_output_layernorm_beta_;
-        encoder_param.attr_output_layernorm_gamma = d_attr_output_layernorm_gamma_;
-        encoder_param.inter_kernel = d_inter_kernel_;
-        encoder_param.inter_bias = d_inter_bias_;
-        encoder_param.output_kernel = d_output_kernel_;
-        encoder_param.output_bias = d_output_bias_;
-        encoder_param.output_layernorm_beta = d_output_layernorm_beta_;
-        encoder_param.output_layernorm_gamma = d_output_layernorm_gamma_;
+
+        encoder_param.self_attention.query_weight.kernel = d_attr_kernel_Q_;
+        encoder_param.self_attention.key_weight.kernel = d_attr_kernel_K_;
+        encoder_param.self_attention.value_weight.kernel = d_attr_kernel_V_;
+        encoder_param.self_attention.query_weight.bias = d_attr_bias_Q_;
+        encoder_param.self_attention.key_weight.bias = d_attr_bias_K_;
+        encoder_param.self_attention.value_weight.bias = d_attr_bias_V_;
+        encoder_param.self_attention.attention_output_weight.kernel = d_attr_output_kernel_;
+        encoder_param.self_attention.attention_output_weight.bias = d_attr_output_bias_;
+        encoder_param.self_layernorm.beta = d_attr_output_layernorm_beta_;
+        encoder_param.self_layernorm.gamma = d_attr_output_layernorm_gamma_;
+        encoder_param.ffn.intermediate_weight.kernel = d_inter_kernel_;
+        encoder_param.ffn.intermediate_weight.bias = d_inter_bias_;
+        encoder_param.ffn.output_weight.kernel = d_output_kernel_;
+        encoder_param.ffn.output_weight.bias = d_output_bias_;
+        encoder_param.ffn_layernorm.beta = d_output_layernorm_beta_;
+        encoder_param.ffn_layernorm.gamma = d_output_layernorm_gamma_;
        encoder_param.cublas_handle = cublas_handle_;

        encoder_transformer_->initialize(encoder_param);