[ResNet50/PyT] Triton perf fix

* ResNet50/PyT Triton ONNXruntime fix with env flag Scripts were modified to fix missing ORT_TENSORRT_FP16_ENABLE flag for Triton Inference Server with ONNXRuntime and TensorRT execution provider. * ResNet50/PyT TensorRT FP16 support fixed ONNX to TensorRT converter was fixed to force FP16 precision for TensorRT networks.
2021-06-16 16:04:22 +02:00 · 2021-06-16 16:04:22 +02:00 · 5c33a8289b
parent e22cfdd617
commit 5c33a8289b
3 changed files with 9 additions and 2 deletions
--- a/PyTorch/Classification/ConvNets/triton/deployment_toolkit/bermuda/onnx2trt_conv.py
+++ b/PyTorch/Classification/ConvNets/triton/deployment_toolkit/bermuda/onnx2trt_conv.py
@ -96,6 +96,12 @@ def onnx2trt(
                LOGGER.error(f"OnnxParser error {i}/{parser.num_errors}: {parser.get_error(i)}")
            raise RuntimeError("Error during parsing ONNX model (see logs for details)")

+        # OnnxParser produces here FP32 TensorRT engine for FP16 network
+        # so we force FP16 here for first input/output
+        if fp16_mode:
+            network.get_input(0).dtype = trt.DataType.HALF
+            network.get_output(0).dtype = trt.DataType.HALF
+
        # optimization
        config = builder.create_builder_config()
        config.flags |= bool(fp16_mode) << int(trt.BuilderFlag.FP16)
--- a/PyTorch/Classification/ConvNets/triton/scripts/docker/triton_inference_server.sh
+++ b/PyTorch/Classification/ConvNets/triton/scripts/docker/triton_inference_server.sh
@ -21,6 +21,7 @@ docker run --rm -d \
  -p 8002:8002 \
  --runtime=nvidia \
  -e NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES} \
+  -e ORT_TENSORRT_FP16_ENABLE=1 \
  -v ${MODEL_REPOSITORY_PATH}:${MODEL_REPOSITORY_PATH} \
  --shm-size=1g \
  --ulimit memlock=-1 \
--- a/PyTorch/Classification/ConvNets/triton/scripts/setup_parameters.sh
+++ b/PyTorch/Classification/ConvNets/triton/scripts/setup_parameters.sh
@ -13,9 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 export PRECISION="fp16"
-export FORMAT="trt"
+export FORMAT="onnx"
 export BATCH_SIZE="1,2,4,8,16,32,64,128"
-export BACKEND_ACCELERATOR="cuda"
+export BACKEND_ACCELERATOR="trt"
 export MAX_BATCH_SIZE="128"
 export NUMBER_OF_MODEL_INSTANCES="1"
 export TRITON_MAX_QUEUE_DELAY="1"