[ResNet50/PyT] Triton perf fix
* ResNet50/PyT Triton ONNXruntime fix with env flag Scripts were modified to fix missing ORT_TENSORRT_FP16_ENABLE flag for Triton Inference Server with ONNXRuntime and TensorRT execution provider. * ResNet50/PyT TensorRT FP16 support fixed ONNX to TensorRT converter was fixed to force FP16 precision for TensorRT networks.
This commit is contained in:
parent
e22cfdd617
commit
5c33a8289b
|
@ -96,6 +96,12 @@ def onnx2trt(
|
||||||
LOGGER.error(f"OnnxParser error {i}/{parser.num_errors}: {parser.get_error(i)}")
|
LOGGER.error(f"OnnxParser error {i}/{parser.num_errors}: {parser.get_error(i)}")
|
||||||
raise RuntimeError("Error during parsing ONNX model (see logs for details)")
|
raise RuntimeError("Error during parsing ONNX model (see logs for details)")
|
||||||
|
|
||||||
|
# OnnxParser produces here FP32 TensorRT engine for FP16 network
|
||||||
|
# so we force FP16 here for first input/output
|
||||||
|
if fp16_mode:
|
||||||
|
network.get_input(0).dtype = trt.DataType.HALF
|
||||||
|
network.get_output(0).dtype = trt.DataType.HALF
|
||||||
|
|
||||||
# optimization
|
# optimization
|
||||||
config = builder.create_builder_config()
|
config = builder.create_builder_config()
|
||||||
config.flags |= bool(fp16_mode) << int(trt.BuilderFlag.FP16)
|
config.flags |= bool(fp16_mode) << int(trt.BuilderFlag.FP16)
|
||||||
|
|
|
@ -21,6 +21,7 @@ docker run --rm -d \
|
||||||
-p 8002:8002 \
|
-p 8002:8002 \
|
||||||
--runtime=nvidia \
|
--runtime=nvidia \
|
||||||
-e NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES} \
|
-e NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES} \
|
||||||
|
-e ORT_TENSORRT_FP16_ENABLE=1 \
|
||||||
-v ${MODEL_REPOSITORY_PATH}:${MODEL_REPOSITORY_PATH} \
|
-v ${MODEL_REPOSITORY_PATH}:${MODEL_REPOSITORY_PATH} \
|
||||||
--shm-size=1g \
|
--shm-size=1g \
|
||||||
--ulimit memlock=-1 \
|
--ulimit memlock=-1 \
|
||||||
|
|
|
@ -13,9 +13,9 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
export PRECISION="fp16"
|
export PRECISION="fp16"
|
||||||
export FORMAT="trt"
|
export FORMAT="onnx"
|
||||||
export BATCH_SIZE="1,2,4,8,16,32,64,128"
|
export BATCH_SIZE="1,2,4,8,16,32,64,128"
|
||||||
export BACKEND_ACCELERATOR="cuda"
|
export BACKEND_ACCELERATOR="trt"
|
||||||
export MAX_BATCH_SIZE="128"
|
export MAX_BATCH_SIZE="128"
|
||||||
export NUMBER_OF_MODEL_INSTANCES="1"
|
export NUMBER_OF_MODEL_INSTANCES="1"
|
||||||
export TRITON_MAX_QUEUE_DELAY="1"
|
export TRITON_MAX_QUEUE_DELAY="1"
|
||||||
|
|
Loading…
Reference in a new issue