[ResNet50/PyT] Triton perf fix
* ResNet50/PyT Triton ONNXruntime fix with env flag Scripts were modified to fix missing ORT_TENSORRT_FP16_ENABLE flag for Triton Inference Server with ONNXRuntime and TensorRT execution provider. * ResNet50/PyT TensorRT FP16 support fixed ONNX to TensorRT converter was fixed to force FP16 precision for TensorRT networks.
This commit is contained in:
parent
e22cfdd617
commit
5c33a8289b
|
@ -96,6 +96,12 @@ def onnx2trt(
|
|||
LOGGER.error(f"OnnxParser error {i}/{parser.num_errors}: {parser.get_error(i)}")
|
||||
raise RuntimeError("Error during parsing ONNX model (see logs for details)")
|
||||
|
||||
# OnnxParser produces here FP32 TensorRT engine for FP16 network
|
||||
# so we force FP16 here for first input/output
|
||||
if fp16_mode:
|
||||
network.get_input(0).dtype = trt.DataType.HALF
|
||||
network.get_output(0).dtype = trt.DataType.HALF
|
||||
|
||||
# optimization
|
||||
config = builder.create_builder_config()
|
||||
config.flags |= bool(fp16_mode) << int(trt.BuilderFlag.FP16)
|
||||
|
|
|
@ -21,6 +21,7 @@ docker run --rm -d \
|
|||
-p 8002:8002 \
|
||||
--runtime=nvidia \
|
||||
-e NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES} \
|
||||
-e ORT_TENSORRT_FP16_ENABLE=1 \
|
||||
-v ${MODEL_REPOSITORY_PATH}:${MODEL_REPOSITORY_PATH} \
|
||||
--shm-size=1g \
|
||||
--ulimit memlock=-1 \
|
||||
|
|
|
@ -13,9 +13,9 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
export PRECISION="fp16"
|
||||
export FORMAT="trt"
|
||||
export FORMAT="onnx"
|
||||
export BATCH_SIZE="1,2,4,8,16,32,64,128"
|
||||
export BACKEND_ACCELERATOR="cuda"
|
||||
export BACKEND_ACCELERATOR="trt"
|
||||
export MAX_BATCH_SIZE="128"
|
||||
export NUMBER_OF_MODEL_INSTANCES="1"
|
||||
export TRITON_MAX_QUEUE_DELAY="1"
|
||||
|
|
Loading…
Reference in a new issue