DeepLearningExamples/TensorFlow/Classification/ConvNets/triton/run_benchmark.py
2021-04-20 13:50:41 +02:00

221 lines
8 KiB
Python
Executable file

#!/usr/bin/env python3
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# method from PEP-366 to support relative import in executed modules
import argparse
import logging
from pathlib import Path
from typing import List
if __name__ == "__main__" and __package__ is None:
__package__ = Path(__file__).parent.name
from .benchmark.benchmark import Benchmark
from .benchmark.checkpoints import HttpCheckpoint
from .benchmark.core import LOGGER
from .benchmark.executor import DockerExecutor
from .deployment_toolkit.core import Accelerator, Format, Precision
AVAILABLE_MODEL_FORMATS = [f.value for f in Format]
AVAILABLE_MODEL_PRECISIONS = [p.value for p in Precision]
AVAILABLE_MODEL_ACCELERATORS = [a.value for a in Accelerator]
def run_benchmark(
devices: List[str],
model_name: str,
model_version: int,
model_format: str,
container_version: str,
checkpoint: str,
max_batch_size: int,
precision: str,
number_of_model_instances: int,
preferred_batch_sizes: List[int],
max_queue_delay_us: int,
backend_accelerator: str,
verbose: bool,
**kwargs
):
benchmark = Benchmark(
devices=devices,
model_name=model_name,
model_version=model_version,
framework="TensorFlow1",
container_version=container_version,
checkpoint=HttpCheckpoint(checkpoint),
verbose=verbose
)
benchmark.model_conversion(
cmds=(
r"""
python3 triton/convert_model.py \
--input-path triton/rn50_model.py \
--input-type tf-estimator \
--output-path ${SHARED_DIR}/model \
--output-type ${FORMAT} \
--onnx-opset 12 \
--onnx-optimized 1 \
--max-batch-size ${MAX_BATCH_SIZE} \
--max-workspace-size 4294967296 \
--ignore-unknown-parameters \
\
--model-dir ${CHECKPOINT_DIR} \
--precision ${PRECISION} \
--dataloader triton/dataloader.py \
--data-dir ${DATASETS_DIR}/imagenet
""",
)
)
benchmark.model_deploy(
cmds=(
r"""
python3 triton/deploy_model.py \
--model-repository ${MODEL_REPOSITORY_PATH} \
--model-path ${SHARED_DIR}/model \
--model-format ${FORMAT} \
--model-name ${MODEL_NAME} \
--model-version 1 \
--max-batch-size ${MAX_BATCH_SIZE} \
--precision ${PRECISION} \
--number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
--max-queue-delay-us ${TRITON_MAX_QUEUE_DELAY} \
--preferred-batch-sizes ${TRITON_PREFERRED_BATCH_SIZES} \
--capture-cuda-graph 0 \
--backend-accelerator ${BACKEND_ACCELERATOR} \
--load-model ${TRITON_LOAD_MODEL_METHOD}
""",
)
)
benchmark.triton_performance_offline_tests(
cmds=(
r"""
python triton/run_offline_performance_test_on_triton.py \
--server-url ${TRITON_SERVER_URL} \
--model-name ${MODEL_NAME} \
--number-of-warmup-iterations 5 \
--input-data random \
--batch-sizes ${BATCH_SIZE} \
--triton-instances ${TRITON_INSTANCES} \
--result-path ${SHARED_DIR}/triton_performance_offline.csv
""",
),
result_path="${SHARED_DIR}/triton_performance_offline.csv",
)
benchmark.triton_performance_online_tests(
cmds=(
r"""
python triton/run_online_performance_test_on_triton.py \
--server-url ${TRITON_SERVER_URL} \
--model-name ${MODEL_NAME} \
--number-of-warmup-iterations 5 \
--input-data random \
--batch-sizes ${BATCH_SIZE} \
--triton-instances ${TRITON_INSTANCES} \
--number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
--result-path ${SHARED_DIR}/triton_performance_online.csv
""",
),
result_path="${SHARED_DIR}/triton_performance_online.csv",
)
benchmark.configuration(
precision=precision,
max_batch_size=max_batch_size,
format=model_format,
accelerator=backend_accelerator,
triton_gpu_engine_count=number_of_model_instances,
triton_preferred_batch_sizes=preferred_batch_sizes,
triton_max_queue_delay_us=max_queue_delay_us,
**kwargs
)
executor = DockerExecutor()
executor.run(benchmark)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run benchmark for model.")
parser.add_argument("--devices", help="NVIDIA GPU device ID on which Triton Inference Server is ran. Accept multiple values", nargs="*", required=False)
parser.add_argument("--model-name", help="Model name. Default: ResNet50", default="ResNet50", required=False)
parser.add_argument("--model-version", default="1", help="Version of model. Default: 1", required=False)
parser.add_argument("--checkpoint", default="https://api.ngc.nvidia.com/v2/models/nvidia/rn50_tf_amp_ckpt/versions/20.06.0/zip", help="Checkpoint url. Default: https://api.ngc.nvidia.com/v2/models/nvidia/rn50_tf_amp_ckpt/versions/20.06.0/zip", required=False)
parser.add_argument("--container-version", help="Version of container for Triton Inference Server. Default: 20.12", default="20.12", required=False)
parser.add_argument(
"--model-format",
choices=AVAILABLE_MODEL_FORMATS,
help="Format of exported model. Default: tf-savedmodel",
default="tf-savedmodel",
required=False
)
parser.add_argument(
"--precision",
type=str,
default="fp16",
choices=AVAILABLE_MODEL_PRECISIONS,
help="Model precision (parameter used only by Tensorflow backend with TensorRT optimization). Default: fp16",
required=False
)
parser.add_argument(
"--max-batch-size",
type=int,
default=32,
help="Batch size used for benchmark. Maximal batch size which is used to convert model. Default: 32",
required=False
)
parser.add_argument(
"--number-of-model-instances",
type=int,
default=2,
help="Number of model instances per GPU (model instances). Default: 2",
required=False
)
parser.add_argument(
"--preferred-batch-sizes",
type=int,
nargs="*",
help="Batch sizes that the dynamic batching should attempt to create. "
"In case --max-queue-delay-us is set and this parameter is not, default value will be calculated based on --max-batch-size",
required=False
)
parser.add_argument(
"--max-queue-delay-us",
type=int,
default=100,
help="Max delay time which dynamic batch shall wait to form a batch. Default: 100",
required=False
)
parser.add_argument(
"--backend-accelerator",
choices=AVAILABLE_MODEL_ACCELERATORS,
type=str,
default="cuda",
help="Select backend accelerator used for model. Default: cuda",
required=False
)
parser.add_argument("--verbose", action="store_true", default=False, help="Provide verbose output")
args = parser.parse_args()
log_level = logging.INFO if not args.verbose else logging.DEBUG
LOGGER.setLevel(log_level)
LOGGER.info(f"args:")
for key, value in vars(args).items():
LOGGER.info(f" {key} = {value}")
run_benchmark(**vars(args))