[nnUnet/PyT] Add support for Triton
|
@ -6,6 +6,7 @@ WORKDIR /workspace/nnunet_pyt
|
|||
|
||||
RUN pip install --upgrade pip
|
||||
RUN pip install --disable-pip-version-check -r requirements.txt
|
||||
RUN pip install --disable-pip-version-check -r triton/requirements.txt
|
||||
RUN pip install pytorch-lightning==1.0.0 --no-dependencies
|
||||
RUN pip install monai==0.4.0 --no-dependencies
|
||||
RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/ nvidia-dali-cuda110==0.30.0
|
||||
|
@ -14,3 +15,10 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2
|
|||
RUN unzip -qq awscliv2.zip
|
||||
RUN ./aws/install
|
||||
RUN rm -rf awscliv2.zip aws
|
||||
|
||||
# Install Perf Client required library
|
||||
RUN apt-get update && apt-get install -y libb64-dev libb64-0d
|
||||
|
||||
# Install Triton Client Python API and copy Perf Client
|
||||
#COPY --from=triton-client /workspace/install/ /workspace/install/
|
||||
#RUN pip install /workspace/install/python/triton*.whl
|
||||
|
|
|
@ -134,10 +134,6 @@ TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by defaul
|
|||
|
||||
Test time augmentation is an inference technique which averages predictions from augmented images with its prediction. As a result, predictions are more accurate, but with the cost of slower inference process. For nnU-Net, we use all possible flip combinations for image augmenting. Test time augmentation can be enabled by adding the `--tta` flag.
|
||||
|
||||
**Deep supervision**
|
||||
|
||||
Deep supervision is a technique which adds auxiliary loss in U-Net decoder. For nnU-Net, we add auxiliary losses to all but the lowest two decoder levels. Final loss is the weighted average of losses. Deep supervision can be enabled by adding the `--deep_supervision` flag.
|
||||
|
||||
## Setup
|
||||
|
||||
The following section lists the requirements that you need to meet in order to start training the nnU-Net model.
|
||||
|
@ -308,7 +304,7 @@ To see the full list of available options and their descriptions, use the `-h` o
|
|||
The following example output is printed when running the model:
|
||||
|
||||
```
|
||||
usage: main.py [-h] [--exec_mode {train,evaluate,predict}] [--data DATA] [--results RESULTS] [--logname LOGNAME] [--task TASK] [--gpus GPUS] [--learning_rate LEARNING_RATE] [--gradient_clip_val GRADIENT_CLIP_VAL] [--negative_slope NEGATIVE_SLOPE] [--tta] [--amp] [--benchmark] [--deep_supervision] [--drop_block] [--attention] [--residual] [--focal] [--sync_batchnorm] [--save_ckpt] [--nfolds NFOLDS] [--seed SEED] [--skip_first_n_eval SKIP_FIRST_N_EVAL] [--ckpt_path CKPT_PATH] [--fold FOLD] [--patience PATIENCE] [--lr_patience LR_PATIENCE] [--batch_size BATCH_SIZE] [--val_batch_size VAL_BATCH_SIZE] [--steps STEPS [STEPS ...]] [--profile] [--momentum MOMENTUM] [--weight_decay WEIGHT_DECAY] [--save_preds] [--dim {2,3}] [--resume_training] [--factor FACTOR] [--num_workers NUM_WORKERS] [--min_epochs MIN_EPOCHS] [--max_epochs MAX_EPOCHS] [--warmup WARMUP] [--norm {instance,batch,group}] [--nvol NVOL] [--data2d_dim {2,3}] [--oversampling OVERSAMPLING] [--overlap OVERLAP] [--affinity {socket,single,single_unique,socket_unique_interleaved,socket_unique_continuous,disabled}] [--scheduler {none,multistep,cosine,plateau}] [--optimizer {sgd,radam,adam}] [--blend {gaussian,constant}] [--train_batches TRAIN_BATCHES] [--test_batches TEST_BATCHES]
|
||||
usage: main.py [-h] [--exec_mode {train,evaluate,predict}] [--data DATA] [--results RESULTS] [--logname LOGNAME] [--task TASK] [--gpus GPUS] [--learning_rate LEARNING_RATE] [--gradient_clip_val GRADIENT_CLIP_VAL] [--negative_slope NEGATIVE_SLOPE] [--tta] [--amp] [--benchmark] [--residual] [--focal] [--sync_batchnorm] [--save_ckpt] [--nfolds NFOLDS] [--seed SEED] [--skip_first_n_eval SKIP_FIRST_N_EVAL] [--ckpt_path CKPT_PATH] [--fold FOLD] [--patience PATIENCE] [--lr_patience LR_PATIENCE] [--batch_size BATCH_SIZE] [--val_batch_size VAL_BATCH_SIZE] [--steps STEPS [STEPS ...]] [--profile] [--momentum MOMENTUM] [--weight_decay WEIGHT_DECAY] [--save_preds] [--dim {2,3}] [--resume_training] [--factor FACTOR] [--num_workers NUM_WORKERS] [--min_epochs MIN_EPOCHS] [--max_epochs MAX_EPOCHS] [--warmup WARMUP] [--norm {instance,batch,group}] [--nvol NVOL] [--data2d_dim {2,3}] [--oversampling OVERSAMPLING] [--overlap OVERLAP] [--affinity {socket,single,single_unique,socket_unique_interleaved,socket_unique_continuous,disabled}] [--scheduler {none,multistep,cosine,plateau}] [--optimizer {sgd,radam,adam}] [--blend {gaussian,constant}] [--train_batches TRAIN_BATCHES] [--test_batches TEST_BATCHES]
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
|
@ -328,9 +324,6 @@ optional arguments:
|
|||
--tta Enable test time augmentation (default: False)
|
||||
--amp Enable automatic mixed precision (default: False)
|
||||
--benchmark Run model benchmarking (default: False)
|
||||
--deep_supervision Enable deep supervision (default: False)
|
||||
--drop_block Enable drop block (default: False)
|
||||
--attention Enable attention in decoder (default: False)
|
||||
--residual Enable residual block in encoder (default: False)
|
||||
--focal Use focal loss instead of cross entropy (default: False)
|
||||
--sync_batchnorm Enable synchronized batchnorm (default: False)
|
||||
|
@ -435,7 +428,7 @@ The default configuration minimizes a function `L = (1 - dice_coefficient) + cro
|
|||
The training can be run directly without using the predefined scripts. The name of the training script is `main.py`. For example:
|
||||
|
||||
```
|
||||
python main.py --exec_mode train --task 01 --fold 0 --gpus 1 --amp --deep_supervision
|
||||
python main.py --exec_mode train --task 01 --fold 0 --gpus 1 --amp
|
||||
```
|
||||
|
||||
Training artifacts will be saved to `/results` in the container. Some important artifacts are:
|
||||
|
@ -612,7 +605,7 @@ Our results were obtained by running the `python scripts/benchmark.py --mode pre
|
|||
|
||||
FP16
|
||||
|
||||
| Dimension | Batch size | Resolution | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
|
||||
| Dimension | Batch size |Resolution| Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
|
||||
|:----------:|:---------:|:-------------:|:----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
|
||||
| 2 | 64 | 4x192x160 | 1866.52 | 34.29 | 34.7 | 48.87 | 52.44 |
|
||||
| 2 | 128 | 4x192x160 | 2032.74 | 62.97 | 63.21 | 63.25 | 63.32 |
|
||||
|
@ -622,7 +615,7 @@ FP16
|
|||
|
||||
FP32
|
||||
|
||||
| Dimension | Batch size | Resolution | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
|
||||
| Dimension | Batch size |Resolution| Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
|
||||
|:----------:|:---------:|:-------------:|:----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
|
||||
| 2 | 64 | 4x192x160 | 1051.46 | 60.87 | 61.21 | 61.48 | 62.87 |
|
||||
| 2 | 128 | 4x192x160 | 1051.68 | 121.71 | 122.29 | 122.44 | 122.6 |
|
||||
|
@ -638,6 +631,10 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
|
|||
|
||||
### Changelog
|
||||
|
||||
May 2021
|
||||
- Add Triton Inference Server support
|
||||
- Removed deep supervision, attention and drop block
|
||||
|
||||
March 2021
|
||||
- Container updated to 21.02
|
||||
- Change data format from tfrecord to npy and data loading for 2D
|
||||
|
|
|
@ -160,6 +160,37 @@ class EvalPipeline(Pipeline):
|
|||
return img, lbl
|
||||
|
||||
|
||||
class BermudaPipeline(Pipeline):
|
||||
def __init__(self, batch_size, num_threads, device_id, **kwargs):
|
||||
super(BermudaPipeline, self).__init__(batch_size, num_threads, device_id)
|
||||
self.input_x = get_numpy_reader(
|
||||
files=kwargs["imgs"],
|
||||
shard_id=device_id,
|
||||
num_shards=kwargs["gpus"],
|
||||
seed=kwargs["seed"],
|
||||
shuffle=False,
|
||||
)
|
||||
self.input_y = get_numpy_reader(
|
||||
files=kwargs["lbls"],
|
||||
shard_id=device_id,
|
||||
num_shards=kwargs["gpus"],
|
||||
seed=kwargs["seed"],
|
||||
shuffle=False,
|
||||
)
|
||||
self.patch_size = kwargs["patch_size"]
|
||||
|
||||
def crop_fn(self, img, lbl):
|
||||
img = fn.crop(img, crop=self.patch_size, out_of_bounds_policy="pad")
|
||||
lbl = fn.crop(lbl, crop=self.patch_size, out_of_bounds_policy="pad")
|
||||
return img, lbl
|
||||
|
||||
def define_graph(self):
|
||||
img, lbl = self.input_x(name="ReaderX"), self.input_y(name="ReaderY")
|
||||
img, lbl = fn.reshape(img, layout="CDHW"), fn.reshape(lbl, layout="CDHW")
|
||||
img, lbl = self.crop_fn(img, lbl)
|
||||
return img, lbl
|
||||
|
||||
|
||||
class TestPipeline(Pipeline):
|
||||
def __init__(self, batch_size, num_threads, device_id, **kwargs):
|
||||
super(TestPipeline, self).__init__(batch_size, num_threads, device_id)
|
||||
|
@ -249,11 +280,6 @@ def fetch_dali_loader(imgs, lbls, batch_size, mode, **kwargs):
|
|||
nbs *= batch_size
|
||||
imgs = list(itertools.chain(*(100 * [imgs])))[: nbs * kwargs["gpus"]]
|
||||
lbls = list(itertools.chain(*(100 * [lbls])))[: nbs * kwargs["gpus"]]
|
||||
if mode == "eval":
|
||||
reminder = len(imgs) % kwargs["gpus"]
|
||||
if reminder != 0:
|
||||
imgs = imgs[:-reminder]
|
||||
lbls = lbls[:-reminder]
|
||||
|
||||
pipe_kwargs = {
|
||||
"imgs": imgs,
|
||||
|
@ -284,6 +310,10 @@ def fetch_dali_loader(imgs, lbls, batch_size, mode, **kwargs):
|
|||
pipeline = EvalPipeline
|
||||
output_map = ["image", "label"]
|
||||
dynamic_shape = True
|
||||
elif mode == "bermuda":
|
||||
pipeline = BermudaPipeline
|
||||
output_map = ["image", "label"]
|
||||
dynamic_shape = False
|
||||
else:
|
||||
pipeline = TestPipeline
|
||||
output_map = ["image", "meta"]
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from dropblock import DropBlock3D, LinearScheduler
|
||||
|
||||
normalizations = {
|
||||
"instancenorm3d": nn.InstanceNorm3d,
|
||||
|
@ -68,14 +67,6 @@ def get_output_padding(kernel_size, stride, padding):
|
|||
return out_padding if len(out_padding) > 1 else out_padding[0]
|
||||
|
||||
|
||||
def get_drop_block():
|
||||
return LinearScheduler(
|
||||
DropBlock3D(block_size=5, drop_prob=0.0),
|
||||
start_value=0.0,
|
||||
stop_value=0.1,
|
||||
nr_steps=10000,
|
||||
)
|
||||
|
||||
|
||||
class ConvLayer(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
|
||||
|
@ -83,15 +74,9 @@ class ConvLayer(nn.Module):
|
|||
self.conv = get_conv(in_channels, out_channels, kernel_size, stride, kwargs["dim"])
|
||||
self.norm = get_norm(kwargs["norm"], out_channels)
|
||||
self.lrelu = nn.LeakyReLU(negative_slope=kwargs["negative_slope"], inplace=True)
|
||||
self.use_drop_block = kwargs["drop_block"]
|
||||
if self.use_drop_block:
|
||||
self.drop_block = get_drop_block()
|
||||
|
||||
def forward(self, data):
|
||||
out = self.conv(data)
|
||||
if self.use_drop_block:
|
||||
self.drop_block.step()
|
||||
out = self.drop_block(out)
|
||||
out = self.norm(out)
|
||||
out = self.lrelu(out)
|
||||
return out
|
||||
|
@ -116,10 +101,6 @@ class ResidBlock(nn.Module):
|
|||
self.conv2 = get_conv(out_channels, out_channels, kernel_size, 1, kwargs["dim"])
|
||||
self.norm = get_norm(kwargs["norm"], out_channels)
|
||||
self.lrelu = nn.LeakyReLU(negative_slope=kwargs["negative_slope"], inplace=True)
|
||||
self.use_drop_block = kwargs["drop_block"]
|
||||
if self.use_drop_block:
|
||||
self.drop_block = get_drop_block()
|
||||
self.skip_drop_block = get_drop_block()
|
||||
self.downsample = None
|
||||
if max(stride) > 1 or in_channels != out_channels:
|
||||
self.downsample = get_conv(in_channels, out_channels, kernel_size, stride, kwargs["dim"])
|
||||
|
@ -129,52 +110,22 @@ class ResidBlock(nn.Module):
|
|||
residual = input_data
|
||||
out = self.conv1(input_data)
|
||||
out = self.conv2(out)
|
||||
if self.use_drop_block:
|
||||
out = self.drop_block(out)
|
||||
out = self.norm(out)
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(residual)
|
||||
if self.use_drop_block:
|
||||
residual = self.skip_drop_block(residual)
|
||||
residual = self.norm_res(residual)
|
||||
out = self.lrelu(out + residual)
|
||||
return out
|
||||
|
||||
|
||||
class AttentionLayer(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, norm, dim):
|
||||
super(AttentionLayer, self).__init__()
|
||||
self.conv = get_conv(in_channels, out_channels, kernel_size=3, stride=1, dim=dim)
|
||||
self.norm = get_norm(norm, out_channels)
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.conv(inputs)
|
||||
out = self.norm(out)
|
||||
return out
|
||||
|
||||
|
||||
class UpsampleBlock(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
|
||||
super(UpsampleBlock, self).__init__()
|
||||
self.transp_conv = get_transp_conv(in_channels, out_channels, stride, stride, kwargs["dim"])
|
||||
self.conv_block = ConvBlock(2 * out_channels, out_channels, kernel_size, 1, **kwargs)
|
||||
self.attention = kwargs["attention"]
|
||||
if self.attention:
|
||||
att_out, norm, dim = out_channels // 2, kwargs["norm"], kwargs["dim"]
|
||||
self.conv_o = AttentionLayer(out_channels, att_out, norm, dim)
|
||||
self.conv_s = AttentionLayer(out_channels, att_out, norm, dim)
|
||||
self.psi = AttentionLayer(att_out, 1, norm, dim)
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
|
||||
def forward(self, input_data, skip_data):
|
||||
out = self.transp_conv(input_data)
|
||||
if self.attention:
|
||||
out_a = self.conv_o(out)
|
||||
skip_a = self.conv_s(skip_data)
|
||||
psi_a = self.psi(self.relu(out_a + skip_a))
|
||||
attention = self.sigmoid(psi_a)
|
||||
skip_data = skip_data * attention
|
||||
out = torch.cat((out, skip_data), dim=1)
|
||||
out = self.conv_block(out)
|
||||
return out
|
||||
|
|
|
@ -39,28 +39,33 @@ from models.unet import UNet
|
|||
|
||||
|
||||
class NNUnet(pl.LightningModule):
|
||||
def __init__(self, args):
|
||||
def __init__(self, args, bermuda=False, data_dir=None):
|
||||
super(NNUnet, self).__init__()
|
||||
self.args = args
|
||||
if not hasattr(self.args, "drop_block"): # For backward compability
|
||||
self.args.drop_block = False
|
||||
self.bermuda = bermuda
|
||||
if data_dir is not None:
|
||||
self.args.data = data_dir
|
||||
self.save_hyperparameters()
|
||||
self.build_nnunet()
|
||||
self.loss = Loss(self.args.focal)
|
||||
self.dice = Dice(self.n_class)
|
||||
self.best_sum = 0
|
||||
self.best_sum_epoch = 0
|
||||
self.best_dice = self.n_class * [0]
|
||||
self.best_epoch = self.n_class * [0]
|
||||
self.best_sum_dice = self.n_class * [0]
|
||||
self.learning_rate = args.learning_rate
|
||||
self.tta_flips = get_tta_flips(args.dim)
|
||||
self.test_idx = 0
|
||||
self.test_imgs = []
|
||||
if self.args.exec_mode in ["train", "evaluate"]:
|
||||
self.dllogger = get_dllogger(args.results)
|
||||
if not self.bermuda:
|
||||
self.learning_rate = args.learning_rate
|
||||
self.loss = Loss(self.args.focal)
|
||||
self.tta_flips = get_tta_flips(args.dim)
|
||||
self.dice = Dice(self.n_class)
|
||||
if self.args.exec_mode in ["train", "evaluate"]:
|
||||
self.dllogger = get_dllogger(args.results)
|
||||
|
||||
def forward(self, img):
|
||||
return torch.argmax(self.model(img), 1)
|
||||
|
||||
def _forward(self, img):
|
||||
if self.args.benchmark:
|
||||
if self.args.dim == 2 and self.args.data2d_dim == 3:
|
||||
img = layout_2d(img, None)
|
||||
|
@ -70,14 +75,14 @@ class NNUnet(pl.LightningModule):
|
|||
def training_step(self, batch, batch_idx):
|
||||
img, lbl = self.get_train_data(batch)
|
||||
pred = self.model(img)
|
||||
loss = self.compute_loss(pred, lbl)
|
||||
loss = self.loss(pred, lbl)
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
if self.current_epoch < self.args.skip_first_n_eval:
|
||||
return None
|
||||
img, lbl = batch["image"], batch["label"]
|
||||
pred = self.forward(img)
|
||||
pred = self._forward(img)
|
||||
loss = self.loss(pred, lbl)
|
||||
self.dice.update(pred, lbl[:, 0])
|
||||
return {"val_loss": loss}
|
||||
|
@ -86,7 +91,7 @@ class NNUnet(pl.LightningModule):
|
|||
if self.args.exec_mode == "evaluate":
|
||||
return self.validation_step(batch, batch_idx)
|
||||
img = batch["image"]
|
||||
pred = self.forward(img)
|
||||
pred = self._forward(img)
|
||||
if self.args.save_preds:
|
||||
meta = batch["meta"][0].cpu().detach().numpy()
|
||||
original_shape = meta[2]
|
||||
|
@ -120,25 +125,12 @@ class NNUnet(pl.LightningModule):
|
|||
strides=strides,
|
||||
dimension=self.args.dim,
|
||||
residual=self.args.residual,
|
||||
attention=self.args.attention,
|
||||
drop_block=self.args.drop_block,
|
||||
normalization_layer=self.args.norm,
|
||||
negative_slope=self.args.negative_slope,
|
||||
deep_supervision=self.args.deep_supervision,
|
||||
)
|
||||
if is_main_process():
|
||||
print(f"Filters: {self.model.filters},\nKernels: {kernels}\nStrides: {strides}")
|
||||
|
||||
def compute_loss(self, preds, label):
|
||||
if self.args.deep_supervision:
|
||||
loss = self.loss(preds[0], label)
|
||||
for i, pred in enumerate(preds[1:]):
|
||||
downsampled_label = nn.functional.interpolate(label, pred.shape[2:])
|
||||
loss += 0.5 ** (i + 1) * self.loss(pred, downsampled_label)
|
||||
c_norm = 1 / (2 - 2 ** (-len(preds)))
|
||||
return c_norm * loss
|
||||
return self.loss(preds, label)
|
||||
|
||||
def do_inference(self, image):
|
||||
if self.args.dim == 3:
|
||||
return self.sliding_window_inference(image)
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from models.layers import ConvBlock, OutputBlock, ResidBlock, UpsampleBlock
|
||||
|
@ -26,19 +25,14 @@ class UNet(nn.Module):
|
|||
strides,
|
||||
normalization_layer,
|
||||
negative_slope,
|
||||
deep_supervision,
|
||||
attention,
|
||||
drop_block,
|
||||
residual,
|
||||
dimension,
|
||||
):
|
||||
super(UNet, self).__init__()
|
||||
self.dim = dimension
|
||||
self.n_class = n_class
|
||||
self.attention = attention
|
||||
self.residual = residual
|
||||
self.negative_slope = negative_slope
|
||||
self.deep_supervision = deep_supervision
|
||||
self.norm = normalization_layer + f"norm{dimension}d"
|
||||
self.filters = [min(2 ** (5 + i), 320 if dimension == 3 else 512) for i in range(len(strides))]
|
||||
|
||||
|
@ -56,7 +50,6 @@ class UNet(nn.Module):
|
|||
out_channels=self.filters[1:],
|
||||
kernels=kernels[1:-1],
|
||||
strides=strides[1:-1],
|
||||
drop_block=drop_block,
|
||||
)
|
||||
self.bottleneck = self.get_conv_block(
|
||||
conv_block=down_block,
|
||||
|
@ -64,7 +57,6 @@ class UNet(nn.Module):
|
|||
out_channels=self.filters[-1],
|
||||
kernel_size=kernels[-1],
|
||||
stride=strides[-1],
|
||||
drop_block=drop_block,
|
||||
)
|
||||
self.upsamples = self.get_module_list(
|
||||
conv_block=UpsampleBlock,
|
||||
|
@ -74,8 +66,8 @@ class UNet(nn.Module):
|
|||
strides=strides[1:][::-1],
|
||||
)
|
||||
self.output_block = self.get_output_block(decoder_level=0)
|
||||
self.deep_supervision_heads = self.get_deep_supervision_heads()
|
||||
self.apply(self.initialize_weights)
|
||||
self.n_layers = len(self.upsamples) - 1
|
||||
|
||||
def forward(self, input_data):
|
||||
out = self.input_block(input_data)
|
||||
|
@ -84,26 +76,18 @@ class UNet(nn.Module):
|
|||
out = downsample(out)
|
||||
encoder_outputs.append(out)
|
||||
out = self.bottleneck(out)
|
||||
decoder_outputs = []
|
||||
for upsample, skip in zip(self.upsamples, reversed(encoder_outputs)):
|
||||
out = upsample(out, skip)
|
||||
decoder_outputs.append(out)
|
||||
for idx, upsample in enumerate(self.upsamples):
|
||||
out = upsample(out, encoder_outputs[self.n_layers - idx])
|
||||
out = self.output_block(out)
|
||||
if self.training and self.deep_supervision:
|
||||
out = [out]
|
||||
for i, decoder_out in enumerate(decoder_outputs[2:-1][::-1]):
|
||||
out.append(self.deep_supervision_heads[i](decoder_out))
|
||||
return out
|
||||
|
||||
def get_conv_block(self, conv_block, in_channels, out_channels, kernel_size, stride, drop_block=False):
|
||||
def get_conv_block(self, conv_block, in_channels, out_channels, kernel_size, stride):
|
||||
return conv_block(
|
||||
dim=self.dim,
|
||||
stride=stride,
|
||||
norm=self.norm,
|
||||
drop_block=drop_block,
|
||||
kernel_size=kernel_size,
|
||||
in_channels=in_channels,
|
||||
attention=self.attention,
|
||||
out_channels=out_channels,
|
||||
negative_slope=self.negative_slope,
|
||||
)
|
||||
|
@ -111,14 +95,10 @@ class UNet(nn.Module):
|
|||
def get_output_block(self, decoder_level):
|
||||
return OutputBlock(in_channels=self.filters[decoder_level], out_channels=self.n_class, dim=self.dim)
|
||||
|
||||
def get_deep_supervision_heads(self):
|
||||
return nn.ModuleList([self.get_output_block(i + 1) for i in range(len(self.upsamples) - 1)])
|
||||
|
||||
def get_module_list(self, in_channels, out_channels, kernels, strides, conv_block, drop_block=False):
|
||||
def get_module_list(self, in_channels, out_channels, kernels, strides, conv_block):
|
||||
layers = []
|
||||
for i, (in_channel, out_channel, kernel, stride) in enumerate(zip(in_channels, out_channels, kernels, strides)):
|
||||
use_drop_block = drop_block and len(in_channels) - i <= 2
|
||||
conv_layer = self.get_conv_block(conv_block, in_channel, out_channel, kernel, stride, use_drop_block)
|
||||
for in_channel, out_channel, kernel, stride in zip(in_channels, out_channels, kernels, strides):
|
||||
conv_layer = self.get_conv_block(conv_block, in_channel, out_channel, kernel, stride)
|
||||
layers.append(conv_layer)
|
||||
return nn.ModuleList(layers)
|
||||
|
||||
|
|
|
@ -5,5 +5,4 @@ scikit-learn==0.23.2
|
|||
pynvml==8.0.4
|
||||
pillow==6.2.0
|
||||
fsspec==0.8.0
|
||||
pytorch_ranger==0.1.1
|
||||
dropblock
|
||||
pytorch_ranger==0.1.1
|
|
@ -30,7 +30,7 @@ parser.add_argument("--logname", type=str, default="log", help="Name of dlloger
|
|||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
path_to_main = os.path.join(dirname(dirname(os.path.realpath(__file__))), "main.py")
|
||||
cmd = f"python {path_to_main} --exec_mode train --task {args.task} --deep_supervision --save_ckpt "
|
||||
cmd = f"python {path_to_main} --exec_mode train --task {args.task} --save_ckpt "
|
||||
cmd += f"--results {args.results} "
|
||||
cmd += f"--logname {args.logname} "
|
||||
cmd += f"--dim {args.dim} "
|
||||
|
|
893
PyTorch/Segmentation/nnUNet/triton/README.md
Normal file
|
@ -0,0 +1,893 @@
|
|||
# Deploying the nnUNet model on Triton Inference Server
|
||||
|
||||
This folder contains instructions for deployment to run inference
|
||||
on the Triton Inference Server and a detailed performance analysis.
|
||||
The purpose of this document is to help you achieve
|
||||
the best inference performance.
|
||||
|
||||
## Table of contents
|
||||
|
||||
- [Solution overview](#solution-overview)
|
||||
- [Introduction](#introduction)
|
||||
- [Deployment process](#deployment-process)
|
||||
- [Setup](#setup)
|
||||
- [Quick Start Guide](#quick-start-guide)
|
||||
- [Advanced](#advanced)
|
||||
- [Triton embedded deployment](#triton-embedded-deployment)
|
||||
- [Prepare configuration](#prepare-configuration)
|
||||
- [Latency explanation](#latency-explanation)
|
||||
- [Performance](#performance)
|
||||
- [Offline scenario](#offline-scenario)
|
||||
- [Offline: NVIDIA DGX-1 (1x V100 32GB) with FP16](#offline-nvidia-dgx-1-1x-v100-32gb-with-fp16)
|
||||
- [Offline: NVIDIA DGX-1 (1x V100 32GB) with FP32](#offline-nvidia-dgx-1-1x-v100-32gb-with-fp32)
|
||||
- [Offline: NVIDIA A40 with FP16](#offline-nvidia-a40-with-fp16)
|
||||
- [Offline: NVIDIA A40 with FP32](#offline-nvidia-a40-with-fp32)
|
||||
- [Offline: NVIDIA T4 with FP16](#offline-nvidia-t4-with-fp16)
|
||||
- [Offline: NVIDIA T4 with FP32](#offline-nvidia-t4-with-fp32)
|
||||
- [Offline: NVIDIA DGX A100 (1x A100 80GB) with FP16](#offline-nvidia-dgx-a100-1x-a100-80gb-with-fp16)
|
||||
- [Offline: NVIDIA DGX A100 (1x A100 80GB) with FP32](#offline-nvidia-dgx-a100-1x-a100-80gb-with-fp32)
|
||||
- [Online scenario](#online-scenario)
|
||||
- [Online: NVIDIA DGX A100 (1x A100 80GB) with FP16](#online-nvidia-dgx-a100-1x-a100-80gb-with-fp16)
|
||||
- [Online: NVIDIA DGX A100 (1x A100 80GB) with FP32](#online-nvidia-dgx-a100-1x-a100-80gb-with-fp32)
|
||||
- [Online: NVIDIA A40 with FP16](#online-nvidia-a40-with-fp16)
|
||||
- [Online: NVIDIA A40 with FP32](#online-nvidia-a40-with-fp32)
|
||||
- [Online: NVIDIA T4 with FP16](#online-nvidia-t4-with-fp16)
|
||||
- [Online: NVIDIA T4 with FP32](#online-nvidia-t4-with-fp32)
|
||||
- [Online: NVIDIA DGX-1 (1x V100 32GB) with FP16](#online-nvidia-dgx-1-1x-v100-32gb-with-fp16)
|
||||
- [Online: NVIDIA DGX-1 (1x V100 32GB) with FP32](#online-nvidia-dgx-1-1x-v100-32gb-with-fp32)
|
||||
- [Release Notes](#release-notes)
|
||||
- [Changelog](#changelog)
|
||||
- [Known issues](#known-issues)
|
||||
|
||||
|
||||
|
||||
|
||||
## Solution overview
|
||||
|
||||
|
||||
### Introduction
|
||||
The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server)
|
||||
provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs.
|
||||
The server provides an inference service via an HTTP or gRPC endpoint that allows remote clients to request inferencing for any number of GPU
|
||||
or CPU models being managed by the server.
|
||||
|
||||
This README provides step-by-step deployment instructions for models generated
|
||||
during training (as described in the [model README](../README.md)).
|
||||
Additionally, this README provides the corresponding deployment scripts that
|
||||
ensure optimal GPU utilization during inferencing on the Triton Inference Server.
|
||||
|
||||
### Deployment process
|
||||
The deployment process consists of two steps:
|
||||
|
||||
1. Conversion. The purpose of conversion is to find the best performing model
|
||||
format supported by the Triton Inference Server.
|
||||
Triton Inference Server uses a number of runtime backends such as
|
||||
[TensorRT](https://developer.nvidia.com/tensorrt),
|
||||
[LibTorch](https://github.com/triton-inference-server/pytorch_backend) and
|
||||
[ONNX Runtime](https://github.com/triton-inference-server/onnxruntime_backend)
|
||||
to support various model types. Refer to
|
||||
[Triton documentation](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton)
|
||||
for the list of available backends.
|
||||
2. Configuration. Model configuration on the Triton Inference Server, which generates
|
||||
necessary [configuration files](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md).
|
||||
|
||||
To run benchmarks measuring the model performance in inference,
|
||||
perform the following steps:
|
||||
|
||||
1. Start the Triton Inference Server.
|
||||
|
||||
The Triton Inference Server container is started
|
||||
in one (possibly remote) container and the ports for gRPC or REST API are exposed.
|
||||
|
||||
2. Run accuracy tests.
|
||||
|
||||
Produce results that are tested against given accuracy thresholds.
|
||||
Refer to step 8 in the [Quick Start Guide](#quick-start-guide).
|
||||
|
||||
3. Run performance tests.
|
||||
|
||||
Produce latency and throughput results for offline (static batching)
|
||||
and online (dynamic batching) scenarios.
|
||||
Refer to step 10 in the [Quick Start Guide](#quick-start-guide).
|
||||
|
||||
|
||||
## Setup
|
||||
|
||||
|
||||
|
||||
Ensure you have the following components:
|
||||
* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
|
||||
* [PyTorch NGC container 21.02](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
|
||||
* [Triton Inference Server NGC container 21.02](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver)
|
||||
* [NVIDIA CUDA repository](https://docs.nvidia.com/cuda/archive/11.2.0/index.html)
|
||||
* [NVIDIA Ampere](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/), [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
|
||||
|
||||
|
||||
|
||||
## Quick Start Guide
|
||||
|
||||
|
||||
|
||||
To deploy your model on Triton Inference Server perform the following steps using the default parameters of the nnUNet model on the [Medical Segmentation Decathlon](http://medicaldecathlon.com/) dataset. For the specifics concerning inference, see the [Advanced](#advanced) section.
|
||||
|
||||
1. Clone the repository.
|
||||
IMPORTANT: This step is executed on the host computer.
|
||||
|
||||
```
|
||||
git clone https://github.com/NVIDIA/DeepLearningExamples.git
|
||||
cd DeepLearningExamples/PyTorch/Segmentation/nnUNet
|
||||
```
|
||||
|
||||
2. Setup the environment on the host computer and start the Triton Inference Server.
|
||||
|
||||
```
|
||||
source triton/scripts/setup_environment.sh
|
||||
bash triton/scripts/docker/triton_inference_server.sh
|
||||
```
|
||||
|
||||
3. Build and run a container that extends the NGC PyTorch container with the Triton Inference Server client libraries and dependencies.
|
||||
|
||||
```
|
||||
bash triton/scripts/docker/build.sh
|
||||
bash triton/scripts/docker/interactive.sh
|
||||
```
|
||||
|
||||
|
||||
4. Prepare the deployment configuration and create folders in Docker.
|
||||
|
||||
IMPORTANT: These and the following commands must be executed in the PyTorch NGC container.
|
||||
|
||||
|
||||
```
|
||||
source triton/scripts/setup_environment.sh
|
||||
```
|
||||
|
||||
5. Download and pre-process the dataset.
|
||||
|
||||
|
||||
```
|
||||
bash triton/scripts/download_data.sh
|
||||
bash triton/scripts/process_dataset.sh
|
||||
```
|
||||
|
||||
6. Setup parameters for deployment.
|
||||
|
||||
```
|
||||
source triton/scripts/setup_parameters.sh
|
||||
```
|
||||
|
||||
7. Convert the model from training to inference format (for example TensorRT).
|
||||
|
||||
|
||||
```
|
||||
python3 triton/convert_model.py \
|
||||
--input-path triton/model.py \
|
||||
--input-type pyt \
|
||||
--output-path ${SHARED_DIR}/model \
|
||||
--output-type ${FORMAT} \
|
||||
--onnx-opset 12 \
|
||||
--onnx-optimized 1 \
|
||||
--max-batch-size ${MAX_BATCH_SIZE} \
|
||||
--max-workspace-size 4294967296 \
|
||||
--ignore-unknown-parameters \
|
||||
--checkpoint-dir ${CHECKPOINT_DIR}/nvidia_nnunet_pyt_ckpt_amp_3d_fold2.ckpt \
|
||||
--precision ${PRECISION} \
|
||||
--dataloader triton/dataloader.py \
|
||||
--data-dir ${DATASETS_DIR}/01_3d/ \
|
||||
--batch-size 1 \
|
||||
|
||||
```
|
||||
|
||||
|
||||
8. Configure the model on the Triton Inference Server.
|
||||
|
||||
Generate the configuration from your model repository.
|
||||
|
||||
```
|
||||
python3 triton/config_model_on_triton.py \
|
||||
--model-repository ${MODEL_REPOSITORY_PATH} \
|
||||
--model-path ${SHARED_DIR}/model \
|
||||
--model-format ${FORMAT} \
|
||||
--model-name ${MODEL_NAME} \
|
||||
--model-version 1 \
|
||||
--max-batch-size ${MAX_BATCH_SIZE} \
|
||||
--precision ${PRECISION} \
|
||||
--number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
|
||||
--preferred-batch-sizes ${TRITON_PREFERRED_BATCH_SIZES} \
|
||||
--max-queue-delay-us ${TRITON_MAX_QUEUE_DELAY} \
|
||||
--capture-cuda-graph 0 \
|
||||
--backend-accelerator ${BACKEND_ACCELERATOR} \
|
||||
--load-model ${TRITON_LOAD_MODEL_METHOD}
|
||||
```
|
||||
|
||||
9. Run the Triton Inference Server accuracy tests.
|
||||
|
||||
```
|
||||
python3 triton/run_inference_on_triton.py \
|
||||
--server-url ${TRITON_SERVER_URL}:8001 \
|
||||
--model-name ${MODEL_NAME} \
|
||||
--model-version 1 \
|
||||
--output-dir ${SHARED_DIR}/accuracy_dump \
|
||||
\
|
||||
--dataloader triton/dataloader.py \
|
||||
--data-dir ${DATASETS_DIR}/01_3d \
|
||||
--batch-size ${MAX_BATCH_SIZE} \
|
||||
--precision ${PRECISION} \
|
||||
--dump-labels
|
||||
|
||||
python3 triton/calculate_metrics.py \
|
||||
--metrics triton/metrics.py \
|
||||
--dump-dir ${SHARED_DIR}/accuracy_dump \
|
||||
--csv ${SHARED_DIR}/accuracy_metrics.csv
|
||||
|
||||
cat ${SHARED_DIR}/accuracy_metrics.csv
|
||||
```
|
||||
|
||||
|
||||
10. Run the Triton Inference Server performance online tests.
|
||||
|
||||
We want to maximize throughput within latency budget constraints.
|
||||
Dynamic batching is a feature of the Triton Inference Server that allows
|
||||
inference requests to be combined by the server, so that a batch is
|
||||
created dynamically, resulting in a reduced average latency.
|
||||
You can set the Dynamic Batcher parameter `max_queue_delay_microseconds` to
|
||||
indicate the maximum amount of time you are willing to wait and
|
||||
`preferred_batch_size` to indicate your maximum server batch size
|
||||
in the Triton Inference Server model configuration. The measurements
|
||||
presented below set the maximum latency to zero to achieve the best latency
|
||||
possible with good performance.
|
||||
|
||||
|
||||
```
|
||||
python triton/run_online_performance_test_on_triton.py \
|
||||
--server-url ${TRITON_SERVER_URL} \
|
||||
--model-name ${MODEL_NAME} \
|
||||
--input-data random \
|
||||
--batch-sizes ${BATCH_SIZE} \
|
||||
--triton-instances ${TRITON_INSTANCES} \
|
||||
--number-of-model-instances ${NUMBER_OF_MODEL_INSTANCES} \
|
||||
--shared-memory \
|
||||
--result-path ${SHARED_DIR}/triton_performance_online.csv
|
||||
```
|
||||
|
||||
|
||||
11. Run the Triton Inference Server performance offline tests.
|
||||
|
||||
We want to maximize throughput. It assumes you have your data available
|
||||
for inference or that your data saturate to maximum batch size quickly.
|
||||
Triton Inference Server supports offline scenarios with static batching.
|
||||
Static batching allows inference requests to be served
|
||||
as they are received. The largest improvements to throughput come
|
||||
from increasing the batch size due to efficiency gains in the GPU with larger
|
||||
batches. This example uses shared-memory.
|
||||
|
||||
```
|
||||
python triton/run_offline_performance_test_on_triton.py \
|
||||
--server-url ${TRITON_SERVER_URL} \
|
||||
--model-name ${MODEL_NAME} \
|
||||
--input-data random \
|
||||
--batch-sizes ${BATCH_SIZE} \
|
||||
--triton-instances ${TRITON_INSTANCES} \
|
||||
--shared-memory \
|
||||
--result-path ${SHARED_DIR}/triton_performance_offline.csv
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Advanced
|
||||
|
||||
### Triton embedded deployment
|
||||
|
||||
Triton embedded deployment means that client and server are running on the same machine (e.g. MRI).
|
||||
|
||||
The shared-memory extensions allow a client to communicate input and output
|
||||
tensors by system or CUDA shared memory. Using shared memory instead of sending
|
||||
the tensor data over the GRPC or REST interface can provide significant
|
||||
performance improvement for some use cases. Because both of these extensions
|
||||
are supported, Triton reports "system_shared_memory" and "cuda_shared_memory"
|
||||
in the extensions field of its Server Metadata.
|
||||
|
||||
More information about shared memory can be found here [Shared memory](https://github.com/triton-inference-server/server/blob/master/docs/protocol/extension_shared_memory.md)
|
||||
|
||||
### Prepare configuration
|
||||
You can use the environment variables to set the parameters of your inference
|
||||
configuration.
|
||||
|
||||
Triton deployment scripts support several inference runtimes listed in the table below:
|
||||
| Inference runtime | Mnemonic used in scripts |
|
||||
|-------------------|--------------------------|
|
||||
| [TorchScript Tracing](https://pytorch.org/docs/stable/jit.html) | `ts-trace` |
|
||||
| [TorchScript Tracing](https://pytorch.org/docs/stable/jit.html) | `ts-script` |
|
||||
| [ONNX](https://onnx.ai) | `onnx` |
|
||||
| [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) | `trt` |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Example values of some key variables in one configuration:
|
||||
```
|
||||
PRECISION="fp16"
|
||||
FORMAT="ts-script"
|
||||
BATCH_SIZE="1, 2, 4"
|
||||
BACKEND_ACCELERATOR="cuda"
|
||||
MAX_BATCH_SIZE="4"
|
||||
NUMBER_OF_MODEL_INSTANCES="1"
|
||||
TRITON_MAX_QUEUE_DELAY="1"
|
||||
TRITON_PREFERRED_BATCH_SIZES="2 4"
|
||||
|
||||
```
|
||||
|
||||
### Latency explanation
|
||||
A typical Triton Inference Server pipeline can be broken down into the following steps:
|
||||
|
||||
1. The client serializes the inference request into a message and sends it to
|
||||
the server (Client Send).
|
||||
2. The message travels over the network from the client to the server (Network).
|
||||
3. The message arrives at the server and is deserialized (Server Receive).
|
||||
4. The request is placed on the queue (Server Queue).
|
||||
5. The request is removed from the queue and computed (Server Compute).
|
||||
6. The completed request is serialized in a message and sent back to
|
||||
the client (Server Send).
|
||||
7. The completed message then travels over the network from the server
|
||||
to the client (Network).
|
||||
8. The completed message is deserialized by the client and processed as
|
||||
a completed inference request (Client Receive).
|
||||
|
||||
Generally, for local clients, steps 1-4 and 6-8 occupy
|
||||
a small fraction of time, compared to steps 5. As backend deep learning
|
||||
systems like Jasper are rarely exposed directly to end users, but instead
|
||||
only interfacing with local front-end servers, for the sake of Jasper,
|
||||
we can consider that all clients are local.
|
||||
|
||||
|
||||
|
||||
## Performance
|
||||
|
||||
|
||||
### Offline scenario
|
||||
This table lists the common variable parameters for all performance measurements:
|
||||
|
||||
| Parameter Name | Parameter Value |
|
||||
|:-----------------------------|:----------------------|
|
||||
| Model Format | TorchScript Scripting |
|
||||
| Backend Accelerator | CUDA |
|
||||
| Max Batch Size | 4 |
|
||||
| Number of model instances | 1 |
|
||||
| Triton Max Queue Delay | 1 |
|
||||
| Triton Preferred Batch Sizes | 2 4 |
|
||||
|
||||
|
||||
|
||||
## **GPU:** NVIDIA DGX-1 (1x V100 32GB)
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="plots/graph_TeslaV10032GB_left.svg"></td>
|
||||
<td><img src="plots/graph_TeslaV10032GB_right.svg"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
### Offline: NVIDIA DGX-1 (1x V100 32GB) with FP16
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA DGX-1 (1x V100 32GB)
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP16
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Precision | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|:------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| FP16 | 1 | 20.3 | 49.295 | 49.329 | 49.386 | 49.188 |
|
||||
| FP16 | 2 | 25.2 | 79.464 | 79.529 | 79.611 | 79.247 |
|
||||
| FP16 | 4 | 28.4 | 140.378 | 140.639 | 140.844 | 139.634 |
|
||||
|
||||
</details>
|
||||
|
||||
### Offline: NVIDIA DGX-1 (1x V100 32GB) with FP32
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA DGX-1 (1x V100 32GB)
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP32
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Precision | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|:------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| FP32 | 1 | 10.3 | 97.262 | 97.335 | 97.56 | 96.908 |
|
||||
| FP32 | 2 | 10.6 | 186.551 | 186.839 | 187.05 | 185.747 |
|
||||
| FP32 | 4 | 11.2 | 368.61 | 368.982 | 370.119 | 366.781 |
|
||||
|
||||
</details>
|
||||
|
||||
## **GPU:** NVIDIA A40
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="plots/graph_A40_left.svg"></td>
|
||||
<td><img src="plots/graph_A40_right.svg"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
### Offline: NVIDIA A40 with FP16
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA A40
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP16
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Precision | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|:------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| FP16 | 1 | 22.2 | 44.997 | 45.001 | 45.011 | 44.977 |
|
||||
| FP16 | 2 | 28.2 | 70.697 | 70.701 | 70.711 | 70.667 |
|
||||
| FP16 | 4 | 32 | 126.1 | 126.111 | 126.13 | 126.061 |
|
||||
|
||||
</details>
|
||||
|
||||
### Offline: NVIDIA A40 with FP32
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA A40
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP32
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Precision | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|:------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| FP32 | 1 | 11.1 | 90.236 | 90.35 | 90.438 | 89.503 |
|
||||
| FP32 | 2 | 11.4 | 176.345 | 176.521 | 176.561 | 176.063 |
|
||||
| FP32 | 4 | 10.8 | 360.355 | 360.631 | 360.668 | 359.839 |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
## **GPU:** NVIDIA T4
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="plots/graph_TeslaT4_left.svg"></td>
|
||||
<td><img src="plots/graph_TeslaT4_right.svg"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
### Offline: NVIDIA T4 with FP16
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA T4
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP16
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Precision | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|:------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| FP16 | 1 | 9.1 | 110.197 | 110.598 | 111.201 | 109.417 |
|
||||
| FP16 | 2 | 9.8 | 209.083 | 209.347 | 209.9 | 208.026 |
|
||||
| FP16 | 4 | 9.6 | 411.128 | 411.216 | 411.711 | 409.599 |
|
||||
|
||||
</details>
|
||||
|
||||
### Offline: NVIDIA T4 with FP32
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA T4
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP32
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Precision | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|:------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| FP32 | 1 | 3.3 | 298.003 | 298.23 | 298.585 | 295.594 |
|
||||
| FP32 | 2 | 3.4 | 592.412 | 592.505 | 592.881 | 591.209 |
|
||||
| FP32 | 4 | 3.6 | 1188.76 | 1189.1 | 1189.1 | 1187.24 |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
## **GPU:** NVIDIA DGX A100 (1x A100 80GB)
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="plots/graph_A10080GB_left.svg"></td>
|
||||
<td><img src="plots/graph_A10080GB_right.svg"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
### Offline: NVIDIA DGX A100 (1x A100 80GB) with FP16
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA DGX A100 (1x A100 80GB)
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP16
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Precision | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|:------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| FP16 | 1 | 26.1 | 38.326 | 38.353 | 38.463 | 38.29 |
|
||||
| FP16 | 2 | 38 | 52.893 | 52.912 | 52.95 | 52.859 |
|
||||
| FP16 | 4 | 48.8 | 81.778 | 81.787 | 81.8 | 81.738 |
|
||||
|
||||
</details>
|
||||
|
||||
### Offline: NVIDIA DGX A100 (1x A100 80GB) with FP32
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA DGX A100 (1x A100 80GB)
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP32
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Precision | Client Batch Size | Inferences/second | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|:------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| FP32 | 1 | 34.6 | 29.043 | 29.088 | 29.159 | 28.918 |
|
||||
| FP32 | 2 | 39.4 | 50.942 | 50.991 | 51.118 | 50.835 |
|
||||
| FP32 | 4 | 21.2 | 299.924 | 322.953 | 354.473 | 191.724 |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
### Online scenario
|
||||
This table lists the common variable parameters for all performance measurements:
|
||||
| Parameter Name | Parameter Value |
|
||||
|:-----------------------------|:----------------------|
|
||||
| Model Format | TorchScript Scripting |
|
||||
| Backend Accelerator | CUDA |
|
||||
| Max Batch Size | 4 |
|
||||
| Number of model instances | 1 |
|
||||
| Triton Max Queue Delay | 1 |
|
||||
| Triton Preferred Batch Sizes | 2 4 |
|
||||
|
||||
|
||||
|
||||
## **GPU:** NVIDIA DGX A100 (1x A100 80GB)
|
||||
|
||||
### Online: NVIDIA DGX A100 (1x A100 80GB) with FP16
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA DGX A100 (1x A100 80GB)
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP16
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
|
||||
![](plots/graph_performance_online_1.svg)
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| 1 | 26.1 | 0.021 | 0.081 | 0.012 | 0.037 | 3.582 | 34.551 | 0 | 38.287 | 38.318 | 38.328 | 38.356 | 38.284 |
|
||||
| 2 | 26.2 | 0.022 | 0.078 | 38.109 | 0.036 | 3.582 | 34.552 | 0 | 76.381 | 76.414 | 76.423 | 76.433 | 76.379 |
|
||||
| 3 | 33 | 0.021 | 0.095 | 42.958 | 0.05 | 3.55 | 44.282 | 0 | 90.956 | 90.992 | 91.013 | 91.107 | 90.956 |
|
||||
| 4 | 38.4 | 0.031 | 0.112 | 45.07 | 0.069 | 3.527 | 55.545 | 0 | 104.352 | 104.399 | 104.419 | 104.486 | 104.354 |
|
||||
| 5 | 41.6 | 0.027 | 0.131 | 46.829 | 0.089 | 3.522 | 69.262 | 0 | 119.861 | 119.903 | 119.91 | 119.935 | 119.86 |
|
||||
| 6 | 44.4 | 0.031 | 0.127 | 62.269 | 0.085 | 3.493 | 68.42 | 0 | 134.425 | 134.467 | 134.488 | 134.608 | 134.425 |
|
||||
| 7 | 47.6 | 0.028 | 0.146 | 72.667 | 0.091 | 3.473 | 71.421 | 0 | 147.828 | 147.868 | 147.883 | 147.912 | 147.826 |
|
||||
| 8 | 49.2 | 0.031 | 0.147 | 81.538 | 0.101 | 3.46 | 78.08 | 0 | 163.351 | 163.406 | 163.435 | 163.607 | 163.357 |
|
||||
|
||||
</details>
|
||||
|
||||
### Online: NVIDIA DGX A100 (1x A100 80GB) with FP32
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA DGX A100 (1x A100 80GB)
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP32
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
![](plots/graph_performance_online_2.svg)
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| 1 | 34.6 | 0.022 | 0.085 | 0.012 | 0.057 | 3.54 | 25.197 | 0 | 28.889 | 29.044 | 29.07 | 29.126 | 28.913 |
|
||||
| 2 | 34.7 | 0.03 | 0.101 | 28.707 | 0.056 | 3.55 | 25.185 | 0 | 57.585 | 57.755 | 57.787 | 58.012 | 57.629 |
|
||||
| 3 | 37.8 | 0.027 | 0.105 | 36.011 | 0.085 | 3.482 | 39.84 | 0 | 79.502 | 79.656 | 79.688 | 79.771 | 79.55 |
|
||||
| 4 | 39.6 | 0.026 | 0.135 | 50.617 | 0.097 | 3.424 | 47.198 | 0 | 101.463 | 101.683 | 101.718 | 101.818 | 101.497 |
|
||||
| 5 | 40 | 0.033 | 0.112 | 59.913 | 0.461 | 3.556 | 60.649 | 0 | 124.66 | 124.832 | 125.114 | 126.906 | 124.724 |
|
||||
| 6 | 37.2 | 0.03 | 0 | 83.268 | 1.142 | 3.545 | 78.663 | 0 | 148.762 | 149.446 | 150.996 | 411.775 | 166.648 |
|
||||
| 7 | 28.7 | 0.039 | 0.252 | 115 | 1.132 | 65.661 | 61.857 | 0 | 243.459 | 245.291 | 246.747 | 247.342 | 243.941 |
|
||||
| 8 | 23.6 | 0.039 | 0.199 | 168.972 | 1.052 | 112.231 | 55.827 | 0 | 338.232 | 339.188 | 339.275 | 340.472 | 338.32 |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
## **GPU:** NVIDIA A40
|
||||
|
||||
### Online: NVIDIA A40 with FP16
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA A40
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP16
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
![](plots/graph_performance_online_3.svg)
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| 1 | 22.2 | 0.073 | 0.304 | 0.019 | 0.07 | 4.844 | 39.599 | 0 | 44.912 | 44.93 | 44.938 | 44.951 | 44.909 |
|
||||
| 2 | 22.4 | 0.075 | 0.299 | 44.198 | 0.069 | 4.844 | 39.598 | 0 | 89.083 | 89.107 | 89.12 | 89.22 | 89.083 |
|
||||
| 3 | 25.9 | 0.073 | 0.335 | 52.735 | 0.106 | 4.814 | 56.894 | 0 | 114.959 | 114.987 | 114.996 | 115.006 | 114.957 |
|
||||
| 4 | 28 | 0.073 | 0.364 | 57.54 | 0.152 | 4.798 | 79.237 | 0 | 142.167 | 142.205 | 142.215 | 142.226 | 142.164 |
|
||||
| 5 | 29.8 | 0.074 | 0.373 | 80.998 | 0.158 | 4.765 | 81.681 | 0 | 168.052 | 168.103 | 168.114 | 168.147 | 168.049 |
|
||||
| 6 | 30.9 | 0.074 | 0.386 | 97.176 | 0.181 | 4.756 | 92.607 | 0 | 195.172 | 195.235 | 195.252 | 195.666 | 195.18 |
|
||||
| 7 | 31.5 | 0.077 | 0.357 | 109.266 | 0.213 | 4.774 | 108.641 | 0 | 223.325 | 223.389 | 223.4 | 223.473 | 223.328 |
|
||||
| 8 | 32 | 0.074 | 0.359 | 125.387 | 0.237 | 4.783 | 120.746 | 0 | 251.573 | 252.62 | 252.698 | 252.857 | 251.586 |
|
||||
|
||||
</details>
|
||||
|
||||
### Online: NVIDIA A40 with FP32
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA A40
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP32
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
![](plots/graph_performance_online_4.svg)
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| 1 | 11.1 | 0.08 | 0.286 | 0.019 | 0.124 | 4.467 | 84.525 | 0 | 89.588 | 90.336 | 90.375 | 90.553 | 89.501 |
|
||||
| 2 | 11.2 | 0.077 | 0.348 | 88.89 | 0.123 | 4.467 | 84.637 | 0 | 178.634 | 179.887 | 179.99 | 180.176 | 178.542 |
|
||||
| 3 | 11.4 | 0.078 | 0.3 | 117.917 | 0.194 | 4.391 | 142.344 | 0 | 265.26 | 265.901 | 265.941 | 266.351 | 265.224 |
|
||||
| 4 | 11.2 | 0.078 | 0.321 | 175.491 | 0.231 | 4.355 | 171.23 | 0 | 351.697 | 352.266 | 352.337 | 352.512 | 351.706 |
|
||||
| 5 | 11.5 | 0.078 | 0.353 | 210.898 | 0.671 | 4.372 | 222.115 | 0 | 438.481 | 439.348 | 439.379 | 439.805 | 438.487 |
|
||||
| 6 | 11.1 | 0.078 | 0.389 | 263.225 | 2.16 | 4.413 | 256.974 | 0 | 527.101 | 528.705 | 528.849 | 528.966 | 527.239 |
|
||||
| 7 | 11.2 | 0.076 | 0.204 | 304.798 | 2.216 | 138.105 | 178.66 | 0 | 624.066 | 625.626 | 625.732 | 625.977 | 624.059 |
|
||||
| 8 | 10.8 | 0.074 | 0.459 | 359.748 | 2.213 | 238.331 | 119.62 | 0 | 720.475 | 721.2 | 721.206 | 721.513 | 720.445 |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
## **GPU:** NVIDIA T4
|
||||
|
||||
### Online: NVIDIA T4 with FP16
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA T4
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP16
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
![](plots/graph_performance_online_5.svg)
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| 1 | 9.1 | 0.109 | 0.388 | 0.015 | 0.151 | 3.082 | 105.624 | 0 | 109.31 | 110.144 | 110.413 | 110.505 | 109.369 |
|
||||
| 2 | 9.2 | 0.116 | 0.399 | 108.562 | 0.154 | 3.094 | 105.774 | 0 | 218.195 | 219.242 | 219.55 | 219.902 | 218.099 |
|
||||
| 3 | 9.3 | 0.116 | 0.5 | 141.682 | 0.244 | 3.043 | 171.276 | 0 | 316.812 | 319.269 | 319.839 | 320.185 | 316.861 |
|
||||
| 4 | 9.8 | 0.116 | 0.397 | 207.308 | 0.288 | 3.053 | 204.455 | 0 | 415.558 | 416.726 | 416.902 | 417.25 | 415.617 |
|
||||
| 5 | 9.7 | 0.115 | 0.263 | 252.215 | 0.372 | 3.06 | 268.918 | 0 | 525.233 | 526.928 | 527.007 | 527.18 | 524.943 |
|
||||
| 6 | 9.6 | 0.114 | 0.431 | 316.091 | 0.43 | 3.087 | 313.056 | 0 | 633.186 | 634.815 | 634.871 | 634.899 | 633.209 |
|
||||
| 7 | 9.4 | 0.115 | 0.385 | 356.97 | 0.507 | 3.106 | 364.103 | 0 | 725.346 | 726.226 | 726.345 | 727.387 | 725.186 |
|
||||
| 8 | 10 | 0.116 | 0.425 | 408.406 | 0.57 | 3.122 | 405.21 | 0 | 818.009 | 819.843 | 819.911 | 820.552 | 817.849 |
|
||||
|
||||
</details>
|
||||
|
||||
### Online: NVIDIA T4 with FP32
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA T4
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP32
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
![](plots/graph_performance_online_6.svg)
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| 1 | 3.3 | 0.12 | 0.359 | 0.016 | 0.286 | 2.823 | 292.021 | 0 | 296.31 | 298.223 | 298.333 | 299.091 | 295.625 |
|
||||
| 2 | 3.4 | 0.121 | 0.482 | 295.028 | 0.285 | 2.821 | 292.411 | 0 | 590.8 | 593.113 | 593.181 | 593.506 | 591.148 |
|
||||
| 3 | 3.3 | 0.118 | 0.364 | 398.407 | 0.462 | 2.827 | 484.536 | 0 | 887.21 | 888.227 | 888.444 | 889.069 | 886.714 |
|
||||
| 4 | 3.2 | 0.117 | 0.359 | 591.981 | 0.559 | 2.819 | 589.073 | 0 | 1185.4 | 1187.74 | 1187.74 | 1188.02 | 1184.91 |
|
||||
| 5 | 3.5 | 0.13 | 0.54 | 711.986 | 1.026 | 2.816 | 768.727 | 0 | 1485.15 | 1488.09 | 1488.09 | 1488.8 | 1485.22 |
|
||||
| 6 | 3.3 | 0.137 | 0.263 | 891.924 | 2.513 | 2.816 | 887.156 | 0 | 1784.96 | 1786.4 | 1786.65 | 1786.65 | 1784.81 |
|
||||
| 7 | 3.5 | 0.134 | 0.61 | 1024 | 3.064 | 2.783 | 1061.49 | 0 | 2092.74 | 2094.77 | 2094.77 | 2094.77 | 2092.08 |
|
||||
| 8 | 3.2 | 0.135 | 0.858 | 1195.84 | 3.696 | 2.769 | 1189.92 | 0 | 2393.93 | 2394.67 | 2394.67 | 2394.67 | 2393.22 |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
## **GPU:** NVIDIA DGX-1 (1x V100 32GB)
|
||||
|
||||
### Online: NVIDIA DGX-1 (1x V100 32GB) with FP16
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA DGX-1 (1x V100 32GB)
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP16
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
![](plots/graph_performance_online_7.svg)
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| 1 | 20.4 | 0.054 | 0.21 | 0.022 | 0.07 | 5.813 | 43.068 | 0 | 49.227 | 49.347 | 49.374 | 49.481 | 49.237 |
|
||||
| 2 | 20.5 | 0.058 | 0.259 | 48.734 | 0.075 | 5.8 | 43.081 | 0 | 97.959 | 98.151 | 98.226 | 98.817 | 98.007 |
|
||||
| 3 | 23.4 | 0.068 | 0.31 | 58.668 | 0.105 | 5.88 | 62.955 | 0 | 127.949 | 128.335 | 128.59 | 128.9 | 127.986 |
|
||||
| 4 | 25.2 | 0.068 | 0.282 | 78.717 | 0.123 | 5.779 | 73.061 | 0 | 157.991 | 158.398 | 158.599 | 158.762 | 158.03 |
|
||||
| 5 | 26.5 | 0.063 | 0.303 | 90.872 | 0.15 | 5.866 | 91.174 | 0 | 188.376 | 188.815 | 189.039 | 189.349 | 188.428 |
|
||||
| 6 | 27.6 | 0.067 | 0.344 | 98.88 | 0.192 | 6.017 | 112.827 | 0 | 218.299 | 219.14 | 219.271 | 219.443 | 218.327 |
|
||||
| 7 | 28.3 | 0.065 | 0.285 | 121.672 | 0.194 | 5.721 | 120.488 | 0 | 248.344 | 249.172 | 249.232 | 249.367 | 248.425 |
|
||||
| 8 | 28.8 | 0.056 | 0.251 | 138.819 | 0.209 | 4.977 | 133.895 | 0 | 277.678 | 279.799 | 280 | 280.367 | 278.207 |
|
||||
|
||||
</details>
|
||||
|
||||
### Online: NVIDIA DGX-1 (1x V100 32GB) with FP32
|
||||
Our results were obtained using the following configuration:
|
||||
* **GPU:** NVIDIA DGX-1 (1x V100 32GB)
|
||||
* **Backend:** PyTorch
|
||||
* **Backend accelerator:** CUDA
|
||||
* **Precision:** FP32
|
||||
* **Model Format:** TorchScript
|
||||
* **Conversion variant:** Script
|
||||
* **Image resolution:** 4x128x128x128
|
||||
|
||||
![](plots/graph_performance_online_8.svg)
|
||||
|
||||
<details>
|
||||
|
||||
<summary>
|
||||
Full tabular data
|
||||
</summary>
|
||||
|
||||
| Concurrent client requests | Inferences/second | Client Send | Network+server Send/recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | P50 Latency | P90 Latency | P95 Latency | P99 Latency | Avg Latency |
|
||||
|-----------------------------:|--------------------:|--------------:|---------------------------:|---------------:|-----------------------:|-----------------------:|------------------------:|--------------:|--------------:|--------------:|--------------:|--------------:|--------------:|
|
||||
| 1 | 10.3 | 0.05 | 0.194 | 0.016 | 0.109 | 4.508 | 91.96 | 0 | 96.843 | 97.226 | 97.299 | 97.443 | 96.837 |
|
||||
| 2 | 10.4 | 0.05 | 0.206 | 96.365 | 0.106 | 4.591 | 91.863 | 0 | 193.236 | 193.883 | 193.988 | 194.156 | 193.181 |
|
||||
| 3 | 10.6 | 0.052 | 0.154 | 126.753 | 0.169 | 4.543 | 150.365 | 0 | 282.048 | 282.865 | 283.024 | 283.756 | 282.036 |
|
||||
| 4 | 10.8 | 0.053 | 0.178 | 185.119 | 0.201 | 4.485 | 180.649 | 0 | 370.513 | 372.052 | 372.606 | 373.333 | 370.685 |
|
||||
| 5 | 11 | 0.056 | 0.261 | 222.045 | 0.759 | 4.419 | 235.089 | 0 | 462.821 | 464.299 | 464.792 | 464.954 | 462.629 |
|
||||
| 6 | 11.2 | 0.056 | 0.329 | 244.152 | 0.889 | 4.44 | 302.491 | 0 | 552.087 | 553.883 | 554.899 | 556.337 | 552.357 |
|
||||
| 7 | 10.9 | 0.054 | 0 | 315.268 | 1.297 | 4.412 | 325.279 | 0 | 643.661 | 645.478 | 646.317 | 699.413 | 646.31 |
|
||||
| 8 | 10.8 | 0.057 | 0.237 | 366.332 | 1.247 | 4.472 | 360.891 | 0 | 733.164 | 735.221 | 735.813 | 736.436 | 733.236 |
|
||||
|
||||
</details>
|
||||
|
||||
# Release Notes
|
||||
We’re constantly refining and improving our performance on AI
|
||||
and HPC workloads with frequent updates
|
||||
to our software stack. For our latest performance data, refer
|
||||
to these pages for
|
||||
[AI](https://developer.nvidia.com/deep-learning-performance-training-inference)
|
||||
and [HPC](https://developer.nvidia.com/hpc-application-performance) benchmarks.
|
||||
|
||||
## Changelog
|
||||
April 2021
|
||||
- Initial release
|
||||
|
||||
## Known issues
|
||||
- There are no known issues with this model.
|
||||
|
||||
|
||||
|
||||
|
||||
|
133
PyTorch/Segmentation/nnUNet/triton/calculate_metrics.py
Executable file
|
@ -0,0 +1,133 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""
|
||||
Using `calculate_metrics.py` script, you can obtain model accuracy/error metrics using defined `MetricsCalculator` class.
|
||||
|
||||
Data provided to `MetricsCalculator` are obtained from npz dump files
|
||||
stored in directory pointed by `--dump-dir` argument.
|
||||
Above files are prepared by `run_inference_on_fw.py` and `run_inference_on_triton.py` scripts.
|
||||
|
||||
Output data is stored in csv file pointed by `--csv` argument.
|
||||
|
||||
Example call:
|
||||
|
||||
```shell script
|
||||
python ./triton/calculate_metrics.py \
|
||||
--dump-dir /results/dump_triton \
|
||||
--csv /results/accuracy_results.csv \
|
||||
--metrics metrics.py \
|
||||
--metric-class-param1 value
|
||||
```
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import string
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
# method from PEP-366 to support relative import in executed modules
|
||||
|
||||
if __package__ is None:
|
||||
__package__ = Path(__file__).parent.name
|
||||
|
||||
from .deployment_toolkit.args import ArgParserGenerator
|
||||
from .deployment_toolkit.core import BaseMetricsCalculator, load_from_file
|
||||
from .deployment_toolkit.dump import pad_except_batch_axis
|
||||
|
||||
LOGGER = logging.getLogger("calculate_metrics")
|
||||
TOTAL_COLUMN_NAME = "_total_"
|
||||
|
||||
|
||||
def get_data(dump_dir, prefix):
|
||||
"""Loads and concatenates dump files for given prefix (ex. inputs, outputs, labels, ids)"""
|
||||
dump_dir = Path(dump_dir)
|
||||
npz_files = sorted(dump_dir.glob(f"{prefix}*.npz"))
|
||||
data = None
|
||||
if npz_files:
|
||||
# assume that all npz files with given prefix contain same set of names
|
||||
names = list(np.load(npz_files[0].as_posix()).keys())
|
||||
# calculate target shape
|
||||
target_shape = {
|
||||
name: tuple(np.max([np.load(npz_file.as_posix())[name].shape for npz_file in npz_files], axis=0))
|
||||
for name in names
|
||||
}
|
||||
# pad and concatenate data
|
||||
data = {
|
||||
name: np.concatenate(
|
||||
[pad_except_batch_axis(np.load(npz_file.as_posix())[name], target_shape[name]) for npz_file in npz_files]
|
||||
)
|
||||
for name in names
|
||||
}
|
||||
return data
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser(description="Run models with given dataloader", allow_abbrev=False)
|
||||
parser.add_argument("--metrics", help=f"Path to python module containing metrics calculator", required=True)
|
||||
parser.add_argument("--csv", help="Path to csv file", required=True)
|
||||
parser.add_argument("--dump-dir", help="Path to directory with dumped outputs (and labels)", required=True)
|
||||
|
||||
args, *_ = parser.parse_known_args()
|
||||
|
||||
MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
|
||||
ArgParserGenerator(MetricsCalculator).update_argparser(parser)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
LOGGER.info(f"args:")
|
||||
for key, value in vars(args).items():
|
||||
LOGGER.info(f" {key} = {value}")
|
||||
|
||||
MetricsCalculator = load_from_file(args.metrics, "metrics", "MetricsCalculator")
|
||||
metrics_calculator: BaseMetricsCalculator = ArgParserGenerator(MetricsCalculator).from_args(args)
|
||||
|
||||
ids = get_data(args.dump_dir, "ids")["ids"]
|
||||
x = get_data(args.dump_dir, "inputs")
|
||||
y_true = get_data(args.dump_dir, "labels")
|
||||
y_pred = get_data(args.dump_dir, "outputs")
|
||||
|
||||
common_keys = list({k for k in (y_true or [])} & {k for k in (y_pred or [])})
|
||||
for key in common_keys:
|
||||
if y_true[key].shape != y_pred[key].shape:
|
||||
LOGGER.warning(
|
||||
f"Model predictions and labels shall have equal shapes. "
|
||||
f"y_pred[{key}].shape={y_pred[key].shape} != "
|
||||
f"y_true[{key}].shape={y_true[key].shape}"
|
||||
)
|
||||
|
||||
metrics = metrics_calculator.calc(ids=ids, x=x, y_pred=y_pred, y_real=y_true)
|
||||
metrics = {TOTAL_COLUMN_NAME: len(ids), **metrics}
|
||||
|
||||
metric_names_with_space = [name for name in metrics if any([c in string.whitespace for c in name])]
|
||||
if metric_names_with_space:
|
||||
raise ValueError(f"Metric names shall have no spaces; Incorrect names: {', '.join(metric_names_with_space)}")
|
||||
|
||||
csv_path = Path(args.csv)
|
||||
csv_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with csv_path.open("w") as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=list(metrics.keys()))
|
||||
writer.writeheader()
|
||||
writer.writerow(metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
202
PyTorch/Segmentation/nnUNet/triton/config_model_on_triton.py
Executable file
|
@ -0,0 +1,202 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""
|
||||
To configure model on Triton, you can use `config_model_on_triton.py` script.
|
||||
This will prepare layout of Model Repository, including Model Configuration.
|
||||
|
||||
```shell script
|
||||
python ./triton/config_model_on_triton.py \
|
||||
--model-repository /model_repository \
|
||||
--model-path /models/exported/model.onnx \
|
||||
--model-format onnx \
|
||||
--model-name ResNet50 \
|
||||
--model-version 1 \
|
||||
--max-batch-size 32 \
|
||||
--precision fp16 \
|
||||
--backend-accelerator trt \
|
||||
--load-model explicit \
|
||||
--timeout 120 \
|
||||
--verbose
|
||||
```
|
||||
|
||||
If Triton server to which we prepare model repository is running with **explicit model control mode**,
|
||||
use `--load-model` argument to send request load_model request to Triton Inference Server.
|
||||
If server is listening on non-default address or port use `--server-url` argument to point server control endpoint.
|
||||
If it is required to use HTTP protocol to communicate with Triton server use `--http` argument.
|
||||
|
||||
To improve inference throughput you can use
|
||||
[dynamic batching](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#dynamic-batcher)
|
||||
for your model by providing `--preferred-batch-sizes` and `--max-queue-delay-us` parameters.
|
||||
|
||||
For models which doesn't support batching, set `--max-batch-sizes` to 0.
|
||||
|
||||
By default Triton will [automatically obtain inputs and outputs definitions](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#auto-generated-model-configuration).
|
||||
but for TorchScript ang TF GraphDef models script uses file with I/O specs. This file is automatically generated
|
||||
when the model is converted to ScriptModule (either traced or scripted).
|
||||
If there is a need to pass different than default path to I/O spec file use `--io-spec` CLI argument.
|
||||
|
||||
I/O spec file is yaml file with below structure:
|
||||
|
||||
```yaml
|
||||
- inputs:
|
||||
- name: input
|
||||
dtype: float32 # np.dtype name
|
||||
shape: [None, 224, 224, 3]
|
||||
- outputs:
|
||||
- name: probabilities
|
||||
dtype: float32
|
||||
shape: [None, 1001]
|
||||
- name: classes
|
||||
dtype: int32
|
||||
shape: [None, 1]
|
||||
```
|
||||
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
|
||||
from model_navigator import Accelerator, Format, Precision
|
||||
from model_navigator.args import str2bool
|
||||
from model_navigator.log import set_logger, log_dict
|
||||
from model_navigator.triton import ModelConfig, TritonClient, TritonModelStore
|
||||
|
||||
LOGGER = logging.getLogger("config_model")
|
||||
|
||||
|
||||
def _available_enum_values(my_enum):
|
||||
return [item.value for item in my_enum]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create Triton model repository and model configuration", allow_abbrev=False
|
||||
)
|
||||
parser.add_argument("--model-repository", required=True, help="Path to Triton model repository.")
|
||||
parser.add_argument("--model-path", required=True, help="Path to model to configure")
|
||||
|
||||
# TODO: automation
|
||||
parser.add_argument(
|
||||
"--model-format",
|
||||
required=True,
|
||||
choices=_available_enum_values(Format),
|
||||
help="Format of model to deploy",
|
||||
)
|
||||
parser.add_argument("--model-name", required=True, help="Model name")
|
||||
parser.add_argument("--model-version", default="1", help="Version of model (default 1)")
|
||||
parser.add_argument(
|
||||
"--max-batch-size",
|
||||
type=int,
|
||||
default=32,
|
||||
help="Maximum batch size allowed for inference. "
|
||||
"A max_batch_size value of 0 indicates that batching is not allowed for the model",
|
||||
)
|
||||
# TODO: automation
|
||||
parser.add_argument(
|
||||
"--precision",
|
||||
type=str,
|
||||
default=Precision.FP16.value,
|
||||
choices=_available_enum_values(Precision),
|
||||
help="Model precision (parameter used only by Tensorflow backend with TensorRT optimization)",
|
||||
)
|
||||
|
||||
# Triton Inference Server endpoint
|
||||
parser.add_argument(
|
||||
"--server-url",
|
||||
type=str,
|
||||
default="grpc://localhost:8001",
|
||||
help="Inference server URL in format protocol://host[:port] (default grpc://localhost:8001)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--load-model",
|
||||
choices=["none", "poll", "explicit"],
|
||||
help="Loading model while Triton Server is in given model control mode",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout", default=120, help="Timeout in seconds to wait till model load (default=120)", type=int
|
||||
)
|
||||
|
||||
# optimization related
|
||||
parser.add_argument(
|
||||
"--backend-accelerator",
|
||||
type=str,
|
||||
choices=_available_enum_values(Accelerator),
|
||||
default=Accelerator.TRT.value,
|
||||
help="Select Backend Accelerator used to serve model",
|
||||
)
|
||||
parser.add_argument("--number-of-model-instances", type=int, default=1, help="Number of model instances per GPU")
|
||||
parser.add_argument(
|
||||
"--preferred-batch-sizes",
|
||||
type=int,
|
||||
nargs="*",
|
||||
help="Batch sizes that the dynamic batcher should attempt to create. "
|
||||
"In case --max-queue-delay-us is set and this parameter is not, default value will be --max-batch-size",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-queue-delay-us",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Max delay time which dynamic batcher shall wait to form a batch (default 0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--capture-cuda-graph",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Use cuda capture graph (used only by TensorRT platform)",
|
||||
)
|
||||
|
||||
parser.add_argument("-v", "--verbose", help="Provide verbose logs", type=str2bool, default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
set_logger(verbose=args.verbose)
|
||||
log_dict("args", vars(args))
|
||||
|
||||
config = ModelConfig.create(
|
||||
model_path=args.model_path,
|
||||
# model definition
|
||||
model_name=args.model_name,
|
||||
model_version=args.model_version,
|
||||
model_format=args.model_format,
|
||||
precision=args.precision,
|
||||
max_batch_size=args.max_batch_size,
|
||||
# optimization
|
||||
accelerator=args.backend_accelerator,
|
||||
gpu_engine_count=args.number_of_model_instances,
|
||||
preferred_batch_sizes=args.preferred_batch_sizes or [],
|
||||
max_queue_delay_us=args.max_queue_delay_us,
|
||||
capture_cuda_graph=args.capture_cuda_graph,
|
||||
)
|
||||
|
||||
model_store = TritonModelStore(args.model_repository)
|
||||
model_store.deploy_model(model_config=config, model_path=args.model_path)
|
||||
|
||||
if args.load_model != "none":
|
||||
client = TritonClient(server_url=args.server_url, verbose=args.verbose)
|
||||
client.wait_for_server_ready(timeout=args.timeout)
|
||||
|
||||
if args.load_model == "explicit":
|
||||
client.load_model(model_name=args.model_name)
|
||||
|
||||
if args.load_model == "poll":
|
||||
time.sleep(15)
|
||||
|
||||
client.wait_for_model(model_name=args.model_name, model_version=args.model_version, timeout_s=args.timeout)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
166
PyTorch/Segmentation/nnUNet/triton/convert_model.py
Executable file
|
@ -0,0 +1,166 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
r"""
|
||||
`convert_model.py` script allows to convert between model formats with additional model optimizations
|
||||
for faster inference.
|
||||
It converts model from results of get_model function.
|
||||
|
||||
Currently supported input and output formats are:
|
||||
|
||||
- inputs
|
||||
- `tf-estimator` - `get_model` function returning Tensorflow Estimator
|
||||
- `tf-keras` - `get_model` function returning Tensorflow Keras Model
|
||||
- `tf-savedmodel` - Tensorflow SavedModel binary
|
||||
- `pyt` - `get_model` function returning PyTorch Module
|
||||
- output
|
||||
- `tf-savedmodel` - Tensorflow saved model
|
||||
- `tf-trt` - TF-TRT saved model
|
||||
- `ts-trace` - PyTorch traced ScriptModule
|
||||
- `ts-script` - PyTorch scripted ScriptModule
|
||||
- `onnx` - ONNX
|
||||
- `trt` - TensorRT plan file
|
||||
|
||||
For tf-keras input you can use:
|
||||
- --large-model flag - helps loading model which exceeds maximum protobuf size of 2GB
|
||||
- --tf-allow-growth flag - control limiting GPU memory growth feature
|
||||
(https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth). By default it is disabled.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
|
||||
os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "1"
|
||||
|
||||
# method from PEP-366 to support relative import in executed modules
|
||||
if __name__ == "__main__" and __package__ is None:
|
||||
__package__ = Path(__file__).parent.name
|
||||
|
||||
from .deployment_toolkit.args import ArgParserGenerator
|
||||
from .deployment_toolkit.core import (
|
||||
DATALOADER_FN_NAME,
|
||||
BaseConverter,
|
||||
BaseLoader,
|
||||
BaseSaver,
|
||||
Format,
|
||||
Precision,
|
||||
load_from_file,
|
||||
)
|
||||
from .deployment_toolkit.extensions import converters, loaders, savers
|
||||
|
||||
LOGGER = logging.getLogger("convert_model")
|
||||
|
||||
INPUT_MODEL_TYPES = [Format.TF_ESTIMATOR, Format.TF_KERAS, Format.TF_SAVEDMODEL, Format.PYT]
|
||||
OUTPUT_MODEL_TYPES = [Format.TF_SAVEDMODEL, Format.TF_TRT, Format.ONNX, Format.TRT, Format.TS_TRACE, Format.TS_SCRIPT]
|
||||
|
||||
|
||||
def _get_args():
|
||||
parser = argparse.ArgumentParser(description="Script for conversion between model formats.", allow_abbrev=False)
|
||||
parser.add_argument("--input-path", help="Path to input model file (python module or binary file)", required=True)
|
||||
parser.add_argument(
|
||||
"--input-type", help="Input model type", choices=[f.value for f in INPUT_MODEL_TYPES], required=True
|
||||
)
|
||||
parser.add_argument("--output-path", help="Path to output model file", required=True)
|
||||
parser.add_argument(
|
||||
"--output-type", help="Output model type", choices=[f.value for f in OUTPUT_MODEL_TYPES], required=True
|
||||
)
|
||||
parser.add_argument("--dataloader", help="Path to python module containing data loader")
|
||||
parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
|
||||
parser.add_argument(
|
||||
"--ignore-unknown-parameters",
|
||||
help="Ignore unknown parameters (argument often used in CI where set of arguments is constant)",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
|
||||
args, unparsed_args = parser.parse_known_args()
|
||||
|
||||
Loader: BaseLoader = loaders.get(args.input_type)
|
||||
ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
|
||||
|
||||
converter_name = f"{args.input_type}--{args.output_type}"
|
||||
Converter: BaseConverter = converters.get(converter_name)
|
||||
if Converter is not None:
|
||||
ArgParserGenerator(Converter).update_argparser(parser)
|
||||
|
||||
Saver: BaseSaver = savers.get(args.output_type)
|
||||
ArgParserGenerator(Saver).update_argparser(parser)
|
||||
|
||||
if args.dataloader is not None:
|
||||
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
|
||||
ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
|
||||
|
||||
if args.ignore_unknown_parameters:
|
||||
args, unknown_args = parser.parse_known_args()
|
||||
LOGGER.warning(f"Got additional args {unknown_args}")
|
||||
else:
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = _get_args()
|
||||
|
||||
log_level = logging.INFO if not args.verbose else logging.DEBUG
|
||||
log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
|
||||
logging.basicConfig(level=log_level, format=log_format)
|
||||
|
||||
LOGGER.info(f"args:")
|
||||
for key, value in vars(args).items():
|
||||
LOGGER.info(f" {key} = {value}")
|
||||
|
||||
requested_model_precision = Precision(args.precision)
|
||||
dataloader_fn = None
|
||||
|
||||
# if conversion is required, temporary change model load precision to that required by converter
|
||||
# it is for TensorRT converters which require fp32 models for all requested precisions
|
||||
converter_name = f"{args.input_type}--{args.output_type}"
|
||||
Converter: BaseConverter = converters.get(converter_name)
|
||||
if Converter:
|
||||
args.precision = Converter.required_source_model_precision(requested_model_precision).value
|
||||
|
||||
Loader: BaseLoader = loaders.get(args.input_type)
|
||||
loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
|
||||
model = loader.load(args.input_path)
|
||||
|
||||
|
||||
LOGGER.info("inputs: %s", model.inputs)
|
||||
LOGGER.info("outputs: %s", model.outputs)
|
||||
|
||||
if Converter: # if conversion is needed
|
||||
# dataloader must much source model precision - so not recovering it yet
|
||||
if args.dataloader is not None:
|
||||
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
|
||||
dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
|
||||
|
||||
# recover precision to that requested by user
|
||||
args.precision = requested_model_precision.value
|
||||
|
||||
if Converter:
|
||||
converter = ArgParserGenerator(Converter).from_args(args)
|
||||
model = converter.convert(model, dataloader_fn=dataloader_fn)
|
||||
|
||||
Saver: BaseSaver = savers.get(args.output_type)
|
||||
saver = ArgParserGenerator(Saver).from_args(args)
|
||||
saver.save(model, args.output_path)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
35
PyTorch/Segmentation/nnUNet/triton/dataloader.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
import numpy as np
|
||||
from data_loading.dali_loader import fetch_dali_loader
|
||||
from sklearn.model_selection import KFold
|
||||
from utils.utils import get_split, load_data
|
||||
|
||||
|
||||
def get_dataloader_fn(*, data_dir: str, batch_size: int, precision: str):
|
||||
kwargs = {
|
||||
"dim": 3,
|
||||
"gpus": 1,
|
||||
"seed": 0,
|
||||
"num_workers": 8,
|
||||
"meta": None,
|
||||
"oversampling": 0,
|
||||
"benchmark": False,
|
||||
"patch_size": [128, 128, 128],
|
||||
}
|
||||
|
||||
imgs, lbls = load_data(data_dir, "*_x.npy"), load_data(data_dir, "*_y.npy")
|
||||
kfold = KFold(n_splits=5, shuffle=True, random_state=12345)
|
||||
_, val_idx = list(kfold.split(imgs))[2]
|
||||
imgs, lbls = get_split(imgs, val_idx), get_split(lbls, val_idx)
|
||||
dataloader = fetch_dali_loader(imgs, lbls, batch_size, "bermuda", **kwargs)
|
||||
|
||||
def _dataloader_fn():
|
||||
for i, batch in enumerate(dataloader):
|
||||
fname = [f"{i}_{j}" for j in range(batch_size)]
|
||||
img = batch["image"].numpy()
|
||||
if "fp16" in precision:
|
||||
img = img.astype(np.half)
|
||||
img = {"INPUT__0": img}
|
||||
lbl = {"OUTPUT__0": batch["label"].squeeze(1).numpy().astype(int)}
|
||||
yield fname, img, lbl
|
||||
|
||||
return _dataloader_fn
|
|
@ -0,0 +1 @@
|
|||
0.5.0-3-g23aa76a3
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
124
PyTorch/Segmentation/nnUNet/triton/deployment_toolkit/args.py
Normal file
|
@ -0,0 +1,124 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import inspect
|
||||
import logging
|
||||
from typing import Any, Callable, Dict, Optional, Union
|
||||
|
||||
from .core import GET_ARGPARSER_FN_NAME, load_from_file
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def str2bool(v):
|
||||
if isinstance(v, bool):
|
||||
return v
|
||||
if v.lower() in ("yes", "true", "t", "y", "1"):
|
||||
return True
|
||||
elif v.lower() in ("no", "false", "f", "n", "0"):
|
||||
return False
|
||||
else:
|
||||
raise argparse.ArgumentTypeError("Boolean value expected.")
|
||||
|
||||
|
||||
def filter_fn_args(args: Union[dict, argparse.Namespace], fn: Callable) -> dict:
|
||||
signature = inspect.signature(fn)
|
||||
parameters_names = list(signature.parameters)
|
||||
if isinstance(args, argparse.Namespace):
|
||||
args = vars(args)
|
||||
args = {k: v for k, v in args.items() if k in parameters_names}
|
||||
return args
|
||||
|
||||
|
||||
def add_args_for_fn_signature(parser, fn) -> argparse.ArgumentParser:
|
||||
parser.conflict_handler = "resolve"
|
||||
signature = inspect.signature(fn)
|
||||
for parameter in signature.parameters.values():
|
||||
if parameter.name in ["self", "args", "kwargs"]:
|
||||
continue
|
||||
argument_kwargs = {}
|
||||
if parameter.annotation != inspect.Parameter.empty:
|
||||
if parameter.annotation == bool:
|
||||
argument_kwargs["type"] = str2bool
|
||||
argument_kwargs["choices"] = [0, 1]
|
||||
elif isinstance(parameter.annotation, type(Optional[Any])):
|
||||
types = [type_ for type_ in parameter.annotation.__args__ if not isinstance(None, type_)]
|
||||
if len(types) != 1:
|
||||
raise RuntimeError(
|
||||
f"Could not prepare argument parser for {parameter.name}: {parameter.annotation} in {fn}"
|
||||
)
|
||||
argument_kwargs["type"] = types[0]
|
||||
else:
|
||||
argument_kwargs["type"] = parameter.annotation
|
||||
|
||||
if parameter.default != inspect.Parameter.empty:
|
||||
if parameter.annotation == bool:
|
||||
argument_kwargs["default"] = str2bool(parameter.default)
|
||||
else:
|
||||
argument_kwargs["default"] = parameter.default
|
||||
else:
|
||||
argument_kwargs["required"] = True
|
||||
name = parameter.name.replace("_", "-")
|
||||
LOGGER.debug(f"Adding argument {name} with {argument_kwargs}")
|
||||
parser.add_argument(f"--{name}", **argument_kwargs)
|
||||
return parser
|
||||
|
||||
|
||||
class ArgParserGenerator:
|
||||
def __init__(self, cls_or_fn, module_path: Optional[str] = None):
|
||||
self._cls_or_fn = cls_or_fn
|
||||
|
||||
self._handle = cls_or_fn if inspect.isfunction(cls_or_fn) else getattr(cls_or_fn, "__init__")
|
||||
input_is_python_file = module_path and module_path.endswith(".py")
|
||||
self._input_path = module_path if input_is_python_file else None
|
||||
self._required_fn_name_for_signature_parsing = getattr(
|
||||
cls_or_fn, "required_fn_name_for_signature_parsing", None
|
||||
)
|
||||
|
||||
def update_argparser(self, parser):
|
||||
name = self._handle.__name__
|
||||
group_parser = parser.add_argument_group(name)
|
||||
add_args_for_fn_signature(group_parser, fn=self._handle)
|
||||
self._update_argparser(group_parser)
|
||||
|
||||
def get_args(self, args: argparse.Namespace):
|
||||
filtered_args = filter_fn_args(args, fn=self._handle)
|
||||
|
||||
tmp_parser = argparse.ArgumentParser(allow_abbrev=False)
|
||||
self._update_argparser(tmp_parser)
|
||||
custom_names = [
|
||||
p.dest.replace("-", "_") for p in tmp_parser._actions if not isinstance(p, argparse._HelpAction)
|
||||
]
|
||||
custom_params = {n: getattr(args, n) for n in custom_names}
|
||||
filtered_args = {**filtered_args, **custom_params}
|
||||
return filtered_args
|
||||
|
||||
def from_args(self, args: Union[argparse.Namespace, Dict]):
|
||||
args = self.get_args(args)
|
||||
LOGGER.info(f"Initializing {self._cls_or_fn.__name__}({args})")
|
||||
return self._cls_or_fn(**args)
|
||||
|
||||
def _update_argparser(self, parser):
|
||||
label = "argparser_update"
|
||||
if self._input_path:
|
||||
update_argparser_handle = load_from_file(self._input_path, label=label, target=GET_ARGPARSER_FN_NAME)
|
||||
if update_argparser_handle:
|
||||
update_argparser_handle(parser)
|
||||
elif self._required_fn_name_for_signature_parsing:
|
||||
fn_handle = load_from_file(
|
||||
self._input_path, label=label, target=self._required_fn_name_for_signature_parsing
|
||||
)
|
||||
if fn_handle:
|
||||
add_args_for_fn_signature(parser, fn_handle)
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -0,0 +1,237 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
# pytype: disable=import-error
|
||||
import onnx
|
||||
import onnx.optimizer
|
||||
import onnx.shape_inference
|
||||
import onnxruntime
|
||||
from google.protobuf import text_format
|
||||
from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
|
||||
|
||||
# pytype: enable=import-error
|
||||
|
||||
from ..core import BaseLoader, BaseRunner, BaseRunnerSession, BaseSaver, Format, Model, Precision, TensorSpec
|
||||
from ..extensions import loaders, runners, savers
|
||||
from .utils import infer_precision
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _value_info2tensor_spec(value_info: onnx.ValueInfoProto):
|
||||
onnx_data_type_map = {"float": "float32", "double": "float64"}
|
||||
|
||||
elem_type_name = onnx.TensorProto.DataType.Name(value_info.type.tensor_type.elem_type).lower()
|
||||
dtype = onnx_data_type_map.get(elem_type_name, elem_type_name)
|
||||
|
||||
def _get_dim(dim):
|
||||
which = dim.WhichOneof("value")
|
||||
if which is not None: # which is None when dim is None
|
||||
dim = getattr(dim, which)
|
||||
return None if isinstance(dim, (str, bytes)) else dim
|
||||
|
||||
shape = value_info.type.tensor_type.shape
|
||||
shape = tuple([_get_dim(d) for d in shape.dim])
|
||||
return TensorSpec(value_info.name, dtype=dtype, shape=shape)
|
||||
|
||||
|
||||
def _infer_graph_precision(onnx_graph: onnx.GraphProto) -> Optional[Precision]:
|
||||
import networkx as nx
|
||||
|
||||
# build directed graph
|
||||
nx_graph = nx.DiGraph()
|
||||
|
||||
def _get_dtype(vi):
|
||||
t = vi.type
|
||||
if hasattr(t, "tensor_type"):
|
||||
type_id = t.tensor_type.elem_type
|
||||
else:
|
||||
raise NotImplementedError("Not implemented yet")
|
||||
return TENSOR_TYPE_TO_NP_TYPE[type_id]
|
||||
|
||||
node_output2type = {vi.name: _get_dtype(vi) for vi in onnx_graph.value_info}
|
||||
|
||||
node_outputs2node = {output_name: node for node in onnx_graph.node for output_name in node.output}
|
||||
node_inputs2node = {input_name: node for node in onnx_graph.node for input_name in node.input}
|
||||
|
||||
for node in onnx_graph.node:
|
||||
node_dtype = node_output2type.get("+".join(node.output), None)
|
||||
nx_graph.add_node(
|
||||
node.name,
|
||||
op=node.op_type,
|
||||
attr={a.name: a for a in node.attribute},
|
||||
dtype=node_dtype,
|
||||
)
|
||||
for input_name in node.input:
|
||||
prev_node = node_outputs2node.get(input_name, None)
|
||||
if prev_node:
|
||||
nx_graph.add_edge(prev_node.name, node.name)
|
||||
|
||||
for input_node in onnx_graph.input:
|
||||
input_name = input_node.name
|
||||
nx_graph.add_node(input_name, op="input", dtype=_get_dtype(input_node))
|
||||
next_node = node_inputs2node.get(input_name, None)
|
||||
if next_node:
|
||||
nx_graph.add_edge(input_name, next_node.name)
|
||||
|
||||
for output in onnx_graph.output:
|
||||
output_name = output.name
|
||||
nx_graph.add_node(output_name, op="output", dtype=_get_dtype(output))
|
||||
prev_node = node_outputs2node.get(output_name, None)
|
||||
if prev_node:
|
||||
nx_graph.add_edge(prev_node.name, output_name)
|
||||
else:
|
||||
LOGGER.warning(f"Could not find previous node for {output_name}")
|
||||
|
||||
input_names = [n.name for n in onnx_graph.input]
|
||||
output_names = [n.name for n in onnx_graph.output]
|
||||
most_common_dtype = infer_precision(nx_graph, input_names, output_names, lambda node: node.get("dtype", None))
|
||||
if most_common_dtype is not None:
|
||||
precision = {np.dtype("float32"): Precision.FP32, np.dtype("float16"): Precision.FP16}[most_common_dtype]
|
||||
else:
|
||||
precision = None
|
||||
return precision
|
||||
|
||||
|
||||
class OnnxLoader(BaseLoader):
|
||||
def load(self, model_path: Union[str, Path], **_) -> Model:
|
||||
if isinstance(model_path, Path):
|
||||
model_path = model_path.as_posix()
|
||||
|
||||
model = onnx.load(model_path)
|
||||
onnx.checker.check_model(model)
|
||||
onnx.helper.strip_doc_string(model)
|
||||
model = onnx.shape_inference.infer_shapes(model)
|
||||
|
||||
# TODO: probably modification of onnx model ios causes error on optimize
|
||||
# from onnx.utils import polish_model
|
||||
# model = polish_model(model) # run checker, docs strip, optimizer and shape inference
|
||||
|
||||
inputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.input}
|
||||
outputs = {vi.name: _value_info2tensor_spec(vi) for vi in model.graph.output}
|
||||
|
||||
precision = _infer_graph_precision(model.graph)
|
||||
|
||||
return Model(model, precision, inputs, outputs)
|
||||
|
||||
|
||||
class OnnxSaver(BaseSaver):
|
||||
def __init__(self, as_text: bool = False):
|
||||
self._as_text = as_text
|
||||
|
||||
def save(self, model: Model, model_path: Union[str, Path]) -> None:
|
||||
model_path = Path(model_path)
|
||||
LOGGER.debug(f"Saving ONNX model to {model_path.as_posix()}")
|
||||
model_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
onnx_model: onnx.ModelProto = model.handle
|
||||
if self._as_text:
|
||||
with model_path.open("w") as f:
|
||||
f.write(text_format.MessageToString(onnx_model))
|
||||
else:
|
||||
with model_path.open("wb") as f:
|
||||
f.write(onnx_model.SerializeToString())
|
||||
|
||||
|
||||
"""
|
||||
ExecutionProviders on onnxruntime 1.4.0
|
||||
['TensorrtExecutionProvider',
|
||||
'CUDAExecutionProvider',
|
||||
'MIGraphXExecutionProvider',
|
||||
'NGRAPHExecutionProvider',
|
||||
'OpenVINOExecutionProvider',
|
||||
'DnnlExecutionProvider',
|
||||
'NupharExecutionProvider',
|
||||
'VitisAIExecutionProvider',
|
||||
'ArmNNExecutionProvider',
|
||||
'ACLExecutionProvider',
|
||||
'CPUExecutionProvider']
|
||||
"""
|
||||
|
||||
|
||||
def _check_providers(providers):
|
||||
providers = providers or []
|
||||
if not isinstance(providers, (list, tuple)):
|
||||
providers = [providers]
|
||||
available_providers = onnxruntime.get_available_providers()
|
||||
unavailable = set(providers) - set(available_providers)
|
||||
if unavailable:
|
||||
raise RuntimeError(f"Unavailable providers {unavailable}")
|
||||
return providers
|
||||
|
||||
|
||||
class OnnxRunner(BaseRunner):
|
||||
def __init__(self, verbose_runtime_logs: bool = False):
|
||||
self._providers = None
|
||||
self._verbose_runtime_logs = verbose_runtime_logs
|
||||
|
||||
def init_inference(self, model: Model):
|
||||
assert isinstance(model.handle, onnx.ModelProto)
|
||||
return OnnxRunnerSession(
|
||||
model=model, providers=self._providers, verbose_runtime_logs=self._verbose_runtime_logs
|
||||
)
|
||||
|
||||
|
||||
class OnnxRunnerSession(BaseRunnerSession):
|
||||
def __init__(self, model: Model, providers, verbose_runtime_logs: bool = False):
|
||||
super().__init__(model)
|
||||
self._input_names = None
|
||||
self._output_names = None
|
||||
self._session = None
|
||||
self._providers = providers
|
||||
self._verbose_runtime_logs = verbose_runtime_logs
|
||||
self._old_env_values = {}
|
||||
|
||||
def __enter__(self):
|
||||
self._old_env_values = self._set_env_variables()
|
||||
sess_options = onnxruntime.SessionOptions() # default session options
|
||||
if self._verbose_runtime_logs:
|
||||
sess_options.log_severity_level = 0
|
||||
sess_options.log_verbosity_level = 1
|
||||
LOGGER.info(
|
||||
f"Starting inference session for onnx model providers={self._providers} sess_options={sess_options}"
|
||||
)
|
||||
|
||||
self._input_names = list(self._model.inputs)
|
||||
self._output_names = list(self._model.outputs)
|
||||
|
||||
model_payload = self._model.handle.SerializeToString()
|
||||
self._session = onnxruntime.InferenceSession(
|
||||
model_payload, providers=self._providers, sess_options=sess_options
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._input_names = None
|
||||
self._output_names = None
|
||||
self._session = None
|
||||
self._recover_env_variables(self._old_env_values)
|
||||
|
||||
def __call__(self, x: Dict[str, object]):
|
||||
feed_dict = {k: x[k] for k in self._input_names}
|
||||
y_pred = self._session.run(self._output_names, feed_dict)
|
||||
y_pred = dict(zip(self._output_names, y_pred))
|
||||
|
||||
return y_pred
|
||||
|
||||
|
||||
loaders.register_extension(Format.ONNX.value, OnnxLoader)
|
||||
runners.register_extension(Format.ONNX.value, OnnxRunner)
|
||||
savers.register_extension(Format.ONNX.value, OnnxSaver)
|
|
@ -0,0 +1,114 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from typing import Dict, Iterable, Optional
|
||||
|
||||
# pytype: disable=import-error
|
||||
import onnx
|
||||
import tensorrt as trt
|
||||
|
||||
from ..core import BaseConverter, Format, Model, Precision, ShapeSpec
|
||||
from ..extensions import converters
|
||||
from .utils import get_input_shapes
|
||||
|
||||
# pytype: enable=import-error
|
||||
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
|
||||
|
||||
|
||||
class Onnx2TRTConverter(BaseConverter):
|
||||
def __init__(self, *, max_batch_size: int, max_workspace_size: int, precision: str):
|
||||
self._max_batch_size = max_batch_size
|
||||
self._max_workspace_size = max_workspace_size
|
||||
self._precision = Precision(precision)
|
||||
|
||||
def convert(self, model: Model, dataloader_fn) -> Model:
|
||||
input_shapes = get_input_shapes(dataloader_fn(), self._max_batch_size)
|
||||
cuda_engine = onnx2trt(
|
||||
model.handle,
|
||||
shapes=input_shapes,
|
||||
max_workspace_size=self._max_workspace_size,
|
||||
max_batch_size=self._max_batch_size,
|
||||
model_precision=self._precision.value,
|
||||
)
|
||||
return model._replace(handle=cuda_engine)
|
||||
|
||||
@staticmethod
|
||||
def required_source_model_precision(requested_model_precision: Precision) -> Precision:
|
||||
# TensorRT requires source models to be in FP32 precision
|
||||
return Precision.FP32
|
||||
|
||||
|
||||
def onnx2trt(
|
||||
onnx_model: onnx.ModelProto,
|
||||
*,
|
||||
shapes: Dict[str, ShapeSpec],
|
||||
max_workspace_size: int,
|
||||
max_batch_size: int,
|
||||
model_precision: str,
|
||||
) -> "trt.ICudaEngine":
|
||||
"""
|
||||
Converts onnx model to TensorRT ICudaEngine
|
||||
Args:
|
||||
onnx_model: onnx.Model to convert
|
||||
shapes: dictionary containing min shape, max shape, opt shape for each input name
|
||||
max_workspace_size: The maximum GPU temporary memory which the CudaEngine can use at execution time.
|
||||
max_batch_size: The maximum batch size which can be used at execution time,
|
||||
and also the batch size for which the CudaEngine will be optimized.
|
||||
model_precision: precision of kernels (possible values: fp16, fp32)
|
||||
|
||||
Returns: TensorRT ICudaEngine
|
||||
"""
|
||||
# Whether or not 16-bit kernels are permitted.
|
||||
# During :class:`ICudaEngine` build fp16 kernels will also be tried when this mode is enabled.
|
||||
fp16_mode = "16" in model_precision
|
||||
|
||||
builder = trt.Builder(TRT_LOGGER)
|
||||
builder.fp16_mode = fp16_mode
|
||||
builder.max_batch_size = max_batch_size
|
||||
builder.max_workspace_size = max_workspace_size
|
||||
|
||||
# In TensorRT 7.0, the ONNX parser only supports full-dimensions mode,
|
||||
# meaning that your network definition must be created with the explicitBatch flag set.
|
||||
# For more information, see
|
||||
# https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work_dynamic_shapes
|
||||
flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
|
||||
network = builder.create_network(flags)
|
||||
|
||||
with trt.OnnxParser(network, TRT_LOGGER) as parser:
|
||||
# onnx model parsing
|
||||
if not parser.parse(onnx_model.SerializeToString()):
|
||||
for i in range(parser.num_errors):
|
||||
LOGGER.error(f"OnnxParser error {i}/{parser.num_errors}: {parser.get_error(i)}")
|
||||
raise RuntimeError("Error during parsing ONNX model (see logs for details)")
|
||||
|
||||
# optimization
|
||||
config = builder.create_builder_config()
|
||||
config.flags |= bool(fp16_mode) << int(trt.BuilderFlag.FP16)
|
||||
config.max_workspace_size = max_workspace_size
|
||||
|
||||
profile = builder.create_optimization_profile()
|
||||
for name, spec in shapes.items():
|
||||
profile.set_shape(name, **spec._asdict())
|
||||
|
||||
config.add_optimization_profile(profile)
|
||||
engine = builder.build_engine(network, config=config)
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
converters.register_extension(f"{Format.ONNX.value}--{Format.TRT.value}", Onnx2TRTConverter)
|
|
@ -0,0 +1,358 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import os
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, NamedTuple, Optional, Union
|
||||
|
||||
import torch # pytype: disable=import-error
|
||||
import yaml
|
||||
|
||||
from ..core import (
|
||||
GET_MODEL_FN_NAME,
|
||||
BaseConverter,
|
||||
BaseLoader,
|
||||
BaseRunner,
|
||||
BaseRunnerSession,
|
||||
BaseSaver,
|
||||
Format,
|
||||
Model,
|
||||
Precision,
|
||||
TensorSpec,
|
||||
load_from_file,
|
||||
)
|
||||
from ..extensions import converters, loaders, runners, savers
|
||||
from .utils import get_dynamic_axes, get_input_shapes, get_shapes_with_dynamic_axes
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class InputOutputSpec(NamedTuple):
|
||||
inputs: Dict[str, TensorSpec]
|
||||
outputs: Dict[str, TensorSpec]
|
||||
|
||||
|
||||
def get_sample_input(dataloader, device):
|
||||
for batch in dataloader:
|
||||
_, x, _ = batch
|
||||
break
|
||||
if isinstance(x, dict):
|
||||
sample_input = list(x.values())
|
||||
elif isinstance(x, list):
|
||||
sample_input = x
|
||||
else:
|
||||
raise TypeError("The first element (x) of batch returned by dataloader must be a list or a dict")
|
||||
|
||||
for idx, s in enumerate(sample_input):
|
||||
sample_input[idx] = torch.from_numpy(s).to(device)
|
||||
|
||||
return tuple(sample_input)
|
||||
|
||||
|
||||
def get_model_device(torch_model):
|
||||
if next(torch_model.parameters()).is_cuda:
|
||||
return "cuda"
|
||||
else:
|
||||
return "cpu"
|
||||
|
||||
|
||||
def infer_model_precision(model):
|
||||
counter = Counter()
|
||||
for param in model.parameters():
|
||||
counter[param.dtype] += 1
|
||||
if counter[torch.float16] > 0:
|
||||
return Precision.FP16
|
||||
else:
|
||||
return Precision.FP32
|
||||
|
||||
|
||||
def _get_tensor_dtypes(dataloader, precision):
|
||||
def _get_dtypes(t):
|
||||
dtypes = {}
|
||||
for k, v in t.items():
|
||||
dtype = str(v.dtype)
|
||||
if dtype == "float64":
|
||||
dtype = "float32"
|
||||
if precision == Precision.FP16 and dtype == "float32":
|
||||
dtype = "float16"
|
||||
dtypes[k] = dtype
|
||||
return dtypes
|
||||
|
||||
input_dtypes = {}
|
||||
output_dtypes = {}
|
||||
|
||||
for batch in dataloader:
|
||||
_, x, y = batch
|
||||
input_dtypes = _get_dtypes(x)
|
||||
output_dtypes = _get_dtypes(y)
|
||||
break
|
||||
|
||||
return input_dtypes, output_dtypes
|
||||
|
||||
|
||||
### TODO assumption: floating point input
|
||||
### type has same precision as the model
|
||||
def _get_io_spec(model, dataloader_fn):
|
||||
precision = model.precision
|
||||
|
||||
dataloader = dataloader_fn()
|
||||
input_dtypes, output_dtypes = _get_tensor_dtypes(dataloader, precision)
|
||||
input_shapes, output_shapes = get_shapes_with_dynamic_axes(dataloader)
|
||||
|
||||
inputs = {
|
||||
name: TensorSpec(name=name, dtype=input_dtypes[name], shape=tuple(input_shapes[name])) for name in model.inputs
|
||||
}
|
||||
outputs = {
|
||||
name: TensorSpec(name=name, dtype=output_dtypes[name], shape=tuple(output_shapes[name]))
|
||||
for name in model.outputs
|
||||
}
|
||||
|
||||
return InputOutputSpec(inputs, outputs)
|
||||
|
||||
|
||||
class PyTorchModelLoader(BaseLoader):
|
||||
required_fn_name_for_signature_parsing: Optional[str] = GET_MODEL_FN_NAME
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._model_args = kwargs
|
||||
|
||||
def load(self, model_path: Union[str, Path], **_) -> Model:
|
||||
if isinstance(model_path, Path):
|
||||
model_path = model_path.as_posix()
|
||||
get_model = load_from_file(model_path, "model", GET_MODEL_FN_NAME)
|
||||
model, tensor_infos = get_model(**self._model_args)
|
||||
io_spec = InputOutputSpec(tensor_infos["inputs"], tensor_infos["outputs"])
|
||||
precision = infer_model_precision(model)
|
||||
return Model(handle=model, precision=precision, inputs=io_spec.inputs, outputs=io_spec.outputs)
|
||||
|
||||
|
||||
class TorchScriptLoader(BaseLoader):
|
||||
def __init__(self, tensor_names_path: str = None, **kwargs):
|
||||
self._model_args = kwargs
|
||||
self._io_spec = None
|
||||
if tensor_names_path is not None:
|
||||
with Path(tensor_names_path).open("r") as fh:
|
||||
tensor_infos = yaml.load(fh, Loader=yaml.SafeLoader)
|
||||
self._io_spec = InputOutputSpec(tensor_infos["inputs"], tensor_infos["outputs"])
|
||||
|
||||
def load(self, model_path: Union[str, Path], **_) -> Model:
|
||||
if not isinstance(model_path, Path):
|
||||
model_path = Path(model_path)
|
||||
model = torch.jit.load(model_path.as_posix())
|
||||
precision = infer_model_precision(model)
|
||||
|
||||
io_spec = self._io_spec
|
||||
if not io_spec:
|
||||
yaml_path = model_path.parent / f"{model_path.stem}.yaml"
|
||||
if not yaml_path.is_file():
|
||||
raise ValueError(
|
||||
f"If `--tensor-names-path is not provided, "
|
||||
f"TorchScript model loader expects file {yaml_path} with tensor information."
|
||||
)
|
||||
with yaml_path.open("r") as fh:
|
||||
tensor_info = yaml.load(fh, Loader=yaml.SafeLoader)
|
||||
io_spec = InputOutputSpec(tensor_info["inputs"], tensor_info["outputs"])
|
||||
|
||||
return Model(handle=model, precision=precision, inputs=io_spec.inputs, outputs=io_spec.outputs)
|
||||
|
||||
|
||||
class TorchScriptTraceConverter(BaseConverter):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def convert(self, model: Model, dataloader_fn) -> Model:
|
||||
device = get_model_device(model.handle)
|
||||
dummy_input = get_sample_input(dataloader_fn(), device)
|
||||
converted_model = torch.jit.trace_module(model.handle, {"forward": dummy_input})
|
||||
io_spec = _get_io_spec(model, dataloader_fn)
|
||||
return Model(converted_model, precision=model.precision, inputs=io_spec.inputs, outputs=io_spec.outputs)
|
||||
|
||||
|
||||
class TorchScriptScriptConverter(BaseConverter):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def convert(self, model: Model, dataloader_fn) -> Model:
|
||||
converted_model = torch.jit.script(model.handle)
|
||||
io_spec = _get_io_spec(model, dataloader_fn)
|
||||
return Model(converted_model, precision=model.precision, inputs=io_spec.inputs, outputs=io_spec.outputs)
|
||||
|
||||
|
||||
class PYT2ONNXConverter(BaseConverter):
|
||||
def __init__(self, onnx_opset: int = None):
|
||||
self._onnx_opset = onnx_opset
|
||||
|
||||
def convert(self, model: Model, dataloader_fn) -> Model:
|
||||
import tempfile
|
||||
|
||||
import onnx # pytype: disable=import-error
|
||||
|
||||
assert isinstance(model.handle, torch.jit.ScriptModule) or isinstance(
|
||||
model.handle, torch.nn.Module
|
||||
), "The model must be of type 'torch.jit.ScriptModule' or 'torch.nn.Module'. Converter aborted."
|
||||
|
||||
dynamic_axes = get_dynamic_axes(dataloader_fn())
|
||||
|
||||
device = get_model_device(model.handle)
|
||||
dummy_input = get_sample_input(dataloader_fn(), device)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
export_path = os.path.join(tmpdirname, "model.onnx")
|
||||
with torch.no_grad():
|
||||
torch.onnx.export(
|
||||
model.handle,
|
||||
dummy_input,
|
||||
export_path,
|
||||
do_constant_folding=True,
|
||||
input_names=list(model.inputs),
|
||||
output_names=list(model.outputs),
|
||||
dynamic_axes=dynamic_axes,
|
||||
opset_version=self._onnx_opset,
|
||||
enable_onnx_checker=True,
|
||||
)
|
||||
|
||||
onnx_model = onnx.load(export_path)
|
||||
onnx.checker.check_model(onnx_model)
|
||||
onnx.helper.strip_doc_string(onnx_model)
|
||||
onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
|
||||
|
||||
return Model(
|
||||
handle=onnx_model,
|
||||
precision=model.precision,
|
||||
inputs=model.inputs,
|
||||
outputs=model.outputs,
|
||||
)
|
||||
|
||||
|
||||
class PYT2TensorRTConverter(BaseConverter):
|
||||
def __init__(self, max_batch_size: int, max_workspace_size: int, onnx_opset: int, precision: str):
|
||||
self._max_batch_size = max_batch_size
|
||||
self._max_workspace_size = max_workspace_size
|
||||
self._onnx_opset = onnx_opset
|
||||
self._precision = Precision(precision)
|
||||
|
||||
def convert(self, model: Model, dataloader_fn) -> Model:
|
||||
from .onnx import _infer_graph_precision
|
||||
from .onnx2trt_conv import onnx2trt
|
||||
|
||||
pyt2onnx_converter = PYT2ONNXConverter(self._onnx_opset)
|
||||
onnx_model = pyt2onnx_converter.convert(model, dataloader_fn).handle
|
||||
precision = _infer_graph_precision(onnx_model.graph)
|
||||
|
||||
input_shapes = get_input_shapes(dataloader_fn(), self._max_batch_size)
|
||||
|
||||
cuda_engine = onnx2trt(
|
||||
onnx_model,
|
||||
shapes=input_shapes,
|
||||
max_workspace_size=self._max_workspace_size,
|
||||
max_batch_size=self._max_batch_size,
|
||||
model_precision=self._precision.value,
|
||||
)
|
||||
|
||||
return Model(
|
||||
handle=cuda_engine,
|
||||
precision=model.precision,
|
||||
inputs=model.inputs,
|
||||
outputs=model.outputs,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def required_source_model_precision(requested_model_precision: Precision) -> Precision:
|
||||
# TensorRT requires source models to be in FP32 precision
|
||||
return Precision.FP32
|
||||
|
||||
|
||||
class TorchScriptSaver(BaseSaver):
|
||||
def save(self, model: Model, model_path: Union[str, Path]) -> None:
|
||||
if not isinstance(model_path, Path):
|
||||
model_path = Path(model_path)
|
||||
if isinstance(model.handle, torch.jit.ScriptModule):
|
||||
torch.jit.save(model.handle, model_path.as_posix())
|
||||
else:
|
||||
print("The model must be of type 'torch.jit.ScriptModule'. Saving aborted.")
|
||||
assert False # temporary error handling
|
||||
|
||||
def _format_tensor_spec(tensor_spec):
|
||||
# wrapping shape with list and whole tensor_spec with dict() is required for correct yaml dump
|
||||
tensor_spec = tensor_spec._replace(shape=list(tensor_spec.shape))
|
||||
tensor_spec = dict(tensor_spec._asdict())
|
||||
return tensor_spec
|
||||
|
||||
# store TensorSpecs from inputs and outputs in a yaml file
|
||||
tensor_specs = {
|
||||
"inputs": {k: _format_tensor_spec(v) for k, v in model.inputs.items()},
|
||||
"outputs": {k: _format_tensor_spec(v) for k, v in model.outputs.items()},
|
||||
}
|
||||
|
||||
yaml_path = model_path.parent / f"{model_path.stem}.yaml"
|
||||
with Path(yaml_path).open("w") as fh:
|
||||
yaml.dump(tensor_specs, fh, indent=4)
|
||||
|
||||
|
||||
class PyTorchRunner(BaseRunner):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def init_inference(self, model: Model):
|
||||
return PyTorchRunnerSession(model=model)
|
||||
|
||||
|
||||
class PyTorchRunnerSession(BaseRunnerSession):
|
||||
def __init__(self, model: Model):
|
||||
super().__init__(model)
|
||||
|
||||
assert isinstance(model.handle, torch.jit.ScriptModule) or isinstance(
|
||||
model.handle, torch.nn.Module
|
||||
), "The model must be of type 'torch.jit.ScriptModule' or 'torch.nn.Module'. Runner aborted."
|
||||
|
||||
self._model = model
|
||||
self._output_names = None
|
||||
|
||||
def __enter__(self):
|
||||
self._output_names = list(self._model.outputs)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._output_names = None
|
||||
self._model = None
|
||||
|
||||
def __call__(self, x: Dict[str, object]):
|
||||
with torch.no_grad():
|
||||
feed_list = [torch.from_numpy(v).cuda() for k, v in x.items()]
|
||||
y_pred = self._model.handle(*feed_list)
|
||||
if isinstance(y_pred, torch.Tensor):
|
||||
y_pred = (y_pred,)
|
||||
y_pred = [t.cpu().numpy() for t in y_pred]
|
||||
y_pred = dict(zip(self._output_names, y_pred))
|
||||
|
||||
return y_pred
|
||||
|
||||
|
||||
loaders.register_extension(Format.PYT.value, PyTorchModelLoader)
|
||||
loaders.register_extension(Format.TS_TRACE.value, TorchScriptLoader)
|
||||
loaders.register_extension(Format.TS_SCRIPT.value, TorchScriptLoader)
|
||||
|
||||
converters.register_extension(f"{Format.PYT.value}--{Format.TS_SCRIPT.value}", TorchScriptScriptConverter)
|
||||
converters.register_extension(f"{Format.PYT.value}--{Format.TS_TRACE.value}", TorchScriptTraceConverter)
|
||||
converters.register_extension(f"{Format.PYT.value}--{Format.ONNX.value}", PYT2ONNXConverter)
|
||||
converters.register_extension(f"{Format.PYT.value}--{Format.TRT.value}", PYT2TensorRTConverter)
|
||||
|
||||
savers.register_extension(Format.TS_SCRIPT.value, TorchScriptSaver)
|
||||
savers.register_extension(Format.TS_TRACE.value, TorchScriptSaver)
|
||||
|
||||
runners.register_extension(Format.PYT.value, PyTorchRunner)
|
||||
runners.register_extension(Format.TS_SCRIPT.value, PyTorchRunner)
|
||||
runners.register_extension(Format.TS_TRACE.value, PyTorchRunner)
|
|
@ -0,0 +1,216 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, NamedTuple, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
# pytype: disable=import-error
|
||||
try:
|
||||
import pycuda.autoinit
|
||||
import pycuda.driver as cuda
|
||||
except (ImportError, Exception) as e:
|
||||
logging.getLogger(__name__).warning(f"Problems with importing pycuda package; {e}")
|
||||
# pytype: enable=import-error
|
||||
|
||||
import tensorrt as trt # pytype: disable=import-error
|
||||
|
||||
from ..core import BaseLoader, BaseRunner, BaseRunnerSession, BaseSaver, Format, Model, Precision, TensorSpec
|
||||
from ..extensions import loaders, runners, savers
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
|
||||
|
||||
"""
|
||||
documentation:
|
||||
https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html
|
||||
https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_samples_section
|
||||
"""
|
||||
|
||||
|
||||
class TensorRTLoader(BaseLoader):
|
||||
def load(self, model_path: Union[str, Path], **_) -> Model:
|
||||
model_path = Path(model_path)
|
||||
LOGGER.debug(f"Loading TensorRT engine from {model_path}")
|
||||
|
||||
with model_path.open("rb") as fh, trt.Runtime(TRT_LOGGER) as runtime:
|
||||
engine = runtime.deserialize_cuda_engine(fh.read())
|
||||
|
||||
if engine is None:
|
||||
raise RuntimeError(f"Could not load ICudaEngine from {model_path}")
|
||||
|
||||
inputs = {}
|
||||
outputs = {}
|
||||
for binding_idx in range(engine.num_bindings):
|
||||
name = engine.get_binding_name(binding_idx)
|
||||
is_input = engine.binding_is_input(binding_idx)
|
||||
dtype = engine.get_binding_dtype(binding_idx)
|
||||
shape = engine.get_binding_shape(binding_idx)
|
||||
if is_input:
|
||||
inputs[name] = TensorSpec(name, dtype, shape)
|
||||
else:
|
||||
outputs[name] = TensorSpec(name, dtype, shape)
|
||||
|
||||
return Model(engine, None, inputs, outputs)
|
||||
|
||||
|
||||
class TensorRTSaver(BaseSaver):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def save(self, model: Model, model_path: Union[str, Path]) -> None:
|
||||
model_path = Path(model_path)
|
||||
LOGGER.debug(f"Saving TensorRT engine to {model_path.as_posix()}")
|
||||
model_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
engine: "trt.ICudaEngine" = model.handle
|
||||
with model_path.open("wb") as fh:
|
||||
fh.write(engine.serialize())
|
||||
|
||||
|
||||
class TRTBuffers(NamedTuple):
|
||||
x_host: Optional[Dict[str, object]]
|
||||
x_dev: Dict[str, object]
|
||||
y_pred_host: Dict[str, object]
|
||||
y_pred_dev: Dict[str, object]
|
||||
|
||||
|
||||
class TensorRTRunner(BaseRunner):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def init_inference(self, model: Model):
|
||||
return TensorRTRunnerSession(model=model)
|
||||
|
||||
|
||||
class TensorRTRunnerSession(BaseRunnerSession):
|
||||
def __init__(self, model: Model):
|
||||
super().__init__(model)
|
||||
assert isinstance(model.handle, trt.ICudaEngine)
|
||||
self._model = model
|
||||
self._has_dynamic_shapes = None
|
||||
|
||||
self._context = None
|
||||
self._engine: trt.ICudaEngine = self._model.handle
|
||||
self._cuda_context = pycuda.autoinit.context
|
||||
|
||||
self._input_names = None
|
||||
self._output_names = None
|
||||
self._buffers = None
|
||||
|
||||
def __enter__(self):
|
||||
self._context = self._engine.create_execution_context()
|
||||
self._context.__enter__()
|
||||
|
||||
self._input_names = [
|
||||
self._engine[idx] for idx in range(self._engine.num_bindings) if self._engine.binding_is_input(idx)
|
||||
]
|
||||
self._output_names = [
|
||||
self._engine[idx] for idx in range(self._engine.num_bindings) if not self._engine.binding_is_input(idx)
|
||||
]
|
||||
# all_binding_shapes_specified is True for models without dynamic shapes
|
||||
# so initially this variable is False for models with dynamic shapes
|
||||
self._has_dynamic_shapes = not self._context.all_binding_shapes_specified
|
||||
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._context.__exit__(exc_type, exc_value, traceback)
|
||||
self._input_names = None
|
||||
self._output_names = None
|
||||
|
||||
# TODO: are cuda buffers dealloc automatically?
|
||||
self._buffers = None
|
||||
|
||||
def __call__(self, x):
|
||||
buffers = self._prepare_buffers_if_needed(x)
|
||||
bindings = self._update_bindings(buffers)
|
||||
|
||||
for name in self._input_names:
|
||||
cuda.memcpy_htod(buffers.x_dev[name], buffers.x_host[name])
|
||||
self._cuda_context.push()
|
||||
self._context.execute_v2(bindings=bindings)
|
||||
self._cuda_context.pop()
|
||||
for name in self._output_names:
|
||||
cuda.memcpy_dtoh(buffers.y_pred_host[name], buffers.y_pred_dev[name])
|
||||
|
||||
return buffers.y_pred_host
|
||||
|
||||
def _update_bindings(self, buffers: TRTBuffers):
|
||||
bindings = [None] * self._engine.num_bindings
|
||||
for name in buffers.y_pred_dev:
|
||||
binding_idx: int = self._engine[name]
|
||||
bindings[binding_idx] = buffers.y_pred_dev[name]
|
||||
|
||||
for name in buffers.x_dev:
|
||||
binding_idx: int = self._engine[name]
|
||||
bindings[binding_idx] = buffers.x_dev[name]
|
||||
|
||||
return bindings
|
||||
|
||||
def _set_dynamic_input_shapes(self, x_host):
|
||||
def _is_shape_dynamic(input_shape):
|
||||
return any([dim is None or dim == -1 for dim in input_shape])
|
||||
|
||||
for name in self._input_names:
|
||||
bindings_idx = self._engine[name]
|
||||
data_shape = x_host[name].shape # pytype: disable=attribute-error
|
||||
if self._engine.is_shape_binding(bindings_idx):
|
||||
input_shape = self._context.get_shape(bindings_idx)
|
||||
if _is_shape_dynamic(input_shape):
|
||||
self._context.set_shape_input(bindings_idx, data_shape)
|
||||
else:
|
||||
input_shape = self._engine.get_binding_shape(bindings_idx)
|
||||
if _is_shape_dynamic(input_shape):
|
||||
self._context.set_binding_shape(bindings_idx, data_shape)
|
||||
|
||||
assert self._context.all_binding_shapes_specified and self._context.all_shape_inputs_specified
|
||||
|
||||
def _prepare_buffers_if_needed(self, x_host: Dict[str, object]):
|
||||
# pytype: disable=attribute-error
|
||||
new_batch_size = list(x_host.values())[0].shape[0]
|
||||
current_batch_size = list(self._buffers.y_pred_host.values())[0].shape[0] if self._buffers else 0
|
||||
# pytype: enable=attribute-error
|
||||
|
||||
if self._has_dynamic_shapes or new_batch_size != current_batch_size:
|
||||
# TODO: are CUDA buffers dealloc automatically?
|
||||
|
||||
self._set_dynamic_input_shapes(x_host)
|
||||
|
||||
y_pred_host = {}
|
||||
for name in self._output_names:
|
||||
shape = self._context.get_binding_shape(self._engine[name])
|
||||
y_pred_host[name] = np.zeros(shape, dtype=trt.nptype(self._model.outputs[name].dtype))
|
||||
|
||||
y_pred_dev = {name: cuda.mem_alloc(data.nbytes) for name, data in y_pred_host.items()}
|
||||
|
||||
x_dev = {
|
||||
name: cuda.mem_alloc(host_input.nbytes)
|
||||
for name, host_input in x_host.items()
|
||||
if name in self._input_names # pytype: disable=attribute-error
|
||||
}
|
||||
|
||||
self._buffers = TRTBuffers(None, x_dev, y_pred_host, y_pred_dev)
|
||||
|
||||
return self._buffers._replace(x_host=x_host)
|
||||
|
||||
|
||||
if "pycuda.driver" in sys.modules:
|
||||
loaders.register_extension(Format.TRT.value, TensorRTLoader)
|
||||
runners.register_extension(Format.TRT.value, TensorRTRunner)
|
||||
savers.register_extension(Format.TRT.value, TensorRTSaver)
|
||||
else:
|
||||
LOGGER.warning("Do not register TensorRT extension due problems with importing pycuda.driver package.")
|
|
@ -0,0 +1,121 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections import Counter
|
||||
from typing import Callable, Dict, List
|
||||
|
||||
import networkx as nx
|
||||
|
||||
from ..core import ShapeSpec
|
||||
|
||||
|
||||
def infer_precision(
|
||||
nx_graph: nx.Graph,
|
||||
input_names: List[str],
|
||||
output_names: List[str],
|
||||
get_node_dtype_fn: Callable,
|
||||
):
|
||||
node_dtypes = [nx_graph.nodes[node_name].get("dtype", None) for node_name in nx_graph.nodes]
|
||||
node_dtypes = [dt for dt in node_dtypes if dt is None or dt.kind not in ["i", "b"]]
|
||||
dtypes_counter = Counter(node_dtypes)
|
||||
return dtypes_counter.most_common()[0][0]
|
||||
|
||||
|
||||
def get_shapes_with_dynamic_axes(dataloader, batch_size_dim=0):
|
||||
def _set_dynamic_shapes(t, shapes):
|
||||
for k, v in t.items():
|
||||
shape = list(v.shape)
|
||||
for dim, s in enumerate(shape):
|
||||
if shapes[k][dim] != -1 and shapes[k][dim] != s:
|
||||
shapes[k][dim] = -1
|
||||
|
||||
## get all shapes from input and output tensors
|
||||
input_shapes = {}
|
||||
output_shapes = {}
|
||||
for batch in dataloader:
|
||||
_, x, y = batch
|
||||
for k, v in x.items():
|
||||
input_shapes[k] = list(v.shape)
|
||||
for k, v in y.items():
|
||||
output_shapes[k] = list(v.shape)
|
||||
break
|
||||
|
||||
# based on max <max_num_iters> iterations, check which
|
||||
# dimensions differ to determine dynamic_axes
|
||||
max_num_iters = 100
|
||||
for idx, batch in enumerate(dataloader):
|
||||
if idx >= max_num_iters:
|
||||
break
|
||||
|
||||
_, x, y = batch
|
||||
|
||||
_set_dynamic_shapes(x, input_shapes)
|
||||
_set_dynamic_shapes(y, output_shapes)
|
||||
|
||||
return input_shapes, output_shapes
|
||||
|
||||
|
||||
def get_dynamic_axes(dataloader, batch_size_dim=0):
|
||||
input_shapes, output_shapes = get_shapes_with_dynamic_axes(dataloader, batch_size_dim)
|
||||
all_shapes = {**input_shapes, **output_shapes}
|
||||
dynamic_axes = {}
|
||||
|
||||
for k, shape in all_shapes.items():
|
||||
for idx, s in enumerate(shape):
|
||||
if s == -1:
|
||||
dynamic_axes[k] = {idx: k + "_" + str(idx)}
|
||||
|
||||
for k, v in all_shapes.items():
|
||||
if k in dynamic_axes:
|
||||
dynamic_axes[k].update({batch_size_dim: "batch_size_" + str(batch_size_dim)})
|
||||
else:
|
||||
dynamic_axes[k] = {batch_size_dim: "batch_size_" + str(batch_size_dim)}
|
||||
|
||||
return dynamic_axes
|
||||
|
||||
|
||||
def get_input_shapes(dataloader, max_batch_size=1) -> Dict[str, ShapeSpec]:
|
||||
def init_counters_and_shapes(x, counters, min_shapes, max_shapes):
|
||||
for k, v in x.items():
|
||||
counters[k] = Counter()
|
||||
min_shapes[k] = [float("inf")] * v.ndim
|
||||
max_shapes[k] = [float("-inf")] * v.ndim
|
||||
|
||||
counters = {}
|
||||
min_shapes: Dict[str, tuple] = {}
|
||||
max_shapes: Dict[str, tuple] = {}
|
||||
for idx, batch in enumerate(dataloader):
|
||||
ids, x, y = batch
|
||||
|
||||
if idx == 0:
|
||||
init_counters_and_shapes(x, counters, min_shapes, max_shapes)
|
||||
|
||||
for k, v in x.items():
|
||||
shape = v.shape
|
||||
counters[k][shape] += 1
|
||||
min_shapes[k] = tuple([min(a, b) for a, b in zip(min_shapes[k], shape)])
|
||||
max_shapes[k] = tuple([max(a, b) for a, b in zip(max_shapes[k], shape)])
|
||||
|
||||
opt_shapes: Dict[str, tuple] = {}
|
||||
for k, v in counters.items():
|
||||
opt_shapes[k] = v.most_common(1)[0][0]
|
||||
|
||||
shapes = {}
|
||||
for k in opt_shapes.keys(): # same keys in min_shapes and max_shapes
|
||||
shapes[k] = ShapeSpec(
|
||||
min=(1,) + min_shapes[k][1:],
|
||||
max=(max_batch_size,) + max_shapes[k][1:],
|
||||
opt=(max_batch_size,) + opt_shapes[k][1:],
|
||||
)
|
||||
return shapes
|
183
PyTorch/Segmentation/nnUNet/triton/deployment_toolkit/core.py
Normal file
|
@ -0,0 +1,183 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import abc
|
||||
import importlib
|
||||
import logging
|
||||
import os
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
DATALOADER_FN_NAME = "get_dataloader_fn"
|
||||
GET_MODEL_FN_NAME = "get_model"
|
||||
GET_SERVING_INPUT_RECEIVER_FN = "get_serving_input_receiver_fn"
|
||||
GET_ARGPARSER_FN_NAME = "update_argparser"
|
||||
|
||||
|
||||
class TensorSpec(NamedTuple):
|
||||
name: str
|
||||
dtype: str
|
||||
shape: Tuple
|
||||
|
||||
|
||||
class Parameter(Enum):
|
||||
def __lt__(self, other: "Parameter") -> bool:
|
||||
return self.value < other.value
|
||||
|
||||
|
||||
class Accelerator(Parameter):
|
||||
AMP = "amp"
|
||||
CUDA = "cuda"
|
||||
TRT = "trt"
|
||||
|
||||
|
||||
class Precision(Parameter):
|
||||
FP16 = "fp16"
|
||||
FP32 = "fp32"
|
||||
TF32 = "tf32" # Deprecated
|
||||
|
||||
|
||||
class Format(Parameter):
|
||||
TF_GRAPHDEF = "tf-graphdef"
|
||||
TF_SAVEDMODEL = "tf-savedmodel"
|
||||
TF_TRT = "tf-trt"
|
||||
TF_ESTIMATOR = "tf-estimator"
|
||||
TF_KERAS = "tf-keras"
|
||||
ONNX = "onnx"
|
||||
TRT = "trt"
|
||||
TS_SCRIPT = "ts-script"
|
||||
TS_TRACE = "ts-trace"
|
||||
PYT = "pyt"
|
||||
|
||||
|
||||
class Model(NamedTuple):
|
||||
handle: object
|
||||
precision: Optional[Precision]
|
||||
inputs: Dict[str, TensorSpec]
|
||||
outputs: Dict[str, TensorSpec]
|
||||
|
||||
|
||||
def load_from_file(file_path, label, target):
|
||||
spec = importlib.util.spec_from_file_location(name=label, location=file_path)
|
||||
my_module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(my_module) # pytype: disable=attribute-error
|
||||
return getattr(my_module, target, None)
|
||||
|
||||
|
||||
class BaseLoader(abc.ABC):
|
||||
required_fn_name_for_signature_parsing: Optional[str] = None
|
||||
|
||||
@abc.abstractmethod
|
||||
def load(self, model_path: Union[str, Path], **kwargs) -> Model:
|
||||
"""
|
||||
Loads and process model from file based on given set of args
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class BaseSaver(abc.ABC):
|
||||
required_fn_name_for_signature_parsing: Optional[str] = None
|
||||
|
||||
@abc.abstractmethod
|
||||
def save(self, model: Model, model_path: Union[str, Path]) -> None:
|
||||
"""
|
||||
Save model to file
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class BaseRunner(abc.ABC):
|
||||
required_fn_name_for_signature_parsing: Optional[str] = None
|
||||
|
||||
@abc.abstractmethod
|
||||
def init_inference(self, model: Model):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BaseRunnerSession(abc.ABC):
|
||||
def __init__(self, model: Model):
|
||||
self._model = model
|
||||
|
||||
@abc.abstractmethod
|
||||
def __enter__(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def __call__(self, x: Dict[str, object]):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _set_env_variables(self) -> Dict[str, object]:
|
||||
"""this method not remove values; fix it if needed"""
|
||||
to_set = {}
|
||||
old_values = {k: os.environ.pop(k, None) for k in to_set}
|
||||
os.environ.update(to_set)
|
||||
return old_values
|
||||
|
||||
def _recover_env_variables(self, old_envs: Dict[str, object]):
|
||||
for name, value in old_envs.items():
|
||||
if value is None:
|
||||
del os.environ[name]
|
||||
else:
|
||||
os.environ[name] = str(value)
|
||||
|
||||
|
||||
class BaseConverter(abc.ABC):
|
||||
required_fn_name_for_signature_parsing: Optional[str] = None
|
||||
|
||||
@abc.abstractmethod
|
||||
def convert(self, model: Model, dataloader_fn) -> Model:
|
||||
raise NotImplementedError()
|
||||
|
||||
@staticmethod
|
||||
def required_source_model_precision(requested_model_precision: Precision) -> Precision:
|
||||
return requested_model_precision
|
||||
|
||||
|
||||
class BaseMetricsCalculator(abc.ABC):
|
||||
required_fn_name_for_signature_parsing: Optional[str] = None
|
||||
|
||||
@abc.abstractmethod
|
||||
def calc(
|
||||
self,
|
||||
*,
|
||||
ids: List[Any],
|
||||
y_pred: Dict[str, np.ndarray],
|
||||
x: Optional[Dict[str, np.ndarray]],
|
||||
y_real: Optional[Dict[str, np.ndarray]],
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Calculates error/accuracy metrics
|
||||
Args:
|
||||
ids: List of ids identifying each sample in the batch
|
||||
y_pred: model output as dict where key is output name and value is output value
|
||||
x: model input as dict where key is input name and value is input value
|
||||
y_real: input ground truth as dict where key is output name and value is output value
|
||||
Returns:
|
||||
dictionary where key is metric name and value is its value
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ShapeSpec(NamedTuple):
|
||||
min: Tuple
|
||||
opt: Tuple
|
||||
max: Tuple
|
147
PyTorch/Segmentation/nnUNet/triton/deployment_toolkit/dump.py
Normal file
|
@ -0,0 +1,147 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable
|
||||
|
||||
import numpy as np
|
||||
|
||||
MB2B = 2 ** 20
|
||||
B2MB = 1 / MB2B
|
||||
FLUSH_THRESHOLD_B = 256 * MB2B
|
||||
|
||||
|
||||
def pad_except_batch_axis(data: np.ndarray, target_shape_with_batch_axis: Iterable[int]):
|
||||
assert all(
|
||||
[current_size <= target_size for target_size, current_size in zip(target_shape_with_batch_axis, data.shape)]
|
||||
), "target_shape should have equal or greater all dimensions comparing to data.shape"
|
||||
padding = [(0, 0)] + [ # (0, 0) - do not pad on batch_axis (with index 0)
|
||||
(0, target_size - current_size)
|
||||
for target_size, current_size in zip(target_shape_with_batch_axis[1:], data.shape[1:])
|
||||
]
|
||||
return np.pad(data, padding, "constant", constant_values=np.nan)
|
||||
|
||||
|
||||
class NpzWriter:
|
||||
"""
|
||||
Dumps dicts of numpy arrays into npz files
|
||||
|
||||
It can/shall be used as context manager:
|
||||
```
|
||||
with OutputWriter('mydir') as writer:
|
||||
writer.write(outputs={'classes': np.zeros(8), 'probs': np.zeros((8, 4))},
|
||||
labels={'classes': np.zeros(8)},
|
||||
inputs={'input': np.zeros((8, 240, 240, 3)})
|
||||
```
|
||||
|
||||
## Variable size data
|
||||
|
||||
Only dynamic of last axis is handled. Data is padded with np.nan value.
|
||||
Also each generated file may have different size of dynamic axis.
|
||||
"""
|
||||
|
||||
def __init__(self, output_dir, compress=False):
|
||||
self._output_dir = Path(output_dir)
|
||||
self._items_cache: Dict[str, Dict[str, np.ndarray]] = {}
|
||||
self._items_counters: Dict[str, int] = {}
|
||||
self._flush_threshold_b = FLUSH_THRESHOLD_B
|
||||
self._compress = compress
|
||||
|
||||
@property
|
||||
def cache_size(self):
|
||||
return {name: sum([a.nbytes for a in data.values()]) for name, data in self._items_cache.items()}
|
||||
|
||||
def _append_to_cache(self, prefix, data):
|
||||
if data is None:
|
||||
return
|
||||
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"{prefix} data to store shall be dict")
|
||||
|
||||
cached_data = self._items_cache.get(prefix, {})
|
||||
for name, value in data.items():
|
||||
assert isinstance(
|
||||
value, (list, np.ndarray)
|
||||
), f"Values shall be lists or np.ndarrays; current type {type(value)}"
|
||||
if not isinstance(value, np.ndarray):
|
||||
value = np.array(value)
|
||||
|
||||
assert value.dtype.kind in ["S", "U"] or not np.any(
|
||||
np.isnan(value)
|
||||
), f"Values with np.nan is not supported; {name}={value}"
|
||||
cached_value = cached_data.get(name, None)
|
||||
if cached_value is not None:
|
||||
target_shape = np.max([cached_value.shape, value.shape], axis=0)
|
||||
cached_value = pad_except_batch_axis(cached_value, target_shape)
|
||||
value = pad_except_batch_axis(value, target_shape)
|
||||
value = np.concatenate((cached_value, value))
|
||||
cached_data[name] = value
|
||||
self._items_cache[prefix] = cached_data
|
||||
|
||||
def write(self, **kwargs):
|
||||
"""
|
||||
Writes named list of dictionaries of np.ndarrays.
|
||||
Finally keyword names will be later prefixes of npz files where those dictionaries will be stored.
|
||||
|
||||
ex. writer.write(inputs={'input': np.zeros((2, 10))},
|
||||
outputs={'classes': np.zeros((2,)), 'probabilities': np.zeros((2, 32))},
|
||||
labels={'classes': np.zeros((2,))})
|
||||
Args:
|
||||
**kwargs: named list of dictionaries of np.ndarrays to store
|
||||
"""
|
||||
|
||||
for prefix, data in kwargs.items():
|
||||
self._append_to_cache(prefix, data)
|
||||
|
||||
biggest_item_size = max(self.cache_size.values())
|
||||
if biggest_item_size > self._flush_threshold_b:
|
||||
self.flush()
|
||||
|
||||
def flush(self):
|
||||
for prefix, data in self._items_cache.items():
|
||||
self._dump(prefix, data)
|
||||
self._items_cache = {}
|
||||
|
||||
def _dump(self, prefix, data):
|
||||
idx = self._items_counters.setdefault(prefix, 0)
|
||||
filename = f"{prefix}-{idx:012d}.npz"
|
||||
output_path = self._output_dir / filename
|
||||
if self._compress:
|
||||
np.savez_compressed(output_path, **data)
|
||||
else:
|
||||
np.savez(output_path, **data)
|
||||
|
||||
nitems = len(list(data.values())[0])
|
||||
|
||||
msg_for_labels = (
|
||||
"If these are correct shapes - consider moving loading of them into metrics.py."
|
||||
if prefix == "labels"
|
||||
else ""
|
||||
)
|
||||
shapes = {name: value.shape if isinstance(value, np.ndarray) else (len(value),) for name, value in data.items()}
|
||||
|
||||
assert all(len(v) == nitems for v in data.values()), (
|
||||
f'All items in "{prefix}" shall have same size on 0 axis equal to batch size. {msg_for_labels}'
|
||||
f'{", ".join(f"{name}: {shape}" for name, shape in shapes.items())}'
|
||||
)
|
||||
self._items_counters[prefix] += nitems
|
||||
|
||||
def __enter__(self):
|
||||
if self._output_dir.exists() and len(list(self._output_dir.iterdir())):
|
||||
raise ValueError(f"{self._output_dir.as_posix()} is not empty")
|
||||
self._output_dir.mkdir(parents=True, exist_ok=True)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.flush()
|
|
@ -0,0 +1,83 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExtensionManager:
|
||||
def __init__(self, name: str):
|
||||
self._name = name
|
||||
self._registry = {}
|
||||
|
||||
def register_extension(self, extension: str, clazz):
|
||||
already_registered_class = self._registry.get(extension, None)
|
||||
if already_registered_class and already_registered_class.__module__ != clazz.__module__:
|
||||
raise RuntimeError(
|
||||
f"Conflicting extension {self._name}/{extension}; "
|
||||
f"{already_registered_class.__module__}.{already_registered_class.__name} "
|
||||
f"and "
|
||||
f"{clazz.__module__}.{clazz.__name__}"
|
||||
)
|
||||
elif already_registered_class is None:
|
||||
clazz_full_name = f"{clazz.__module__}.{clazz.__name__}" if clazz is not None else "None"
|
||||
LOGGER.debug(f"Registering extension {self._name}/{extension}: {clazz_full_name}")
|
||||
self._registry[extension] = clazz
|
||||
|
||||
def get(self, extension):
|
||||
if extension not in self._registry:
|
||||
raise RuntimeError(f"Missing extension {self._name}/{extension}")
|
||||
return self._registry[extension]
|
||||
|
||||
@property
|
||||
def supported_extensions(self):
|
||||
return list(self._registry)
|
||||
|
||||
@staticmethod
|
||||
def scan_for_extensions(extension_dirs: List[Path]):
|
||||
register_pattern = r".*\.register_extension\(.*"
|
||||
|
||||
for extension_dir in extension_dirs:
|
||||
for python_path in extension_dir.rglob("*.py"):
|
||||
if not python_path.is_file():
|
||||
continue
|
||||
payload = python_path.read_text()
|
||||
if re.findall(register_pattern, payload):
|
||||
import_path = python_path.relative_to(toolkit_root_dir.parent)
|
||||
package = import_path.parent.as_posix().replace(os.sep, ".")
|
||||
package_with_module = f"{package}.{import_path.stem}"
|
||||
spec = importlib.util.spec_from_file_location(name=package_with_module, location=python_path)
|
||||
my_module = importlib.util.module_from_spec(spec)
|
||||
my_module.__package__ = package
|
||||
|
||||
try:
|
||||
spec.loader.exec_module(my_module) # pytype: disable=attribute-error
|
||||
except ModuleNotFoundError as e:
|
||||
LOGGER.error(
|
||||
f"Could not load extensions from {import_path} due to missing python packages; {e}"
|
||||
)
|
||||
|
||||
|
||||
runners = ExtensionManager("runners")
|
||||
loaders = ExtensionManager("loaders")
|
||||
savers = ExtensionManager("savers")
|
||||
converters = ExtensionManager("converters")
|
||||
toolkit_root_dir = (Path(__file__).parent / "..").resolve()
|
||||
ExtensionManager.scan_for_extensions([toolkit_root_dir])
|
|
@ -0,0 +1,61 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import csv
|
||||
import re
|
||||
from typing import Dict, List
|
||||
|
||||
from natsort import natsorted
|
||||
from tabulate import tabulate
|
||||
|
||||
|
||||
def sort_results(results: List):
|
||||
results = natsorted(results, key=lambda item: [item[key] for key in item.keys()])
|
||||
return results
|
||||
|
||||
|
||||
def save_results(filename: str, data: List, formatted: bool = False):
|
||||
data = format_data(data=data) if formatted else data
|
||||
with open(filename, "a") as csvfile:
|
||||
fieldnames = data[0].keys()
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for row in data:
|
||||
writer.writerow(row)
|
||||
|
||||
|
||||
def format_data(data: List[Dict]) -> List[Dict]:
|
||||
formatted_data = list()
|
||||
for item in data:
|
||||
formatted_item = format_keys(data=item)
|
||||
formatted_data.append(formatted_item)
|
||||
|
||||
return formatted_data
|
||||
|
||||
|
||||
def format_keys(data: Dict) -> Dict:
|
||||
keys = {format_key(key=key): value for key, value in data.items()}
|
||||
return keys
|
||||
|
||||
|
||||
def format_key(key: str) -> str:
|
||||
key = " ".join([k.capitalize() for k in re.split("_| ", key)])
|
||||
return key
|
||||
|
||||
|
||||
def show_results(results: List[Dict]):
|
||||
headers = list(results[0].keys())
|
||||
summary = map(lambda x: list(map(lambda item: item[1], x.items())), results)
|
||||
print(tabulate(summary, headers=headers))
|
|
@ -0,0 +1,67 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
def warmup(
|
||||
model_name: str,
|
||||
batch_sizes: List[int],
|
||||
triton_gpu_engine_count: int = 1,
|
||||
triton_instances: int = 1,
|
||||
profiling_data: str = "random",
|
||||
input_shapes: Optional[List[str]] = None,
|
||||
server_url: str = "localhost",
|
||||
measurement_window: int = 10000,
|
||||
shared_memory: bool = False
|
||||
):
|
||||
print("\n")
|
||||
print(f"==== Warmup start ====")
|
||||
print("\n")
|
||||
|
||||
input_shapes = " ".join(map(lambda shape: f" --shape {shape}", input_shapes)) if input_shapes else ""
|
||||
|
||||
measurement_window = 6 * measurement_window
|
||||
|
||||
max_batch_size = max(batch_sizes)
|
||||
max_total_requests = 2 * max_batch_size * triton_instances * triton_gpu_engine_count
|
||||
max_concurrency = min(256, max_total_requests)
|
||||
batch_size = max(1, max_total_requests // 256)
|
||||
|
||||
step = max(1, max_concurrency // 2)
|
||||
min_concurrency = step
|
||||
|
||||
exec_args = f"""-m {model_name} \
|
||||
-x 1 \
|
||||
-p {measurement_window} \
|
||||
-v \
|
||||
-i http \
|
||||
-u {server_url}:8000 \
|
||||
-b {batch_size} \
|
||||
--concurrency-range {min_concurrency}:{max_concurrency}:{step} \
|
||||
--input-data {profiling_data} {input_shapes}"""
|
||||
|
||||
if shared_memory:
|
||||
exec_args += " --shared-memory=cuda"
|
||||
|
||||
result = os.system(f"perf_client {exec_args}")
|
||||
if result != 0:
|
||||
print(f"Failed running performance tests. Perf client failed with exit code {result}")
|
||||
sys.exit(1)
|
||||
|
||||
print("\n")
|
||||
print(f"==== Warmup done ====")
|
||||
print("\n")
|
42
PyTorch/Segmentation/nnUNet/triton/metrics.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from triton.deployment_toolkit.core import BaseMetricsCalculator
|
||||
|
||||
|
||||
class MetricsCalculator(BaseMetricsCalculator):
|
||||
def calc(
|
||||
self,
|
||||
*,
|
||||
ids: List[Any],
|
||||
x: Optional[Dict[str, np.ndarray]],
|
||||
y_real: Optional[Dict[str, np.ndarray]],
|
||||
y_pred: Dict[str, np.ndarray],
|
||||
) -> Dict[str, float]:
|
||||
y_pred = y_pred["OUTPUT__0"]
|
||||
y_true = y_real["OUTPUT__0"]
|
||||
|
||||
n_examples = y_pred.shape[0]
|
||||
nclass = max(np.max(y_pred), np.max(y_true))
|
||||
dice = np.zeros((nclass,))
|
||||
for i in range(n_examples):
|
||||
for c in range(nclass):
|
||||
if not (y_true[i] == c).any():
|
||||
# no foreground class
|
||||
dice[c] += 1 if not (y_pred[i] == c).any() else 0
|
||||
continue
|
||||
true_pos, false_neg, false_pos = self.get_stats(y_pred[i], y_true[i], c + 1)
|
||||
denom = 2 * true_pos + false_neg + false_pos
|
||||
dice[c] += 2 * true_pos / denom if denom != 0 else 0.0
|
||||
|
||||
dice /= n_examples
|
||||
dice = np.mean(dice)
|
||||
return {"dice": dice}
|
||||
|
||||
@staticmethod
|
||||
def get_stats(pred, targ, class_idx):
|
||||
true_pos = np.logical_and(pred == class_idx, targ == class_idx).sum()
|
||||
false_neg = np.logical_and(pred != class_idx, targ == class_idx).sum()
|
||||
false_pos = np.logical_and(pred == class_idx, targ != class_idx).sum()
|
||||
return true_pos, false_neg, false_pos
|
11
PyTorch/Segmentation/nnUNet/triton/model.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
from models.nn_unet import NNUnet
|
||||
|
||||
|
||||
def get_model(*, checkpoint_dir: str, precision: str, data_dir: str):
|
||||
model = NNUnet.load_from_checkpoint(checkpoint_dir, data_dir=data_dir, bermuda=True, strict=False)
|
||||
model = model.cuda()
|
||||
if "fp16" in precision:
|
||||
model = model.half()
|
||||
model.eval()
|
||||
tensor_names = {"inputs": ["INPUT__0"], "outputs": ["OUTPUT__0"]}
|
||||
return model, tensor_names
|
1134
PyTorch/Segmentation/nnUNet/triton/plots/graph_A10080GB_left.svg
Normal file
After Width: | Height: | Size: 36 KiB |
1130
PyTorch/Segmentation/nnUNet/triton/plots/graph_A10080GB_right.svg
Normal file
After Width: | Height: | Size: 36 KiB |
1161
PyTorch/Segmentation/nnUNet/triton/plots/graph_A40_left.svg
Normal file
After Width: | Height: | Size: 37 KiB |
1102
PyTorch/Segmentation/nnUNet/triton/plots/graph_A40_right.svg
Normal file
After Width: | Height: | Size: 35 KiB |
1147
PyTorch/Segmentation/nnUNet/triton/plots/graph_TeslaT4_left.svg
Normal file
After Width: | Height: | Size: 37 KiB |
1107
PyTorch/Segmentation/nnUNet/triton/plots/graph_TeslaT4_right.svg
Normal file
After Width: | Height: | Size: 35 KiB |
After Width: | Height: | Size: 37 KiB |
After Width: | Height: | Size: 35 KiB |
After Width: | Height: | Size: 76 KiB |
After Width: | Height: | Size: 76 KiB |
After Width: | Height: | Size: 75 KiB |
After Width: | Height: | Size: 75 KiB |
After Width: | Height: | Size: 76 KiB |
After Width: | Height: | Size: 74 KiB |
After Width: | Height: | Size: 74 KiB |
After Width: | Height: | Size: 77 KiB |
47
PyTorch/Segmentation/nnUNet/triton/preprocess.py
Executable file
|
@ -0,0 +1,47 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import time
|
||||
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
|
||||
|
||||
from data_preprocessing.preprocessor import Preprocessor
|
||||
from utils.utils import get_task_code
|
||||
|
||||
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument("--data", type=str, default="/data", help="Path to data directory")
|
||||
parser.add_argument("--results", type=str, default="/data", help="Path for saving results directory")
|
||||
parser.add_argument(
|
||||
"--exec_mode",
|
||||
type=str,
|
||||
default="training",
|
||||
choices=["training", "val", "test"],
|
||||
help="Mode for data preprocessing",
|
||||
)
|
||||
parser.add_argument("--dilation", action="store_true", help="Perform morphological label dilation")
|
||||
parser.add_argument("--task", type=str, help="Number of task to be run. MSD uses numbers 01-10")
|
||||
parser.add_argument("--dim", type=int, default=3, choices=[2, 3], help="Data dimension to prepare")
|
||||
parser.add_argument("--n_jobs", type=int, default=-1, help="Number of parallel jobs for data preprocessing")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
start = time.time()
|
||||
Preprocessor(args).run()
|
||||
task_code = get_task_code(args)
|
||||
path = os.path.join(args.data, task_code)
|
||||
if args.exec_mode == "test":
|
||||
path = os.path.join(path, "test")
|
||||
end = time.time()
|
||||
print(f"Preprocessing time: {(end - start):.2f}")
|
24
PyTorch/Segmentation/nnUNet/triton/requirements.txt
Normal file
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
networkx==2.5
|
||||
numpy<1.20.0,>=1.19.1 # # numpy 1.20+ requires py37
|
||||
onnx==1.8.0
|
||||
onnxruntime==1.5.2
|
||||
pycuda>=2019.1.2
|
||||
PyYAML>=5.2
|
||||
tqdm>=4.44.1
|
||||
tabulate>=0.8.7
|
||||
natsort>=7.0.0
|
||||
# use tags instead of branch names - because there might be docker cache hit causing not fetching most recent changes on branch
|
||||
model_navigator @ git+https://github.com/triton-inference-server/model_navigator.git@v0.1.0#egg=model_navigator
|
134
PyTorch/Segmentation/nnUNet/triton/run_inference_on_fw.py
Executable file
|
@ -0,0 +1,134 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""
|
||||
To infer the model on framework runtime, you can use `run_inference_on_fw.py` script.
|
||||
It infers data obtained from pointed data loader locally and saves received data into npz files.
|
||||
Those files are stored in directory pointed by `--output-dir` argument.
|
||||
|
||||
Example call:
|
||||
|
||||
```shell script
|
||||
python ./triton/run_inference_on_fw.py \
|
||||
--input-path /models/exported/model.onnx \
|
||||
--input-type onnx \
|
||||
--dataloader triton/dataloader.py \
|
||||
--data-dir /data/imagenet \
|
||||
--batch-size 32 \
|
||||
--output-dir /results/dump_local \
|
||||
--dump-labels
|
||||
```
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
|
||||
os.environ["TF_ENABLE_DEPRECATION_WARNINGS"] = "0"
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
# method from PEP-366 to support relative import in executed modules
|
||||
if __package__ is None:
|
||||
__package__ = Path(__file__).parent.name
|
||||
|
||||
from .deployment_toolkit.args import ArgParserGenerator
|
||||
from .deployment_toolkit.core import DATALOADER_FN_NAME, BaseLoader, BaseRunner, Format, load_from_file
|
||||
from .deployment_toolkit.dump import NpzWriter
|
||||
from .deployment_toolkit.extensions import loaders, runners
|
||||
|
||||
LOGGER = logging.getLogger("run_inference_on_fw")
|
||||
|
||||
|
||||
def _verify_and_format_dump(args, ids, x, y_pred, y_real):
|
||||
data = {"outputs": y_pred, "ids": {"ids": ids}}
|
||||
if args.dump_inputs:
|
||||
data["inputs"] = x
|
||||
if args.dump_labels:
|
||||
if not y_real:
|
||||
raise ValueError(
|
||||
"Found empty label values. Please provide labels in dataloader_fn or do not use --dump-labels argument"
|
||||
)
|
||||
data["labels"] = y_real
|
||||
return data
|
||||
|
||||
|
||||
def _parse_and_validate_args():
|
||||
supported_inputs = set(runners.supported_extensions) & set(loaders.supported_extensions)
|
||||
|
||||
parser = argparse.ArgumentParser(description="Dump local inference output of given model", allow_abbrev=False)
|
||||
parser.add_argument("--input-path", help="Path to input model", required=True)
|
||||
parser.add_argument("--input-type", help="Input model type", choices=supported_inputs, required=True)
|
||||
parser.add_argument("--dataloader", help="Path to python file containing dataloader.", required=True)
|
||||
parser.add_argument("--output-dir", help="Path to dir where output files will be stored", required=True)
|
||||
parser.add_argument("--dump-labels", help="Dump labels to output dir", action="store_true", default=False)
|
||||
parser.add_argument("--dump-inputs", help="Dump inputs to output dir", action="store_true", default=False)
|
||||
parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
|
||||
|
||||
args, *_ = parser.parse_known_args()
|
||||
|
||||
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
|
||||
ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
|
||||
|
||||
Loader: BaseLoader = loaders.get(args.input_type)
|
||||
ArgParserGenerator(Loader, module_path=args.input_path).update_argparser(parser)
|
||||
|
||||
Runner: BaseRunner = runners.get(args.input_type)
|
||||
ArgParserGenerator(Runner).update_argparser(parser)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
types_requiring_io_params = []
|
||||
|
||||
if args.input_type in types_requiring_io_params and not all(p for p in [args.inputs, args.outputs]):
|
||||
parser.error(f"For {args.input_type} input provide --inputs and --outputs parameters")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = _parse_and_validate_args()
|
||||
|
||||
log_level = logging.INFO if not args.verbose else logging.DEBUG
|
||||
log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
|
||||
logging.basicConfig(level=log_level, format=log_format)
|
||||
|
||||
LOGGER.info(f"args:")
|
||||
for key, value in vars(args).items():
|
||||
LOGGER.info(f" {key} = {value}")
|
||||
|
||||
Loader: BaseLoader = loaders.get(args.input_type)
|
||||
Runner: BaseRunner = runners.get(args.input_type)
|
||||
|
||||
loader = ArgParserGenerator(Loader, module_path=args.input_path).from_args(args)
|
||||
runner = ArgParserGenerator(Runner).from_args(args)
|
||||
LOGGER.info(f"Loading {args.input_path}")
|
||||
model = loader.load(args.input_path)
|
||||
with runner.init_inference(model=model) as runner_session, NpzWriter(args.output_dir) as writer:
|
||||
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
|
||||
dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
|
||||
LOGGER.info(f"Data loader initialized; Running inference")
|
||||
for ids, x, y_real in tqdm(dataloader_fn(), unit="batch", mininterval=10):
|
||||
y_pred = runner_session(x)
|
||||
data = _verify_and_format_dump(args, ids=ids, x=x, y_pred=y_pred, y_real=y_real)
|
||||
writer.write(**data)
|
||||
LOGGER.info(f"Inference finished")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
287
PyTorch/Segmentation/nnUNet/triton/run_inference_on_triton.py
Executable file
|
@ -0,0 +1,287 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""
|
||||
To infer the model deployed on Triton, you can use `run_inference_on_triton.py` script.
|
||||
It sends a request with data obtained from pointed data loader and dumps received data into npz files.
|
||||
Those files are stored in directory pointed by `--output-dir` argument.
|
||||
|
||||
Currently, the client communicates with the Triton server asynchronously using GRPC protocol.
|
||||
|
||||
Example call:
|
||||
|
||||
```shell script
|
||||
python ./triton/run_inference_on_triton.py \
|
||||
--server-url localhost:8001 \
|
||||
--model-name ResNet50 \
|
||||
--model-version 1 \
|
||||
--dump-labels \
|
||||
--output-dir /results/dump_triton
|
||||
```
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import functools
|
||||
import logging
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
# pytype: disable=import-error
|
||||
try:
|
||||
from tritonclient import utils as client_utils # noqa: F401
|
||||
from tritonclient.grpc import (
|
||||
InferenceServerClient,
|
||||
InferInput,
|
||||
InferRequestedOutput,
|
||||
)
|
||||
except ImportError:
|
||||
import tritongrpcclient as grpc_client
|
||||
from tritongrpcclient import (
|
||||
InferenceServerClient,
|
||||
InferInput,
|
||||
InferRequestedOutput,
|
||||
)
|
||||
# pytype: enable=import-error
|
||||
|
||||
# method from PEP-366 to support relative import in executed modules
|
||||
if __package__ is None:
|
||||
__package__ = Path(__file__).parent.name
|
||||
|
||||
from .deployment_toolkit.args import ArgParserGenerator
|
||||
from .deployment_toolkit.core import DATALOADER_FN_NAME, load_from_file
|
||||
from .deployment_toolkit.dump import NpzWriter
|
||||
|
||||
LOGGER = logging.getLogger("run_inference_on_triton")
|
||||
|
||||
|
||||
class AsyncGRPCTritonRunner:
|
||||
DEFAULT_MAX_RESP_WAIT_S = 120
|
||||
DEFAULT_MAX_UNRESP_REQS = 128
|
||||
DEFAULT_MAX_FINISH_WAIT_S = 900 # 15min
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
server_url: str,
|
||||
model_name: str,
|
||||
model_version: str,
|
||||
*,
|
||||
dataloader,
|
||||
verbose=False,
|
||||
resp_wait_s: Optional[float] = None,
|
||||
max_unresponded_reqs: Optional[int] = None,
|
||||
):
|
||||
self._server_url = server_url
|
||||
self._model_name = model_name
|
||||
self._model_version = model_version
|
||||
self._dataloader = dataloader
|
||||
self._verbose = verbose
|
||||
self._response_wait_t = self.DEFAULT_MAX_RESP_WAIT_S if resp_wait_s is None else resp_wait_s
|
||||
self._max_unresp_reqs = self.DEFAULT_MAX_UNRESP_REQS if max_unresponded_reqs is None else max_unresponded_reqs
|
||||
|
||||
self._results = queue.Queue()
|
||||
self._processed_all = False
|
||||
self._errors = []
|
||||
self._num_waiting_for = 0
|
||||
self._sync = threading.Condition()
|
||||
self._req_thread = threading.Thread(target=self.req_loop, daemon=True)
|
||||
|
||||
def __iter__(self):
|
||||
self._req_thread.start()
|
||||
timeout_s = 0.050 # check flags processed_all and error flags every 50ms
|
||||
while True:
|
||||
try:
|
||||
ids, x, y_pred, y_real = self._results.get(timeout=timeout_s)
|
||||
yield ids, x, y_pred, y_real
|
||||
except queue.Empty:
|
||||
shall_stop = self._processed_all or self._errors
|
||||
if shall_stop:
|
||||
break
|
||||
|
||||
LOGGER.debug("Waiting for request thread to stop")
|
||||
self._req_thread.join()
|
||||
if self._errors:
|
||||
error_msg = "\n".join(map(str, self._errors))
|
||||
raise RuntimeError(error_msg)
|
||||
|
||||
def _on_result(self, ids, x, y_real, output_names, result, error):
|
||||
with self._sync:
|
||||
if error:
|
||||
self._errors.append(error)
|
||||
else:
|
||||
y_pred = {name: result.as_numpy(name) for name in output_names}
|
||||
self._results.put((ids, x, y_pred, y_real))
|
||||
self._num_waiting_for -= 1
|
||||
self._sync.notify_all()
|
||||
|
||||
def req_loop(self):
|
||||
client = InferenceServerClient(self._server_url, verbose=self._verbose)
|
||||
self._errors = self._verify_triton_state(client)
|
||||
if self._errors:
|
||||
return
|
||||
|
||||
LOGGER.debug(
|
||||
f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} " f"are up and ready!"
|
||||
)
|
||||
|
||||
model_config = client.get_model_config(self._model_name, self._model_version)
|
||||
model_metadata = client.get_model_metadata(self._model_name, self._model_version)
|
||||
LOGGER.info(f"Model config {model_config}")
|
||||
LOGGER.info(f"Model metadata {model_metadata}")
|
||||
|
||||
inputs = {tm.name: tm for tm in model_metadata.inputs}
|
||||
outputs = {tm.name: tm for tm in model_metadata.outputs}
|
||||
output_names = list(outputs)
|
||||
outputs_req = [InferRequestedOutput(name) for name in outputs]
|
||||
|
||||
self._num_waiting_for = 0
|
||||
|
||||
for ids, x, y_real in self._dataloader:
|
||||
infer_inputs = []
|
||||
for name in inputs:
|
||||
data = x[name]
|
||||
infer_input = InferInput(name, data.shape, inputs[name].datatype)
|
||||
|
||||
target_np_dtype = client_utils.triton_to_np_dtype(inputs[name].datatype)
|
||||
data = data.astype(target_np_dtype)
|
||||
|
||||
infer_input.set_data_from_numpy(data)
|
||||
infer_inputs.append(infer_input)
|
||||
|
||||
with self._sync:
|
||||
|
||||
def _check_can_send():
|
||||
return self._num_waiting_for < self._max_unresp_reqs
|
||||
|
||||
can_send = self._sync.wait_for(_check_can_send, timeout=self._response_wait_t)
|
||||
if not can_send:
|
||||
error_msg = f"Runner could not send new requests for {self._response_wait_t}s"
|
||||
self._errors.append(error_msg)
|
||||
break
|
||||
|
||||
callback = functools.partial(AsyncGRPCTritonRunner._on_result, self, ids, x, y_real, output_names)
|
||||
client.async_infer(
|
||||
model_name=self._model_name,
|
||||
model_version=self._model_version,
|
||||
inputs=infer_inputs,
|
||||
outputs=outputs_req,
|
||||
callback=callback,
|
||||
)
|
||||
self._num_waiting_for += 1
|
||||
|
||||
# wait till receive all requested data
|
||||
with self._sync:
|
||||
|
||||
def _all_processed():
|
||||
LOGGER.debug(f"wait for {self._num_waiting_for} unprocessed jobs")
|
||||
return self._num_waiting_for == 0
|
||||
|
||||
self._processed_all = self._sync.wait_for(_all_processed, self.DEFAULT_MAX_FINISH_WAIT_S)
|
||||
if not self._processed_all:
|
||||
error_msg = f"Runner {self._response_wait_t}s timeout received while waiting for results from server"
|
||||
self._errors.append(error_msg)
|
||||
LOGGER.debug("Finished request thread")
|
||||
|
||||
def _verify_triton_state(self, triton_client):
|
||||
errors = []
|
||||
if not triton_client.is_server_live():
|
||||
errors.append(f"Triton server {self._server_url} is not live")
|
||||
elif not triton_client.is_server_ready():
|
||||
errors.append(f"Triton server {self._server_url} is not ready")
|
||||
elif not triton_client.is_model_ready(self._model_name, self._model_version):
|
||||
errors.append(f"Model {self._model_name}:{self._model_version} is not ready")
|
||||
return errors
|
||||
|
||||
|
||||
def _parse_args():
|
||||
parser = argparse.ArgumentParser(description="Infer model on Triton server", allow_abbrev=False)
|
||||
parser.add_argument(
|
||||
"--server-url", type=str, default="localhost:8001", help="Inference server URL (default localhost:8001)"
|
||||
)
|
||||
parser.add_argument("--model-name", help="The name of the model used for inference.", required=True)
|
||||
parser.add_argument("--model-version", help="The version of the model used for inference.", required=True)
|
||||
parser.add_argument("--dataloader", help="Path to python file containing dataloader.", required=True)
|
||||
parser.add_argument("--dump-labels", help="Dump labels to output dir", action="store_true", default=False)
|
||||
parser.add_argument("--dump-inputs", help="Dump inputs to output dir", action="store_true", default=False)
|
||||
parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
|
||||
parser.add_argument("--output-dir", required=True, help="Path to directory where outputs will be saved")
|
||||
parser.add_argument("--response-wait-time", required=False, help="Maximal time to wait for response", default=120)
|
||||
parser.add_argument(
|
||||
"--max-unresponded-requests", required=False, help="Maximal number of unresponded requests", default=128
|
||||
)
|
||||
|
||||
args, *_ = parser.parse_known_args()
|
||||
|
||||
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
|
||||
ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = _parse_args()
|
||||
|
||||
log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
|
||||
log_level = logging.INFO if not args.verbose else logging.DEBUG
|
||||
logging.basicConfig(level=log_level, format=log_format)
|
||||
|
||||
LOGGER.info(f"args:")
|
||||
for key, value in vars(args).items():
|
||||
LOGGER.info(f" {key} = {value}")
|
||||
|
||||
get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
|
||||
dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
|
||||
|
||||
runner = AsyncGRPCTritonRunner(
|
||||
args.server_url,
|
||||
args.model_name,
|
||||
args.model_version,
|
||||
dataloader=dataloader_fn(),
|
||||
verbose=False,
|
||||
resp_wait_s=args.response_wait_time,
|
||||
max_unresponded_reqs=args.max_unresponded_requests,
|
||||
)
|
||||
|
||||
with NpzWriter(output_dir=args.output_dir) as writer:
|
||||
start = time.time()
|
||||
for ids, x, y_pred, y_real in tqdm(runner, unit="batch", mininterval=10):
|
||||
data = _verify_and_format_dump(args, ids, x, y_pred, y_real)
|
||||
writer.write(**data)
|
||||
stop = time.time()
|
||||
|
||||
LOGGER.info(f"\nThe inference took {stop - start:0.3f}s")
|
||||
|
||||
|
||||
def _verify_and_format_dump(args, ids, x, y_pred, y_real):
|
||||
data = {"outputs": y_pred, "ids": {"ids": ids}}
|
||||
if args.dump_inputs:
|
||||
data["inputs"] = x
|
||||
if args.dump_labels:
|
||||
if not y_real:
|
||||
raise ValueError(
|
||||
"Found empty label values. Please provide labels in dataloader_fn or do not use --dump-labels argument"
|
||||
)
|
||||
data["labels"] = y_real
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
178
PyTorch/Segmentation/nnUNet/triton/run_offline_performance_test_on_triton.py
Executable file
|
@ -0,0 +1,178 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""
|
||||
For models with variable-sized inputs you must provide the --input-shape argument so that perf_analyzer knows
|
||||
what shape tensors to use. For example, for a model that has an input called IMAGE that has shape [ 3, N, M ],
|
||||
where N and M are variable-size dimensions, to tell perf_analyzer to send batch-size 4 requests of shape [ 3, 224, 224 ]
|
||||
`--shape IMAGE:3,224,224`.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
# method from PEP-366 to support relative import in executed modules
|
||||
if __package__ is None:
|
||||
__package__ = Path(__file__).parent.name
|
||||
|
||||
from .deployment_toolkit.report import save_results, show_results, sort_results
|
||||
from .deployment_toolkit.warmup import warmup
|
||||
|
||||
|
||||
def calculate_average_latency(r):
|
||||
avg_sum_fields = [
|
||||
"Client Send",
|
||||
"Network+Server Send/Recv",
|
||||
"Server Queue",
|
||||
"Server Compute",
|
||||
"Server Compute Input",
|
||||
"Server Compute Infer",
|
||||
"Server Compute Output",
|
||||
"Client Recv",
|
||||
]
|
||||
avg_latency = sum([int(r.get(f, 0)) for f in avg_sum_fields])
|
||||
|
||||
return avg_latency
|
||||
|
||||
|
||||
def update_performance_data(results: List, batch_size: int, performance_partial_file: str):
|
||||
row: Dict = {"batch_size": batch_size}
|
||||
with open(performance_partial_file, "r") as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for r in reader:
|
||||
avg_latency = calculate_average_latency(r)
|
||||
row = {**row, **r, "avg latency": avg_latency}
|
||||
|
||||
results.append(row)
|
||||
|
||||
|
||||
def _parse_batch_sizes(batch_sizes: str):
|
||||
batches = batch_sizes.split(sep=",")
|
||||
return list(map(lambda x: int(x.strip()), batches))
|
||||
|
||||
|
||||
def offline_performance(
|
||||
model_name: str,
|
||||
batch_sizes: List[int],
|
||||
result_path: str,
|
||||
input_shapes: Optional[List[str]] = None,
|
||||
profiling_data: str = "random",
|
||||
triton_instances: int = 1,
|
||||
server_url: str = "localhost",
|
||||
measurement_window: int = 10000,
|
||||
shared_memory: bool = False
|
||||
):
|
||||
print("\n")
|
||||
print(f"==== Static batching analysis start ====")
|
||||
print("\n")
|
||||
|
||||
input_shapes = " ".join(map(lambda shape: f" --shape {shape}", input_shapes)) if input_shapes else ""
|
||||
|
||||
results: List[Dict] = list()
|
||||
for batch_size in batch_sizes:
|
||||
print(f"Running performance tests for batch size: {batch_size}")
|
||||
performance_partial_file = f"triton_performance_partial_{batch_size}.csv"
|
||||
|
||||
exec_args = f"""-max-threads {triton_instances} \
|
||||
-m {model_name} \
|
||||
-x 1 \
|
||||
-c {triton_instances} \
|
||||
-t {triton_instances} \
|
||||
-p {measurement_window} \
|
||||
-v \
|
||||
-i http \
|
||||
-u {server_url}:8000 \
|
||||
-b {batch_size} \
|
||||
-f {performance_partial_file} \
|
||||
--input-data {profiling_data} {input_shapes}"""
|
||||
|
||||
if shared_memory:
|
||||
exec_args += " --shared-memory=cuda"
|
||||
|
||||
result = os.system(f"perf_client {exec_args}")
|
||||
if result != 0:
|
||||
print(f"Failed running performance tests. Perf client failed with exit code {result}")
|
||||
sys.exit(1)
|
||||
|
||||
update_performance_data(results, batch_size, performance_partial_file)
|
||||
os.remove(performance_partial_file)
|
||||
|
||||
results = sort_results(results=results)
|
||||
|
||||
save_results(filename=result_path, data=results)
|
||||
show_results(results=results)
|
||||
|
||||
print("Performance results for static batching stored in: {0}".format(result_path))
|
||||
|
||||
print("\n")
|
||||
print(f"==== Analysis done ====")
|
||||
print("\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model-name", type=str, required=True, help="Name of the model to test")
|
||||
parser.add_argument(
|
||||
"--input-data", type=str, required=False, default="random", help="Input data to perform profiling."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-shape",
|
||||
action="append",
|
||||
required=False,
|
||||
help="Input data shape in form INPUT_NAME:<full_shape_without_batch_axis>.",
|
||||
)
|
||||
parser.add_argument("--batch-sizes", type=str, required=True, help="List of batch sizes to tests. Comma separated.")
|
||||
parser.add_argument("--result-path", type=str, required=True, help="Path where result file is going to be stored.")
|
||||
parser.add_argument("--triton-instances", type=int, default=1, help="Number of Triton Server instances")
|
||||
parser.add_argument("--server-url", type=str, required=False, default="localhost", help="Url to Triton server")
|
||||
parser.add_argument(
|
||||
"--measurement-window", required=False, help="Time which perf_analyzer will wait for results", default=10000
|
||||
)
|
||||
parser.add_argument("--shared-memory", help="Use shared memory for communication with Triton", action="store_true",
|
||||
default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
warmup(
|
||||
server_url=args.server_url,
|
||||
model_name=args.model_name,
|
||||
batch_sizes=_parse_batch_sizes(args.batch_sizes),
|
||||
triton_instances=args.triton_instances,
|
||||
profiling_data=args.input_data,
|
||||
input_shapes=args.input_shape,
|
||||
measurement_window=args.measurement_window,
|
||||
shared_memory=args.shared_memory
|
||||
)
|
||||
|
||||
offline_performance(
|
||||
server_url=args.server_url,
|
||||
model_name=args.model_name,
|
||||
batch_sizes=_parse_batch_sizes(args.batch_sizes),
|
||||
triton_instances=args.triton_instances,
|
||||
profiling_data=args.input_data,
|
||||
input_shapes=args.input_shape,
|
||||
result_path=args.result_path,
|
||||
measurement_window=args.measurement_window,
|
||||
shared_memory=args.shared_memory
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
188
PyTorch/Segmentation/nnUNet/triton/run_online_performance_test_on_triton.py
Executable file
|
@ -0,0 +1,188 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""
|
||||
For models with variable-sized inputs you must provide the --input-shape argument so that perf_analyzer knows
|
||||
what shape tensors to use. For example, for a model that has an input called IMAGE that has shape [ 3, N, M ],
|
||||
where N and M are variable-size dimensions, to tell perf_analyzer to send batch-size 4 requests of shape [ 3, 224, 224 ]
|
||||
`--shape IMAGE:3,224,224`.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
# method from PEP-366 to support relative import in executed modules
|
||||
if __package__ is None:
|
||||
__package__ = Path(__file__).parent.name
|
||||
|
||||
from .deployment_toolkit.report import save_results, show_results, sort_results
|
||||
from .deployment_toolkit.warmup import warmup
|
||||
|
||||
|
||||
def calculate_average_latency(r):
|
||||
avg_sum_fields = [
|
||||
"Client Send",
|
||||
"Network+Server Send/Recv",
|
||||
"Server Queue",
|
||||
"Server Compute",
|
||||
"Server Compute Input",
|
||||
"Server Compute Infer",
|
||||
"Server Compute Output",
|
||||
"Client Recv",
|
||||
]
|
||||
avg_latency = sum([int(r.get(f, 0)) for f in avg_sum_fields])
|
||||
|
||||
return avg_latency
|
||||
|
||||
|
||||
def update_performance_data(results: List, performance_file: str):
|
||||
with open(performance_file, "r") as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for row in reader:
|
||||
row["avg latency"] = calculate_average_latency(row)
|
||||
|
||||
results.append(row)
|
||||
|
||||
|
||||
def _parse_batch_sizes(batch_sizes: str):
|
||||
batches = batch_sizes.split(sep=",")
|
||||
return list(map(lambda x: int(x.strip()), batches))
|
||||
|
||||
|
||||
def online_performance(
|
||||
model_name: str,
|
||||
batch_sizes: List[int],
|
||||
result_path: str,
|
||||
input_shapes: Optional[List[str]] = None,
|
||||
profiling_data: str = "random",
|
||||
triton_instances: int = 1,
|
||||
triton_gpu_engine_count: int = 1,
|
||||
server_url: str = "localhost",
|
||||
measurement_window: int = 10000,
|
||||
shared_memory: bool = False
|
||||
):
|
||||
print("\n")
|
||||
print(f"==== Dynamic batching analysis start ====")
|
||||
print("\n")
|
||||
|
||||
input_shapes = " ".join(map(lambda shape: f" --shape {shape}", input_shapes)) if input_shapes else ""
|
||||
|
||||
print(f"Running performance tests for dynamic batching")
|
||||
performance_file = f"triton_performance_dynamic_partial.csv"
|
||||
|
||||
max_batch_size = max(batch_sizes)
|
||||
max_total_requests = 2 * max_batch_size * triton_instances * triton_gpu_engine_count
|
||||
max_concurrency = min(256, max_total_requests)
|
||||
batch_size = max(1, max_total_requests // 256)
|
||||
|
||||
step = max(1, max_concurrency // 32)
|
||||
min_concurrency = step
|
||||
|
||||
exec_args = f"""-m {model_name} \
|
||||
-x 1 \
|
||||
-p {measurement_window} \
|
||||
-v \
|
||||
-i http \
|
||||
-u {server_url}:8000 \
|
||||
-b {batch_size} \
|
||||
-f {performance_file} \
|
||||
--concurrency-range {min_concurrency}:{max_concurrency}:{step} \
|
||||
--input-data {profiling_data} {input_shapes}"""
|
||||
|
||||
if shared_memory:
|
||||
exec_args += " --shared-memory=cuda"
|
||||
|
||||
result = os.system(f"perf_client {exec_args}")
|
||||
if result != 0:
|
||||
print(f"Failed running performance tests. Perf client failed with exit code {result}")
|
||||
sys.exit(1)
|
||||
|
||||
results = list()
|
||||
update_performance_data(results=results, performance_file=performance_file)
|
||||
|
||||
results = sort_results(results=results)
|
||||
|
||||
save_results(filename=result_path, data=results)
|
||||
show_results(results=results)
|
||||
|
||||
os.remove(performance_file)
|
||||
|
||||
print("Performance results for dynamic batching stored in: {0}".format(result_path))
|
||||
|
||||
print("\n")
|
||||
print(f"==== Analysis done ====")
|
||||
print("\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model-name", type=str, required=True, help="Name of the model to test")
|
||||
parser.add_argument(
|
||||
"--input-data", type=str, required=False, default="random", help="Input data to perform profiling."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-shape",
|
||||
action="append",
|
||||
required=False,
|
||||
help="Input data shape in form INPUT_NAME:<full_shape_without_batch_axis>.",
|
||||
)
|
||||
parser.add_argument("--batch-sizes", type=str, required=True, help="List of batch sizes to tests. Comma separated.")
|
||||
parser.add_argument("--triton-instances", type=int, default=1, help="Number of Triton Server instances")
|
||||
parser.add_argument(
|
||||
"--number-of-model-instances", type=int, default=1, help="Number of models instances on Triton Server"
|
||||
)
|
||||
parser.add_argument("--result-path", type=str, required=True, help="Path where result file is going to be stored.")
|
||||
parser.add_argument("--server-url", type=str, required=False, default="localhost", help="Url to Triton server")
|
||||
parser.add_argument(
|
||||
"--measurement-window", required=False, help="Time which perf_analyzer will wait for results", default=10000
|
||||
)
|
||||
parser.add_argument("--shared-memory", help="Use shared memory for communication with Triton", action="store_true",
|
||||
default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
warmup(
|
||||
server_url=args.server_url,
|
||||
model_name=args.model_name,
|
||||
batch_sizes=_parse_batch_sizes(args.batch_sizes),
|
||||
triton_instances=args.triton_instances,
|
||||
triton_gpu_engine_count=args.number_of_model_instances,
|
||||
profiling_data=args.input_data,
|
||||
input_shapes=args.input_shape,
|
||||
measurement_window=args.measurement_window,
|
||||
shared_memory=args.shared_memory
|
||||
)
|
||||
|
||||
online_performance(
|
||||
server_url=args.server_url,
|
||||
model_name=args.model_name,
|
||||
batch_sizes=_parse_batch_sizes(args.batch_sizes),
|
||||
triton_instances=args.triton_instances,
|
||||
triton_gpu_engine_count=args.number_of_model_instances,
|
||||
profiling_data=args.input_data,
|
||||
input_shapes=args.input_shape,
|
||||
result_path=args.result_path,
|
||||
measurement_window=args.measurement_window,
|
||||
shared_memory=args.shared_memory
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
16
PyTorch/Segmentation/nnUNet/triton/scripts/docker/build.sh
Executable file
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
docker build -t nnunet . -f Dockerfile
|
|
@ -0,0 +1,27 @@
|
|||
#Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
docker run -it --rm \
|
||||
--gpus "device=all" \
|
||||
--net=host \
|
||||
--shm-size=1g \
|
||||
--ulimit memlock=-1 \
|
||||
--ulimit stack=67108864 \
|
||||
-e WORKDIR="$(pwd)" \
|
||||
-e PYTHONPATH=$(pwd) \
|
||||
-v $(pwd):$(pwd) \
|
||||
-v /mnt/nvdl/usr/jzarzycki/nnunet_pyt/results:/data \
|
||||
-v /mnt/nvdl/usr/jzarzycki/nnunet_pyt/results:/results \
|
||||
-w $(pwd) \
|
||||
nnunet:latest bash
|
32
PyTorch/Segmentation/nnUNet/triton/scripts/docker/triton_inference_server.sh
Executable file
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:=all}
|
||||
|
||||
docker run --rm -d \
|
||||
-p 8000:8000 \
|
||||
-p 8001:8001 \
|
||||
-p 8002:8002 \
|
||||
--runtime=nvidia \
|
||||
-e NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES} \
|
||||
-v ${MODEL_REPOSITORY_PATH}:${MODEL_REPOSITORY_PATH} \
|
||||
--shm-size=1g \
|
||||
--ulimit memlock=-1 \
|
||||
--ulimit stack=67108864 \
|
||||
nvcr.io/nvidia/tritonserver:21.02-py3 tritonserver \
|
||||
--model-store=${MODEL_REPOSITORY_PATH} \
|
||||
--strict-model-config=false \
|
||||
--exit-on-error=true \
|
||||
--model-control-mode=explicit
|
27
PyTorch/Segmentation/nnUNet/triton/scripts/download_data.sh
Executable file
|
@ -0,0 +1,27 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
if [ -f "${CHECKPOINT_DIR}/nnunet_pyt_ckpt_3d_fold2_amp_21.02.0.zip" ]; then
|
||||
echo "Checkpoint already downloaded."
|
||||
else
|
||||
echo "Downloading checkpoint ..."
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/nnunet_pyt_ckpt_3d_fold2_amp/versions/21.02.0/zip -O nnunet_pyt_ckpt_3d_fold2_amp_21.02.0.zip || {
|
||||
echo "ERROR: Failed to download checkpoint from NGC"
|
||||
exit 1
|
||||
}
|
||||
unzip nnunet_pyt_ckpt_3d_fold2_amp_21.02.0.zip -d ${CHECKPOINT_DIR}
|
||||
rm nnunet_pyt_ckpt_3d_fold2_amp_21.02.0.zip
|
||||
echo "ok"
|
||||
fi
|
||||
|
19
PyTorch/Segmentation/nnUNet/triton/scripts/process_dataset.sh
Executable file
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
python download.py --task 01 --results ${DATASETS_DIR}
|
||||
python preprocess.py --task 01 --dim 3 --data ${DATASETS_DIR} --results ${DATASETS_DIR}
|
||||
|
||||
|
33
PyTorch/Segmentation/nnUNet/triton/scripts/setup_environment.sh
Executable file
|
@ -0,0 +1,33 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
WORKDIR="$(pwd)"
|
||||
export WORKSPACE_DIR=${WORKDIR}/workspace
|
||||
export DATASETS_DIR=${WORKSPACE_DIR}/datasets_dir/01_3d/
|
||||
export CHECKPOINT_DIR=${WORKSPACE_DIR}/checkpoint_dir
|
||||
export MODEL_REPOSITORY_PATH=${WORKSPACE_DIR}/model_store
|
||||
export SHARED_DIR=${WORKSPACE_DIR}/shared_dir
|
||||
|
||||
echo "Preparing directories"
|
||||
mkdir -p ${WORKSPACE_DIR}
|
||||
mkdir -p ${DATASETS_DIR}
|
||||
mkdir -p ${CHECKPOINT_DIR}
|
||||
mkdir -p ${MODEL_REPOSITORY_PATH}
|
||||
mkdir -p ${SHARED_DIR}
|
||||
|
||||
echo "Setting up environment"
|
||||
export MODEL_NAME=nnunet
|
||||
export TRITON_LOAD_MODEL_METHOD=explicit
|
||||
export TRITON_INSTANCES=1
|
||||
export TRITON_SERVER_URL=127.0.0.1
|
24
PyTorch/Segmentation/nnUNet/triton/scripts/setup_parameters.sh
Executable file
|
@ -0,0 +1,24 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
export PRECISION="fp16"
|
||||
export FORMAT="ts-script"
|
||||
export BATCH_SIZE="1,2,4"
|
||||
export BACKEND_ACCELERATOR="cuda"
|
||||
export MAX_BATCH_SIZE="4"
|
||||
export NUMBER_OF_MODEL_INSTANCES="1"
|
||||
export TRITON_MAX_QUEUE_DELAY="1"
|
||||
export TRITON_PREFERRED_BATCH_SIZES="2 4"
|
||||
|
|
@ -190,9 +190,6 @@ def get_main_args(strings=None):
|
|||
arg("--tta", action="store_true", help="Enable test time augmentation")
|
||||
arg("--amp", action="store_true", help="Enable automatic mixed precision")
|
||||
arg("--benchmark", action="store_true", help="Run model benchmarking")
|
||||
arg("--deep_supervision", action="store_true", help="Enable deep supervision")
|
||||
arg("--drop_block", action="store_true", help="Enable drop block")
|
||||
arg("--attention", action="store_true", help="Enable attention in decoder")
|
||||
arg("--residual", action="store_true", help="Enable residual block in encoder")
|
||||
arg("--focal", action="store_true", help="Use focal loss instead of cross entropy")
|
||||
arg("--sync_batchnorm", action="store_true", help="Enable synchronized batchnorm")
|
||||
|
|