Merge pull request #773 from swethmandava/master

removie trt, fix queuing delay typo in triton readme for bert
This commit is contained in:
Swetha Mandava 2020-12-03 08:24:48 -08:00 committed by GitHub
commit 99b1c898ce
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 2 additions and 195 deletions

View file

@ -187,7 +187,7 @@ Performance numbers for BERT Large, sequence length=384 are obtained from [exper
![](../data/images/bert_trt_throughput_vs_latency.png?raw=true)
The plot above shows that throughput gains taper off from increasing batch size above 12. There is minimal gain in throughput going from batch size 12 to 128. However, running inference with a single large batch might be faster than running several small inference requests. Therefore, we choose to maximize batch size for Dynamic Batching with a maximum acceptable queuing delay of 50ms and maximum acceptable inference latency of 100ms.
The plot above shows that throughput gains taper off from increasing batch size above 12. There is minimal gain in throughput going from batch size 12 to 128. However, running inference with a single large batch might be faster than running several small inference requests. Therefore, we choose to maximize batch size for Dynamic Batching with a maximum acceptable queuing delay of 1ms and maximum acceptable inference latency of 100ms.
### Dynamic Batching Support
@ -232,4 +232,4 @@ April 2020
TRTIS -> TRITON
October 2019
Initial release
Initial release

View file

@ -1,95 +0,0 @@
#!/usr/bin/env python3
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorrt as trt
import os
import pycuda.driver as cuda
import pycuda.autoinit # lgtm[py/unused-import]
import numpy as np
import helpers.tokenization as tokenization
import helpers.data_processing as dp
class BertCalibrator(trt.IInt8MinMaxCalibrator):
def __init__(self, squad_json, vocab_file, cache_file, batch_size, max_seq_length, num_inputs):
# Whenever you specify a custom constructor for a TensorRT class,
# you MUST call the constructor of the parent explicitly.
trt.IInt8MinMaxCalibrator.__init__(self)
self.cache_file = cache_file
# Every time get_batch is called, the next batch of size batch_size will be copied to the device and returned.
self.data = dp.read_squad_json(squad_json)
self.max_seq_length = max_seq_length
self.batch_size = batch_size
self.current_index = 0
self.num_inputs = num_inputs
self.tokenizer = tokenization.BertTokenizer(vocab_file=vocab_file, do_lower_case=True)
self.doc_stride = 128
self.max_query_length = 64
# Allocate enough memory for a whole batch.
self.device_inputs = [cuda.mem_alloc(self.max_seq_length * trt.int32.itemsize * self.batch_size) for binding in range(3)]
def free(self):
for dinput in self.device_inputs:
dinput.free()
def get_batch_size(self):
return self.batch_size
# TensorRT passes along the names of the engine bindings to the get_batch function.
# You don't necessarily have to use them, but they can be useful to understand the order of
# the inputs. The bindings list is expected to have the same ordering as 'names'.
def get_batch(self, bindings, names):
if self.current_index + self.batch_size > self.num_inputs:
print("Calibrating index {:} batch size {:} exceed max input limit {:} sentences".format(self.current_index, self.batch_size, self.num_inputs))
return None
current_batch = int(self.current_index / self.batch_size)
if current_batch % 10 == 0:
print("Calibrating batch {:}, containing {:} sentences".format(current_batch, self.batch_size))
input_ids = []
segment_ids = []
input_mask = []
for i in range(self.batch_size):
example = self.data[self.current_index + i]
features = dp.convert_example_to_features(example.doc_tokens, example.question_text, self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length)
if len(input_ids) and len(segment_ids) and len(input_mask):
input_ids = np.concatenate((input_ids, features[0].input_ids))
segment_ids = np.concatenate((segment_ids, features[0].segment_ids))
input_mask = np.concatenate((input_mask, features[0].input_mask))
else:
input_ids = features[0].input_ids
segment_ids = features[0].segment_ids
input_mask = features[0].input_mask
cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel())
cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel())
cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel())
self.current_index += self.batch_size
return self.device_inputs
def read_calibration_cache(self):
# If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
if os.path.exists(self.cache_file):
with open(self.cache_file, "rb") as f:
return f.read()
def write_calibration_cache(self, cache):
with open(self.cache_file, "wb") as f:
f.write(cache)

File diff suppressed because one or more lines are too long

View file

@ -1,97 +0,0 @@
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions, f1_acc):
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
if (f1 < f1_acc - 0.5):
print("&&&& FAILED TensorRT BERT Squad Accuracy matches reference.")
else:
print("&&&& PASSED TensorRT BERT Squad Accuracy matches reference.")
return {'exact_match': exact_match, 'f1': f1}
if __name__ == '__main__':
expected_version = '1.1'
parser = argparse.ArgumentParser(
description='Evaluation for SQuAD ' + expected_version)
parser.add_argument('dataset_file', help='Dataset file')
parser.add_argument('prediction_file', help='Prediction File')
parser.add_argument('f1_acc', help='Reference Accuracy')
args = parser.parse_args()
with open(args.dataset_file) as dataset_file:
dataset_json = json.load(dataset_file)
if (dataset_json['version'] != expected_version):
print('Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
with open(args.prediction_file) as prediction_file:
predictions = json.load(prediction_file)
f1_acc = float(args.f1_acc)
print(json.dumps(evaluate(dataset, predictions, f1_acc)))