Merge pull request #773 from swethmandava/master

removie trt, fix queuing delay typo in triton readme for bert
2020-12-03 08:24:48 -08:00 · 2020-12-03 08:24:48 -08:00 · 99b1c898ce
parent 66667f1839 33ea90e1cc
commit 99b1c898ce
4 changed files with 2 additions and 195 deletions
--- a/TensorFlow/LanguageModeling/BERT/triton/README.md
+++ b/TensorFlow/LanguageModeling/BERT/triton/README.md
@ -187,7 +187,7 @@ Performance numbers for BERT Large, sequence length=384 are obtained from [exper

 ![](../data/images/bert_trt_throughput_vs_latency.png?raw=true)

-The plot above shows that throughput gains taper off from increasing batch size above 12. There is minimal gain in throughput going from batch size 12 to 128. However, running inference with a single large batch might be faster than running several small inference requests. Therefore, we choose to maximize batch size for Dynamic Batching with a maximum acceptable queuing delay of 50ms and maximum acceptable inference latency of 100ms.
+The plot above shows that throughput gains taper off from increasing batch size above 12. There is minimal gain in throughput going from batch size 12 to 128. However, running inference with a single large batch might be faster than running several small inference requests. Therefore, we choose to maximize batch size for Dynamic Batching with a maximum acceptable queuing delay of 1ms and maximum acceptable inference latency of 100ms.

 ### Dynamic Batching Support

@ -232,4 +232,4 @@ April 2020
 TRTIS -> TRITON

 October 2019
-Initial release
+Initial release
--- a/TensorFlow/LanguageModeling/BERT/trt/helpers/calibrator.py
+++ b/TensorFlow/LanguageModeling/BERT/trt/helpers/calibrator.py
@ -1,95 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tensorrt as trt
-import os
-
-import pycuda.driver as cuda
-import pycuda.autoinit # lgtm[py/unused-import]
-import numpy as np
-import helpers.tokenization as tokenization
-import helpers.data_processing as dp
-
-class BertCalibrator(trt.IInt8MinMaxCalibrator):
-    def __init__(self, squad_json, vocab_file, cache_file, batch_size, max_seq_length, num_inputs):
-        # Whenever you specify a custom constructor for a TensorRT class,
-        # you MUST call the constructor of the parent explicitly.
-        trt.IInt8MinMaxCalibrator.__init__(self)
-
-        self.cache_file = cache_file
-
-        # Every time get_batch is called, the next batch of size batch_size will be copied to the device and returned.
-        self.data = dp.read_squad_json(squad_json)
-        self.max_seq_length = max_seq_length
-        self.batch_size = batch_size
-        self.current_index = 0
-        self.num_inputs = num_inputs
-        self.tokenizer = tokenization.BertTokenizer(vocab_file=vocab_file, do_lower_case=True)
-        self.doc_stride = 128
-        self.max_query_length = 64
-
-        # Allocate enough memory for a whole batch.
-        self.device_inputs = [cuda.mem_alloc(self.max_seq_length * trt.int32.itemsize * self.batch_size) for binding in range(3)]
-
-    def free(self):
-        for dinput in self.device_inputs:
-            dinput.free()
-
-    def get_batch_size(self):
-        return self.batch_size
-
-    # TensorRT passes along the names of the engine bindings to the get_batch function.
-    # You don't necessarily have to use them, but they can be useful to understand the order of
-    # the inputs. The bindings list is expected to have the same ordering as 'names'.
-    def get_batch(self, bindings, names):
-        if self.current_index + self.batch_size > self.num_inputs:
-            print("Calibrating index {:} batch size {:} exceed max input limit {:} sentences".format(self.current_index, self.batch_size, self.num_inputs))
-            return None
-
-        current_batch = int(self.current_index / self.batch_size)
-        if current_batch % 10 == 0:
-            print("Calibrating batch {:}, containing {:} sentences".format(current_batch, self.batch_size))
-
-        input_ids = []
-        segment_ids = []
-        input_mask = []
-        for i in range(self.batch_size):
-            example = self.data[self.current_index + i]
-            features = dp.convert_example_to_features(example.doc_tokens, example.question_text, self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length)
-            if len(input_ids) and len(segment_ids) and len(input_mask):
-                input_ids = np.concatenate((input_ids, features[0].input_ids))
-                segment_ids = np.concatenate((segment_ids, features[0].segment_ids))
-                input_mask = np.concatenate((input_mask, features[0].input_mask))
-            else:
-                input_ids = features[0].input_ids
-                segment_ids = features[0].segment_ids
-                input_mask = features[0].input_mask
-
-        cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel())
-
-        self.current_index += self.batch_size
-        return self.device_inputs
-
-    def read_calibration_cache(self):
-        # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
-        if os.path.exists(self.cache_file):
-            with open(self.cache_file, "rb") as f:
-                return f.read()
-
-    def write_calibration_cache(self, cache):
-        with open(self.cache_file, "wb") as f:
-            f.write(cache)
--- a/TensorFlow/LanguageModeling/BERT/trt/squad/dev-v1.1.json
+++ b/TensorFlow/LanguageModeling/BERT/trt/squad/dev-v1.1.json
--- a/TensorFlow/LanguageModeling/BERT/trt/squad/evaluate-v1.1.py
+++ b/TensorFlow/LanguageModeling/BERT/trt/squad/evaluate-v1.1.py
@ -1,97 +0,0 @@
-""" Official evaluation script for v1.1 of the SQuAD dataset. """
-from __future__ import print_function
-from collections import Counter
-import string
-import re
-import argparse
-import json
-import sys
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-    def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
-
-    def white_space_fix(text):
-        return ' '.join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def f1_score(prediction, ground_truth):
-    prediction_tokens = normalize_answer(prediction).split()
-    ground_truth_tokens = normalize_answer(ground_truth).split()
-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
-    num_same = sum(common.values())
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-
-def exact_match_score(prediction, ground_truth):
-    return (normalize_answer(prediction) == normalize_answer(ground_truth))
-
-
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-    scores_for_ground_truths = []
-    for ground_truth in ground_truths:
-        score = metric_fn(prediction, ground_truth)
-        scores_for_ground_truths.append(score)
-    return max(scores_for_ground_truths)
-
-def evaluate(dataset, predictions, f1_acc):
-    f1 = exact_match = total = 0
-    for article in dataset:
-        for paragraph in article['paragraphs']:
-            for qa in paragraph['qas']:
-                total += 1
-                if qa['id'] not in predictions:
-                    message = 'Unanswered question ' + qa['id'] + \
-                              ' will receive score 0.'
-                    print(message, file=sys.stderr)
-                    continue
-                ground_truths = list(map(lambda x: x['text'], qa['answers']))
-                prediction = predictions[qa['id']]
-                exact_match += metric_max_over_ground_truths(
-                    exact_match_score, prediction, ground_truths)
-                f1 += metric_max_over_ground_truths(
-                    f1_score, prediction, ground_truths)
-
-    exact_match = 100.0 * exact_match / total
-    f1 = 100.0 * f1 / total
-    if (f1 < f1_acc - 0.5):
-        print("&&&& FAILED TensorRT BERT Squad Accuracy matches reference.")
-    else:
-        print("&&&& PASSED TensorRT BERT Squad Accuracy matches reference.")
-    return {'exact_match': exact_match, 'f1': f1}
-
-if __name__ == '__main__':
-    expected_version = '1.1'
-    parser = argparse.ArgumentParser(
-        description='Evaluation for SQuAD ' + expected_version)
-    parser.add_argument('dataset_file', help='Dataset file')
-    parser.add_argument('prediction_file', help='Prediction File')
-    parser.add_argument('f1_acc', help='Reference Accuracy')
-    args = parser.parse_args()
-    with open(args.dataset_file) as dataset_file:
-        dataset_json = json.load(dataset_file)
-        if (dataset_json['version'] != expected_version):
-            print('Evaluation expects v-' + expected_version +
-                  ', but got dataset with v-' + dataset_json['version'],
-                  file=sys.stderr)
-        dataset = dataset_json['data']
-    with open(args.prediction_file) as prediction_file:
-        predictions = json.load(prediction_file)
-        f1_acc = float(args.f1_acc)
-    print(json.dumps(evaluate(dataset, predictions, f1_acc)))
-