byshiue b2e89e6e80
[FT] FasterTransformer 3.0 Release (#696)
[FT] feat: Add FasterTransformer v3.0

1. Add supporting of INT8 quantization of cpp and TensorFlow op.
2. Provide the tools to quantize the model.
3. Fix the bugs that cmake 3.15 and 3.16 cannot build this project. 
4. Deprecate the FasterTransformer v1
2020-09-23 10:03:37 +08:00

146 lines
6.1 KiB

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import argparse
import timeit
import torch
# import torch.cuda.nvtx as nvtx
from onmt.utils.misc import sequence_mask
from utils.decoder import DecoderWeights, CustomDecoder, ONMTDecoder, init_op_cache, init_onmt_cache
def main():
parser = argparse.ArgumentParser()
parser.add_argument('batch_size', type=int,
help='batch size')
parser.add_argument('layer_num', type=int,
help='number of layers')
parser.add_argument('seq_len', type=int,
help='sequence length')
parser.add_argument('head_num', type=int,
help='head number')
parser.add_argument('head_size', type=int,
help='size per head')
parser.add_argument('--step', type=int, default=0,
help='decoding step number')
parser.add_argument('--fp16', action='store_true',
help='is fp16')
parser.add_argument('--time', action='store_true',
help='test the time or not.')
parser.add_argument('--module_path', type=str, default='./',
help='directory containing the th_fastertransformer dynamic lib')
parser.add_argument('--ths', action='store_true',
help='use TorchScript mode')
parser.add_argument('--ths_path', type=str, default='./lib/',
help='path of the ths_fastertransformer dynamic lib file')
args = parser.parse_args()
hidden_dim = args.head_num * args.head_size
if args.step <= 0:
step = args.seq_len
step = args.step
print("\n=============== Argument ===============")
print('batch_size: ' + str(args.batch_size))
print('layer_num: ' + str(args.layer_num))
print('seq_len: ' + str(args.seq_len))
print('head_num: ' + str(args.head_num))
print('head_size: ' + str(args.head_size))
print('hidden_dim: ' + str(hidden_dim))
print('step: ' + str(step))
print('use_fp16: ' + str(args.fp16))
print('TorchScript mode: ' + str(args.ths))
print('test_time: ' + str(args.time))
inp = torch.empty(args.batch_size, 1, hidden_dim).cuda()
mem = torch.empty(args.batch_size, args.seq_len, hidden_dim).cuda()
torch.nn.init.uniform_(inp, -1, 1)
torch.nn.init.uniform_(mem, -1, 1)
if args.fp16:
inp = inp.half()
mem = mem.half()
mem_seq_lens = torch.randint(1, args.seq_len+1, (args.batch_size,), dtype=torch.int32).cuda()
src_pad_mask = ~sequence_mask(mem_seq_lens, args.seq_len).unsqueeze(1)
weights = DecoderWeights(args.layer_num, hidden_dim)
onmt_decoder = ONMTDecoder(args.layer_num, args.head_num, args.head_size, weights)
if args.fp16:
if args.fp16:
if args.ths:
custom_decoder = CustomDecoder(args.layer_num, args.head_num, args.head_size, weights, args.fp16, os.path.abspath(args.ths_path), args.ths)
custom_decoder = CustomDecoder(args.layer_num, args.head_num, args.head_size, weights, args.fp16, os.path.abspath(args.module_path))
with torch.no_grad():
self_cache, mem_cache = init_op_cache(args.layer_num, args.batch_size, 1, args.seq_len, hidden_dim, args.fp16)
cache = init_onmt_cache(args.layer_num, mem)
output1 = inp
output2 = inp
for i in range(step):
output1 = onmt_decoder(output1, mem, src_pad_mask, cache, 0)
output2, self_cache, mem_cache = custom_decoder(output2, mem, mem_seq_lens, self_cache, mem_cache)
diff = torch.abs((output1 - output2) / output1)
print('step: {} Mean relative diff: {} Max relative diff: {} Min relative diff: {}'.format(
i, torch.mean(diff), torch.max(diff), torch.min(diff)))
if args.time:
iterations = 10
for i in range(iterations):
cache = init_onmt_cache(args.layer_num, mem)
output1 = inp
for i in range(step):
output1 = onmt_decoder(output1, mem, src_pad_mask, cache, 0)
t10 = timeit.default_timer()
for i in range(iterations):
cache = init_onmt_cache(args.layer_num, mem)
output1 = inp
for i in range(step):
output1 = onmt_decoder(output1, mem, src_pad_mask, cache, 0)
t1 = timeit.default_timer() - t10
for i in range(iterations):
self_cache, mem_cache = init_op_cache(args.layer_num, args.batch_size, 1, args.seq_len, hidden_dim, args.fp16)
output2 = inp
for i in range(step):
output2, self_cache, mem_cache = custom_decoder(output2, mem, mem_seq_lens, self_cache, mem_cache)
t20 = timeit.default_timer()
for i in range(iterations):
self_cache, mem_cache = init_op_cache(args.layer_num, args.batch_size, 1, args.seq_len, hidden_dim, args.fp16)
output2 = inp
for i in range(step):
output2, self_cache, mem_cache = custom_decoder(output2, mem, mem_seq_lens, self_cache, mem_cache)
t2 = timeit.default_timer() - t20
print("[INFO] ONMTDecoder time costs: {:.2f} ms".format(t1*1000/iterations))
print("[INFO] FTDecoder time costs: {:.2f} ms".format(t2*1000/iterations))
if __name__ == '__main__':