DeepLearningExamples/FasterTransformer/v3.1/fastertransformer/cuda/topk_kernels.cuh
byshiue ae76b894b9
Byshiue patch 2 (#788)
[FasterTransformer] feat: Update FasterTransformer v3.1
2020-12-14 07:28:11 +08:00

190 lines
6.3 KiB
Plaintext

/*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <assert.h>
#include <array>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <curand_kernel.h>
#include "fastertransformer/arguments.h"
#include "fastertransformer/cuda/cuda_kernels.h"
#include <float.h>
#include <type_traits>
namespace fastertransformer{
#define DO_SPLIT_SMALL_TOP_K_SOFTMAX
static const int SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE = 256;
static const int SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS = 128;
static const int MAX_K = 4;
static const float HALF_FLT_MAX = 65504.F;
template<typename T, int MAX_K>
struct TopK
{
int p[MAX_K];
T u[MAX_K];
__device__ __forceinline__ void insert(T elem, int elem_id)
{
if (elem > u[MAX_K-1] || (p[MAX_K-1] == -1) || ((elem == u[MAX_K-1]) && (elem_id < p[MAX_K-1])))
//if (elem > u[MAX_K-1] || ((elem == u[MAX_K-1]) && (elem_id < p[MAX_K-1])))
{
u[MAX_K-1] = elem;
p[MAX_K-1] = elem_id;
}
for(int k = MAX_K - 2; k >= 0; --k)
{
if ((u[k+1] > u[k]) || (p[k] == -1) || ((u[k+1] == u[k])&&(p[k+1] < p[k])))
//if ((u[k+1] > u[k]) || ((u[k+1] == u[k])&&(p[k+1] < p[k])))
{
T u2 = u[k];
int p2 = p[k];
u[k] = u[k+1];
p[k] = p[k+1];
u[k+1] = u2;
p[k+1] = p2;
}
}
}
__device__ __forceinline__ void init()
{
const bool IS_FP16 = std::is_same<T, half>::value;
const T MAX_T_VAL = (IS_FP16)? HALF_FLT_MAX : FLT_MAX;
for(int i = 0; i < MAX_K; i++)
{
p[i] = -1;
u[i] = -MAX_T_VAL;
}
}
};
template<typename T, int MAX_K>
__device__ __forceinline__ TopK<T, MAX_K> reduce_topk_op(const TopK<T, MAX_K>& a, const TopK<T, MAX_K>& b)
{
TopK<T, MAX_K> res = a;
for(int i = 0; i < MAX_K; ++i)
res.insert(b.u[i], b.p[i]);
return res;
}
template<typename T>
struct TopK_2
{
int p = -1;
T u = -((std::is_same<T, half>::value)? HALF_FLT_MAX : FLT_MAX);
__device__ __forceinline__ void insert(T elem, int elem_id)
{
if(elem > u)
{
u = elem;
p = elem_id;
}
}
__device__ __forceinline__ void init()
{
u = -((std::is_same<T, half>::value)? HALF_FLT_MAX : FLT_MAX);
p = -1;
}
};
template<typename T>
__device__ __forceinline__ TopK_2<T> reduce_topk_op_2(const TopK_2<T>& a, const TopK_2<T>& b)
{
return a.u > b.u ? a : b;
}
template <typename T>
void topK_kernelLauncher(T* log_probs,
int* topk_tmp_id_buf,
T* topk_tmp_val_buf,
int* ids,
DecodingBeamsearchArguments args,
cudaStream_t stream);
template <typename T>
void topK_kernelLauncher(void* workspace,
size_t& workspace_size,
T* log_probs,
int* ids,
DecodingBeamsearchArguments args,
cudaStream_t stream);
template <typename T>
void topK_softMax(const T* log_probs,
const float* bias,
const bool* finished,
T* cum_log_probs,
int* ids,
void * tmp_storage,
DecodingBeamsearchArguments args,
cudaStream_t stream);
/* *************************** end of BeamSearch kernel *********************************** */
/* ********************************** Sampling kernel *********************************** */
template <typename T>
void topK_sampling_kernel_kernelLauncher(void* workspace,
size_t& workspace_size,
T* log_probs,
int* ids,
int* sequence_length,
bool* finished_buf,
int random_num,
DecodingSamplingArguments args,
cudaStream_t stream);
template<typename T>
void topP_sampling_kernel_kernelLauncher(void* workspace,
size_t& workspace_size,
const T* log_probs,
const int* id_vals,
const int* offset_buf,
bool* finished_buf,
int step,
DecodingSamplingArguments& args,
int* output_ids,
int* sequence_length,
const int n,
cudaStream_t stream);
template<typename T>
void beam_topK_kernelLauncher(const T* log_probs,
int* topk_tmp_id_buf,
T* topk_tmp_val_buf,
DecodingSamplingArguments args,
cudaStream_t stream);
template<typename T>
void topK_topP_sampling_kernel_kernelLauncher(void* workspace,
size_t& workspace_size,
int* output_ids,
const T* logits,
const int random_num,
DecodingSamplingArguments& args,
cudaStream_t stream);
/* *************************** end of Sampling kernel *********************************** */
}//namespace fastertransformer