2019-07-13 18:29:45 +02:00
|
|
|
/*
|
2020-03-02 14:10:33 +01:00
|
|
|
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
2019-07-13 18:29:45 +02:00
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <iostream>
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include <cuda_fp16.h>
|
|
|
|
#include <cublas_v2.h>
|
2020-03-02 14:10:33 +01:00
|
|
|
#include <stdexcept>
|
2019-07-13 18:29:45 +02:00
|
|
|
|
|
|
|
namespace fastertransformer{
|
|
|
|
|
2020-03-02 14:10:33 +01:00
|
|
|
enum class OperationType{FP32, FP16};
|
2019-07-13 18:29:45 +02:00
|
|
|
enum class AllocatorType{CUDA, TF};
|
|
|
|
|
|
|
|
#define PRINT_FUNC_NAME_() do{\
|
|
|
|
std::cout << "[FT][CALL] " << __FUNCTION__ << " " << std::endl; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
static const char *_cudaGetErrorEnum(cudaError_t error) {
|
|
|
|
return cudaGetErrorString(error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *_cudaGetErrorEnum(cublasStatus_t error) {
|
|
|
|
switch (error) {
|
|
|
|
case CUBLAS_STATUS_SUCCESS:
|
|
|
|
return "CUBLAS_STATUS_SUCCESS";
|
|
|
|
|
|
|
|
case CUBLAS_STATUS_NOT_INITIALIZED:
|
|
|
|
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
|
|
|
|
|
|
|
case CUBLAS_STATUS_ALLOC_FAILED:
|
|
|
|
return "CUBLAS_STATUS_ALLOC_FAILED";
|
|
|
|
|
|
|
|
case CUBLAS_STATUS_INVALID_VALUE:
|
|
|
|
return "CUBLAS_STATUS_INVALID_VALUE";
|
|
|
|
|
|
|
|
case CUBLAS_STATUS_ARCH_MISMATCH:
|
|
|
|
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
|
|
|
|
|
|
|
case CUBLAS_STATUS_MAPPING_ERROR:
|
|
|
|
return "CUBLAS_STATUS_MAPPING_ERROR";
|
|
|
|
|
|
|
|
case CUBLAS_STATUS_EXECUTION_FAILED:
|
|
|
|
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
|
|
|
|
|
|
|
case CUBLAS_STATUS_INTERNAL_ERROR:
|
|
|
|
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
|
|
|
|
|
|
|
case CUBLAS_STATUS_NOT_SUPPORTED:
|
|
|
|
return "CUBLAS_STATUS_NOT_SUPPORTED";
|
|
|
|
|
|
|
|
case CUBLAS_STATUS_LICENSE_ERROR:
|
|
|
|
return "CUBLAS_STATUS_LICENSE_ERROR";
|
|
|
|
}
|
|
|
|
return "<unknown>";
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
void check(T result, char const *const func, const char *const file, int const line) {
|
|
|
|
if (result) {
|
|
|
|
throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + \
|
|
|
|
(_cudaGetErrorEnum(result)) + " " + file + \
|
|
|
|
":" + std::to_string(line) + " \n");\
|
|
|
|
}
|
|
|
|
}
|
2020-03-02 14:10:33 +01:00
|
|
|
|
2019-07-13 18:29:45 +02:00
|
|
|
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
|
2020-03-02 14:10:33 +01:00
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
void print_to_file(T* result, const int size, char* file)
|
|
|
|
{
|
|
|
|
FILE* fd = fopen(file, "w");
|
|
|
|
float* tmp = (float*)malloc(sizeof(float) * size);
|
|
|
|
check_cuda_error(cudaMemcpy(tmp, result, sizeof(float) * size, cudaMemcpyDeviceToHost));
|
|
|
|
for(int i = 0; i < size; ++i)
|
|
|
|
fprintf(fd, "%f\n", (float)tmp[i]);
|
|
|
|
free(tmp);
|
|
|
|
fclose(fd);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
void print_to_screen(T* result, const int size)
|
|
|
|
{
|
|
|
|
float* tmp = (float*)malloc(sizeof(float) * size);
|
|
|
|
check_cuda_error(cudaMemcpy(tmp, result, sizeof(float) * size, cudaMemcpyDeviceToHost));
|
|
|
|
for(int i = 0; i < size; ++i)
|
|
|
|
printf("%d, %f\n", i, (float)tmp[i]);
|
|
|
|
free(tmp);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
void check_max_val(const T* result, const int size){
|
|
|
|
T* tmp = new T[size];
|
|
|
|
cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost);
|
|
|
|
float max_val = -100000;
|
|
|
|
for(int i = 0 ; i < size; i++){
|
|
|
|
float val = (float)(tmp[i]);
|
|
|
|
if(val > max_val) max_val = val;
|
|
|
|
}
|
|
|
|
delete tmp;
|
|
|
|
printf("[INFO][CUDA] addr %p max val: %f \n", result, max_val);
|
|
|
|
}
|
|
|
|
|
2019-07-13 18:29:45 +02:00
|
|
|
}//namespace fastertransformer
|