DeepLearningExamples/FasterTransformer/v2/fastertransformer/allocator.h

127 lines
3.2 KiB
C++

/*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Memory Allocator
**/
#pragma once
#include "fastertransformer/common.h"
#include "fastertransformer/utils.h"
#include <cuda_runtime.h>
#ifdef GOOGLE_CUDA
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#endif
namespace fastertransformer
{
class IAllocator
{
public:
virtual void *malloc(size_t size) const = 0;
virtual void free(void *ptr) const = 0;
};
template <AllocatorType AllocType_>
class Allocator;
template <>
class Allocator<AllocatorType::CUDA> : public IAllocator
{
const int device_id_;
public:
Allocator(int device_id) : device_id_(device_id) {}
void *malloc(size_t size) const
{
void *ptr = nullptr;
int o_device = 0;
check_cuda_error(get_set_device(device_id_, &o_device));
check_cuda_error(cudaMalloc(&ptr, size));
check_cuda_error(get_set_device(o_device));
return ptr;
}
void free(void *ptr) const
{
int o_device = 0;
check_cuda_error(get_set_device(device_id_, &o_device));
check_cuda_error(cudaFree(ptr));
check_cuda_error(get_set_device(o_device));
return;
}
};
//TODO: allocator of TensorFlow
// You can add context to constructor
#ifdef GOOGLE_CUDA
using namespace tensorflow;
template <>
class Allocator<AllocatorType::TF> : public IAllocator
{
OpKernelContext *context_;
std::vector<Tensor> *allocated_tensor_vector;
public:
Allocator(OpKernelContext *context) : context_(context)
{
allocated_tensor_vector = new std::vector<Tensor>;
}
void *malloc(size_t size) const
{
Tensor buf;
long long int buf_size = (long long int)size;
tensorflow::Status status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf);
allocated_tensor_vector->push_back(buf);
if (status != tensorflow::Status::OK())
throw std::runtime_error("TF error: context->allocate_temp failed");
auto flat = buf.flat<uint8>();
void *ptr = (void *)flat.data();
cudaMemset(ptr, 0, buf_size);
return ptr;
}
void free(void *ptr) const
{
#ifndef NDEBUG
printf("call from allocator free\n");
#endif
return;
}
~Allocator()
{
allocated_tensor_vector->clear();
delete allocated_tensor_vector;
}
};
#endif
} //namespace fastertransformer