mirror of
https://github.com/matrix-construct/construct
synced 2024-12-03 04:02:47 +01:00
1235 lines
26 KiB
C++
1235 lines
26 KiB
C++
// The Construct
|
|
//
|
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
|
//
|
|
// Permission to use, copy, modify, and/or distribute this software for any
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
// copyright notice and this permission notice is present in all copies. The
|
|
// full license for this software is available in the LICENSE file.
|
|
|
|
#include <dlfcn.h>
|
|
#include <CL/cl.h>
|
|
|
|
// Util
|
|
namespace ircd::cl
|
|
{
|
|
static bool is_error(const int &code) noexcept;
|
|
static int throw_on_error(const int &code);
|
|
template<class func, class... args> static int call(func&&, args&&...);
|
|
template<class T = string_view, class F, class id, class param> static T info(F&&, const id &, const param &, const mutable_buffer &);
|
|
}
|
|
|
|
// Runtime state
|
|
namespace ircd::cl
|
|
{
|
|
static const size_t
|
|
OPTION_MAX {8},
|
|
PLATFORM_MAX {8},
|
|
DEVICE_MAX {8};
|
|
|
|
static uint
|
|
options,
|
|
platforms,
|
|
devices[PLATFORM_MAX];
|
|
|
|
static char
|
|
option[OPTION_MAX][256];
|
|
|
|
static void
|
|
*linkage;
|
|
|
|
static cl_platform_id
|
|
platform[PLATFORM_MAX];
|
|
|
|
static cl_device_id
|
|
device[PLATFORM_MAX][DEVICE_MAX];
|
|
|
|
static cl_context
|
|
primary;
|
|
|
|
static cl_command_queue
|
|
queue[PLATFORM_MAX][DEVICE_MAX];
|
|
|
|
static void handle_notify(const char *, const void *, size_t, void *) noexcept;
|
|
}
|
|
|
|
decltype(ircd::cl::log)
|
|
ircd::cl::log
|
|
{
|
|
"cl"
|
|
};
|
|
|
|
decltype(ircd::cl::version_api)
|
|
ircd::cl::version_api
|
|
{
|
|
"OpenCL", info::versions::API, CL_TARGET_OPENCL_VERSION,
|
|
{
|
|
#if defined(CL_VERSION_MAJOR) && defined(CL_VERSION_MINOR) && defined(CL_VERSION_PATCH)
|
|
CL_VERSION_MAJOR(CL_TARGET_OPENCL_VERSION),
|
|
CL_VERSION_MINOR(CL_TARGET_OPENCL_VERSION),
|
|
CL_VERSION_PATCH(CL_TARGET_OPENCL_VERSION),
|
|
#endif
|
|
}
|
|
};
|
|
|
|
decltype(ircd::cl::version_abi)
|
|
ircd::cl::version_abi
|
|
{
|
|
"OpenCL", info::versions::ABI
|
|
};
|
|
|
|
decltype(ircd::cl::enable)
|
|
ircd::cl::enable
|
|
{
|
|
{ "name", "ircd.cl.enable" },
|
|
{ "default", false },
|
|
{ "persist", false },
|
|
};
|
|
|
|
//
|
|
// init
|
|
//
|
|
|
|
ircd::cl::init::init()
|
|
{
|
|
if(!enable)
|
|
{
|
|
log::dwarning
|
|
{
|
|
log, "OpenCL hardware acceleration is not available or enabled."
|
|
};
|
|
|
|
return;
|
|
}
|
|
|
|
const ctx::posix::enable_pthread enable_pthread;
|
|
|
|
// Setup options
|
|
strlcpy{option[options++], "LP_NUM_THREADS=0"};
|
|
strlcpy{option[options++], "MESA_GLSL_CACHE_DISABLE=true"};
|
|
strlcpy{option[options++], "AMD_DEBUG=nogfx"};
|
|
assert(options <= OPTION_MAX);
|
|
|
|
// Configure options into the environment. TODO: XXX don't overwrite
|
|
while(options--)
|
|
sys::call(putenv, option[options]);
|
|
|
|
// Load the pipe.
|
|
assert(!linkage);
|
|
if(!(linkage = dlopen("libOpenCL.so", RTLD_LAZY | RTLD_GLOBAL)))
|
|
return;
|
|
|
|
// OpenCL sez platform=null is implementation defined.
|
|
info(clGetPlatformInfo, nullptr, CL_PLATFORM_VERSION, version_abi.string);
|
|
|
|
// Get the platforms.
|
|
call(clGetPlatformIDs, PLATFORM_MAX, platform, &platforms);
|
|
|
|
char buf[4][128];
|
|
for(size_t i(0); i < platforms; ++i)
|
|
log::logf
|
|
{
|
|
log, log::level::DEBUG,
|
|
"OpenCL:%d [%u][*] :%s :%s :%s :%s",
|
|
CL_TARGET_OPENCL_VERSION,
|
|
i,
|
|
info(clGetPlatformInfo, platform[i], CL_PLATFORM_VERSION, buf[0]),
|
|
info(clGetPlatformInfo, platform[i], CL_PLATFORM_VENDOR, buf[1]),
|
|
info(clGetPlatformInfo, platform[i], CL_PLATFORM_NAME, buf[2]),
|
|
info(clGetPlatformInfo, platform[i], CL_PLATFORM_EXTENSIONS, buf[3]),
|
|
};
|
|
|
|
size_t devices_total(0);
|
|
for(size_t i(0); i < platforms; ++i)
|
|
{
|
|
static const auto type
|
|
{
|
|
CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR
|
|
};
|
|
|
|
call(clGetDeviceIDs, platform[i], type, DEVICE_MAX, device[i], devices + i);
|
|
devices_total += devices[i];
|
|
}
|
|
|
|
for(size_t i(0); i < platforms; ++i)
|
|
for(size_t j(0); j < devices[i]; ++j)
|
|
log::info
|
|
{
|
|
log, "OpenCL:%d [%u][%u] :%s :%s :%s :%s",
|
|
CL_TARGET_OPENCL_VERSION,
|
|
i,
|
|
j,
|
|
info(clGetDeviceInfo, device[i][j], CL_DEVICE_VERSION, buf[1]),
|
|
info(clGetDeviceInfo, device[i][j], CL_DEVICE_VENDOR, buf[2]),
|
|
info(clGetDeviceInfo, device[i][j], CL_DEVICE_NAME, buf[3]),
|
|
info(clGetDeviceInfo, device[i][j], CL_DRIVER_VERSION, buf[0]),
|
|
};
|
|
|
|
// Gather all devices we'll use.
|
|
size_t _devs {0};
|
|
cl_device_id _dev[DEVICE_MAX];
|
|
for(size_t i(0); i < platforms; ++i)
|
|
for(size_t j(0); j < devices[i]; ++j)
|
|
_dev[_devs++] = device[i][j];
|
|
|
|
// Create a context from gathered devices.
|
|
cl_int err {CL_SUCCESS};
|
|
cl_context_properties ctxprop {0};
|
|
primary = clCreateContext(&ctxprop, _devs, _dev, handle_notify, nullptr, &err);
|
|
throw_on_error(err);
|
|
|
|
// Create a queue for each device.
|
|
cl_command_queue_properties qprop {0};
|
|
qprop |= CL_QUEUE_PROFILING_ENABLE;
|
|
for(size_t i(0); i < platforms; ++i)
|
|
for(size_t j(0); j < devices[i]; ++j)
|
|
{
|
|
queue[i][j] = clCreateCommandQueue(primary, device[i][j], qprop, &err);
|
|
throw_on_error(err);
|
|
}
|
|
|
|
work::init();
|
|
}
|
|
|
|
ircd::cl::init::~init()
|
|
noexcept
|
|
{
|
|
if(!linkage)
|
|
return;
|
|
|
|
const ctx::posix::enable_pthread enable_pthread;
|
|
if(primary)
|
|
{
|
|
log::debug
|
|
{
|
|
log, "Shutting down OpenCL...",
|
|
};
|
|
|
|
work::fini();
|
|
}
|
|
|
|
for(size_t i(0); i < PLATFORM_MAX; ++i)
|
|
for(size_t j(0); j < DEVICE_MAX; ++j)
|
|
if(queue[i][j])
|
|
{
|
|
call(clReleaseCommandQueue, queue[i][j]);
|
|
queue[i][j] = nullptr;
|
|
}
|
|
|
|
if(primary)
|
|
{
|
|
call(clReleaseContext, primary);
|
|
primary = nullptr;
|
|
}
|
|
|
|
dlclose(linkage);
|
|
}
|
|
|
|
//
|
|
// interface
|
|
//
|
|
|
|
void
|
|
ircd::cl::sync()
|
|
{
|
|
auto &q
|
|
{
|
|
queue[0][0]
|
|
};
|
|
|
|
call
|
|
(
|
|
clFinish, q
|
|
);
|
|
}
|
|
|
|
void
|
|
ircd::cl::flush()
|
|
{
|
|
auto &q
|
|
{
|
|
queue[0][0]
|
|
};
|
|
|
|
call
|
|
(
|
|
clFlush, q
|
|
);
|
|
}
|
|
|
|
//
|
|
// exec
|
|
//
|
|
|
|
namespace ircd::cl
|
|
{
|
|
static const size_t _deps_list_max {256};
|
|
static thread_local cl_event _deps_list[_deps_list_max];
|
|
|
|
static vector_view<cl_event> make_deps_default(cl::work *const &, const exec::opts &);
|
|
static vector_view<cl_event> make_deps(cl::work *const &, const exec::opts &);
|
|
}
|
|
|
|
template<>
|
|
decltype(ircd::util::instance_list<ircd::cl::work>::allocator)
|
|
ircd::util::instance_list<ircd::cl::work>::allocator
|
|
{};
|
|
|
|
template<>
|
|
decltype(ircd::util::instance_list<ircd::cl::work>::list)
|
|
ircd::util::instance_list<ircd::cl::work>::list
|
|
{
|
|
allocator
|
|
};
|
|
|
|
decltype(ircd::cl::exec::opts_default)
|
|
ircd::cl::exec::opts_default;
|
|
|
|
ircd::cl::exec::exec(const opts &opts)
|
|
try
|
|
{
|
|
auto &q
|
|
{
|
|
queue[0][0]
|
|
};
|
|
|
|
const auto deps
|
|
{
|
|
make_deps(this, opts)
|
|
};
|
|
|
|
call
|
|
(
|
|
clEnqueueBarrierWithWaitList,
|
|
q,
|
|
deps.size(),
|
|
deps.size()? deps.data(): nullptr,
|
|
reinterpret_cast<cl_event *>(&this->handle)
|
|
);
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
log::error
|
|
{
|
|
log, "Exec Barrier :%s",
|
|
e.what(),
|
|
};
|
|
|
|
throw;
|
|
}
|
|
|
|
ircd::cl::exec::exec(kern &kern,
|
|
const kern::range &work,
|
|
const opts &opts)
|
|
try
|
|
{
|
|
size_t dim(0);
|
|
for(size_t i(0); i < work.global.size(); ++i)
|
|
dim += work.global[i] > 0;
|
|
|
|
auto &q
|
|
{
|
|
queue[0][0]
|
|
};
|
|
|
|
const auto deps
|
|
{
|
|
make_deps(this, opts)
|
|
};
|
|
|
|
call
|
|
(
|
|
clEnqueueNDRangeKernel,
|
|
q,
|
|
cl_kernel(kern.handle),
|
|
dim,
|
|
work.offset.data(),
|
|
work.global.data(),
|
|
work.local.data(),
|
|
deps.size(),
|
|
deps.size()? deps.data(): nullptr,
|
|
reinterpret_cast<cl_event *>(&this->handle)
|
|
);
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
log::error
|
|
{
|
|
log, "Exec Kern :%s",
|
|
e.what(),
|
|
};
|
|
|
|
throw;
|
|
}
|
|
|
|
ircd::cl::exec::exec(data &data,
|
|
const mutable_buffer &buf,
|
|
const opts &opts)
|
|
try
|
|
{
|
|
auto &q
|
|
{
|
|
queue[0][0]
|
|
};
|
|
|
|
const auto deps
|
|
{
|
|
make_deps(this, opts)
|
|
};
|
|
|
|
call
|
|
(
|
|
clEnqueueReadBuffer,
|
|
q,
|
|
cl_mem(data.handle),
|
|
opts.blocking,
|
|
0UL, //offset,
|
|
ircd::size(buf),
|
|
ircd::data(buf),
|
|
deps.size(),
|
|
deps.size()? deps.data(): nullptr,
|
|
reinterpret_cast<cl_event *>(&this->handle)
|
|
);
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
log::error
|
|
{
|
|
log, "Exec Read :%s",
|
|
e.what(),
|
|
};
|
|
|
|
throw;
|
|
}
|
|
|
|
ircd::cl::exec::exec(data &data,
|
|
const const_buffer &buf,
|
|
const opts &opts)
|
|
try
|
|
{
|
|
auto &q
|
|
{
|
|
queue[0][0]
|
|
};
|
|
|
|
const auto deps
|
|
{
|
|
make_deps(this, opts)
|
|
};
|
|
|
|
call
|
|
(
|
|
clEnqueueWriteBuffer,
|
|
q,
|
|
cl_mem(data.handle),
|
|
opts.blocking,
|
|
0UL, //offset,
|
|
ircd::size(buf),
|
|
mutable_cast(ircd::data(buf)),
|
|
deps.size(),
|
|
deps.size()? deps.data(): nullptr,
|
|
reinterpret_cast<cl_event *>(&this->handle)
|
|
);
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
log::error
|
|
{
|
|
log, "Exec Write :%s",
|
|
e.what(),
|
|
};
|
|
|
|
throw;
|
|
}
|
|
|
|
ircd::vector_view<cl_event>
|
|
ircd::cl::make_deps(cl::work *const &work,
|
|
const exec::opts &opts)
|
|
{
|
|
if(empty(opts.deps))
|
|
return make_deps_default(work, opts);
|
|
|
|
size_t ret(0);
|
|
vector_view<cl_event> out(_deps_list);
|
|
for(auto &exec : opts.deps)
|
|
out.at(ret++) = cl_event(exec.handle);
|
|
|
|
return vector_view<cl_event>
|
|
{
|
|
out, ret
|
|
};
|
|
}
|
|
|
|
ircd::vector_view<cl_event>
|
|
ircd::cl::make_deps_default(cl::work *const &work,
|
|
const exec::opts &opts)
|
|
{
|
|
size_t ret(0);
|
|
vector_view<cl_event> out(_deps_list);
|
|
for(auto it(rbegin(cl::work::list)); it != rend(cl::work::list); ++it)
|
|
{
|
|
cl::work *const &other{*it};
|
|
if(other == work)
|
|
continue;
|
|
|
|
if(other->context != ctx::current)
|
|
continue;
|
|
|
|
out.at(ret++) = cl_event(other->handle);
|
|
break;
|
|
}
|
|
|
|
return vector_view<cl_event>
|
|
{
|
|
out, ret
|
|
};
|
|
}
|
|
|
|
//
|
|
// kern
|
|
//
|
|
|
|
ircd::cl::kern::kern(code &code,
|
|
const string_view &name)
|
|
try
|
|
{
|
|
int err {CL_SUCCESS};
|
|
handle = clCreateKernel(cl_program(code.handle), name.c_str(), &err);
|
|
throw_on_error(err);
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
log::error
|
|
{
|
|
log, "Kernel Create '%s' :%s",
|
|
name,
|
|
e.what(),
|
|
};
|
|
|
|
throw;
|
|
}
|
|
|
|
ircd::cl::kern::kern(kern &&o)
|
|
noexcept
|
|
:handle{std::move(o.handle)}
|
|
{
|
|
o.handle = nullptr;
|
|
}
|
|
|
|
ircd::cl::kern &
|
|
ircd::cl::kern::operator=(kern &&o)
|
|
noexcept
|
|
{
|
|
this->~kern();
|
|
handle = std::move(o.handle);
|
|
o.handle = nullptr;
|
|
return *this;
|
|
}
|
|
|
|
ircd::cl::kern::~kern()
|
|
noexcept try
|
|
{
|
|
call(clReleaseKernel, cl_kernel(handle));
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
log::critical
|
|
{
|
|
log, "Kernel Release :%s",
|
|
e.what(),
|
|
};
|
|
|
|
return;
|
|
}
|
|
|
|
void
|
|
ircd::cl::kern::arg(const int i,
|
|
data &data)
|
|
{
|
|
const auto &data_handle
|
|
{
|
|
cl_mem(data.handle)
|
|
};
|
|
|
|
call(clSetKernelArg, cl_kernel(handle), i, sizeof(cl_mem), &data_handle);
|
|
}
|
|
|
|
//
|
|
// code
|
|
//
|
|
|
|
ircd::cl::code::code(const string_view &src,
|
|
const string_view &build_opts)
|
|
:code
|
|
{
|
|
vector_view<const string_view>(&src, 1),
|
|
build_opts
|
|
}
|
|
{
|
|
}
|
|
|
|
ircd::cl::code::code(const vector_view<const string_view> &srcs,
|
|
const string_view &build_opts)
|
|
{
|
|
static const size_t iov_max
|
|
{
|
|
64 //TODO: ???
|
|
};
|
|
|
|
if(unlikely(srcs.size() > iov_max))
|
|
throw error
|
|
{
|
|
"Maximum number of sources exceeded: lim:%zu got:%zu",
|
|
iov_max,
|
|
srcs.size(),
|
|
};
|
|
|
|
const size_t count
|
|
{
|
|
std::min(srcs.size(), iov_max)
|
|
};
|
|
|
|
size_t len[count];
|
|
const char *src[count];
|
|
for(size_t i(0); i < count; ++i)
|
|
src[i] = ircd::data(srcs[i]),
|
|
len[i] = ircd::size(srcs[i]);
|
|
|
|
int err {CL_SUCCESS};
|
|
handle = clCreateProgramWithSource(primary, count, src, len, &err);
|
|
throw_on_error(err);
|
|
|
|
if(!null(build_opts))
|
|
build(build_opts);
|
|
}
|
|
|
|
ircd::cl::code::code(code &&o)
|
|
noexcept
|
|
:handle{std::move(o.handle)}
|
|
{
|
|
o.handle = nullptr;
|
|
}
|
|
|
|
ircd::cl::code &
|
|
ircd::cl::code::operator=(code &&o)
|
|
noexcept
|
|
{
|
|
this->~code();
|
|
handle = std::move(o.handle);
|
|
o.handle = nullptr;
|
|
return *this;
|
|
}
|
|
|
|
ircd::cl::code::~code()
|
|
noexcept try
|
|
{
|
|
call(clReleaseProgram, cl_program(handle));
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
log::critical
|
|
{
|
|
log, "Program Release :%s",
|
|
e.what(),
|
|
};
|
|
|
|
return;
|
|
}
|
|
|
|
namespace ircd::cl
|
|
{
|
|
static void
|
|
handle_built(cl_program program, void *priv)
|
|
{
|
|
ircd::always_assert(false);
|
|
}
|
|
}
|
|
|
|
void
|
|
ircd::cl::code::build(const string_view &opts)
|
|
try
|
|
{
|
|
const uint num_devices {0};
|
|
const cl_device_id *const device_list {nullptr};
|
|
call
|
|
(
|
|
clBuildProgram,
|
|
cl_program(handle),
|
|
num_devices,
|
|
device_list,
|
|
opts.c_str(),
|
|
&cl::handle_built,
|
|
nullptr
|
|
);
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
const auto error_closure{[this]
|
|
(const mutable_buffer &buf)
|
|
{
|
|
size_t len {0}; call
|
|
(
|
|
clGetProgramBuildInfo,
|
|
cl_program(this->handle),
|
|
device[0][0],
|
|
CL_PROGRAM_BUILD_LOG,
|
|
ircd::size(buf),
|
|
ircd::data(buf),
|
|
&len
|
|
);
|
|
|
|
return len;
|
|
}};
|
|
|
|
const auto error_message
|
|
{
|
|
ircd::string(8_KiB | SHRINK_TO_FIT, error_closure)
|
|
};
|
|
|
|
ircd::tokens(error_message, '\n', []
|
|
(const string_view &line)
|
|
{
|
|
// note last line is just a CR
|
|
if(likely(line.size() > 1))
|
|
log::logf
|
|
{
|
|
log, log::DERROR, "%s", line,
|
|
};
|
|
});
|
|
|
|
throw;
|
|
}
|
|
|
|
//
|
|
// data
|
|
//
|
|
|
|
ircd::cl::data::data(const size_t size_,
|
|
const mutable_buffer &buf,
|
|
const bool wonly)
|
|
{
|
|
const auto ptr
|
|
{
|
|
ircd::size(buf)? ircd::data(buf): nullptr
|
|
};
|
|
|
|
const auto size
|
|
{
|
|
ircd::size(buf)?: size_
|
|
};
|
|
|
|
cl_mem_flags flags {0};
|
|
flags |= CL_MEM_READ_WRITE;
|
|
flags |= wonly? CL_MEM_WRITE_ONLY: 0;
|
|
flags |= ircd::size(buf)? CL_MEM_COPY_HOST_PTR: 0;
|
|
|
|
int err {CL_SUCCESS};
|
|
handle = clCreateBuffer(primary, flags, size, ptr, &err);
|
|
throw_on_error(err);
|
|
}
|
|
|
|
ircd::cl::data::data(const size_t size_,
|
|
const const_buffer &buf)
|
|
{
|
|
const auto ptr
|
|
{
|
|
ircd::size(buf)? ircd::data(buf): nullptr
|
|
};
|
|
|
|
const auto size
|
|
{
|
|
ircd::size(buf)?: size_
|
|
};
|
|
|
|
cl_mem_flags flags {0};
|
|
flags |= CL_MEM_READ_ONLY;
|
|
flags |= ircd::size(buf)? CL_MEM_COPY_HOST_PTR: 0;
|
|
|
|
int err {CL_SUCCESS};
|
|
handle = clCreateBuffer(primary, flags, size, mutable_cast(ptr), &err);
|
|
throw_on_error(err);
|
|
}
|
|
|
|
ircd::cl::data::data(const mutable_buffer &buf,
|
|
const bool wonly)
|
|
{
|
|
cl_mem_flags flags {0};
|
|
flags |= CL_MEM_USE_HOST_PTR;
|
|
flags |= wonly? CL_MEM_WRITE_ONLY: CL_MEM_READ_WRITE;
|
|
|
|
int err {CL_SUCCESS};
|
|
handle = clCreateBuffer(primary, flags, ircd::size(buf), ircd::data(buf), &err);
|
|
throw_on_error(err);
|
|
}
|
|
|
|
ircd::cl::data::data(const const_buffer &buf)
|
|
{
|
|
cl_mem_flags flags {0};
|
|
flags |= CL_MEM_USE_HOST_PTR;
|
|
flags |= CL_MEM_READ_ONLY;
|
|
|
|
int err {CL_SUCCESS};
|
|
handle = clCreateBuffer(primary, flags, ircd::size(buf), mutable_cast(ircd::data(buf)), &err);
|
|
throw_on_error(err);
|
|
}
|
|
|
|
ircd::cl::data::data(data &&o)
|
|
noexcept
|
|
:handle{std::move(o.handle)}
|
|
{
|
|
o.handle = nullptr;
|
|
}
|
|
|
|
ircd::cl::data &
|
|
ircd::cl::data::operator=(data &&o)
|
|
noexcept
|
|
{
|
|
this->~data();
|
|
handle = std::move(o.handle);
|
|
o.handle = nullptr;
|
|
return *this;
|
|
}
|
|
|
|
ircd::cl::data::~data()
|
|
noexcept try
|
|
{
|
|
call(clReleaseMemObject, cl_mem(handle));
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
log::critical
|
|
{
|
|
log, "Memory Release :%s",
|
|
e.what(),
|
|
};
|
|
|
|
return;
|
|
}
|
|
|
|
//
|
|
// cl::work (event)
|
|
//
|
|
|
|
namespace ircd::cl
|
|
{
|
|
struct alignas(16) pkg
|
|
{
|
|
cl_event event {nullptr};
|
|
ctx::ctx *ctx {nullptr};
|
|
}
|
|
__packed;
|
|
|
|
static std::unique_ptr<ctx::context> handle_context;
|
|
static std::mutex offload_mutex alignas(64);
|
|
static std::condition_variable offload_cond alignas(64);
|
|
static std::atomic<bool> offload_term alignas(64);
|
|
static std::atomic<pkg> offload_pkg alignas(64);
|
|
|
|
static void handle_event(cl_event, cl_int, void *) noexcept;
|
|
static void handle_incomplete(work &, const int &);
|
|
static void handle_offload();
|
|
static void handle_worker();
|
|
}
|
|
|
|
void
|
|
ircd::cl::work::init()
|
|
{
|
|
assert(!handle_context);
|
|
handle_context = std::make_unique<ctx::context>
|
|
(
|
|
"cl.work", handle_worker
|
|
);
|
|
}
|
|
|
|
void
|
|
ircd::cl::work::fini()
|
|
noexcept
|
|
{
|
|
cl::sync();
|
|
if(!handle_context)
|
|
return;
|
|
|
|
offload_term.store(true, std::memory_order_release);
|
|
offload_cond.notify_all();
|
|
handle_context->terminate();
|
|
handle_context->join();
|
|
handle_context.reset(nullptr);
|
|
}
|
|
|
|
void
|
|
ircd::cl::handle_worker()
|
|
{
|
|
ctx::ole::opts opts;
|
|
opts.name = "cl.work"_sv;
|
|
ctx::offload
|
|
{
|
|
opts, []
|
|
{
|
|
while(!offload_term.load(std::memory_order_relaxed))
|
|
handle_offload();
|
|
}
|
|
};
|
|
}
|
|
|
|
void
|
|
ircd::cl::handle_offload()
|
|
{
|
|
// This is a pre-locked mutex used to achieve "naked wakeup" with only the
|
|
// standard API's and contractual assumptions and atomics between threads.
|
|
static std::unique_lock lock{offload_mutex};
|
|
|
|
// Acquire the work assignment pkg. The mutex/lock does nothing here
|
|
// because the main thread never takes it (nor does anyone else). Instead
|
|
// the main thread (notifier) guarantees if our result is not visible, a
|
|
// notify is to follow, therefor it's always safe for this thread to go to
|
|
// sleep if the condition is not witnessed as satisfied.
|
|
pkg p;
|
|
bool terminated {false};
|
|
offload_cond.wait(lock, [&p, &terminated]
|
|
{
|
|
// Also check for termination as another wakeup condition.
|
|
terminated = offload_term.load(std::memory_order_relaxed);
|
|
p = offload_pkg.load(std::memory_order_acquire);
|
|
return terminated | (p.event != nullptr);
|
|
});
|
|
|
|
if(unlikely(!p.event))
|
|
{
|
|
assert(terminated);
|
|
return;
|
|
}
|
|
|
|
const auto res
|
|
{
|
|
clSetEventCallback(p.event, CL_COMPLETE, &handle_event, p.ctx)
|
|
};
|
|
|
|
if(unlikely(res != CL_SUCCESS))
|
|
ircd::terminate
|
|
{
|
|
"clSetEventCallback(%p) context:%p unexpected error #%d.",
|
|
p.event,
|
|
p.ctx,
|
|
res,
|
|
};
|
|
}
|
|
|
|
void
|
|
ircd::cl::handle_event(cl_event event,
|
|
cl_int status,
|
|
void *const priv)
|
|
noexcept
|
|
{
|
|
assert(status == CL_COMPLETE);
|
|
|
|
// Prepare response to the main thread. The event pointer is anulled to
|
|
// indicate completion.
|
|
pkg ours;
|
|
ours.event = nullptr;
|
|
ours.ctx = reinterpret_cast<ctx::ctx *>(priv);
|
|
|
|
// Expect the main thread to not have changed anything in the pacakge
|
|
// during our operation. The context should be nicely suspended this whole
|
|
// time. In case the context anulls the package to abandon the result this
|
|
// transaction will fail and we'll do nothing further from here.
|
|
//
|
|
// Note at this time abandonment is not supported since we're missing
|
|
// another phase indicating a commitment by this thread to notify. Without
|
|
// any obligation by the main thread to such a commitment, the *ctx may
|
|
// be invalid by the time this thread notifies.
|
|
pkg theirs;
|
|
theirs.ctx = ours.ctx;
|
|
theirs.event = event;
|
|
|
|
constexpr auto
|
|
success(std::memory_order_acq_rel),
|
|
failure(std::memory_order_relaxed);
|
|
if(unlikely(!offload_pkg.compare_exchange_strong(theirs, ours, success, failure)))
|
|
return;
|
|
|
|
if(unlikely(!theirs.ctx))
|
|
return;
|
|
|
|
// Wakeup the ctx for the result. This is a special notify meant to
|
|
// originate from external threads.
|
|
ctx::notify(*theirs.ctx, ctx::threadsafe);
|
|
}
|
|
|
|
void
|
|
ircd::cl::handle_incomplete(work &work,
|
|
const int &status)
|
|
{
|
|
// Expose the datagram to the other thread. On supported architectures
|
|
// we can pass 16 bytes atomically, or two pointers. One for the event
|
|
// (input) and the other for the context to notify (output).
|
|
pkg theirs, ours;
|
|
ours.event = cl_event(work.handle);
|
|
ours.ctx = ctx::current;
|
|
|
|
constexpr auto
|
|
success(std::memory_order_acq_rel),
|
|
failure(std::memory_order_relaxed);
|
|
if(unlikely(!offload_pkg.compare_exchange_strong(theirs, ours, success, failure)))
|
|
{
|
|
always_assert(false); // Conflict from two ircd::ctx.
|
|
return;
|
|
}
|
|
|
|
// When finished here the package is cleared immediately without condition.
|
|
// It's probably unsafe to clear the package with zero cooperation from the
|
|
// other thread, so we disable interrupts here for now; the other thread
|
|
// must respond.
|
|
const ctx::uninterruptible::nothrow ui;
|
|
const unwind unset{[]
|
|
{
|
|
offload_pkg.store(pkg{}, std::memory_order_release);
|
|
}};
|
|
|
|
// Send a "naked wakeup" to the other thread because we're not taking a
|
|
// lock on this condition variable for the notify. To accomplish this we
|
|
// fulfill two obligations on this side:
|
|
// 1. Ensure every package is always followed by a notify after it.
|
|
// 2. Ensure notify is followed by same release-semantics as an unlock().
|
|
offload_cond.notify_one();
|
|
std::atomic_thread_fence(std::memory_order_release); do
|
|
{
|
|
// Unconditionally suspend this context. The other thread will have
|
|
// to respond with at least one notification preceded by a result.
|
|
ctx::wait();
|
|
theirs = offload_pkg.load(std::memory_order_acquire);
|
|
}
|
|
while(theirs.event != nullptr);
|
|
assert(theirs.ctx == ctx::current);
|
|
}
|
|
|
|
//
|
|
// work::work
|
|
//
|
|
|
|
ircd::cl::work::work(void *const &handle)
|
|
{
|
|
call(clRetainEvent, cl_event(handle));
|
|
this->handle = handle;
|
|
}
|
|
|
|
ircd::cl::work::~work()
|
|
noexcept try
|
|
{
|
|
if(!this->handle)
|
|
return;
|
|
|
|
const auto handle
|
|
{
|
|
cl_event(this->handle)
|
|
};
|
|
|
|
char status_buf[8] {0};
|
|
const auto &status
|
|
{
|
|
info<int>(clGetEventInfo, handle, CL_EVENT_COMMAND_EXECUTION_STATUS, status_buf)
|
|
};
|
|
|
|
if(status != CL_COMPLETE)
|
|
handle_incomplete(*this, status);
|
|
|
|
call(clReleaseEvent, cl_event(handle));
|
|
}
|
|
catch(const std::exception &e)
|
|
{
|
|
log::critical
|
|
{
|
|
log, "Work Release :%s",
|
|
e.what(),
|
|
};
|
|
|
|
return;
|
|
}
|
|
|
|
std::array<uint64_t, 4>
|
|
ircd::cl::work::profile()
|
|
const
|
|
{
|
|
const auto handle
|
|
{
|
|
cl_event(this->handle)
|
|
};
|
|
|
|
char buf[4][8];
|
|
return std::array<uint64_t, 4>
|
|
{
|
|
info<size_t>(clGetEventProfilingInfo, handle, CL_PROFILING_COMMAND_QUEUED, buf[0]),
|
|
info<size_t>(clGetEventProfilingInfo, handle, CL_PROFILING_COMMAND_SUBMIT, buf[1]),
|
|
info<size_t>(clGetEventProfilingInfo, handle, CL_PROFILING_COMMAND_START, buf[2]),
|
|
info<size_t>(clGetEventProfilingInfo, handle, CL_PROFILING_COMMAND_END, buf[3]),
|
|
};
|
|
}
|
|
|
|
//
|
|
// callback surface
|
|
//
|
|
|
|
void
|
|
ircd::cl::handle_notify(const char *errstr,
|
|
const void *token,
|
|
size_t cb,
|
|
void *priv)
|
|
noexcept
|
|
{
|
|
if(errstr)
|
|
log::error
|
|
{
|
|
log, "OpenCL t:%p cb:%zu :%s",
|
|
token,
|
|
cb,
|
|
errstr,
|
|
};
|
|
}
|
|
|
|
//
|
|
// util
|
|
//
|
|
|
|
template<class T,
|
|
class F,
|
|
class id,
|
|
class param>
|
|
T
|
|
ircd::cl::info(F&& func,
|
|
const id &i,
|
|
const param &p,
|
|
const mutable_buffer &out)
|
|
{
|
|
using ircd::data;
|
|
using ircd::size;
|
|
|
|
size_t len {0};
|
|
call(std::forward<F>(func), i, p, size(out), data(out), &len);
|
|
const string_view str
|
|
{
|
|
data(out), len
|
|
};
|
|
|
|
return byte_view<T>(str);
|
|
}
|
|
|
|
template<class func,
|
|
class... args>
|
|
int
|
|
ircd::cl::call(func&& f,
|
|
args&&... a)
|
|
{
|
|
const int ret
|
|
{
|
|
f(std::forward<args>(a)...)
|
|
};
|
|
|
|
return throw_on_error(ret);
|
|
}
|
|
|
|
int
|
|
ircd::cl::throw_on_error(const int &code)
|
|
{
|
|
if(unlikely(is_error(code)))
|
|
throw opencl_error
|
|
{
|
|
"(#%d) :%s",
|
|
code,
|
|
reflect_error(code),
|
|
};
|
|
|
|
return code;
|
|
}
|
|
|
|
bool
|
|
ircd::cl::is_error(const int &code)
|
|
noexcept
|
|
{
|
|
return code < 0;
|
|
}
|
|
|
|
ircd::string_view
|
|
ircd::cl::reflect_error(const int code)
|
|
noexcept
|
|
{
|
|
switch(code)
|
|
{
|
|
case CL_SUCCESS: return "SUCCESS";
|
|
case CL_DEVICE_NOT_FOUND: return "DEVICE_NOT_FOUND";
|
|
case CL_DEVICE_NOT_AVAILABLE: return "DEVICE_NOT_AVAILABLE";
|
|
case CL_COMPILER_NOT_AVAILABLE: return "COMPILER_NOT_AVAILABLE";
|
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "MEM_OBJECT_ALLOCATION_FAILURE";
|
|
case CL_OUT_OF_RESOURCES: return "OUT_OF_RESOURCES";
|
|
case CL_OUT_OF_HOST_MEMORY: return "OUT_OF_HOST_MEMORY";
|
|
case CL_PROFILING_INFO_NOT_AVAILABLE: return "PROFILING_INFO_NOT_AVAILABLE";
|
|
case CL_MEM_COPY_OVERLAP: return "MEM_COPY_OVERLAP";
|
|
case CL_IMAGE_FORMAT_MISMATCH: return "IMAGE_FORMAT_MISMATCH";
|
|
case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "IMAGE_FORMAT_NOT_SUPPORTED";
|
|
case CL_BUILD_PROGRAM_FAILURE: return "BUILD_PROGRAM_FAILURE";
|
|
case CL_MAP_FAILURE: return "MAP_FAILURE";
|
|
case CL_INVALID_VALUE: return "INVALID_VALUE";
|
|
case CL_INVALID_DEVICE_TYPE: return "INVALID_DEVICE_TYPE";
|
|
case CL_INVALID_PLATFORM: return "INVALID_PLATFORM";
|
|
case CL_INVALID_DEVICE: return "INVALID_DEVICE";
|
|
case CL_INVALID_CONTEXT: return "INVALID_CONTEXT";
|
|
case CL_INVALID_QUEUE_PROPERTIES: return "INVALID_QUEUE_PROPERTIES";
|
|
case CL_INVALID_COMMAND_QUEUE: return "INVALID_COMMAND_QUEUE";
|
|
case CL_INVALID_HOST_PTR: return "INVALID_HOST_PTR";
|
|
case CL_INVALID_MEM_OBJECT: return "INVALID_MEM_OBJECT";
|
|
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "INVALID_IMAGE_FORMAT_DESCRIPTOR";
|
|
case CL_INVALID_IMAGE_SIZE: return "INVALID_IMAGE_SIZE";
|
|
case CL_INVALID_SAMPLER: return "INVALID_SAMPLER";
|
|
case CL_INVALID_BINARY: return "INVALID_BINARY";
|
|
case CL_INVALID_BUILD_OPTIONS: return "INVALID_BUILD_OPTIONS";
|
|
case CL_INVALID_PROGRAM: return "INVALID_PROGRAM";
|
|
case CL_INVALID_PROGRAM_EXECUTABLE: return "INVALID_PROGRAM_EXECUTABLE";
|
|
case CL_INVALID_KERNEL_NAME: return "INVALID_KERNEL_NAME";
|
|
case CL_INVALID_KERNEL_DEFINITION: return "INVALID_KERNEL_DEFINITION";
|
|
case CL_INVALID_KERNEL: return "INVALID_KERNEL";
|
|
case CL_INVALID_ARG_INDEX: return "INVALID_ARG_INDEX";
|
|
case CL_INVALID_ARG_VALUE: return "INVALID_ARG_VALUE";
|
|
case CL_INVALID_ARG_SIZE: return "INVALID_ARG_SIZE";
|
|
case CL_INVALID_KERNEL_ARGS: return "INVALID_KERNEL_ARGS";
|
|
case CL_INVALID_WORK_DIMENSION: return "INVALID_WORK_DIMENSION";
|
|
case CL_INVALID_WORK_GROUP_SIZE: return "INVALID_WORK_GROUP_SIZE";
|
|
case CL_INVALID_WORK_ITEM_SIZE: return "INVALID_WORK_ITEM_SIZE";
|
|
case CL_INVALID_GLOBAL_OFFSET: return "INVALID_GLOBAL_OFFSET";
|
|
case CL_INVALID_EVENT_WAIT_LIST: return "INVALID_EVENT_WAIT_LIST";
|
|
case CL_INVALID_EVENT: return "INVALID_EVENT";
|
|
case CL_INVALID_OPERATION: return "INVALID_OPERATION";
|
|
case CL_INVALID_GL_OBJECT: return "INVALID_GL_OBJECT";
|
|
case CL_INVALID_BUFFER_SIZE: return "INVALID_BUFFER_SIZE";
|
|
case CL_INVALID_MIP_LEVEL: return "INVALID_MIP_LEVEL";
|
|
case CL_INVALID_GLOBAL_WORK_SIZE: return "INVALID_GLOBAL_WORK_SIZE";
|
|
|
|
#ifdef CL_VERSION_1_1
|
|
case CL_INVALID_PROPERTY: return "INVALID_PROPERTY";
|
|
case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "MISALIGNED_SUB_BUFFER_OFFSET";
|
|
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
|
|
#endif
|
|
|
|
#ifdef CL_VERSION_1_2
|
|
case CL_COMPILE_PROGRAM_FAILURE: return "COMPILE_PROGRAM_FAILURE";
|
|
case CL_LINKER_NOT_AVAILABLE: return "LINKER_NOT_AVAILABLE";
|
|
case CL_LINK_PROGRAM_FAILURE: return "LINK_PROGRAM_FAILURE";
|
|
case CL_DEVICE_PARTITION_FAILED: return "DEVICE_PARTITION_FAILED";
|
|
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "KERNEL_ARG_INFO_NOT_AVAILABLE";
|
|
case CL_INVALID_IMAGE_DESCRIPTOR: return "INVALID_IMAGE_DESCRIPTOR";
|
|
case CL_INVALID_COMPILER_OPTIONS: return "INVALID_COMPILER_OPTIONS";
|
|
case CL_INVALID_LINKER_OPTIONS: return "INVALID_LINKER_OPTIONS";
|
|
case CL_INVALID_DEVICE_PARTITION_COUNT: return "INVALID_DEVICE_PARTITION_COUNT";
|
|
#endif
|
|
|
|
#ifdef CL_VERSION_2_0
|
|
case CL_INVALID_PIPE_SIZE: return "INVALID_PIPE_SIZE";
|
|
case CL_INVALID_DEVICE_QUEUE: return "INVALID_DEVICE_QUEUE";
|
|
#endif
|
|
|
|
#ifdef CL_VERSION_2_2
|
|
case CL_INVALID_SPEC_ID: return "INVALID_SPEC_ID";
|
|
case CL_MAX_SIZE_RESTRICTION_EXCEEDED: return "MAX_SIZE_RESTRICTION_EXCEEDED";
|
|
#endif
|
|
}
|
|
|
|
return "???????";
|
|
}
|