construct/ircd/cl.cc

3063 lines
60 KiB
C++

// The Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#include <dlfcn.h>
#include <CL/cl.h>
#include <CL/cl_ext.h>
// Util
namespace ircd::cl
{
static bool is_error(const int &code) noexcept;
static int throw_on_error(const int &code);
template<int maybe = CL_SUCCESS, class func, class... args> static int call(func&&, args&&...);
template<class T = string_view, int maybe = CL_SUCCESS, class F, class id, class param> static T info(F&&, const id &, const param &, const mutable_buffer &, T default_ = {});
template<class T = string_view, int maybe = CL_SUCCESS, class F, class id0, class id1, class param> static T info(F&&, const id0 &, const id1 &, const param &, const mutable_buffer &, T default_ = {});
static string_view version_str(const mutable_buffer &, const cl_version);
static uint query_warp_size_amd(cl_device_id);
static uint query_warp_size(cl_context, cl_device_id);
}
// Runtime state
namespace ircd::cl
{
static const size_t
OPTION_MAX {8},
PLATFORM_MAX {8},
DEVICE_MAX {8};
static uint
options,
platforms,
devices[PLATFORM_MAX];
static char
option[OPTION_MAX][256];
static void
*linkage;
static cl_platform_id
platform[PLATFORM_MAX];
static cl_device_id
device[PLATFORM_MAX][DEVICE_MAX];
static struct version
{
int major, minor;
}
api[PLATFORM_MAX][DEVICE_MAX];
static uint
warp_size[PLATFORM_MAX][DEVICE_MAX];
static cl_context
primary;
extern struct stats
primary_stats;
static cl_command_queue
queue[PLATFORM_MAX][DEVICE_MAX];
static void handle_notify(const char *, const void *, size_t, void *) noexcept;
}
struct ircd::cl::stats
{
template<class T>
using item = ircd::stats::item<T>;
item<uint64_t>
sync_count,
flush_count,
alloc_count,
alloc_bytes,
dealloc_count,
dealloc_bytes,
work_waits,
work_waits_async,
work_errors,
work_completes,
exec_tasks,
exec_kern_tasks,
exec_kern_threads,
exec_kern_groups,
exec_write_tasks,
exec_write_bytes,
exec_read_tasks,
exec_read_bytes,
exec_copy_tasks,
exec_copy_bytes,
exec_barrier_tasks;
};
decltype(ircd::cl::log)
ircd::cl::log
{
"cl"
};
decltype(ircd::cl::version_api)
ircd::cl::version_api
{
"OpenCL", info::versions::API, CL_TARGET_OPENCL_VERSION,
{
#if defined(CL_VERSION_MAJOR) && defined(CL_VERSION_MINOR) && defined(CL_VERSION_PATCH)
CL_VERSION_MAJOR(CL_TARGET_OPENCL_VERSION),
CL_VERSION_MINOR(CL_TARGET_OPENCL_VERSION),
CL_VERSION_PATCH(CL_TARGET_OPENCL_VERSION),
#endif
}
};
decltype(ircd::cl::version_abi)
ircd::cl::version_abi
{
"OpenCL", info::versions::ABI
};
decltype(ircd::cl::enable)
ircd::cl::enable
{
{ "name", "ircd.cl.enable" },
{ "default", true },
{ "persist", false },
};
decltype(ircd::cl::profile_queue)
ircd::cl::profile_queue
{
{ "name", "ircd.cl.queue.profile" },
{ "default", false },
{ "persist", false },
};
decltype(ircd::cl::device_queue)
ircd::cl::device_queue
{
{ "name", "ircd.cl.queue.device" },
{ "default", true },
{ "persist", false },
};
decltype(ircd::cl::queue_size)
ircd::cl::queue_size
{
{ "name", "ircd.cl.queue.size" },
{ "default", long(1_MiB) },
{ "persist", false },
};
decltype(ircd::cl::intensity)
ircd::cl::intensity
{
{ "name", "ircd.cl.intensity" },
{ "default", 0L },
};
decltype(ircd::cl::watchdog_tsc)
ircd::cl::watchdog_tsc
{
{ "name", "ircd.cl.watchdog.tsc" },
{ "default", 268'435'456L },
};
decltype(ircd::cl::path)
ircd::cl::path
{
{ "name", "ircd.cl.path" },
{ "default", "libOpenCL.so" },
{ "persist", false },
};
decltype(ircd::cl::envs)
ircd::cl::envs
{
{
{ "name", "LP_NUM_THREADS" },
{ "default", "0" },
},
{
{ "name", "MESA_NO_MINMAX_CACHE" },
{ "default", "true" },
},
{
{ "name", "MESA_SHADER_CACHE_DISABLE" },
{ "default", "true" },
},
{
{ "name", "AMD_DEBUG" },
{ "default", "nogfx,reserve_vmid" },
},
{
{ "name", "R600_DEBUG" },
{ "default", "forcedma" },
},
{
{ "name", "RADEON_THREAD" },
{ "default", "false" },
},
};
decltype(ircd::cl::primary_stats)
ircd::cl::primary_stats
{
{ { "name", "ircd.cl.sync.count" } },
{ { "name", "ircd.cl.flush.count" } },
{ { "name", "ircd.cl.alloc.count" } },
{ { "name", "ircd.cl.alloc.bytes" } },
{ { "name", "ircd.cl.dealloc.count" } },
{ { "name", "ircd.cl.dealloc.bytes" } },
{ { "name", "ircd.cl.work.waits" } },
{ { "name", "ircd.cl.work.waits.async" } },
{ { "name", "ircd.cl.work.errors" } },
{ { "name", "ircd.cl.work.completes" } },
{ { "name", "ircd.cl.exec.tasks" } },
{ { "name", "ircd.cl.exec.kern.tasks" } },
{ { "name", "ircd.cl.exec.kern.threads" } },
{ { "name", "ircd.cl.exec.kern.groups" } },
{ { "name", "ircd.cl.exec.write.tasks" } },
{ { "name", "ircd.cl.exec.write.bytes" } },
{ { "name", "ircd.cl.exec.read.tasks" } },
{ { "name", "ircd.cl.exec.read.bytes" } },
{ { "name", "ircd.cl.exec.copy.tasks" } },
{ { "name", "ircd.cl.exec.copy.bytes" } },
{ { "name", "ircd.cl.exec.barrier.tasks" } },
};
//
// init
//
ircd::cl::init::init()
{
if(!enable)
{
log::dwarning
{
log, "OpenCL hardware acceleration is not available or enabled."
};
return;
}
const ctx::posix::enable_pthread enable_pthread;
// Link the libraries.
if(!init_libs())
{
log::warning
{
log, "OpenCL hardware acceleration runtime failed to link."
};
return;
}
// Get the platforms.
if(!init_platforms())
{
log::warning
{
log, "OpenCL hardware acceleration platform not found."
};
return;
}
// Report the platforms.
log_platform_info();
// Get the devices.
if(!init_devices())
{
log::warning
{
log, "OpenCL hardware acceleration device not found."
};
return;
}
// Various other inits.
init_ctxs();
// Report the devices.
log_dev_info();
}
[[gnu::cold]]
ircd::cl::init::~init()
noexcept
{
if(!linkage)
return;
log::debug
{
log, "Shutting down OpenCL...",
};
const ctx::posix::enable_pthread enable_pthread;
fini_ctxs();
fini_libs();
}
bool
ircd::cl::init::init_libs()
{
const string_view path
{
cl::path
};
if(path.empty())
return false;
// Setup options
for(const auto &item : envs)
{
assert(options < OPTION_MAX);
fmt::sprintf
{
option[options], "%s=%s",
item.name,
string_view{item},
};
sys::call(putenv, option[options++]);
}
// Load the pipe.
assert(!linkage);
if(!(linkage = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL)))
return false;
return true;
}
void
ircd::cl::init::fini_libs()
{
if(likely(linkage))
dlclose(linkage);
}
size_t
ircd::cl::init::init_platforms()
try
{
// OpenCL sez platform=null is implementation defined.
constexpr auto ignore(CL_INVALID_PLATFORM);
info<string_view, ignore>(clGetPlatformInfo, nullptr, CL_PLATFORM_VERSION, version_abi.string);
// Get the platforms.
call(clGetPlatformIDs, PLATFORM_MAX, platform, &platforms);
return platforms;
}
catch(const std::exception &e)
{
log::logf
{
log, log::level::DERROR,
"OpenCL platforms initialization :%s",
e.what(),
};
return 0;
}
size_t
ircd::cl::init::init_devices()
try
{
// Get the devices.
size_t devices_total(0);
for(size_t i(0); i < platforms; ++i)
{
static const auto type
{
CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR
};
// OpenCL sez 0 devices throws an error but we log a warning for now.
constexpr auto ignore(CL_DEVICE_NOT_FOUND);
call<ignore>(clGetDeviceIDs, platform[i], type, DEVICE_MAX, device[i], devices + i);
devices_total += devices[i];
}
// Gather the API versions for the devices.
for(size_t i(0); i < platforms; ++i)
for(size_t j(0); j < devices[i]; ++j) try
{
// OpenCL sez:
// OpenCL<space><major_version.minor_version><space><vendor-specific information>
string_view ver; char buf[256];
ver = info(clGetDeviceInfo, device[i][j], CL_DEVICE_VERSION, buf);
ver = lstrip(ver, "OpenCL ");
ver = split(ver, ' ').first;
const auto &[major, minor]
{
split(ver, '.')
};
api[i][j].major = lex_cast<uint>(major);
api[i][j].minor = lex_cast<uint>(minor);
}
catch(const error &e)
{
log::error
{
log, "OpenCL [%u][%u] CL_DEVICE_VERSION :%s",
i, j,
e.what(),
};
}
return devices_total;
}
catch(const std::exception &e)
{
log::error
{
log, "OpenCL devices initialization :%s",
e.what(),
};
return 0;
}
size_t
ircd::cl::init::init_ctxs()
{
// Gather all devices we'll use.
size_t _devs {0};
cl_device_id _dev[DEVICE_MAX];
for(size_t i(0); i < platforms; ++i)
for(size_t j(0); j < devices[i]; ++j)
_dev[_devs++] = device[i][j];
// Create a context from gathered devices.
cl_int err {CL_SUCCESS};
cl_context_properties ctxprop {0};
primary = clCreateContext(&ctxprop, _devs, _dev, handle_notify, nullptr, &err);
throw_on_error(err);
// Device queue enabler
bool dev_queue
{
true
&& cl::device_queue // If enabled by conf
&& !cl::profile_queue // And not profiling (for now)
};
// Device queue support
char tmp[4];
for(size_t i(0); i < _devs; ++i)
dev_queue &= info<uint>(clGetDeviceInfo, _dev[i], CL_DEVICE_MAX_ON_DEVICE_QUEUES, tmp);
// Queue out-of-order execution
bool dev_ooe
{
dev_queue // tied to dev_queue
};
// Queue size in bytes
uint dev_queue_size
{
uint(cl::queue_size)
};
// Queue size limited by devices
for(size_t i(0); i < _devs; ++i)
{
const auto max_queue_size
{
info<uint>(clGetDeviceInfo, _dev[i], CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, tmp)
};
dev_queue_size = std::min(dev_queue_size, max_queue_size);
}
const uint prop[][2]
{
{ CL_QUEUE_SIZE, dev_queue_size },
{ CL_QUEUE_PROPERTIES, CL_QUEUE_ON_DEVICE & boolmask<uint>(dev_queue) },
{ CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE & boolmask<uint>(dev_ooe) },
{ CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE & boolmask<uint>(profile_queue) },
{ 0, 0 },
};
cl_queue_properties qprop[8] {0};
const auto qprops(sizeof(qprop) / sizeof(cl_queue_properties));
for(size_t i(0), j(0); j < qprops && (prop[i][0] || prop[i][1]); ++i)
if(prop[i][1])
{
qprop[j++] = prop[i][0];
qprop[j++] = prop[i][1];
}
// Create a queue for each device.
for(size_t i(0); i < platforms; ++i)
for(size_t j(0); j < devices[i]; ++j)
{
queue[i][j] = clCreateCommandQueueWithProperties(primary, device[i][j], qprop, &err);
throw_on_error(err);
}
// For any inits in the work subsystem.
work::init();
// Save the warp/wavefront size.
for(size_t i(0); i < platforms; ++i)
for(size_t j(0); j < devices[i]; ++j)
warp_size[i][j] = query_warp_size(primary, device[i][j]);
return _devs;
}
void
ircd::cl::init::fini_ctxs()
{
if(primary)
work::fini();
for(size_t i(0); i < PLATFORM_MAX; ++i)
for(size_t j(0); j < DEVICE_MAX; ++j)
if(queue[i][j])
{
call(clReleaseCommandQueue, queue[i][j]);
queue[i][j] = nullptr;
}
if(primary)
{
call(clReleaseContext, primary);
primary = nullptr;
}
}
void
ircd::cl::log_platform_info()
{
for(size_t i(0); i < platforms; ++i)
log_platform_info(i);
}
void
ircd::cl::log_platform_info(const uint i)
{
char buf[3][64];
char extbuf[320];
log::logf
{
log, log::level::DEBUG,
"OpenCL [%u][*] %-3d :%s :%s :%s :%s",
i,
CL_TARGET_OPENCL_VERSION,
info(clGetPlatformInfo, platform[i], CL_PLATFORM_VERSION, buf[0]),
info(clGetPlatformInfo, platform[i], CL_PLATFORM_VENDOR, buf[1]),
info(clGetPlatformInfo, platform[i], CL_PLATFORM_NAME, buf[2]),
info(clGetPlatformInfo, platform[i], CL_PLATFORM_EXTENSIONS, extbuf),
};
}
void
ircd::cl::log_dev_info()
{
for(size_t i(0); i < platforms; ++i)
log_dev_info(i);
}
void
ircd::cl::log_dev_info(const uint i)
{
if(unlikely(i >= PLATFORM_MAX))
throw std::out_of_range
{
"Invalid platform identifier."
};
for(size_t j(0); j < devices[i]; ++j)
log_dev_info(i, j);
}
void
ircd::cl::log_dev_info(const uint i,
const uint j)
{
if(unlikely(i >= PLATFORM_MAX || j >= DEVICE_MAX))
throw std::out_of_range
{
"Invalid platform or device identifier."
};
const auto &dev
{
device[i][j]
};
char buf[12][192];
char pbuf[8][64];
const auto type
{
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_TYPE, buf[0])
};
const auto type_str
{
type & CL_DEVICE_TYPE_CPU?
"CPU"_sv:
type & CL_DEVICE_TYPE_GPU?
"GPU"_sv:
type & CL_DEVICE_TYPE_ACCELERATOR?
"APU"_sv:
"DEV"_sv
};
const fmt::bsprintf<32> head
{
"%s id:%u:%u",
type_str,
i,
j,
};
constexpr auto ignore(CL_INVALID_VALUE);
const uint numeric_ver[]
{
info<uint, ignore>(clGetDeviceInfo, dev, CL_DEVICE_NUMERIC_VERSION, buf[0]),
info<uint, ignore>(clGetDeviceInfo, dev, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, buf[1]),
};
log::info
{
log, "%s cl:%03u clc:%s dev:%s :%s :%s :%s :%s",
string_view{head},
CL_TARGET_OPENCL_VERSION,
version_str(buf[0], numeric_ver[0]),
version_str(buf[1], numeric_ver[1]),
info(clGetDeviceInfo, dev, CL_DEVICE_VERSION, buf[2]),
info(clGetDeviceInfo, dev, CL_DRIVER_VERSION, buf[3]),
info(clGetDeviceInfo, dev, CL_DEVICE_VENDOR, buf[4]),
info(clGetDeviceInfo, dev, CL_DEVICE_NAME, buf[5]),
};
const auto wid
{
info<std::array<size_t, 3>>(clGetDeviceInfo, dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, buf[0])
};
log::info
{
log, "%s %u$mHz %u$x[simd%u] %u$x[%lu:%lu]%d %u$d[%u$x%u$x%u]",
string_view{head},
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, buf[0]),
info<uint, CL_INVALID_VALUE>(clGetDeviceInfo, dev, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, buf[1], 0U),
info<uint, CL_INVALID_VALUE>(clGetDeviceInfo, dev, CL_DEVICE_SIMD_WIDTH_AMD, buf[2], 0U),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MAX_COMPUTE_UNITS, buf[3]),
warp_size[i][j],
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, buf[4]),
info<int>(clGetDeviceInfo, dev, CL_DEVICE_PARTITION_PROPERTIES, buf[5]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, buf[6]),
wid[0], wid[1], wid[2],
};
log::info
{
log, "%s %u$bit-%s %s line %s align %s page %s param %s alloc %s",
string_view{head},
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_ADDRESS_BITS, buf[0]),
info<bool>(clGetDeviceInfo, dev, CL_DEVICE_ENDIAN_LITTLE, buf[1])?
"LE"_sv: "BE"_sv,
info<bool>(clGetDeviceInfo, dev, CL_DEVICE_ERROR_CORRECTION_SUPPORT, buf[2])?
"ECC"_sv: "non-ECC"_sv,
pretty(pbuf[0], iec(info<uint>(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, buf[3]))),
pretty(pbuf[1], iec(info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, buf[4]))),
pretty(pbuf[2], iec(info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MEM_BASE_ADDR_ALIGN, buf[5]))),
pretty(pbuf[3], iec(info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MAX_PARAMETER_SIZE, buf[6]))),
pretty(pbuf[4], iec(info<ulong>(clGetDeviceInfo, dev, CL_DEVICE_MAX_MEM_ALLOC_SIZE, buf[7]))),
};
log::info
{
log, "%s global %s type:%02x cache %s chans %u banks %u width %u",
string_view{head},
pretty(pbuf[0], iec(info<ulong>(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_SIZE, buf[0]))),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, buf[1]),
pretty(pbuf[1], iec(info<ulong>(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, buf[2]))),
info<uint, CL_INVALID_VALUE>(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, buf[3], 0U),
info<uint, CL_INVALID_VALUE>(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, buf[4], 0U),
info<uint, CL_INVALID_VALUE>(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, buf[5], 0U),
};
log::info
{
log, "%s local %s type:%02x banks %u const %s consts %u",
string_view{head},
pretty(pbuf[0], iec(info<ulong>(clGetDeviceInfo, dev, CL_DEVICE_LOCAL_MEM_SIZE, buf[0]))),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_LOCAL_MEM_TYPE, buf[1]),
info<uint, CL_INVALID_VALUE>(clGetDeviceInfo, dev, CL_DEVICE_LOCAL_MEM_BANKS_AMD, buf[2], 0U),
pretty(pbuf[1], iec(info<ulong>(clGetDeviceInfo, dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, buf[3]))),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MAX_CONSTANT_ARGS, buf[4]),
};
log::logf
{
log, log::level::DEBUG,
"%s char%u short%u half%u int%u float%u long%u double%u",
string_view{head},
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, buf[0]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, buf[1]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, buf[2]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, buf[3]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, buf[4]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, buf[5]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, buf[6]),
};
const bool native_kernel
(
CL_EXEC_NATIVE_KERNEL
& info<ulong>(clGetDeviceInfo, dev, CL_DEVICE_EXECUTION_CAPABILITIES, buf[0])
);
const auto il_version
(
native_kernel?
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_IL_VERSION, buf[1]): 0
);
log::logf
{
log, log::level::DEBUG,
"%s SPIR-V:%b:%u queues:%u events:%u pref:%u max:%u printf:%lu :%s",
string_view{head},
native_kernel,
il_version,
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MAX_ON_DEVICE_QUEUES, buf[0]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_MAX_ON_DEVICE_EVENTS, buf[1]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE, buf[2]),
info<uint>(clGetDeviceInfo, dev, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, buf[3]),
info<size_t>(clGetDeviceInfo, dev, CL_DEVICE_PRINTF_BUFFER_SIZE, buf[4]),
info(clGetDeviceInfo, dev, CL_DEVICE_OPENCL_C_VERSION, buf[5])
};
char extensions_buf[2048];
const string_view extensions
{
info(clGetDeviceInfo, dev, CL_DEVICE_EXTENSIONS, extensions_buf)
};
log::logf
{
log, log::level::DEBUG,
"%s :%s",
string_view{head},
extensions,
};
}
/// Silly quirks of OpenCL force us to setup a context, compile a program, and
/// instantiate a kernel to find out the warp/wavefront size characteristic.
/// Note that other thread-grouping characteristics are available from device
/// properties directly.
uint
ircd::cl::query_warp_size(cl_context context,
cl_device_id dev)
try
{
// Attempt to get this value the easy way from a device API.
uint ret(0);
if((ret = query_warp_size_amd(dev)))
return ret;
assert(primary);
assert(context == primary); //TODO: XXX
// Get this value the hard way from the cl_kernel API which requires
// building a program first and querying its properties.
cl::code code
{
"__kernel void ircd_test() {}"_sv
};
code.compile();
code.link();
cl::kern kern
{
code, "ircd_test"
};
ret = kern.preferred_group_size_multiple(device);
return ret;
}
catch(const ctx::interrupted &)
{
throw;
}
catch(const ctx::terminated &)
{
throw;
}
catch(const std::exception &e)
{
log::logf
{
log, log::level::WARNING,
"context(%p) device(%p) query warp size :%s",
static_cast<const void *>(context),
static_cast<const void *>(dev),
e.what(),
};
return 0;
}
uint
ircd::cl::query_warp_size_amd(cl_device_id dev)
try
{
char buf[4];
constexpr auto ign(CL_INVALID_VALUE);
return info<uint, ign>(clGetDeviceInfo, dev, CL_DEVICE_WAVEFRONT_WIDTH_AMD, buf, 0);
}
catch(const std::exception &e)
{
log::error
{
log, "device(%p) query warp size (AMD) :%s",
static_cast<const void *>(dev),
e.what(),
};
return 0;
}
ircd::string_view
ircd::cl::version_str(const mutable_buffer &buf,
const cl_version val)
{
return fmt::sprintf
{
buf, "%u.%u.%u",
CL_VERSION_MAJOR(val),
CL_VERSION_MINOR(val),
CL_VERSION_PATCH(val),
};
}
//
// interface
//
void
ircd::cl::sync()
{
if(unlikely(!primary))
return;
auto &q
{
queue[0][0]
};
call
(
clFinish, q
);
++primary_stats.sync_count;
}
void
ircd::cl::flush()
{
auto &q
{
queue[0][0]
};
call
(
clFlush, q
);
++primary_stats.flush_count;
}
//
// exec
//
namespace ircd::cl
{
static const size_t _deps_list_max {32};
static thread_local cl_event _deps_list[_deps_list_max];
static cl_event *addressof_handle(cl::work *const &);
static void check_submit_blocking(cl::exec *const &, const exec::opts &);
static void handle_submitted(cl::exec *const &, const exec::opts &);
static vector_view<cl_event> make_deps_default(cl::work *const &, const exec::opts &);
static vector_view<cl_event> make_deps(cl::work *const &, const exec::opts &);
}
template<>
decltype(ircd::util::instance_list<ircd::cl::work>::allocator)
ircd::util::instance_list<ircd::cl::work>::allocator
{};
template<>
decltype(ircd::util::instance_list<ircd::cl::work>::list)
ircd::util::instance_list<ircd::cl::work>::list
{
allocator
};
decltype(ircd::cl::exec::opts_default)
ircd::cl::exec::opts_default;
ircd::cl::exec::exec(const opts &opts)
try
{
auto &q
{
queue[0][0]
};
const auto deps
{
make_deps(this, opts)
};
assert(!this->handle);
call
(
clEnqueueBarrierWithWaitList,
q,
deps.size(),
deps.size()? deps.data(): nullptr,
addressof_handle(this)
);
primary_stats.exec_barrier_tasks += 1;
handle_submitted(this, opts);
}
catch(const std::exception &e)
{
log::error
{
log, "Exec Barrier :%s",
e.what(),
};
throw;
}
ircd::cl::exec::exec(kern &kern,
const kern::range &range,
const opts &opts)
try
{
size_t dim(0);
for(size_t i(0); i < range.global.size(); ++i)
dim += range.global[i] > 0 && dim == i;
if(!dim)
return;
if(unlikely(run::level != run::level::RUN))
throw unavailable
{
"Unable to submit work items in runlevel %s",
reflect(run::level),
};
assert(run::level == run::level::RUN);
auto &q
{
queue[0][0]
};
auto &dev
{
device[0][0]
};
const auto deps
{
make_deps(this, opts)
};
assert(!this->object);
this->object = &kern;
const auto max_local_size
{
kern.group_size(dev)
};
const auto reqd_local_size
{
kern.compile_group_size(dev)
};
const auto hint_local_size
{
kern.preferred_group_size_multiple(dev)
};
size_t local[dim];
for(size_t d(0); d < dim; ++d)
{
local[d] = reqd_local_size[d]?: range.local[d]?: hint_local_size;
local[d] = std::min(local[d], max_local_size);
}
size_t global_size{range.global[0]};
size_t local_size{local[0]};
for(size_t d(1); d < dim; ++d)
{
global_size *= range.global[d];
local_size *= local[d];
}
assert(global_size % local_size == 0);
const size_t groups
{
global_size / local_size
};
assert(groups > 0);
size_t intensity
{
cl::intensity?
std::max(opts.intensity, uint(cl::intensity)):
opts.intensity
};
if(intensity < groups)
while(intensity > 1 && groups % intensity != 0)
intensity--;
const size_t tasks
{
intensity && intensity < groups?
groups / intensity:
1
};
assert(!this->handle);
for(size_t i(0); i < tasks; ++i)
{
kern::range sub_range(range);
for(size_t d(0); d < dim; ++d)
{
sub_range.global[d] /= tasks;
sub_range.offset[d] += sub_range.global[d] * i;
}
call
(
clEnqueueNDRangeKernel,
q,
cl_kernel(kern.handle),
dim,
sub_range.offset.data(),
sub_range.global.data(),
local + 0,
deps.size(),
deps.size()? deps.data(): nullptr,
i == tasks - 1? addressof_handle(this): nullptr
);
}
primary_stats.exec_kern_tasks += tasks;
primary_stats.exec_kern_threads += global_size;
primary_stats.exec_kern_groups += groups;
handle_submitted(this, opts);
}
catch(const std::exception &e)
{
log::error
{
log, "Exec Kern :%s",
e.what(),
};
throw;
}
ircd::cl::exec::exec(data &dst,
const data &src,
const opts &opts)
try
{
auto &q
{
queue[0][0]
};
assert(src.handle);
assert(dst.handle);
const size_t size
{
opts.size == -1UL?
std::min(dst.size(), src.size()):
opts.size
};
if(!size)
return;
assert(!this->object);
this->object = &dst;
const auto deps
{
make_deps(this, opts)
};
assert(!this->handle);
call
(
clEnqueueCopyBuffer,
q,
cl_mem(src.handle),
cl_mem(dst.handle),
opts.offset[1],
opts.offset[0],
size,
deps.size(),
deps.size()? deps.data(): nullptr,
addressof_handle(this)
);
primary_stats.exec_copy_bytes += size;
primary_stats.exec_copy_tasks += 1;
handle_submitted(this, opts);
}
catch(const std::exception &e)
{
log::error
{
log, "Exec Copy :%s",
e.what(),
};
throw;
}
ircd::cl::exec::exec(data &data,
const std::memory_order order,
const opts &opts)
try
{
if(unlikely(run::level < run::level::RUN))
throw unavailable
{
"Unable to write to device in runlevel %s",
reflect(run::level),
};
assert(run::level >= run::level::RUN);
const auto max_size
{
opts.size == -1UL?
data.size():
opts.size
};
const auto size
{
size_t(opts.offset[0]) < max_size?
max_size - opts.offset[0]:
0UL
};
assert(size_t(size) <= data.size());
assert(size_t(opts.offset[0]) <= data.size());
if(!size)
return;
bool read {false}, write {false}, invalidate {false};
bool blocking {opts.blocking};
switch(order)
{
case std::memory_order_relaxed:
return;
case std::memory_order_consume:
read = true;
break;
case std::memory_order_acquire:
read = true;
write = true;
break;
case std::memory_order_seq_cst:
read = true;
write = true;
blocking = true;
break;
case std::memory_order_acq_rel:
invalidate = true;
break;
case std::memory_order_release:
break;
}
const cl_map_flags flags
{
(boolmask<cl_map_flags>(read) & CL_MAP_READ) |
(boolmask<cl_map_flags>(write) & CL_MAP_WRITE) |
(boolmask<cl_map_flags>(invalidate) & CL_MAP_WRITE_INVALIDATE_REGION)
};
if(!flags && !data.mapped)
return;
assert(flags || data.mapped);
assert(!this->object);
this->object = &data;
auto &q
{
queue[0][0]
};
const auto deps
{
make_deps(this, opts)
};
int err {CL_SUCCESS};
assert(!this->handle);
if(flags)
data.mapped = clEnqueueMapBuffer
(
q,
cl_mem(data.handle),
blocking,
flags,
opts.offset[0],
size,
deps.size(),
deps.size()? deps.data(): nullptr,
addressof_handle(this),
&err
);
else
call
(
clEnqueueUnmapMemObject,
q,
cl_mem(data.handle),
std::exchange(data.mapped, nullptr),
0, // deps
nullptr, // depslist
addressof_handle(this)
);
throw_on_error(err);
primary_stats.exec_read_bytes += read? size: 0;
primary_stats.exec_read_tasks += read;
primary_stats.exec_write_bytes += write || invalidate? size: 0;
primary_stats.exec_write_tasks += write || invalidate;
handle_submitted(this, opts);
assert(data.mapped || !flags);
assert(this->handle);
}
catch(const std::exception &e)
{
log::error
{
log, "Exec Map order:%d :%s",
int(order),
e.what(),
};
throw;
}
void
ircd::cl::handle_submitted(cl::exec *const &exec,
const exec::opts &opts)
{
assert(run::level == run::level::RUN || run::level == run::level::QUIT);
primary_stats.exec_tasks += 1;
if(opts.flush)
cl::flush();
if(opts.sync)
cl::sync();
if(likely(!opts.blocking))
check_submit_blocking(exec, opts);
}
/// Checks if the OpenCL runtime blocked this thread to sound the alarms.
void
ircd::cl::check_submit_blocking(cl::exec *const &exec,
const exec::opts &opts)
{
const uint64_t &threshold
{
watchdog_tsc
};
if(!threshold)
return;
const auto submit_cycles
{
prof::cycles() - exec->ts
};
if(likely(submit_cycles < threshold))
return;
char nbuf[64];
const auto name
{
exec->name(nbuf)
};
char pbuf[32];
log::dwarning
{
log, "clEnqueue() kernel '%s' blocking the host for %s cycles on submit.",
name?: "<unnamed kernel or unknown command type>"_sv,
pretty(pbuf, si(submit_cycles), 1),
};
}
ircd::vector_view<cl_event>
ircd::cl::make_deps(cl::work *const &work,
const exec::opts &opts)
{
if(empty(opts.deps) && !opts.indep)
return make_deps_default(work, opts);
if(empty(opts.deps))
return {};
size_t ret(0);
vector_view<cl_event> out(_deps_list);
for(auto &exec : opts.deps)
out.at(ret++) = cl_event(exec.handle);
return vector_view<cl_event>
{
out, ret
};
}
ircd::vector_view<cl_event>
ircd::cl::make_deps_default(cl::work *const &work,
const exec::opts &opts)
{
size_t ret(0);
vector_view<cl_event> out(_deps_list);
for(auto it(rbegin(cl::work::list)); it != rend(cl::work::list); ++it)
{
cl::work *const &other{*it};
if(other == work)
continue;
if(!other->handle)
continue;
if(other->context != ctx::current)
continue;
out.at(ret++) = cl_event(other->handle);
break;
}
return vector_view<cl_event>
{
out, ret
};
}
cl_event *
ircd::cl::addressof_handle(cl::work *const &work)
{
const auto handle
{
std::addressof(work->handle)
};
cl_event *const ret
(
reinterpret_cast<cl_event *>(handle)
);
return ret;
}
//
// kern
//
ircd::cl::kern::kern(code &code,
const string_view &name)
try
{
int err {CL_SUCCESS};
handle = clCreateKernel(cl_program(code.handle), name.c_str(), &err);
throw_on_error(err);
const std::array<size_t, 3> cgs
{
#ifdef RB_DEBUG
compile_group_size(device[0][0]) //TODO: XXX
#else
0, 0, 0
#endif
};
char buf[1][16];
char pbuf[2][48];
if constexpr((false))
log::debug
{
log, "kernel stack %s local %s group:%zu pref:%zu comp:%zu:%zu:%zu :%s",
pretty(pbuf[0], iec(stack_mem_size())),
pretty(pbuf[1], iec(local_mem_size())),
group_size(),
preferred_group_size_multiple(),
cgs[0], cgs[1], cgs[2],
name,
};
}
catch(const std::exception &e)
{
log::error
{
log, "Kernel Create '%s' :%s",
name,
e.what(),
};
throw;
}
ircd::cl::kern::~kern()
noexcept try
{
if(likely(handle))
call(clReleaseKernel, cl_kernel(handle));
}
catch(const std::exception &e)
{
log::critical
{
log, "Kernel Release :%s",
e.what(),
};
return;
}
void
ircd::cl::kern::arg(const int i,
data &data)
{
const auto &data_handle
{
cl_mem(data.handle)
};
call(clSetKernelArg, cl_kernel(handle), i, sizeof(cl_mem), &data_handle);
}
void
ircd::cl::kern::arg(const int i,
const const_buffer &buf)
{
call(clSetKernelArg, cl_kernel(handle), i, ircd::size(buf), ircd::data(buf));
}
std::array<size_t, 3>
ircd::cl::kern::compile_group_size(void *const dev)
const
{
char buf[24];
const auto handle(cl_kernel(this->handle));
constexpr auto flag(CL_KERNEL_COMPILE_WORK_GROUP_SIZE);
return info<std::array<size_t, 3>>(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf);
}
size_t
ircd::cl::kern::preferred_group_size_multiple(void *const dev)
const
{
char buf[16];
const auto handle(cl_kernel(this->handle));
constexpr auto flag(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE);
return info<size_t>(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf);
}
size_t
ircd::cl::kern::group_size(void *const dev)
const
{
char buf[16];
const auto handle(cl_kernel(this->handle));
constexpr auto flag(CL_KERNEL_WORK_GROUP_SIZE);
return info<size_t>(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf);
}
size_t
ircd::cl::kern::local_mem_size(void *const dev)
const
{
char buf[16];
const auto handle(cl_kernel(this->handle));
constexpr auto flag(CL_KERNEL_LOCAL_MEM_SIZE);
return info<ulong>(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf);
}
size_t
ircd::cl::kern::stack_mem_size(void *const dev)
const
{
char buf[16];
const auto handle(cl_kernel(this->handle));
constexpr auto flag(CL_KERNEL_PRIVATE_MEM_SIZE);
return info<ulong>(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf);
}
uint
ircd::cl::kern::argc()
const
{
const auto handle
{
cl_kernel(this->handle)
};
char buf[sizeof(uint)];
return info<uint>(clGetKernelInfo, handle, CL_KERNEL_NUM_ARGS, buf);
}
ircd::string_view
ircd::cl::kern::name(const mutable_buffer &buf)
const
{
const auto handle
{
cl_kernel(this->handle)
};
return handle?
info(clGetKernelInfo, handle, CL_KERNEL_FUNCTION_NAME, buf):
string_view{};
}
//
// code
//
namespace ircd::cl
{
static void build_handle_error_log_line(const string_view &line);
static void build_handle_error(code &, const opencl_error &);
static void build_handle(cl_program program, void *priv);
}
decltype(ircd::cl::code::iov_max)
ircd::cl::code::iov_max
{
64
};
//
// code::code
//
ircd::cl::code::code(path_t,
const string_view &paths)
:code
{
path,
vector_view<const string_view>(&paths, 1),
}
{
}
ircd::cl::code::code(path_t,
const vector_view<const string_view> &paths_)
{
static const auto is_cl
{
[](const auto &path)
{
return fs::is_reg(path) && fs::is_extension(path, ".cl");
}
};
std::vector<std::string> buf;
for(const auto &path : paths_)
if(fs::is_dir(path))
{
for(const auto &file : fs::ls(path))
if(is_cl(file))
buf.emplace_back(fs::read(fs::fd(file)));
}
else if(is_cl(path))
buf.emplace_back(fs::read(fs::fd(path)));
const auto count(buf.size());
if(unlikely(count > iov_max))
throw error
{
"Maximum number of sources exceeded: lim:%zu got:%zu",
iov_max,
count,
};
string_view src[count];
for(uint i(0); i < count; ++i)
src[i] = buf[i];
create(vector_view<const string_view>(src, count));
}
ircd::cl::code::code(const string_view &src)
:code
{
vector_view<const string_view>(&src, 1),
}
{
}
ircd::cl::code::code(const vector_view<const string_view> &srcs)
{
create(srcs);
}
ircd::cl::code::code(const vector_view<const const_buffer> &bins)
{
create(bins);
}
ircd::cl::code::code(const const_buffer &bc)
{
char pbuf[1][48];
log::logf
{
log, log::level::DEBUG,
"code(%p) loading %s bitcode:%p",
this,
pretty(pbuf[0], si(ircd::size(bc))),
ircd::data(bc),
};
assert(!handle);
int err {CL_SUCCESS};
handle = clCreateProgramWithIL
(
primary,
ircd::data(bc),
ircd::size(bc),
&err
);
throw_on_error(err);
}
ircd::cl::code::~code()
noexcept try
{
if(likely(handle))
call(clReleaseProgram, cl_program(handle));
}
catch(const std::exception &e)
{
log::critical
{
log, "Program Release :%s",
e.what(),
};
return;
}
void
ircd::cl::code::create(const vector_view<const string_view> &srcs)
{
const auto count(srcs.size());
if(unlikely(count > iov_max))
throw error
{
"Maximum number of sources exceeded: lim:%zu got:%zu",
iov_max,
srcs.size(),
};
size_t len[count];
const char *src[count];
for(size_t i(0); i < count; ++i)
src[i] = ircd::data(srcs[i]),
len[i] = ircd::size(srcs[i]);
char pbuf[1][48];
log::logf
{
log, log::level::DEBUG,
"code(%p) creating %s srcs:%zu",
this,
pretty(pbuf[0], si(std::accumulate(len, len + count, 0))),
count,
};
assert(!handle);
int err {CL_SUCCESS};
handle = clCreateProgramWithSource(primary, count, src, len, &err);
throw_on_error(err);
}
void
ircd::cl::code::create(const vector_view<const const_buffer> &bins)
{
const auto count(bins.size());
if(unlikely(count > iov_max))
throw error
{
"Maximum number of binaries exceeded: lim:%zu got:%zu",
iov_max,
count,
};
size_t len[iov_max + 1] {0};
const uint8_t *bin[iov_max + 1] {nullptr};
for(size_t i(0); i < count; ++i)
bin[i] = reinterpret_cast<const uint8_t *>(ircd::data(bins[i])),
len[i] = ircd::size(bins[i]);
size_t devs {0};
cl_device_id dev[DEVICE_MAX] {0};
for(size_t i(0); i < platforms; ++i)
for(size_t j(0); j < devices[i]; ++j)
dev[devs++] = device[i][j];
char pbuf[1][48];
log::logf
{
log, log::level::DEBUG,
"code(%p) loading %s bins:%zu devs:%zu",
this,
pretty(pbuf[0], si(std::accumulate(len, len + count, 0))),
count,
devs,
};
assert(len);
assert(devs);
assert(!handle);
int err {CL_SUCCESS};
int binerr[iov_max + 1] {CL_SUCCESS};
handle = clCreateProgramWithBinary(primary, devs, dev, len, bin, binerr, &err);
throw_on_error(err);
for(size_t i(0); i < count; ++i)
throw_on_error(binerr[i]);
}
void
ircd::cl::code::build(const string_view &opts)
try
{
const uint num_devices {1};
const cl_device_id *device_list
{
device[0] //TODO: XXX
};
log::logf
{
log, log::level::DEBUG,
"code(%p) building devs:%zu %c%s",
this,
num_devices,
opts? ':': ' ',
opts,
};
call
(
clBuildProgram,
cl_program(handle),
num_devices,
device_list,
opts.c_str(),
cl::build_handle,
this
);
}
catch(const opencl_error &e)
{
build_handle_error(*this, e);
throw;
}
catch(const std::exception &e)
{
log::error
{
log, "code(%p) :Failed to build :%s",
this,
e.what(),
};
throw;
}
void
ircd::cl::code::link(const string_view &opts)
try
{
const uint num_devices {1};
const cl_device_id *device_list
{
device[0] //TODO: XXX
};
assert(this->handle);
const uint num_progs {1};
const cl_program progs[]
{
cl_program(this->handle)
};
log::logf
{
log, log::level::DEBUG,
"code(%p) linking devs:%zu progs:%zu opts:%zu$B %c%s",
this,
num_devices,
num_progs,
ircd::size(opts),
opts? ':': ' ',
opts,
};
int err
{
CL_COMPILER_NOT_AVAILABLE
};
#ifdef CL_VERSION_1_2
void *handle
{
clLinkProgram
(
primary,
num_devices,
device_list,
opts.c_str(),
num_progs,
progs,
cl::build_handle,
this,
&err
)
};
#endif
throw_on_error(err);
std::swap(handle, this->handle);
call(clReleaseProgram, cl_program(handle));
}
catch(const opencl_error &e)
{
build_handle_error(*this, e);
throw;
}
catch(const std::exception &e)
{
log::error
{
log, "code(%p) :Failed to link :%s",
this,
e.what(),
};
throw;
}
void
ircd::cl::code::compile(const string_view &opts)
try
{
const uint num_devices {1};
const cl_device_id *device_list
{
device[0] //TODO: XXX
};
const uint num_headers {0};
const cl_program *header_progs
{
nullptr
};
const char *header_names[]
{
nullptr
};
log::logf
{
log, log::level::DEBUG,
"code(%p) compiling devs:%zu headers:%zu %c%s",
this,
num_devices,
num_headers,
opts? ':': ' ',
opts,
};
#ifdef CL_VERSION_1_2
call
(
clCompileProgram,
cl_program(handle),
num_devices,
device_list,
opts.c_str(),
num_headers,
header_progs,
nullptr, //header_names, // clover api bug?
cl::build_handle,
this
);
#else
throw_on_error(CL_COMPILER_NOT_AVAILABLE);
#endif
}
catch(const opencl_error &e)
{
build_handle_error(*this, e);
throw;
}
catch(const std::exception &e)
{
log::error
{
log, "code(%p) :Failed to compile :%s",
this,
e.what(),
};
throw;
}
ircd::string_view
ircd::cl::code::src(const mutable_buffer &buf)
const
{
const auto &handle
{
cl_program(this->handle)
};
return info(clGetProgramInfo, handle, CL_PROGRAM_SOURCE, buf);
}
ircd::vector_view<const ircd::mutable_buffer>
ircd::cl::code::bin(vector_view<mutable_buffer> buf)
const
{
const auto &handle
{
cl_program(this->handle)
};
const auto devs
{
this->devs()
};
assert(devs <= ircd::size(buf));
const auto count
{
std::min(devs, ircd::size(buf))
};
size_t bin_sz[count];
const auto bins
{
this->bins({bin_sz, count})
};
assert(bins <= count);
const auto num
{
std::min(bins, count)
};
for(size_t i(0); i < num; ++i)
buf[i] = mutable_buffer
{
buf[i], bin_sz[i]
};
uintptr_t ptr[num];
for(size_t i(0); i < num; ++i)
ptr[i] = uintptr_t(ircd::data(buf[i]));
info(clGetProgramInfo, handle, CL_PROGRAM_BINARIES, mutable_buffer
{
reinterpret_cast<char *>(ptr), sizeof(uintptr_t) * num
});
return buf;
}
size_t
ircd::cl::code::bins_size()
const
{
const auto devs
{
this->devs()
};
size_t bin_sz[devs];
const auto bins
{
this->bins({bin_sz, devs})
};
assert(bins <= devs);
const auto ret
{
std::accumulate(bin_sz, bin_sz + bins, 0UL)
};
return ret;
}
size_t
ircd::cl::code::bins(const vector_view<size_t> &buf)
const
{
const auto &handle
{
cl_program(this->handle)
};
const auto count
{
devs()
};
assert(count <= size(buf));
info(clGetProgramInfo, handle, CL_PROGRAM_BINARY_SIZES, mutable_buffer
{
reinterpret_cast<char *>(ircd::data(buf)), ircd::size(buf) * sizeof(size_t)
});
return count;
}
size_t
ircd::cl::code::devs()
const
{
char buf[sizeof(uint)];
const auto &handle
{
cl_program(this->handle)
};
return info<uint>(clGetProgramInfo, handle, CL_PROGRAM_NUM_DEVICES, buf);
}
long
ircd::cl::code::status()
const
{
const auto &handle
{
cl_program(this->handle)
};
cl_build_status buf;
const mutable_buffer mb
{
reinterpret_cast<char *>(&buf), sizeof(buf)
};
const auto &dev
{
device[0][0], //TODO: XXX
};
const auto ret
{
info<cl_build_status>(clGetProgramBuildInfo, handle, dev, CL_PROGRAM_BUILD_STATUS, mb)
};
return ret;
}
void
ircd::cl::build_handle(cl_program program,
void *const priv)
{
cl::code *const &code
{
reinterpret_cast<cl::code *>(priv)
};
assert(code);
char pbuf[1][48];
log::logf
{
log, log::level::DEBUG,
"program(%p) devs:%zu binsz:%s :Build complete.",
(const void *)program,
code->devs(),
pretty(pbuf[0], si(code->bins_size())),
};
}
void
ircd::cl::build_handle_error(code &code,
const opencl_error &e)
{
const auto string_closure
{
[&code](const mutable_buffer &buf)
{
size_t len {0}; call
(
clGetProgramBuildInfo,
cl_program(code.handle),
device[0][0], //TODO: XXX
CL_PROGRAM_BUILD_LOG,
ircd::size(buf),
ircd::data(buf),
&len
);
return len;
}
};
const auto error_message
{
ircd::string(8_KiB | SHRINK_TO_FIT, string_closure)
};
const auto lines
{
ircd::tokens(error_message, '\n', build_handle_error_log_line)
};
}
void
ircd::cl::build_handle_error_log_line(const string_view &line)
{
// note last line is just a CR
if(line.size() <= 1)
return;
const auto &[loc, line_] { split(line, ' ') };
const auto &[fac, msg] { split(line_, ' ') };
const auto &[fname, pos] { split(loc, ':') };
const auto &[row, col] { split(pos, ':') };
const auto level
{
startswith(fac, "warning")?
log::level::WARNING:
startswith(fac, "error")?
log::level::ERROR:
log::level::ERROR
};
log::logf
{
log, level, "%s", line
};
}
//
// data
//
decltype(ircd::cl::data::gart_page_size)
ircd::cl::data::gart_page_size
{
{ "name", "ircd.cl.data.gart.page_size" },
{ "default", 4096 },
{ "help", "Override (un)detected gart page size." },
};
ircd::cl::data::data(const size_t size,
const bool host_read,
const bool host_write)
{
if(!size)
return;
cl_mem_flags flags {0};
flags |= CL_MEM_READ_WRITE;
flags |= host_read && !host_write? CL_MEM_HOST_READ_ONLY: 0;
flags |= !host_read && host_write? CL_MEM_HOST_WRITE_ONLY: 0;
flags |= !host_read && !host_write? CL_MEM_HOST_NO_ACCESS: 0;
char pbuf[48];
log::debug
{
log, "data(%p) device %s %lu@ host[read:%b write:%b] flags:%08x",
this,
pretty(pbuf, iec(size)),
alignment(size),
host_read,
host_write,
flags,
};
int err {CL_SUCCESS};
handle = clCreateBuffer
(
primary,
flags,
size,
nullptr,
&err
);
throw_on_error(err);
primary_stats.alloc_count += 1;
primary_stats.alloc_bytes += size;
}
ircd::cl::data::data(const mutable_buffer &buf,
const bool wonly)
{
const auto &ptr
{
ircd::data(buf)
};
const auto &size
{
ircd::size(buf)
};
if(!size)
return;
cl_mem_flags flags {0};
flags |= CL_MEM_USE_HOST_PTR;
flags |= wonly? CL_MEM_WRITE_ONLY: CL_MEM_READ_WRITE;
char pbuf[48];
log::debug
{
log, "data(%p) mutable %p %lu@ %s %lu@ wonly:%b flags:%08x",
this,
ptr,
alignment(ptr),
pretty(pbuf, iec(size)),
alignment(size),
wonly,
flags,
};
assert(!ptr || aligned(buf, size_t(gart_page_size)));
assert(padded(size, size_t(gart_page_size)));
int err {CL_SUCCESS};
handle = clCreateBuffer
(
primary,
flags,
size,
ptr,
&err
);
throw_on_error(err);
primary_stats.alloc_count += 1;
primary_stats.alloc_bytes += size;
}
ircd::cl::data::data(const const_buffer &buf)
{
const auto &ptr
{
ircd::data(buf)
};
const auto &size
{
ircd::size(buf)
};
if(!size)
return;
cl_mem_flags flags {0};
flags |= CL_MEM_USE_HOST_PTR;
flags |= CL_MEM_READ_ONLY;
char pbuf[48];
log::debug
{
log, "data(%p) immutable %p %lu@ %s %lu@ flags:%08x",
this,
ptr,
alignment(ptr),
pretty(pbuf, iec(size)),
alignment(size),
flags,
};
assert(!ptr || aligned(buf, size_t(gart_page_size)));
assert(padded(size, size_t(gart_page_size)));
int err {CL_SUCCESS};
handle = clCreateBuffer
(
primary,
flags,
size,
mutable_cast(ptr),
&err
);
throw_on_error(err);
primary_stats.alloc_count += 1;
primary_stats.alloc_bytes += size;
}
ircd::cl::data::data(data &master,
const pair<size_t, off_t> &slice)
{
constexpr auto type
{
CL_BUFFER_CREATE_TYPE_REGION
};
if(!master.handle)
return;
cl_buffer_region region {0};
region.size = slice.first;
region.origin = master.offset() + slice.second;
if(!region.size)
return;
const auto root
{
master.master()?: master.handle
};
char pbuf[48];
if constexpr((false))
log::debug
{
log, "data(%p) master(%p) region offset:%zu %lu@ %s %lu@",
this,
root,
region.origin,
alignment(region.origin),
pretty(pbuf, iec(region.size)),
alignment(region.size),
};
assert(aligned(region.origin, size_t(gart_page_size)));
assert(padded(region.size, size_t(gart_page_size)));
assert(root);
int err {CL_SUCCESS};
handle = clCreateSubBuffer
(
cl_mem(root),
cl_mem_flags{0},
type,
&region,
&err
);
throw_on_error(err);
}
ircd::cl::data::~data()
noexcept try
{
assert(handle || !mapped);
if(likely(handle))
{
const auto size
{
this->size()
};
call(clReleaseMemObject, cl_mem(handle));
primary_stats.dealloc_count += 1;
primary_stats.dealloc_bytes += size;
}
}
catch(const std::exception &e)
{
log::critical
{
log, "Memory Release :%s",
e.what(),
};
return;
}
char *
ircd::cl::data::ptr()
const
{
assert(handle);
char buf[sizeof(void *)] {0};
const auto ret
{
this->mapped?
static_cast<char *>(this->mapped):
info<char *>(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_HOST_PTR, buf)
};
return ret;
}
size_t
ircd::cl::data::refs()
const
{
assert(handle);
char buf[sizeof(size_t)] {0};
return info<uint>(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_REFERENCE_COUNT, buf);
}
off_t
ircd::cl::data::offset()
const
{
assert(handle);
char buf[sizeof(off_t)] {0};
return info<off_t>(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_OFFSET, buf);
}
size_t
ircd::cl::data::size()
const
{
assert(handle);
char buf[sizeof(size_t)] {0};
return info<size_t>(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_SIZE, buf);
}
uint
ircd::cl::data::flags()
const
{
assert(handle);
char buf[sizeof(uint)] {0};
return info<uint>(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_FLAGS, buf);
}
void *
ircd::cl::data::master()
const
{
assert(handle);
char buf[sizeof(void *)] {0};
constexpr auto qtype(CL_MEM_ASSOCIATED_MEMOBJECT);
return info<cl_mem>(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), qtype, buf);
}
//
// cl::work (event)
//
namespace ircd::cl
{
struct completion;
extern conf::item<bool> offload_enable;
extern const ctx::ole::opts offload_opts;
static void handle_event_callback(cl_event, cl_int, void *) noexcept;
static int wait_event_callback(work &, const int, const int);
static int wait_event_offload(work &, const int, const int);
static int wait_event(work &, const int, const int);
}
struct alignas(64) ircd::cl::completion
{
cl_event event {nullptr};
cl_int status {CL_COMPLETE};
ctx::dock dock;
};
decltype(ircd::cl::offload_enable)
ircd::cl::offload_enable
{
{ "name", "ircd.cl.offload.enable" },
{ "default", true },
};
decltype(ircd::cl::offload_opts)
ircd::cl::offload_opts
{
"cl"
};
[[gnu::visibility("hidden")]]
void
ircd::cl::work::init()
{
}
[[using gnu: cold, visibility("hidden")]]
void
ircd::cl::work::fini()
noexcept
{
cl::sync();
}
//
// work::work
//
ircd::cl::work::work()
{
if(unlikely(!cl::linkage))
throw unavailable
{
"OpenCL runtime is not available."
};
assert(cl::linkage);
}
ircd::cl::work::work(void *const &handle)
{
call(clRetainEvent, cl_event(handle));
this->handle = handle;
}
ircd::cl::work::~work()
noexcept try
{
if(!handle)
return;
const unwind free{[this]
{
assert(handle);
call(clReleaseEvent, cl_event(handle));
}};
wait();
}
catch(const std::exception &e)
{
log::critical
{
log, "Work Release :%s",
e.what(),
};
return;
}
void
ircd::cl::work::wait(const uint desired)
try
{
static_assert(CL_COMPLETE == 0);
constexpr auto execution_status
{
CL_EVENT_COMMAND_EXECUTION_STATUS
};
char buf[4];
int status {0};
if(likely(handle))
status = info<int>(clGetEventInfo, cl_event(handle), execution_status, buf);
if(status > int(desired))
status = wait_event(*this, status, desired);
if(unlikely(status < 0))
throw_on_error(status);
assert(int(status) == int(desired));
}
catch(const std::exception &e)
{
log::error
{
log, "work(%p)::wait(%u) :%s",
this,
desired,
e.what(),
};
throw;
}
ircd::string_view
ircd::cl::work::name(const mutable_buffer &buf)
const
{
switch(const auto type(this->type()); type)
{
default:
return nullptr;
case CL_COMMAND_READ_BUFFER: return "READ_BUFFER";
case CL_COMMAND_WRITE_BUFFER: return "WRITE_BUFFER";
case CL_COMMAND_COPY_BUFFER: return "COPY_BUFFER";
case CL_COMMAND_MAP_BUFFER: return "MAP_BUFFER";
case CL_COMMAND_UNMAP_MEM_OBJECT: return "UNMAP_MEM_OBJECT";
case CL_COMMAND_NDRANGE_KERNEL:
{
const auto kern
{
reinterpret_cast<const cl::kern *>(object)
};
return kern? kern->name(buf): "NDRANGE_KERNEL";
}
};
}
int
ircd::cl::work::type()
const
{
const auto handle
{
cl_event(this->handle)
};
if(!handle)
return 0;
char buf[4];
return info<int>(clGetEventInfo, handle, CL_EVENT_COMMAND_TYPE, buf);
}
//
// cl::work::prof
//
ircd::cl::work::prof::prof(const cl::work &w)
{
const auto h
{
cl_event(w.handle)
};
if(!profile_queue || !h)
{
for(uint i(0); i < this->size(); ++i)
(*this)[i] = 0ns;
return;
}
char b[5][8];
(*this)[0] = info<nanoseconds>(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_QUEUED, b[0]);
(*this)[1] = info<nanoseconds>(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_SUBMIT, b[1]);
(*this)[2] = info<nanoseconds>(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_START, b[2]);
(*this)[3] = info<nanoseconds>(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_END, b[3]);
(*this)[4] = info<nanoseconds>(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_COMPLETE, b[4]);
}
//
// cl::work (internal)
//
int
ircd::cl::wait_event(work &work,
const int status,
const int desired)
{
assert(work.handle);
assert(status > desired);
const ctx::uninterruptible::nothrow ui;
const bool use_offload
{
// conf item
bool(offload_enable)
};
const auto ret
{
use_offload?
wait_event_offload(work, status, desired):
wait_event_callback(work, status, desired)
};
const bool is_err
{
ret < 0
};
const bool is_complete
{
ret == CL_COMPLETE
};
primary_stats.work_completes += is_complete;
primary_stats.work_errors += is_err;
primary_stats.work_waits += 1;
return ret;
}
int
ircd::cl::wait_event_offload(work &work,
const int status,
const int desired)
{
completion c
{
cl_event(work.handle),
status,
};
ctx::ole::offload
{
offload_opts, [&c]
{
assert(c.status != CL_COMPLETE);
call(clWaitForEvents, 1UL, &c.event);
c.status = CL_COMPLETE;
}
};
assert(c.status == CL_COMPLETE);
return c.status;
}
int
ircd::cl::wait_event_callback(work &work,
const int status,
const int desired)
{
// Completion state structure on this ircd::ctx's stack.
completion c
{
cl_event(work.handle),
status,
};
// Completion condition closure to be satisfied.
const auto condition{[&c, &desired]
() noexcept -> bool
{
return !c.event || c.status <= desired;
}};
// Register callback with OpenCL; note that the callback might be
// dispatched immediately from this call itself (see below).
call
(
clSetEventCallback,
c.event,
desired,
&handle_event_callback,
&c
);
// This stats item counts clSetEventCallback()'s which return before OpenCL
// calls the callback, verifying asynchronicity. If this stats item remains
// zero, the OpenCL runtime has hijacked our thread for a blocking wait.
primary_stats.work_waits_async += !condition();
// Yield ircd::ctx while condition unsatisfied
c.dock.wait(condition);
return c.status;
}
void
ircd::cl::handle_event_callback(cl_event event,
cl_int status,
void *const priv)
noexcept
{
auto *const c
{
reinterpret_cast<completion *>(priv)
};
assert(priv != nullptr);
assert(event != nullptr);
assert(c->event == event);
c->status = status;
c->dock.notify_one();
}
//
// callback surface
//
void
ircd::cl::handle_notify(const char *errstr,
const void *token,
size_t cb,
void *priv)
noexcept
{
if(errstr)
log::error
{
log, "OpenCL t:%p cb:%zu :%s",
token,
cb,
errstr,
};
}
//
// util
//
template<class T,
int maybe,
class F,
class id0,
class param>
T
ircd::cl::info(F&& func,
const id0 &i0,
const param &p,
const mutable_buffer &out,
T default_)
{
using ircd::data;
using ircd::size;
size_t len {0};
const auto code
{
call<maybe>(std::forward<F>(func), i0, p, size(out), data(out), &len)
};
assert(code == CL_SUCCESS || code == maybe);
if constexpr(maybe != CL_SUCCESS)
if(code == maybe)
return default_;
return byte_view<T>
{
string_view
{
data(out), len
}
};
}
template<class T,
int maybe,
class F,
class id0,
class id1,
class param>
T
ircd::cl::info(F&& func,
const id0 &i0,
const id1 &i1,
const param &p,
const mutable_buffer &out,
T default_)
{
using ircd::data;
using ircd::size;
size_t len {0};
const auto code
{
call<maybe>(std::forward<F>(func), i0, i1, p, size(out), data(out), &len)
};
assert(code == CL_SUCCESS || code == maybe);
if constexpr(maybe != CL_SUCCESS)
if(code == maybe)
return default_;
return byte_view<T>
{
string_view
{
data(out), len
}
};
}
template<int maybe,
class func,
class... args>
int
ircd::cl::call(func&& f,
args&&... a)
{
const int ret
{
f(std::forward<args>(a)...)
};
if constexpr(maybe != CL_SUCCESS)
if(ret == maybe)
return ret;
return throw_on_error(ret);
}
int
ircd::cl::throw_on_error(const int &code)
{
if(unlikely(is_error(code)))
throw opencl_error
{
"(%d) :%s",
code,
reflect_error(code),
};
return code;
}
bool
ircd::cl::is_error(const int &code)
noexcept
{
return code < 0;
}
ircd::string_view
ircd::cl::reflect_error(const int code)
noexcept
{
switch(code)
{
case CL_SUCCESS: return "SUCCESS";
case CL_DEVICE_NOT_FOUND: return "DEVICE_NOT_FOUND";
case CL_DEVICE_NOT_AVAILABLE: return "DEVICE_NOT_AVAILABLE";
case CL_COMPILER_NOT_AVAILABLE: return "COMPILER_NOT_AVAILABLE";
case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "MEM_OBJECT_ALLOCATION_FAILURE";
case CL_OUT_OF_RESOURCES: return "OUT_OF_RESOURCES";
case CL_OUT_OF_HOST_MEMORY: return "OUT_OF_HOST_MEMORY";
case CL_PROFILING_INFO_NOT_AVAILABLE: return "PROFILING_INFO_NOT_AVAILABLE";
case CL_MEM_COPY_OVERLAP: return "MEM_COPY_OVERLAP";
case CL_IMAGE_FORMAT_MISMATCH: return "IMAGE_FORMAT_MISMATCH";
case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "IMAGE_FORMAT_NOT_SUPPORTED";
case CL_BUILD_PROGRAM_FAILURE: return "BUILD_PROGRAM_FAILURE";
case CL_MAP_FAILURE: return "MAP_FAILURE";
case CL_INVALID_VALUE: return "INVALID_VALUE";
case CL_INVALID_DEVICE_TYPE: return "INVALID_DEVICE_TYPE";
case CL_INVALID_PLATFORM: return "INVALID_PLATFORM";
case CL_INVALID_DEVICE: return "INVALID_DEVICE";
case CL_INVALID_CONTEXT: return "INVALID_CONTEXT";
case CL_INVALID_QUEUE_PROPERTIES: return "INVALID_QUEUE_PROPERTIES";
case CL_INVALID_COMMAND_QUEUE: return "INVALID_COMMAND_QUEUE";
case CL_INVALID_HOST_PTR: return "INVALID_HOST_PTR";
case CL_INVALID_MEM_OBJECT: return "INVALID_MEM_OBJECT";
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "INVALID_IMAGE_FORMAT_DESCRIPTOR";
case CL_INVALID_IMAGE_SIZE: return "INVALID_IMAGE_SIZE";
case CL_INVALID_SAMPLER: return "INVALID_SAMPLER";
case CL_INVALID_BINARY: return "INVALID_BINARY";
case CL_INVALID_BUILD_OPTIONS: return "INVALID_BUILD_OPTIONS";
case CL_INVALID_PROGRAM: return "INVALID_PROGRAM";
case CL_INVALID_PROGRAM_EXECUTABLE: return "INVALID_PROGRAM_EXECUTABLE";
case CL_INVALID_KERNEL_NAME: return "INVALID_KERNEL_NAME";
case CL_INVALID_KERNEL_DEFINITION: return "INVALID_KERNEL_DEFINITION";
case CL_INVALID_KERNEL: return "INVALID_KERNEL";
case CL_INVALID_ARG_INDEX: return "INVALID_ARG_INDEX";
case CL_INVALID_ARG_VALUE: return "INVALID_ARG_VALUE";
case CL_INVALID_ARG_SIZE: return "INVALID_ARG_SIZE";
case CL_INVALID_KERNEL_ARGS: return "INVALID_KERNEL_ARGS";
case CL_INVALID_WORK_DIMENSION: return "INVALID_WORK_DIMENSION";
case CL_INVALID_WORK_GROUP_SIZE: return "INVALID_WORK_GROUP_SIZE";
case CL_INVALID_WORK_ITEM_SIZE: return "INVALID_WORK_ITEM_SIZE";
case CL_INVALID_GLOBAL_OFFSET: return "INVALID_GLOBAL_OFFSET";
case CL_INVALID_EVENT_WAIT_LIST: return "INVALID_EVENT_WAIT_LIST";
case CL_INVALID_EVENT: return "INVALID_EVENT";
case CL_INVALID_OPERATION: return "INVALID_OPERATION";
case CL_INVALID_GL_OBJECT: return "INVALID_GL_OBJECT";
case CL_INVALID_BUFFER_SIZE: return "INVALID_BUFFER_SIZE";
case CL_INVALID_MIP_LEVEL: return "INVALID_MIP_LEVEL";
case CL_INVALID_GLOBAL_WORK_SIZE: return "INVALID_GLOBAL_WORK_SIZE";
#ifdef CL_VERSION_1_1
case CL_INVALID_PROPERTY: return "INVALID_PROPERTY";
case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "MISALIGNED_SUB_BUFFER_OFFSET";
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
#endif
#ifdef CL_VERSION_1_2
case CL_COMPILE_PROGRAM_FAILURE: return "COMPILE_PROGRAM_FAILURE";
case CL_LINKER_NOT_AVAILABLE: return "LINKER_NOT_AVAILABLE";
case CL_LINK_PROGRAM_FAILURE: return "LINK_PROGRAM_FAILURE";
case CL_DEVICE_PARTITION_FAILED: return "DEVICE_PARTITION_FAILED";
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "KERNEL_ARG_INFO_NOT_AVAILABLE";
case CL_INVALID_IMAGE_DESCRIPTOR: return "INVALID_IMAGE_DESCRIPTOR";
case CL_INVALID_COMPILER_OPTIONS: return "INVALID_COMPILER_OPTIONS";
case CL_INVALID_LINKER_OPTIONS: return "INVALID_LINKER_OPTIONS";
case CL_INVALID_DEVICE_PARTITION_COUNT: return "INVALID_DEVICE_PARTITION_COUNT";
#endif
#ifdef CL_VERSION_2_0
case CL_INVALID_PIPE_SIZE: return "INVALID_PIPE_SIZE";
case CL_INVALID_DEVICE_QUEUE: return "INVALID_DEVICE_QUEUE";
#endif
#ifdef CL_VERSION_2_2
case CL_INVALID_SPEC_ID: return "INVALID_SPEC_ID";
case CL_MAX_SIZE_RESTRICTION_EXCEEDED: return "MAX_SIZE_RESTRICTION_EXCEEDED";
#endif
}
return "???????";
}