// The Construct // // Copyright (C) Matrix Construct Developers, Authors & Contributors // Copyright (C) 2016-2021 Jason Volk // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. #include #include #include // Util namespace ircd::cl { static bool is_error(const int &code) noexcept; static int throw_on_error(const int &code); template static int call(func&&, args&&...); template static T info(F&&, const id &, const param &, const mutable_buffer &, T default_ = {}); template static T info(F&&, const id0 &, const id1 &, const param &, const mutable_buffer &, T default_ = {}); static string_view version_str(const mutable_buffer &, const cl_version); static uint query_warp_size_amd(cl_device_id); static uint query_warp_size(cl_context, cl_device_id); } // Runtime state namespace ircd::cl { static const size_t OPTION_MAX {8}, PLATFORM_MAX {8}, DEVICE_MAX {8}; static uint options, platforms, devices[PLATFORM_MAX]; static char option[OPTION_MAX][256]; static void *linkage; static cl_platform_id platform[PLATFORM_MAX]; static cl_device_id device[PLATFORM_MAX][DEVICE_MAX]; static struct version { int major, minor; } api[PLATFORM_MAX][DEVICE_MAX]; static uint warp_size[PLATFORM_MAX][DEVICE_MAX]; static cl_context primary; extern struct stats primary_stats; static cl_command_queue queue[PLATFORM_MAX][DEVICE_MAX]; static void handle_notify(const char *, const void *, size_t, void *) noexcept; } struct ircd::cl::stats { template using item = ircd::stats::item; item sync_count, flush_count, alloc_count, alloc_bytes, dealloc_count, dealloc_bytes, work_waits, work_waits_async, work_errors, work_completes, exec_tasks, exec_kern_tasks, exec_kern_threads, exec_kern_groups, exec_write_tasks, exec_write_bytes, exec_read_tasks, exec_read_bytes, exec_copy_tasks, exec_copy_bytes, exec_barrier_tasks; }; decltype(ircd::cl::log) ircd::cl::log { "cl" }; decltype(ircd::cl::version_api) ircd::cl::version_api { "OpenCL", info::versions::API, CL_TARGET_OPENCL_VERSION, { #if defined(CL_VERSION_MAJOR) && defined(CL_VERSION_MINOR) && defined(CL_VERSION_PATCH) CL_VERSION_MAJOR(CL_TARGET_OPENCL_VERSION), CL_VERSION_MINOR(CL_TARGET_OPENCL_VERSION), CL_VERSION_PATCH(CL_TARGET_OPENCL_VERSION), #endif } }; decltype(ircd::cl::version_abi) ircd::cl::version_abi { "OpenCL", info::versions::ABI }; decltype(ircd::cl::enable) ircd::cl::enable { { "name", "ircd.cl.enable" }, { "default", true }, { "persist", false }, }; decltype(ircd::cl::profile_queue) ircd::cl::profile_queue { { "name", "ircd.cl.queue.profile" }, { "default", false }, { "persist", false }, }; decltype(ircd::cl::device_queue) ircd::cl::device_queue { { "name", "ircd.cl.queue.device" }, { "default", true }, { "persist", false }, }; decltype(ircd::cl::queue_size) ircd::cl::queue_size { { "name", "ircd.cl.queue.size" }, { "default", long(1_MiB) }, { "persist", false }, }; decltype(ircd::cl::intensity) ircd::cl::intensity { { "name", "ircd.cl.intensity" }, { "default", 0L }, }; decltype(ircd::cl::watchdog_tsc) ircd::cl::watchdog_tsc { { "name", "ircd.cl.watchdog.tsc" }, { "default", 268'435'456L }, }; decltype(ircd::cl::path) ircd::cl::path { { "name", "ircd.cl.path" }, { "default", "libOpenCL.so" }, { "persist", false }, }; decltype(ircd::cl::envs) ircd::cl::envs { { { "name", "LP_NUM_THREADS" }, { "default", "0" }, }, { { "name", "MESA_NO_MINMAX_CACHE" }, { "default", "true" }, }, { { "name", "MESA_SHADER_CACHE_DISABLE" }, { "default", "true" }, }, { { "name", "AMD_DEBUG" }, { "default", "nogfx,reserve_vmid" }, }, { { "name", "R600_DEBUG" }, { "default", "forcedma" }, }, { { "name", "RADEON_THREAD" }, { "default", "false" }, }, }; decltype(ircd::cl::primary_stats) ircd::cl::primary_stats { { { "name", "ircd.cl.sync.count" } }, { { "name", "ircd.cl.flush.count" } }, { { "name", "ircd.cl.alloc.count" } }, { { "name", "ircd.cl.alloc.bytes" } }, { { "name", "ircd.cl.dealloc.count" } }, { { "name", "ircd.cl.dealloc.bytes" } }, { { "name", "ircd.cl.work.waits" } }, { { "name", "ircd.cl.work.waits.async" } }, { { "name", "ircd.cl.work.errors" } }, { { "name", "ircd.cl.work.completes" } }, { { "name", "ircd.cl.exec.tasks" } }, { { "name", "ircd.cl.exec.kern.tasks" } }, { { "name", "ircd.cl.exec.kern.threads" } }, { { "name", "ircd.cl.exec.kern.groups" } }, { { "name", "ircd.cl.exec.write.tasks" } }, { { "name", "ircd.cl.exec.write.bytes" } }, { { "name", "ircd.cl.exec.read.tasks" } }, { { "name", "ircd.cl.exec.read.bytes" } }, { { "name", "ircd.cl.exec.copy.tasks" } }, { { "name", "ircd.cl.exec.copy.bytes" } }, { { "name", "ircd.cl.exec.barrier.tasks" } }, }; // // init // ircd::cl::init::init() { if(!enable) { log::dwarning { log, "OpenCL hardware acceleration is not available or enabled." }; return; } const ctx::posix::enable_pthread enable_pthread; // Link the libraries. if(!init_libs()) { log::warning { log, "OpenCL hardware acceleration runtime failed to link." }; return; } // Get the platforms. if(!init_platforms()) { log::warning { log, "OpenCL hardware acceleration platform not found." }; return; } // Report the platforms. log_platform_info(); // Get the devices. if(!init_devices()) { log::warning { log, "OpenCL hardware acceleration device not found." }; return; } // Various other inits. init_ctxs(); // Report the devices. log_dev_info(); } [[gnu::cold]] ircd::cl::init::~init() noexcept { if(!linkage) return; log::debug { log, "Shutting down OpenCL...", }; const ctx::posix::enable_pthread enable_pthread; fini_ctxs(); fini_libs(); } bool ircd::cl::init::init_libs() { const string_view path { cl::path }; if(path.empty()) return false; // Setup options for(const auto &item : envs) { assert(options < OPTION_MAX); fmt::sprintf { option[options], "%s=%s", item.name, string_view{item}, }; sys::call(putenv, option[options++]); } // Load the pipe. assert(!linkage); if(!(linkage = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL))) return false; return true; } void ircd::cl::init::fini_libs() { if(likely(linkage)) dlclose(linkage); } size_t ircd::cl::init::init_platforms() try { // OpenCL sez platform=null is implementation defined. constexpr auto ignore(CL_INVALID_PLATFORM); info(clGetPlatformInfo, nullptr, CL_PLATFORM_VERSION, version_abi.string); // Get the platforms. call(clGetPlatformIDs, PLATFORM_MAX, platform, &platforms); return platforms; } catch(const std::exception &e) { log::logf { log, log::level::DERROR, "OpenCL platforms initialization :%s", e.what(), }; return 0; } size_t ircd::cl::init::init_devices() try { // Get the devices. size_t devices_total(0); for(size_t i(0); i < platforms; ++i) { static const auto type { CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR }; // OpenCL sez 0 devices throws an error but we log a warning for now. constexpr auto ignore(CL_DEVICE_NOT_FOUND); call(clGetDeviceIDs, platform[i], type, DEVICE_MAX, device[i], devices + i); devices_total += devices[i]; } // Gather the API versions for the devices. for(size_t i(0); i < platforms; ++i) for(size_t j(0); j < devices[i]; ++j) try { // OpenCL sez: // OpenCL string_view ver; char buf[256]; ver = info(clGetDeviceInfo, device[i][j], CL_DEVICE_VERSION, buf); ver = lstrip(ver, "OpenCL "); ver = split(ver, ' ').first; const auto &[major, minor] { split(ver, '.') }; api[i][j].major = lex_cast(major); api[i][j].minor = lex_cast(minor); } catch(const error &e) { log::error { log, "OpenCL [%u][%u] CL_DEVICE_VERSION :%s", i, j, e.what(), }; } return devices_total; } catch(const std::exception &e) { log::error { log, "OpenCL devices initialization :%s", e.what(), }; return 0; } size_t ircd::cl::init::init_ctxs() { // Gather all devices we'll use. size_t _devs {0}; cl_device_id _dev[DEVICE_MAX]; for(size_t i(0); i < platforms; ++i) for(size_t j(0); j < devices[i]; ++j) _dev[_devs++] = device[i][j]; // Create a context from gathered devices. cl_int err {CL_SUCCESS}; cl_context_properties ctxprop {0}; primary = clCreateContext(&ctxprop, _devs, _dev, handle_notify, nullptr, &err); throw_on_error(err); // Device queue enabler bool dev_queue { true && cl::device_queue // If enabled by conf && !cl::profile_queue // And not profiling (for now) }; // Device queue support char tmp[4]; for(size_t i(0); i < _devs; ++i) dev_queue &= info(clGetDeviceInfo, _dev[i], CL_DEVICE_MAX_ON_DEVICE_QUEUES, tmp); // Queue out-of-order execution bool dev_ooe { dev_queue // tied to dev_queue }; // Queue size in bytes uint dev_queue_size { uint(cl::queue_size) }; // Queue size limited by devices for(size_t i(0); i < _devs; ++i) { const auto max_queue_size { info(clGetDeviceInfo, _dev[i], CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, tmp) }; dev_queue_size = std::min(dev_queue_size, max_queue_size); } const uint prop[][2] { { CL_QUEUE_SIZE, dev_queue_size }, { CL_QUEUE_PROPERTIES, CL_QUEUE_ON_DEVICE & boolmask(dev_queue) }, { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE & boolmask(dev_ooe) }, { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE & boolmask(profile_queue) }, { 0, 0 }, }; cl_queue_properties qprop[8] {0}; const auto qprops(sizeof(qprop) / sizeof(cl_queue_properties)); for(size_t i(0), j(0); j < qprops && (prop[i][0] || prop[i][1]); ++i) if(prop[i][1]) { qprop[j++] = prop[i][0]; qprop[j++] = prop[i][1]; } // Create a queue for each device. for(size_t i(0); i < platforms; ++i) for(size_t j(0); j < devices[i]; ++j) { queue[i][j] = clCreateCommandQueueWithProperties(primary, device[i][j], qprop, &err); throw_on_error(err); } // For any inits in the work subsystem. work::init(); // Save the warp/wavefront size. for(size_t i(0); i < platforms; ++i) for(size_t j(0); j < devices[i]; ++j) warp_size[i][j] = query_warp_size(primary, device[i][j]); return _devs; } void ircd::cl::init::fini_ctxs() { if(primary) work::fini(); for(size_t i(0); i < PLATFORM_MAX; ++i) for(size_t j(0); j < DEVICE_MAX; ++j) if(queue[i][j]) { call(clReleaseCommandQueue, queue[i][j]); queue[i][j] = nullptr; } if(primary) { call(clReleaseContext, primary); primary = nullptr; } } void ircd::cl::log_platform_info() { for(size_t i(0); i < platforms; ++i) log_platform_info(i); } void ircd::cl::log_platform_info(const uint i) { char buf[3][64]; char extbuf[320]; log::logf { log, log::level::DEBUG, "OpenCL [%u][*] %-3d :%s :%s :%s :%s", i, CL_TARGET_OPENCL_VERSION, info(clGetPlatformInfo, platform[i], CL_PLATFORM_VERSION, buf[0]), info(clGetPlatformInfo, platform[i], CL_PLATFORM_VENDOR, buf[1]), info(clGetPlatformInfo, platform[i], CL_PLATFORM_NAME, buf[2]), info(clGetPlatformInfo, platform[i], CL_PLATFORM_EXTENSIONS, extbuf), }; } void ircd::cl::log_dev_info() { for(size_t i(0); i < platforms; ++i) log_dev_info(i); } void ircd::cl::log_dev_info(const uint i) { if(unlikely(i >= PLATFORM_MAX)) throw std::out_of_range { "Invalid platform identifier." }; for(size_t j(0); j < devices[i]; ++j) log_dev_info(i, j); } void ircd::cl::log_dev_info(const uint i, const uint j) { if(unlikely(i >= PLATFORM_MAX || j >= DEVICE_MAX)) throw std::out_of_range { "Invalid platform or device identifier." }; const auto &dev { device[i][j] }; char buf[12][192]; char pbuf[8][64]; const auto type { info(clGetDeviceInfo, dev, CL_DEVICE_TYPE, buf[0]) }; const auto type_str { type & CL_DEVICE_TYPE_CPU? "CPU"_sv: type & CL_DEVICE_TYPE_GPU? "GPU"_sv: type & CL_DEVICE_TYPE_ACCELERATOR? "APU"_sv: "DEV"_sv }; const fmt::bsprintf<32> head { "%s id:%u:%u", type_str, i, j, }; constexpr auto ignore(CL_INVALID_VALUE); const uint numeric_ver[] { info(clGetDeviceInfo, dev, CL_DEVICE_NUMERIC_VERSION, buf[0]), info(clGetDeviceInfo, dev, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, buf[1]), }; log::info { log, "%s cl:%03u clc:%s dev:%s :%s :%s :%s :%s", string_view{head}, CL_TARGET_OPENCL_VERSION, version_str(buf[0], numeric_ver[0]), version_str(buf[1], numeric_ver[1]), info(clGetDeviceInfo, dev, CL_DEVICE_VERSION, buf[2]), info(clGetDeviceInfo, dev, CL_DRIVER_VERSION, buf[3]), info(clGetDeviceInfo, dev, CL_DEVICE_VENDOR, buf[4]), info(clGetDeviceInfo, dev, CL_DEVICE_NAME, buf[5]), }; const auto wid { info>(clGetDeviceInfo, dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, buf[0]) }; log::info { log, "%s %u$mHz %u$x[simd%u] %u$x[%lu:%lu]%d %u$d[%u$x%u$x%u]", string_view{head}, info(clGetDeviceInfo, dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, buf[0]), info(clGetDeviceInfo, dev, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, buf[1], 0U), info(clGetDeviceInfo, dev, CL_DEVICE_SIMD_WIDTH_AMD, buf[2], 0U), info(clGetDeviceInfo, dev, CL_DEVICE_MAX_COMPUTE_UNITS, buf[3]), warp_size[i][j], info(clGetDeviceInfo, dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, buf[4]), info(clGetDeviceInfo, dev, CL_DEVICE_PARTITION_PROPERTIES, buf[5]), info(clGetDeviceInfo, dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, buf[6]), wid[0], wid[1], wid[2], }; log::info { log, "%s %u$bit-%s %s line %s align %s page %s param %s alloc %s", string_view{head}, info(clGetDeviceInfo, dev, CL_DEVICE_ADDRESS_BITS, buf[0]), info(clGetDeviceInfo, dev, CL_DEVICE_ENDIAN_LITTLE, buf[1])? "LE"_sv: "BE"_sv, info(clGetDeviceInfo, dev, CL_DEVICE_ERROR_CORRECTION_SUPPORT, buf[2])? "ECC"_sv: "non-ECC"_sv, pretty(pbuf[0], iec(info(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, buf[3]))), pretty(pbuf[1], iec(info(clGetDeviceInfo, dev, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, buf[4]))), pretty(pbuf[2], iec(info(clGetDeviceInfo, dev, CL_DEVICE_MEM_BASE_ADDR_ALIGN, buf[5]))), pretty(pbuf[3], iec(info(clGetDeviceInfo, dev, CL_DEVICE_MAX_PARAMETER_SIZE, buf[6]))), pretty(pbuf[4], iec(info(clGetDeviceInfo, dev, CL_DEVICE_MAX_MEM_ALLOC_SIZE, buf[7]))), }; log::info { log, "%s global %s type:%02x cache %s chans %u banks %u width %u", string_view{head}, pretty(pbuf[0], iec(info(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_SIZE, buf[0]))), info(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, buf[1]), pretty(pbuf[1], iec(info(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, buf[2]))), info(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, buf[3], 0U), info(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, buf[4], 0U), info(clGetDeviceInfo, dev, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, buf[5], 0U), }; log::info { log, "%s local %s type:%02x banks %u const %s consts %u", string_view{head}, pretty(pbuf[0], iec(info(clGetDeviceInfo, dev, CL_DEVICE_LOCAL_MEM_SIZE, buf[0]))), info(clGetDeviceInfo, dev, CL_DEVICE_LOCAL_MEM_TYPE, buf[1]), info(clGetDeviceInfo, dev, CL_DEVICE_LOCAL_MEM_BANKS_AMD, buf[2], 0U), pretty(pbuf[1], iec(info(clGetDeviceInfo, dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, buf[3]))), info(clGetDeviceInfo, dev, CL_DEVICE_MAX_CONSTANT_ARGS, buf[4]), }; log::logf { log, log::level::DEBUG, "%s char%u short%u half%u int%u float%u long%u double%u", string_view{head}, info(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, buf[0]), info(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, buf[1]), info(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, buf[2]), info(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, buf[3]), info(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, buf[4]), info(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, buf[5]), info(clGetDeviceInfo, dev, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, buf[6]), }; const bool native_kernel ( CL_EXEC_NATIVE_KERNEL & info(clGetDeviceInfo, dev, CL_DEVICE_EXECUTION_CAPABILITIES, buf[0]) ); const auto il_version ( native_kernel? info(clGetDeviceInfo, dev, CL_DEVICE_IL_VERSION, buf[1]): 0 ); log::logf { log, log::level::DEBUG, "%s SPIR-V:%b:%u queues:%u events:%u pref:%u max:%u printf:%lu :%s", string_view{head}, native_kernel, il_version, info(clGetDeviceInfo, dev, CL_DEVICE_MAX_ON_DEVICE_QUEUES, buf[0]), info(clGetDeviceInfo, dev, CL_DEVICE_MAX_ON_DEVICE_EVENTS, buf[1]), info(clGetDeviceInfo, dev, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE, buf[2]), info(clGetDeviceInfo, dev, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, buf[3]), info(clGetDeviceInfo, dev, CL_DEVICE_PRINTF_BUFFER_SIZE, buf[4]), info(clGetDeviceInfo, dev, CL_DEVICE_OPENCL_C_VERSION, buf[5]) }; char extensions_buf[2048]; const string_view extensions { info(clGetDeviceInfo, dev, CL_DEVICE_EXTENSIONS, extensions_buf) }; log::logf { log, log::level::DEBUG, "%s :%s", string_view{head}, extensions, }; } /// Silly quirks of OpenCL force us to setup a context, compile a program, and /// instantiate a kernel to find out the warp/wavefront size characteristic. /// Note that other thread-grouping characteristics are available from device /// properties directly. uint ircd::cl::query_warp_size(cl_context context, cl_device_id dev) try { // Attempt to get this value the easy way from a device API. uint ret(0); if((ret = query_warp_size_amd(dev))) return ret; assert(primary); assert(context == primary); //TODO: XXX // Get this value the hard way from the cl_kernel API which requires // building a program first and querying its properties. cl::code code { "__kernel void ircd_test() {}"_sv }; code.compile(); code.link(); cl::kern kern { code, "ircd_test" }; ret = kern.preferred_group_size_multiple(device); return ret; } catch(const ctx::interrupted &) { throw; } catch(const ctx::terminated &) { throw; } catch(const std::exception &e) { log::logf { log, log::level::WARNING, "context(%p) device(%p) query warp size :%s", static_cast(context), static_cast(dev), e.what(), }; return 0; } uint ircd::cl::query_warp_size_amd(cl_device_id dev) try { char buf[4]; constexpr auto ign(CL_INVALID_VALUE); return info(clGetDeviceInfo, dev, CL_DEVICE_WAVEFRONT_WIDTH_AMD, buf, 0); } catch(const std::exception &e) { log::error { log, "device(%p) query warp size (AMD) :%s", static_cast(dev), e.what(), }; return 0; } ircd::string_view ircd::cl::version_str(const mutable_buffer &buf, const cl_version val) { return fmt::sprintf { buf, "%u.%u.%u", CL_VERSION_MAJOR(val), CL_VERSION_MINOR(val), CL_VERSION_PATCH(val), }; } // // interface // void ircd::cl::sync() { if(unlikely(!primary)) return; auto &q { queue[0][0] }; call ( clFinish, q ); ++primary_stats.sync_count; } void ircd::cl::flush() { auto &q { queue[0][0] }; call ( clFlush, q ); ++primary_stats.flush_count; } // // exec // namespace ircd::cl { static const size_t _deps_list_max {32}; static thread_local cl_event _deps_list[_deps_list_max]; static cl_event *addressof_handle(cl::work *const &); static void check_submit_blocking(cl::exec *const &, const exec::opts &); static void handle_submitted(cl::exec *const &, const exec::opts &); static vector_view make_deps_default(cl::work *const &, const exec::opts &); static vector_view make_deps(cl::work *const &, const exec::opts &); } template<> decltype(ircd::util::instance_list::allocator) ircd::util::instance_list::allocator {}; template<> decltype(ircd::util::instance_list::list) ircd::util::instance_list::list { allocator }; decltype(ircd::cl::exec::opts_default) ircd::cl::exec::opts_default; ircd::cl::exec::exec(const opts &opts) try { auto &q { queue[0][0] }; const auto deps { make_deps(this, opts) }; assert(!this->handle); call ( clEnqueueBarrierWithWaitList, q, deps.size(), deps.size()? deps.data(): nullptr, addressof_handle(this) ); primary_stats.exec_barrier_tasks += 1; handle_submitted(this, opts); } catch(const std::exception &e) { log::error { log, "Exec Barrier :%s", e.what(), }; throw; } ircd::cl::exec::exec(kern &kern, const kern::range &range, const opts &opts) try { size_t dim(0); for(size_t i(0); i < range.global.size(); ++i) dim += range.global[i] > 0 && dim == i; if(!dim) return; if(unlikely(run::level != run::level::RUN)) throw unavailable { "Unable to submit work items in runlevel %s", reflect(run::level), }; assert(run::level == run::level::RUN); auto &q { queue[0][0] }; auto &dev { device[0][0] }; const auto deps { make_deps(this, opts) }; assert(!this->object); this->object = &kern; const auto max_local_size { kern.group_size(dev) }; const auto reqd_local_size { kern.compile_group_size(dev) }; const auto hint_local_size { kern.preferred_group_size_multiple(dev) }; size_t local[dim]; for(size_t d(0); d < dim; ++d) { local[d] = reqd_local_size[d]?: range.local[d]?: hint_local_size; local[d] = std::min(local[d], max_local_size); } size_t global_size{range.global[0]}; size_t local_size{local[0]}; for(size_t d(1); d < dim; ++d) { global_size *= range.global[d]; local_size *= local[d]; } assert(global_size % local_size == 0); const size_t groups { global_size / local_size }; assert(groups > 0); size_t intensity { cl::intensity? std::max(opts.intensity, uint(cl::intensity)): opts.intensity }; if(intensity < groups) while(intensity > 1 && groups % intensity != 0) intensity--; const size_t tasks { intensity && intensity < groups? groups / intensity: 1 }; assert(!this->handle); for(size_t i(0); i < tasks; ++i) { kern::range sub_range(range); for(size_t d(0); d < dim; ++d) { sub_range.global[d] /= tasks; sub_range.offset[d] += sub_range.global[d] * i; } call ( clEnqueueNDRangeKernel, q, cl_kernel(kern.handle), dim, sub_range.offset.data(), sub_range.global.data(), local + 0, deps.size(), deps.size()? deps.data(): nullptr, i == tasks - 1? addressof_handle(this): nullptr ); } primary_stats.exec_kern_tasks += tasks; primary_stats.exec_kern_threads += global_size; primary_stats.exec_kern_groups += groups; handle_submitted(this, opts); } catch(const std::exception &e) { log::error { log, "Exec Kern :%s", e.what(), }; throw; } ircd::cl::exec::exec(data &dst, const data &src, const opts &opts) try { auto &q { queue[0][0] }; assert(src.handle); assert(dst.handle); const size_t size { opts.size == -1UL? std::min(dst.size(), src.size()): opts.size }; if(!size) return; assert(!this->object); this->object = &dst; const auto deps { make_deps(this, opts) }; assert(!this->handle); call ( clEnqueueCopyBuffer, q, cl_mem(src.handle), cl_mem(dst.handle), opts.offset[1], opts.offset[0], size, deps.size(), deps.size()? deps.data(): nullptr, addressof_handle(this) ); primary_stats.exec_copy_bytes += size; primary_stats.exec_copy_tasks += 1; handle_submitted(this, opts); } catch(const std::exception &e) { log::error { log, "Exec Copy :%s", e.what(), }; throw; } ircd::cl::exec::exec(data &data, const std::memory_order order, const opts &opts) try { if(unlikely(run::level < run::level::RUN)) throw unavailable { "Unable to write to device in runlevel %s", reflect(run::level), }; assert(run::level >= run::level::RUN); const auto max_size { opts.size == -1UL? data.size(): opts.size }; const auto size { size_t(opts.offset[0]) < max_size? max_size - opts.offset[0]: 0UL }; assert(size_t(size) <= data.size()); assert(size_t(opts.offset[0]) <= data.size()); if(!size) return; bool read {false}, write {false}, invalidate {false}; bool blocking {opts.blocking}; switch(order) { case std::memory_order_relaxed: return; case std::memory_order_consume: read = true; break; case std::memory_order_acquire: read = true; write = true; break; case std::memory_order_seq_cst: read = true; write = true; blocking = true; break; case std::memory_order_acq_rel: invalidate = true; break; case std::memory_order_release: break; } const cl_map_flags flags { (boolmask(read) & CL_MAP_READ) | (boolmask(write) & CL_MAP_WRITE) | (boolmask(invalidate) & CL_MAP_WRITE_INVALIDATE_REGION) }; if(!flags && !data.mapped) return; assert(flags || data.mapped); assert(!this->object); this->object = &data; auto &q { queue[0][0] }; const auto deps { make_deps(this, opts) }; int err {CL_SUCCESS}; assert(!this->handle); if(flags) data.mapped = clEnqueueMapBuffer ( q, cl_mem(data.handle), blocking, flags, opts.offset[0], size, deps.size(), deps.size()? deps.data(): nullptr, addressof_handle(this), &err ); else call ( clEnqueueUnmapMemObject, q, cl_mem(data.handle), std::exchange(data.mapped, nullptr), 0, // deps nullptr, // depslist addressof_handle(this) ); throw_on_error(err); primary_stats.exec_read_bytes += read? size: 0; primary_stats.exec_read_tasks += read; primary_stats.exec_write_bytes += write || invalidate? size: 0; primary_stats.exec_write_tasks += write || invalidate; handle_submitted(this, opts); assert(data.mapped || !flags); assert(this->handle); } catch(const std::exception &e) { log::error { log, "Exec Map order:%d :%s", int(order), e.what(), }; throw; } void ircd::cl::handle_submitted(cl::exec *const &exec, const exec::opts &opts) { assert(run::level == run::level::RUN || run::level == run::level::QUIT); primary_stats.exec_tasks += 1; if(opts.flush) cl::flush(); if(opts.sync) cl::sync(); if(likely(!opts.blocking)) check_submit_blocking(exec, opts); } /// Checks if the OpenCL runtime blocked this thread to sound the alarms. void ircd::cl::check_submit_blocking(cl::exec *const &exec, const exec::opts &opts) { const uint64_t &threshold { watchdog_tsc }; if(!threshold) return; const auto submit_cycles { prof::cycles() - exec->ts }; if(likely(submit_cycles < threshold)) return; char nbuf[64]; const auto name { exec->name(nbuf) }; char pbuf[32]; log::dwarning { log, "clEnqueue() kernel '%s' blocking the host for %s cycles on submit.", name?: ""_sv, pretty(pbuf, si(submit_cycles), 1), }; } ircd::vector_view ircd::cl::make_deps(cl::work *const &work, const exec::opts &opts) { if(empty(opts.deps) && !opts.indep) return make_deps_default(work, opts); if(empty(opts.deps)) return {}; size_t ret(0); vector_view out(_deps_list); for(auto &exec : opts.deps) out.at(ret++) = cl_event(exec.handle); return vector_view { out, ret }; } ircd::vector_view ircd::cl::make_deps_default(cl::work *const &work, const exec::opts &opts) { size_t ret(0); vector_view out(_deps_list); for(auto it(rbegin(cl::work::list)); it != rend(cl::work::list); ++it) { cl::work *const &other{*it}; if(other == work) continue; if(!other->handle) continue; if(other->context != ctx::current) continue; out.at(ret++) = cl_event(other->handle); break; } return vector_view { out, ret }; } cl_event * ircd::cl::addressof_handle(cl::work *const &work) { const auto handle { std::addressof(work->handle) }; cl_event *const ret ( reinterpret_cast(handle) ); return ret; } // // kern // ircd::cl::kern::kern(code &code, const string_view &name) try { int err {CL_SUCCESS}; handle = clCreateKernel(cl_program(code.handle), name.c_str(), &err); throw_on_error(err); const std::array cgs { #ifdef RB_DEBUG compile_group_size(device[0][0]) //TODO: XXX #else 0, 0, 0 #endif }; char buf[1][16]; char pbuf[2][48]; if constexpr((false)) log::debug { log, "kernel stack %s local %s group:%zu pref:%zu comp:%zu:%zu:%zu :%s", pretty(pbuf[0], iec(stack_mem_size())), pretty(pbuf[1], iec(local_mem_size())), group_size(), preferred_group_size_multiple(), cgs[0], cgs[1], cgs[2], name, }; } catch(const std::exception &e) { log::error { log, "Kernel Create '%s' :%s", name, e.what(), }; throw; } ircd::cl::kern::~kern() noexcept try { if(likely(handle)) call(clReleaseKernel, cl_kernel(handle)); } catch(const std::exception &e) { log::critical { log, "Kernel Release :%s", e.what(), }; return; } void ircd::cl::kern::arg(const int i, data &data) { const auto &data_handle { cl_mem(data.handle) }; call(clSetKernelArg, cl_kernel(handle), i, sizeof(cl_mem), &data_handle); } void ircd::cl::kern::arg(const int i, const const_buffer &buf) { call(clSetKernelArg, cl_kernel(handle), i, ircd::size(buf), ircd::data(buf)); } std::array ircd::cl::kern::compile_group_size(void *const dev) const { char buf[24]; const auto handle(cl_kernel(this->handle)); constexpr auto flag(CL_KERNEL_COMPILE_WORK_GROUP_SIZE); return info>(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf); } size_t ircd::cl::kern::preferred_group_size_multiple(void *const dev) const { char buf[16]; const auto handle(cl_kernel(this->handle)); constexpr auto flag(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE); return info(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf); } size_t ircd::cl::kern::group_size(void *const dev) const { char buf[16]; const auto handle(cl_kernel(this->handle)); constexpr auto flag(CL_KERNEL_WORK_GROUP_SIZE); return info(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf); } size_t ircd::cl::kern::local_mem_size(void *const dev) const { char buf[16]; const auto handle(cl_kernel(this->handle)); constexpr auto flag(CL_KERNEL_LOCAL_MEM_SIZE); return info(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf); } size_t ircd::cl::kern::stack_mem_size(void *const dev) const { char buf[16]; const auto handle(cl_kernel(this->handle)); constexpr auto flag(CL_KERNEL_PRIVATE_MEM_SIZE); return info(clGetKernelWorkGroupInfo, handle, cl_device_id(dev), flag, buf); } uint ircd::cl::kern::argc() const { const auto handle { cl_kernel(this->handle) }; char buf[sizeof(uint)]; return info(clGetKernelInfo, handle, CL_KERNEL_NUM_ARGS, buf); } ircd::string_view ircd::cl::kern::name(const mutable_buffer &buf) const { const auto handle { cl_kernel(this->handle) }; return handle? info(clGetKernelInfo, handle, CL_KERNEL_FUNCTION_NAME, buf): string_view{}; } // // code // namespace ircd::cl { static void build_handle_error_log_line(const string_view &line); static void build_handle_error(code &, const opencl_error &); static void build_handle(cl_program program, void *priv); } decltype(ircd::cl::code::iov_max) ircd::cl::code::iov_max { 64 }; // // code::code // ircd::cl::code::code(path_t, const string_view &paths) :code { path, vector_view(&paths, 1), } { } ircd::cl::code::code(path_t, const vector_view &paths_) { static const auto is_cl { [](const auto &path) { return fs::is_reg(path) && fs::is_extension(path, ".cl"); } }; std::vector buf; for(const auto &path : paths_) if(fs::is_dir(path)) { for(const auto &file : fs::ls(path)) if(is_cl(file)) buf.emplace_back(fs::read(fs::fd(file))); } else if(is_cl(path)) buf.emplace_back(fs::read(fs::fd(path))); const auto count(buf.size()); if(unlikely(count > iov_max)) throw error { "Maximum number of sources exceeded: lim:%zu got:%zu", iov_max, count, }; string_view src[count]; for(uint i(0); i < count; ++i) src[i] = buf[i]; create(vector_view(src, count)); } ircd::cl::code::code(const string_view &src) :code { vector_view(&src, 1), } { } ircd::cl::code::code(const vector_view &srcs) { create(srcs); } ircd::cl::code::code(const vector_view &bins) { create(bins); } ircd::cl::code::code(const const_buffer &bc) { char pbuf[1][48]; log::logf { log, log::level::DEBUG, "code(%p) loading %s bitcode:%p", this, pretty(pbuf[0], si(ircd::size(bc))), ircd::data(bc), }; assert(!handle); int err {CL_SUCCESS}; handle = clCreateProgramWithIL ( primary, ircd::data(bc), ircd::size(bc), &err ); throw_on_error(err); } ircd::cl::code::~code() noexcept try { if(likely(handle)) call(clReleaseProgram, cl_program(handle)); } catch(const std::exception &e) { log::critical { log, "Program Release :%s", e.what(), }; return; } void ircd::cl::code::create(const vector_view &srcs) { const auto count(srcs.size()); if(unlikely(count > iov_max)) throw error { "Maximum number of sources exceeded: lim:%zu got:%zu", iov_max, srcs.size(), }; size_t len[count]; const char *src[count]; for(size_t i(0); i < count; ++i) src[i] = ircd::data(srcs[i]), len[i] = ircd::size(srcs[i]); char pbuf[1][48]; log::logf { log, log::level::DEBUG, "code(%p) creating %s srcs:%zu", this, pretty(pbuf[0], si(std::accumulate(len, len + count, 0))), count, }; assert(!handle); int err {CL_SUCCESS}; handle = clCreateProgramWithSource(primary, count, src, len, &err); throw_on_error(err); } void ircd::cl::code::create(const vector_view &bins) { const auto count(bins.size()); if(unlikely(count > iov_max)) throw error { "Maximum number of binaries exceeded: lim:%zu got:%zu", iov_max, count, }; size_t len[iov_max + 1] {0}; const uint8_t *bin[iov_max + 1] {nullptr}; for(size_t i(0); i < count; ++i) bin[i] = reinterpret_cast(ircd::data(bins[i])), len[i] = ircd::size(bins[i]); size_t devs {0}; cl_device_id dev[DEVICE_MAX] {0}; for(size_t i(0); i < platforms; ++i) for(size_t j(0); j < devices[i]; ++j) dev[devs++] = device[i][j]; char pbuf[1][48]; log::logf { log, log::level::DEBUG, "code(%p) loading %s bins:%zu devs:%zu", this, pretty(pbuf[0], si(std::accumulate(len, len + count, 0))), count, devs, }; assert(len); assert(devs); assert(!handle); int err {CL_SUCCESS}; int binerr[iov_max + 1] {CL_SUCCESS}; handle = clCreateProgramWithBinary(primary, devs, dev, len, bin, binerr, &err); throw_on_error(err); for(size_t i(0); i < count; ++i) throw_on_error(binerr[i]); } void ircd::cl::code::build(const string_view &opts) try { const uint num_devices {1}; const cl_device_id *device_list { device[0] //TODO: XXX }; log::logf { log, log::level::DEBUG, "code(%p) building devs:%zu %c%s", this, num_devices, opts? ':': ' ', opts, }; call ( clBuildProgram, cl_program(handle), num_devices, device_list, opts.c_str(), cl::build_handle, this ); } catch(const opencl_error &e) { build_handle_error(*this, e); throw; } catch(const std::exception &e) { log::error { log, "code(%p) :Failed to build :%s", this, e.what(), }; throw; } void ircd::cl::code::link(const string_view &opts) try { const uint num_devices {1}; const cl_device_id *device_list { device[0] //TODO: XXX }; assert(this->handle); const uint num_progs {1}; const cl_program progs[] { cl_program(this->handle) }; log::logf { log, log::level::DEBUG, "code(%p) linking devs:%zu progs:%zu opts:%zu$B %c%s", this, num_devices, num_progs, ircd::size(opts), opts? ':': ' ', opts, }; int err { CL_COMPILER_NOT_AVAILABLE }; #ifdef CL_VERSION_1_2 void *handle { clLinkProgram ( primary, num_devices, device_list, opts.c_str(), num_progs, progs, cl::build_handle, this, &err ) }; #endif throw_on_error(err); std::swap(handle, this->handle); call(clReleaseProgram, cl_program(handle)); } catch(const opencl_error &e) { build_handle_error(*this, e); throw; } catch(const std::exception &e) { log::error { log, "code(%p) :Failed to link :%s", this, e.what(), }; throw; } void ircd::cl::code::compile(const string_view &opts) try { const uint num_devices {1}; const cl_device_id *device_list { device[0] //TODO: XXX }; const uint num_headers {0}; const cl_program *header_progs { nullptr }; const char *header_names[] { nullptr }; log::logf { log, log::level::DEBUG, "code(%p) compiling devs:%zu headers:%zu %c%s", this, num_devices, num_headers, opts? ':': ' ', opts, }; #ifdef CL_VERSION_1_2 call ( clCompileProgram, cl_program(handle), num_devices, device_list, opts.c_str(), num_headers, header_progs, nullptr, //header_names, // clover api bug? cl::build_handle, this ); #else throw_on_error(CL_COMPILER_NOT_AVAILABLE); #endif } catch(const opencl_error &e) { build_handle_error(*this, e); throw; } catch(const std::exception &e) { log::error { log, "code(%p) :Failed to compile :%s", this, e.what(), }; throw; } ircd::string_view ircd::cl::code::src(const mutable_buffer &buf) const { const auto &handle { cl_program(this->handle) }; return info(clGetProgramInfo, handle, CL_PROGRAM_SOURCE, buf); } ircd::vector_view ircd::cl::code::bin(vector_view buf) const { const auto &handle { cl_program(this->handle) }; const auto devs { this->devs() }; assert(devs <= ircd::size(buf)); const auto count { std::min(devs, ircd::size(buf)) }; size_t bin_sz[count]; const auto bins { this->bins({bin_sz, count}) }; assert(bins <= count); const auto num { std::min(bins, count) }; for(size_t i(0); i < num; ++i) buf[i] = mutable_buffer { buf[i], bin_sz[i] }; uintptr_t ptr[num]; for(size_t i(0); i < num; ++i) ptr[i] = uintptr_t(ircd::data(buf[i])); info(clGetProgramInfo, handle, CL_PROGRAM_BINARIES, mutable_buffer { reinterpret_cast(ptr), sizeof(uintptr_t) * num }); return buf; } size_t ircd::cl::code::bins_size() const { const auto devs { this->devs() }; size_t bin_sz[devs]; const auto bins { this->bins({bin_sz, devs}) }; assert(bins <= devs); const auto ret { std::accumulate(bin_sz, bin_sz + bins, 0UL) }; return ret; } size_t ircd::cl::code::bins(const vector_view &buf) const { const auto &handle { cl_program(this->handle) }; const auto count { devs() }; assert(count <= size(buf)); info(clGetProgramInfo, handle, CL_PROGRAM_BINARY_SIZES, mutable_buffer { reinterpret_cast(ircd::data(buf)), ircd::size(buf) * sizeof(size_t) }); return count; } size_t ircd::cl::code::devs() const { char buf[sizeof(uint)]; const auto &handle { cl_program(this->handle) }; return info(clGetProgramInfo, handle, CL_PROGRAM_NUM_DEVICES, buf); } long ircd::cl::code::status() const { const auto &handle { cl_program(this->handle) }; cl_build_status buf; const mutable_buffer mb { reinterpret_cast(&buf), sizeof(buf) }; const auto &dev { device[0][0], //TODO: XXX }; const auto ret { info(clGetProgramBuildInfo, handle, dev, CL_PROGRAM_BUILD_STATUS, mb) }; return ret; } void ircd::cl::build_handle(cl_program program, void *const priv) { cl::code *const &code { reinterpret_cast(priv) }; assert(code); char pbuf[1][48]; log::logf { log, log::level::DEBUG, "program(%p) devs:%zu binsz:%s :Build complete.", (const void *)program, code->devs(), pretty(pbuf[0], si(code->bins_size())), }; } void ircd::cl::build_handle_error(code &code, const opencl_error &e) { const auto string_closure { [&code](const mutable_buffer &buf) { size_t len {0}; call ( clGetProgramBuildInfo, cl_program(code.handle), device[0][0], //TODO: XXX CL_PROGRAM_BUILD_LOG, ircd::size(buf), ircd::data(buf), &len ); return len; } }; const auto error_message { ircd::string(8_KiB | SHRINK_TO_FIT, string_closure) }; const auto lines { ircd::tokens(error_message, '\n', build_handle_error_log_line) }; } void ircd::cl::build_handle_error_log_line(const string_view &line) { // note last line is just a CR if(line.size() <= 1) return; const auto &[loc, line_] { split(line, ' ') }; const auto &[fac, msg] { split(line_, ' ') }; const auto &[fname, pos] { split(loc, ':') }; const auto &[row, col] { split(pos, ':') }; const auto level { startswith(fac, "warning")? log::level::WARNING: startswith(fac, "error")? log::level::ERROR: log::level::ERROR }; log::logf { log, level, "%s", line }; } // // data // decltype(ircd::cl::data::gart_page_size) ircd::cl::data::gart_page_size { { "name", "ircd.cl.data.gart.page_size" }, { "default", 4096 }, { "help", "Override (un)detected gart page size." }, }; ircd::cl::data::data(const size_t size, const bool host_read, const bool host_write) { if(!size) return; cl_mem_flags flags {0}; flags |= CL_MEM_READ_WRITE; flags |= host_read && !host_write? CL_MEM_HOST_READ_ONLY: 0; flags |= !host_read && host_write? CL_MEM_HOST_WRITE_ONLY: 0; flags |= !host_read && !host_write? CL_MEM_HOST_NO_ACCESS: 0; char pbuf[48]; log::debug { log, "data(%p) device %s %lu@ host[read:%b write:%b] flags:%08x", this, pretty(pbuf, iec(size)), alignment(size), host_read, host_write, flags, }; int err {CL_SUCCESS}; handle = clCreateBuffer ( primary, flags, size, nullptr, &err ); throw_on_error(err); primary_stats.alloc_count += 1; primary_stats.alloc_bytes += size; } ircd::cl::data::data(const mutable_buffer &buf, const bool wonly) { const auto &ptr { ircd::data(buf) }; const auto &size { ircd::size(buf) }; if(!size) return; cl_mem_flags flags {0}; flags |= CL_MEM_USE_HOST_PTR; flags |= wonly? CL_MEM_WRITE_ONLY: CL_MEM_READ_WRITE; char pbuf[48]; log::debug { log, "data(%p) mutable %p %lu@ %s %lu@ wonly:%b flags:%08x", this, ptr, alignment(ptr), pretty(pbuf, iec(size)), alignment(size), wonly, flags, }; assert(!ptr || aligned(buf, size_t(gart_page_size))); assert(padded(size, size_t(gart_page_size))); int err {CL_SUCCESS}; handle = clCreateBuffer ( primary, flags, size, ptr, &err ); throw_on_error(err); primary_stats.alloc_count += 1; primary_stats.alloc_bytes += size; } ircd::cl::data::data(const const_buffer &buf) { const auto &ptr { ircd::data(buf) }; const auto &size { ircd::size(buf) }; if(!size) return; cl_mem_flags flags {0}; flags |= CL_MEM_USE_HOST_PTR; flags |= CL_MEM_READ_ONLY; char pbuf[48]; log::debug { log, "data(%p) immutable %p %lu@ %s %lu@ flags:%08x", this, ptr, alignment(ptr), pretty(pbuf, iec(size)), alignment(size), flags, }; assert(!ptr || aligned(buf, size_t(gart_page_size))); assert(padded(size, size_t(gart_page_size))); int err {CL_SUCCESS}; handle = clCreateBuffer ( primary, flags, size, mutable_cast(ptr), &err ); throw_on_error(err); primary_stats.alloc_count += 1; primary_stats.alloc_bytes += size; } ircd::cl::data::data(data &master, const pair &slice) { constexpr auto type { CL_BUFFER_CREATE_TYPE_REGION }; if(!master.handle) return; cl_buffer_region region {0}; region.size = slice.first; region.origin = master.offset() + slice.second; if(!region.size) return; const auto root { master.master()?: master.handle }; char pbuf[48]; if constexpr((false)) log::debug { log, "data(%p) master(%p) region offset:%zu %lu@ %s %lu@", this, root, region.origin, alignment(region.origin), pretty(pbuf, iec(region.size)), alignment(region.size), }; assert(aligned(region.origin, size_t(gart_page_size))); assert(padded(region.size, size_t(gart_page_size))); assert(root); int err {CL_SUCCESS}; handle = clCreateSubBuffer ( cl_mem(root), cl_mem_flags{0}, type, ®ion, &err ); throw_on_error(err); } ircd::cl::data::~data() noexcept try { assert(handle || !mapped); if(likely(handle)) { const auto size { this->size() }; call(clReleaseMemObject, cl_mem(handle)); primary_stats.dealloc_count += 1; primary_stats.dealloc_bytes += size; } } catch(const std::exception &e) { log::critical { log, "Memory Release :%s", e.what(), }; return; } char * ircd::cl::data::ptr() const { assert(handle); char buf[sizeof(void *)] {0}; const auto ret { this->mapped? static_cast(this->mapped): info(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_HOST_PTR, buf) }; return ret; } size_t ircd::cl::data::refs() const { assert(handle); char buf[sizeof(size_t)] {0}; return info(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_REFERENCE_COUNT, buf); } off_t ircd::cl::data::offset() const { assert(handle); char buf[sizeof(off_t)] {0}; return info(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_OFFSET, buf); } size_t ircd::cl::data::size() const { assert(handle); char buf[sizeof(size_t)] {0}; return info(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_SIZE, buf); } uint ircd::cl::data::flags() const { assert(handle); char buf[sizeof(uint)] {0}; return info(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), CL_MEM_FLAGS, buf); } void * ircd::cl::data::master() const { assert(handle); char buf[sizeof(void *)] {0}; constexpr auto qtype(CL_MEM_ASSOCIATED_MEMOBJECT); return info(clGetMemObjectInfo, cl_mem(mutable_cast(handle)), qtype, buf); } // // cl::work (event) // namespace ircd::cl { struct completion; extern conf::item offload_enable; extern const ctx::ole::opts offload_opts; static void handle_event_callback(cl_event, cl_int, void *) noexcept; static int wait_event_callback(work &, const int, const int); static int wait_event_offload(work &, const int, const int); static int wait_event(work &, const int, const int); } struct alignas(64) ircd::cl::completion { cl_event event {nullptr}; cl_int status {CL_COMPLETE}; ctx::dock dock; }; decltype(ircd::cl::offload_enable) ircd::cl::offload_enable { { "name", "ircd.cl.offload.enable" }, { "default", true }, }; decltype(ircd::cl::offload_opts) ircd::cl::offload_opts { "cl" }; [[gnu::visibility("hidden")]] void ircd::cl::work::init() { } [[using gnu: cold, visibility("hidden")]] void ircd::cl::work::fini() noexcept { cl::sync(); } // // work::work // ircd::cl::work::work() { if(unlikely(!cl::linkage)) throw unavailable { "OpenCL runtime is not available." }; assert(cl::linkage); } ircd::cl::work::work(void *const &handle) { call(clRetainEvent, cl_event(handle)); this->handle = handle; } ircd::cl::work::~work() noexcept try { if(!handle) return; const unwind free{[this] { assert(handle); call(clReleaseEvent, cl_event(handle)); }}; wait(); } catch(const std::exception &e) { log::critical { log, "Work Release :%s", e.what(), }; return; } void ircd::cl::work::wait(const uint desired) try { static_assert(CL_COMPLETE == 0); constexpr auto execution_status { CL_EVENT_COMMAND_EXECUTION_STATUS }; char buf[4]; int status {0}; if(likely(handle)) status = info(clGetEventInfo, cl_event(handle), execution_status, buf); if(status > int(desired)) status = wait_event(*this, status, desired); if(unlikely(status < 0)) throw_on_error(status); assert(int(status) == int(desired)); } catch(const std::exception &e) { log::error { log, "work(%p)::wait(%u) :%s", this, desired, e.what(), }; throw; } ircd::string_view ircd::cl::work::name(const mutable_buffer &buf) const { switch(const auto type(this->type()); type) { default: return nullptr; case CL_COMMAND_READ_BUFFER: return "READ_BUFFER"; case CL_COMMAND_WRITE_BUFFER: return "WRITE_BUFFER"; case CL_COMMAND_COPY_BUFFER: return "COPY_BUFFER"; case CL_COMMAND_MAP_BUFFER: return "MAP_BUFFER"; case CL_COMMAND_UNMAP_MEM_OBJECT: return "UNMAP_MEM_OBJECT"; case CL_COMMAND_NDRANGE_KERNEL: { const auto kern { reinterpret_cast(object) }; return kern? kern->name(buf): "NDRANGE_KERNEL"; } }; } int ircd::cl::work::type() const { const auto handle { cl_event(this->handle) }; if(!handle) return 0; char buf[4]; return info(clGetEventInfo, handle, CL_EVENT_COMMAND_TYPE, buf); } // // cl::work::prof // ircd::cl::work::prof::prof(const cl::work &w) { const auto h { cl_event(w.handle) }; if(!profile_queue || !h) { for(uint i(0); i < this->size(); ++i) (*this)[i] = 0ns; return; } char b[5][8]; (*this)[0] = info(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_QUEUED, b[0]); (*this)[1] = info(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_SUBMIT, b[1]); (*this)[2] = info(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_START, b[2]); (*this)[3] = info(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_END, b[3]); (*this)[4] = info(clGetEventProfilingInfo, h, CL_PROFILING_COMMAND_COMPLETE, b[4]); } // // cl::work (internal) // int ircd::cl::wait_event(work &work, const int status, const int desired) { assert(work.handle); assert(status > desired); const ctx::uninterruptible::nothrow ui; const bool use_offload { // conf item bool(offload_enable) }; const auto ret { use_offload? wait_event_offload(work, status, desired): wait_event_callback(work, status, desired) }; const bool is_err { ret < 0 }; const bool is_complete { ret == CL_COMPLETE }; primary_stats.work_completes += is_complete; primary_stats.work_errors += is_err; primary_stats.work_waits += 1; return ret; } int ircd::cl::wait_event_offload(work &work, const int status, const int desired) { completion c { cl_event(work.handle), status, }; ctx::ole::offload { offload_opts, [&c] { assert(c.status != CL_COMPLETE); call(clWaitForEvents, 1UL, &c.event); c.status = CL_COMPLETE; } }; assert(c.status == CL_COMPLETE); return c.status; } int ircd::cl::wait_event_callback(work &work, const int status, const int desired) { // Completion state structure on this ircd::ctx's stack. completion c { cl_event(work.handle), status, }; // Completion condition closure to be satisfied. const auto condition{[&c, &desired] () noexcept -> bool { return !c.event || c.status <= desired; }}; // Register callback with OpenCL; note that the callback might be // dispatched immediately from this call itself (see below). call ( clSetEventCallback, c.event, desired, &handle_event_callback, &c ); // This stats item counts clSetEventCallback()'s which return before OpenCL // calls the callback, verifying asynchronicity. If this stats item remains // zero, the OpenCL runtime has hijacked our thread for a blocking wait. primary_stats.work_waits_async += !condition(); // Yield ircd::ctx while condition unsatisfied c.dock.wait(condition); return c.status; } void ircd::cl::handle_event_callback(cl_event event, cl_int status, void *const priv) noexcept { auto *const c { reinterpret_cast(priv) }; assert(priv != nullptr); assert(event != nullptr); assert(c->event == event); c->status = status; c->dock.notify_one(); } // // callback surface // void ircd::cl::handle_notify(const char *errstr, const void *token, size_t cb, void *priv) noexcept { if(errstr) log::error { log, "OpenCL t:%p cb:%zu :%s", token, cb, errstr, }; } // // util // template T ircd::cl::info(F&& func, const id0 &i0, const param &p, const mutable_buffer &out, T default_) { using ircd::data; using ircd::size; size_t len {0}; const auto code { call(std::forward(func), i0, p, size(out), data(out), &len) }; assert(code == CL_SUCCESS || code == maybe); if constexpr(maybe != CL_SUCCESS) if(code == maybe) return default_; return byte_view { string_view { data(out), len } }; } template T ircd::cl::info(F&& func, const id0 &i0, const id1 &i1, const param &p, const mutable_buffer &out, T default_) { using ircd::data; using ircd::size; size_t len {0}; const auto code { call(std::forward(func), i0, i1, p, size(out), data(out), &len) }; assert(code == CL_SUCCESS || code == maybe); if constexpr(maybe != CL_SUCCESS) if(code == maybe) return default_; return byte_view { string_view { data(out), len } }; } template int ircd::cl::call(func&& f, args&&... a) { const int ret { f(std::forward(a)...) }; if constexpr(maybe != CL_SUCCESS) if(ret == maybe) return ret; return throw_on_error(ret); } int ircd::cl::throw_on_error(const int &code) { if(unlikely(is_error(code))) throw opencl_error { "(%d) :%s", code, reflect_error(code), }; return code; } bool ircd::cl::is_error(const int &code) noexcept { return code < 0; } ircd::string_view ircd::cl::reflect_error(const int code) noexcept { switch(code) { case CL_SUCCESS: return "SUCCESS"; case CL_DEVICE_NOT_FOUND: return "DEVICE_NOT_FOUND"; case CL_DEVICE_NOT_AVAILABLE: return "DEVICE_NOT_AVAILABLE"; case CL_COMPILER_NOT_AVAILABLE: return "COMPILER_NOT_AVAILABLE"; case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "MEM_OBJECT_ALLOCATION_FAILURE"; case CL_OUT_OF_RESOURCES: return "OUT_OF_RESOURCES"; case CL_OUT_OF_HOST_MEMORY: return "OUT_OF_HOST_MEMORY"; case CL_PROFILING_INFO_NOT_AVAILABLE: return "PROFILING_INFO_NOT_AVAILABLE"; case CL_MEM_COPY_OVERLAP: return "MEM_COPY_OVERLAP"; case CL_IMAGE_FORMAT_MISMATCH: return "IMAGE_FORMAT_MISMATCH"; case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "IMAGE_FORMAT_NOT_SUPPORTED"; case CL_BUILD_PROGRAM_FAILURE: return "BUILD_PROGRAM_FAILURE"; case CL_MAP_FAILURE: return "MAP_FAILURE"; case CL_INVALID_VALUE: return "INVALID_VALUE"; case CL_INVALID_DEVICE_TYPE: return "INVALID_DEVICE_TYPE"; case CL_INVALID_PLATFORM: return "INVALID_PLATFORM"; case CL_INVALID_DEVICE: return "INVALID_DEVICE"; case CL_INVALID_CONTEXT: return "INVALID_CONTEXT"; case CL_INVALID_QUEUE_PROPERTIES: return "INVALID_QUEUE_PROPERTIES"; case CL_INVALID_COMMAND_QUEUE: return "INVALID_COMMAND_QUEUE"; case CL_INVALID_HOST_PTR: return "INVALID_HOST_PTR"; case CL_INVALID_MEM_OBJECT: return "INVALID_MEM_OBJECT"; case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "INVALID_IMAGE_FORMAT_DESCRIPTOR"; case CL_INVALID_IMAGE_SIZE: return "INVALID_IMAGE_SIZE"; case CL_INVALID_SAMPLER: return "INVALID_SAMPLER"; case CL_INVALID_BINARY: return "INVALID_BINARY"; case CL_INVALID_BUILD_OPTIONS: return "INVALID_BUILD_OPTIONS"; case CL_INVALID_PROGRAM: return "INVALID_PROGRAM"; case CL_INVALID_PROGRAM_EXECUTABLE: return "INVALID_PROGRAM_EXECUTABLE"; case CL_INVALID_KERNEL_NAME: return "INVALID_KERNEL_NAME"; case CL_INVALID_KERNEL_DEFINITION: return "INVALID_KERNEL_DEFINITION"; case CL_INVALID_KERNEL: return "INVALID_KERNEL"; case CL_INVALID_ARG_INDEX: return "INVALID_ARG_INDEX"; case CL_INVALID_ARG_VALUE: return "INVALID_ARG_VALUE"; case CL_INVALID_ARG_SIZE: return "INVALID_ARG_SIZE"; case CL_INVALID_KERNEL_ARGS: return "INVALID_KERNEL_ARGS"; case CL_INVALID_WORK_DIMENSION: return "INVALID_WORK_DIMENSION"; case CL_INVALID_WORK_GROUP_SIZE: return "INVALID_WORK_GROUP_SIZE"; case CL_INVALID_WORK_ITEM_SIZE: return "INVALID_WORK_ITEM_SIZE"; case CL_INVALID_GLOBAL_OFFSET: return "INVALID_GLOBAL_OFFSET"; case CL_INVALID_EVENT_WAIT_LIST: return "INVALID_EVENT_WAIT_LIST"; case CL_INVALID_EVENT: return "INVALID_EVENT"; case CL_INVALID_OPERATION: return "INVALID_OPERATION"; case CL_INVALID_GL_OBJECT: return "INVALID_GL_OBJECT"; case CL_INVALID_BUFFER_SIZE: return "INVALID_BUFFER_SIZE"; case CL_INVALID_MIP_LEVEL: return "INVALID_MIP_LEVEL"; case CL_INVALID_GLOBAL_WORK_SIZE: return "INVALID_GLOBAL_WORK_SIZE"; #ifdef CL_VERSION_1_1 case CL_INVALID_PROPERTY: return "INVALID_PROPERTY"; case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "MISALIGNED_SUB_BUFFER_OFFSET"; case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; #endif #ifdef CL_VERSION_1_2 case CL_COMPILE_PROGRAM_FAILURE: return "COMPILE_PROGRAM_FAILURE"; case CL_LINKER_NOT_AVAILABLE: return "LINKER_NOT_AVAILABLE"; case CL_LINK_PROGRAM_FAILURE: return "LINK_PROGRAM_FAILURE"; case CL_DEVICE_PARTITION_FAILED: return "DEVICE_PARTITION_FAILED"; case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "KERNEL_ARG_INFO_NOT_AVAILABLE"; case CL_INVALID_IMAGE_DESCRIPTOR: return "INVALID_IMAGE_DESCRIPTOR"; case CL_INVALID_COMPILER_OPTIONS: return "INVALID_COMPILER_OPTIONS"; case CL_INVALID_LINKER_OPTIONS: return "INVALID_LINKER_OPTIONS"; case CL_INVALID_DEVICE_PARTITION_COUNT: return "INVALID_DEVICE_PARTITION_COUNT"; #endif #ifdef CL_VERSION_2_0 case CL_INVALID_PIPE_SIZE: return "INVALID_PIPE_SIZE"; case CL_INVALID_DEVICE_QUEUE: return "INVALID_DEVICE_QUEUE"; #endif #ifdef CL_VERSION_2_2 case CL_INVALID_SPEC_ID: return "INVALID_SPEC_ID"; case CL_MAX_SIZE_RESTRICTION_EXCEEDED: return "MAX_SIZE_RESTRICTION_EXCEEDED"; #endif } return "???????"; }