0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-12-27 16:04:15 +01:00

ircd:🆑 Replace niceness value with intensity by range partition.

This commit is contained in:
Jason Volk 2022-09-27 12:26:35 -07:00
parent 508d27eb40
commit da614e33a5
3 changed files with 73 additions and 41 deletions

View file

@ -40,7 +40,7 @@ namespace ircd::cl
extern conf::item<bool> enable; extern conf::item<bool> enable;
extern conf::item<bool> profile_queue; extern conf::item<bool> profile_queue;
extern conf::item<uint64_t> watchdog_tsc; extern conf::item<uint64_t> watchdog_tsc;
extern conf::item<milliseconds> nice_rate; extern conf::item<uint64_t> intensity;
extern conf::item<std::string> path; extern conf::item<std::string> path;
extern conf::item<std::string> envs[]; extern conf::item<std::string> envs[];
} }

View file

@ -65,11 +65,14 @@ struct ircd::cl::exec::opts
/// For operations which have an offset (or two); otherwise ignored. /// For operations which have an offset (or two); otherwise ignored.
off_t offset[2] {0}; off_t offset[2] {0};
/// Tune the intensity of the execution. For headless deployments the /// Tune the intensity of the execution. The value is intended to correlate
/// maximum intensity is advised. Lesser values are more intense. The /// with parallel resource consumption on the device by shaping the work
/// default of -1 is the maximum. The value of zero yields the ircd::ctx /// groups submitted over the range. The minimum value of 1 will serialize
/// after submission, but does not otherwise decrease the intensity. /// execution. Values greater than the number of CU's will not increase
int nice {-1}; /// concurrency but may still partition a large range with multiple command
/// submissions to increase niceness. The default of zero will maximize
/// intensity and minimize command submissions (to one).
uint intensity {0};
/// Starts a new dependency chain; allowing empty deps without implicit /// Starts a new dependency chain; allowing empty deps without implicit
/// dependency on the last work item constructed on the ircd::ctx. /// dependency on the last work item constructed on the ircd::ctx.

View file

@ -141,11 +141,11 @@ ircd::cl::profile_queue
{ "persist", false }, { "persist", false },
}; };
decltype(ircd::cl::nice_rate) decltype(ircd::cl::intensity)
ircd::cl::nice_rate ircd::cl::intensity
{ {
{ "name", "ircd.cl.nice.rate" }, { "name", "ircd.cl.intensity" },
{ "default", 1L }, { "default", 0L },
}; };
decltype(ircd::cl::watchdog_tsc) decltype(ircd::cl::watchdog_tsc)
@ -854,13 +854,13 @@ catch(const std::exception &e)
} }
ircd::cl::exec::exec(kern &kern, ircd::cl::exec::exec(kern &kern,
const kern::range &work, const kern::range &range,
const opts &opts) const opts &opts)
try try
{ {
size_t dim(0); size_t dim(0);
for(size_t i(0); i < work.global.size(); ++i) for(size_t i(0); i < range.global.size(); ++i)
dim += work.global[i] > 0 && dim == i; dim += range.global[i] > 0 && dim == i;
if(!dim) if(!dim)
return; return;
@ -886,32 +886,67 @@ try
assert(!this->object); assert(!this->object);
this->object = &kern; this->object = &kern;
size_t global_size(range.global[0]);
size_t local_size(range.local[0]);
for(size_t d(1); d < dim; ++d)
{
global_size *= range.global[d];
local_size *= range.local[d];
}
assert(global_size % local_size == 0);
const size_t groups
{
global_size / local_size
};
assert(groups > 0);
size_t intensity
{
cl::intensity?
std::max(opts.intensity, uint(cl::intensity)):
opts.intensity
};
if(intensity < groups)
while(intensity > 1 && groups % intensity != 0)
intensity--;
const size_t tasks
{
intensity && intensity < groups?
groups / intensity:
1
};
assert(!this->handle); assert(!this->handle);
call for(size_t i(0); i < tasks; ++i)
( {
clEnqueueNDRangeKernel, kern::range sub_range(range);
q, for(size_t d(0); d < dim; ++d)
cl_kernel(kern.handle), {
dim, sub_range.global[d] /= tasks;
work.offset.data(), sub_range.offset[d] += sub_range.global[d] * i;
work.global.data(), }
work.local.data(),
deps.size(),
deps.size()? deps.data(): nullptr,
addressof_handle(this)
);
size_t global_size(work.global[0]); call
for(size_t i(1); i < dim; ++i) (
global_size *= work.global[i]; clEnqueueNDRangeKernel,
q,
cl_kernel(kern.handle),
dim,
sub_range.offset.data(),
sub_range.global.data(),
sub_range.local.data(),
deps.size(),
deps.size()? deps.data(): nullptr,
i == tasks - 1? addressof_handle(this): nullptr
);
}
size_t local_size(work.local[0]); primary_stats.exec_kern_tasks += tasks;
for(size_t i(1); i < dim; ++i)
local_size *= work.local[i];
primary_stats.exec_kern_tasks += 1;
primary_stats.exec_kern_threads += global_size; primary_stats.exec_kern_threads += global_size;
primary_stats.exec_kern_groups += global_size / local_size; primary_stats.exec_kern_groups += groups;
handle_submitted(this, opts); handle_submitted(this, opts);
} }
catch(const std::exception &e) catch(const std::exception &e)
@ -1136,12 +1171,6 @@ ircd::cl::handle_submitted(cl::exec *const &exec,
if(likely(!opts.blocking)) if(likely(!opts.blocking))
check_submit_blocking(exec, opts); check_submit_blocking(exec, opts);
if(opts.nice == 0)
ctx::yield();
if(opts.nice > 0)
ctx::sleep(opts.nice * milliseconds(nice_rate));
} }
/// Checks if the OpenCL runtime blocked this thread to sound the alarms. /// Checks if the OpenCL runtime blocked this thread to sound the alarms.