0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-12-26 07:23:53 +01:00

ircd:🆑 Replace niceness value with intensity by range partition.

This commit is contained in:
Jason Volk 2022-09-27 12:26:35 -07:00
parent 508d27eb40
commit da614e33a5
3 changed files with 73 additions and 41 deletions

View file

@ -40,7 +40,7 @@ namespace ircd::cl
extern conf::item<bool> enable;
extern conf::item<bool> profile_queue;
extern conf::item<uint64_t> watchdog_tsc;
extern conf::item<milliseconds> nice_rate;
extern conf::item<uint64_t> intensity;
extern conf::item<std::string> path;
extern conf::item<std::string> envs[];
}

View file

@ -65,11 +65,14 @@ struct ircd::cl::exec::opts
/// For operations which have an offset (or two); otherwise ignored.
off_t offset[2] {0};
/// Tune the intensity of the execution. For headless deployments the
/// maximum intensity is advised. Lesser values are more intense. The
/// default of -1 is the maximum. The value of zero yields the ircd::ctx
/// after submission, but does not otherwise decrease the intensity.
int nice {-1};
/// Tune the intensity of the execution. The value is intended to correlate
/// with parallel resource consumption on the device by shaping the work
/// groups submitted over the range. The minimum value of 1 will serialize
/// execution. Values greater than the number of CU's will not increase
/// concurrency but may still partition a large range with multiple command
/// submissions to increase niceness. The default of zero will maximize
/// intensity and minimize command submissions (to one).
uint intensity {0};
/// Starts a new dependency chain; allowing empty deps without implicit
/// dependency on the last work item constructed on the ircd::ctx.

View file

@ -141,11 +141,11 @@ ircd::cl::profile_queue
{ "persist", false },
};
decltype(ircd::cl::nice_rate)
ircd::cl::nice_rate
decltype(ircd::cl::intensity)
ircd::cl::intensity
{
{ "name", "ircd.cl.nice.rate" },
{ "default", 1L },
{ "name", "ircd.cl.intensity" },
{ "default", 0L },
};
decltype(ircd::cl::watchdog_tsc)
@ -854,13 +854,13 @@ catch(const std::exception &e)
}
ircd::cl::exec::exec(kern &kern,
const kern::range &work,
const kern::range &range,
const opts &opts)
try
{
size_t dim(0);
for(size_t i(0); i < work.global.size(); ++i)
dim += work.global[i] > 0 && dim == i;
for(size_t i(0); i < range.global.size(); ++i)
dim += range.global[i] > 0 && dim == i;
if(!dim)
return;
@ -886,32 +886,67 @@ try
assert(!this->object);
this->object = &kern;
size_t global_size(range.global[0]);
size_t local_size(range.local[0]);
for(size_t d(1); d < dim; ++d)
{
global_size *= range.global[d];
local_size *= range.local[d];
}
assert(global_size % local_size == 0);
const size_t groups
{
global_size / local_size
};
assert(groups > 0);
size_t intensity
{
cl::intensity?
std::max(opts.intensity, uint(cl::intensity)):
opts.intensity
};
if(intensity < groups)
while(intensity > 1 && groups % intensity != 0)
intensity--;
const size_t tasks
{
intensity && intensity < groups?
groups / intensity:
1
};
assert(!this->handle);
for(size_t i(0); i < tasks; ++i)
{
kern::range sub_range(range);
for(size_t d(0); d < dim; ++d)
{
sub_range.global[d] /= tasks;
sub_range.offset[d] += sub_range.global[d] * i;
}
call
(
clEnqueueNDRangeKernel,
q,
cl_kernel(kern.handle),
dim,
work.offset.data(),
work.global.data(),
work.local.data(),
sub_range.offset.data(),
sub_range.global.data(),
sub_range.local.data(),
deps.size(),
deps.size()? deps.data(): nullptr,
addressof_handle(this)
i == tasks - 1? addressof_handle(this): nullptr
);
}
size_t global_size(work.global[0]);
for(size_t i(1); i < dim; ++i)
global_size *= work.global[i];
size_t local_size(work.local[0]);
for(size_t i(1); i < dim; ++i)
local_size *= work.local[i];
primary_stats.exec_kern_tasks += 1;
primary_stats.exec_kern_tasks += tasks;
primary_stats.exec_kern_threads += global_size;
primary_stats.exec_kern_groups += global_size / local_size;
primary_stats.exec_kern_groups += groups;
handle_submitted(this, opts);
}
catch(const std::exception &e)
@ -1136,12 +1171,6 @@ ircd::cl::handle_submitted(cl::exec *const &exec,
if(likely(!opts.blocking))
check_submit_blocking(exec, opts);
if(opts.nice == 0)
ctx::yield();
if(opts.nice > 0)
ctx::sleep(opts.nice * milliseconds(nice_rate));
}
/// Checks if the OpenCL runtime blocked this thread to sound the alarms.