ircd:🆑 Replace niceness value with intensity by range partition.

2024-12-27 16:04:15 +01:00 · 2022-09-27 12:26:35 -07:00 · 2022-09-27 12:26:35 -07:00 · da614e33a5
commit da614e33a5
parent 508d27eb40
3 changed files with 73 additions and 41 deletions
--- a/include/ircd/cl/cl.h
+++ b/include/ircd/cl/cl.h
@ -40,7 +40,7 @@ namespace ircd::cl
 	extern conf::item<bool> enable;
 	extern conf::item<bool> profile_queue;
 	extern conf::item<uint64_t> watchdog_tsc;
-	extern conf::item<milliseconds> nice_rate;
+	extern conf::item<uint64_t> intensity;
 	extern conf::item<std::string> path;
 	extern conf::item<std::string> envs[];
 }
--- a/include/ircd/cl/exec.h
+++ b/include/ircd/cl/exec.h
@ -65,11 +65,14 @@ struct ircd::cl::exec::opts
 	/// For operations which have an offset (or two); otherwise ignored.
 	off_t offset[2] {0};
-	/// Tune the intensity of the execution. For headless deployments the
+	/// Tune the intensity of the execution. The value is intended to correlate
-	/// maximum intensity is advised. Lesser values are more intense. The
+	/// with parallel resource consumption on the device by shaping the work
-	/// default of -1 is the maximum. The value of zero yields the ircd::ctx
+	/// groups submitted over the range. The minimum value of 1 will serialize
-	/// after submission, but does not otherwise decrease the intensity.
+	/// execution. Values greater than the number of CU's will not increase
-	int nice {-1};
+	/// concurrency but may still partition a large range with multiple command
 	/// submissions to increase niceness. The default of zero will maximize
 	/// intensity and minimize command submissions (to one).
 	uint intensity {0};
 	/// Starts a new dependency chain; allowing empty deps without implicit
 	/// dependency on the last work item constructed on the ircd::ctx.
--- a/ircd/cl.cc
+++ b/ircd/cl.cc
@ -141,11 +141,11 @@ ircd::cl::profile_queue
 	{ "persist",   false                    },
 };
-decltype(ircd::cl::nice_rate)
+decltype(ircd::cl::intensity)
-ircd::cl::nice_rate
+ircd::cl::intensity
 {
-	{ "name",      "ircd.cl.nice.rate"  },
+	{ "name",      "ircd.cl.intensity"  },
-	{ "default",   1L                   },
+	{ "default",   0L                   },
 };
 decltype(ircd::cl::watchdog_tsc)
@ -854,13 +854,13 @@ catch(const std::exception &e)
 }
 ircd::cl::exec::exec(kern &kern,
-                     const kern::range &work,
+                     const kern::range &range,
                     const opts &opts)
 try
 {
 	size_t dim(0);
-	for(size_t i(0); i < work.global.size(); ++i)
+	for(size_t i(0); i < range.global.size(); ++i)
-		dim += work.global[i] > 0 && dim == i;
+		dim += range.global[i] > 0 && dim == i;
 	if(!dim)
 		return;
@ -886,32 +886,67 @@ try
 	assert(!this->object);
 	this->object = &kern;
 	size_t global_size(range.global[0]);
 	size_t local_size(range.local[0]);
 	for(size_t d(1); d < dim; ++d)
 	{
 		global_size *= range.global[d];
 		local_size *= range.local[d];
 	}
 	assert(global_size % local_size == 0);
 	const size_t groups
 	{
 		global_size / local_size
 	};
 	assert(groups > 0);
 	size_t intensity
 	{
 		cl::intensity?
 			std::max(opts.intensity, uint(cl::intensity)):
 			opts.intensity
 	};
 	if(intensity < groups)
 		while(intensity > 1 && groups % intensity != 0)
 			intensity--;
 	const size_t tasks
 	{
 		intensity && intensity < groups?
 			groups / intensity:
 			1
 	};
 	assert(!this->handle);
-	call
+	for(size_t i(0); i < tasks; ++i)
-	(
+	{
-		clEnqueueNDRangeKernel,
+		kern::range sub_range(range);
-		q,
+		for(size_t d(0); d < dim; ++d)
-		cl_kernel(kern.handle),
+		{
-		dim,
+			sub_range.global[d] /= tasks;
-		work.offset.data(),
+			sub_range.offset[d] += sub_range.global[d] * i;
-		work.global.data(),
+		}
 		work.local.data(),
 		deps.size(),
 		deps.size()? deps.data(): nullptr,
 		addressof_handle(this)
 	);
-	size_t global_size(work.global[0]);
+		call
-	for(size_t i(1); i < dim; ++i)
+		(
-		global_size *= work.global[i];
+			clEnqueueNDRangeKernel,
 			q,
 			cl_kernel(kern.handle),
 			dim,
 			sub_range.offset.data(),
 			sub_range.global.data(),
 			sub_range.local.data(),
 			deps.size(),
 			deps.size()? deps.data(): nullptr,
 			i == tasks - 1? addressof_handle(this): nullptr
 		);
 	}
-	size_t local_size(work.local[0]);
+	primary_stats.exec_kern_tasks += tasks;
 	for(size_t i(1); i < dim; ++i)
 		local_size *= work.local[i];
 	primary_stats.exec_kern_tasks += 1;
 	primary_stats.exec_kern_threads += global_size;
-	primary_stats.exec_kern_groups += global_size / local_size;
+	primary_stats.exec_kern_groups += groups;
 	handle_submitted(this, opts);
 }
 catch(const std::exception &e)
@ -1136,12 +1171,6 @@ ircd::cl::handle_submitted(cl::exec *const &exec,
 	if(likely(!opts.blocking))
 		check_submit_blocking(exec, opts);
 	if(opts.nice == 0)
 		ctx::yield();
 	if(opts.nice > 0)
 		ctx::sleep(opts.nice * milliseconds(nice_rate));
 }
 /// Checks if the OpenCL runtime blocked this thread to sound the alarms.