ircd:🆑 Replace niceness value with intensity by range partition.

2024-12-26 07:23:53 +01:00 · 2022-09-27 12:26:35 -07:00 · 2022-09-27 12:26:35 -07:00 · da614e33a5
commit da614e33a5
parent 508d27eb40
3 changed files with 73 additions and 41 deletions
--- a/include/ircd/cl/cl.h
+++ b/include/ircd/cl/cl.h
@ -40,7 +40,7 @@ namespace ircd::cl
 	extern conf::item<bool> enable;
 	extern conf::item<bool> profile_queue;
 	extern conf::item<uint64_t> watchdog_tsc;
-	extern conf::item<milliseconds> nice_rate;
+	extern conf::item<uint64_t> intensity;
 	extern conf::item<std::string> path;
 	extern conf::item<std::string> envs[];
 }
--- a/include/ircd/cl/exec.h
+++ b/include/ircd/cl/exec.h
@ -65,11 +65,14 @@ struct ircd::cl::exec::opts
 	/// For operations which have an offset (or two); otherwise ignored.
 	off_t offset[2] {0};

-	/// Tune the intensity of the execution. For headless deployments the
-	/// maximum intensity is advised. Lesser values are more intense. The
-	/// default of -1 is the maximum. The value of zero yields the ircd::ctx
-	/// after submission, but does not otherwise decrease the intensity.
-	int nice {-1};
+	/// Tune the intensity of the execution. The value is intended to correlate
+	/// with parallel resource consumption on the device by shaping the work
+	/// groups submitted over the range. The minimum value of 1 will serialize
+	/// execution. Values greater than the number of CU's will not increase
+	/// concurrency but may still partition a large range with multiple command
+	/// submissions to increase niceness. The default of zero will maximize
+	/// intensity and minimize command submissions (to one).
+	uint intensity {0};

 	/// Starts a new dependency chain; allowing empty deps without implicit
 	/// dependency on the last work item constructed on the ircd::ctx.
--- a/ircd/cl.cc
+++ b/ircd/cl.cc
@ -141,11 +141,11 @@ ircd::cl::profile_queue
 	{ "persist",   false                    },
 };

-decltype(ircd::cl::nice_rate)
-ircd::cl::nice_rate
+decltype(ircd::cl::intensity)
+ircd::cl::intensity
 {
-	{ "name",      "ircd.cl.nice.rate"  },
-	{ "default",   1L                   },
+	{ "name",      "ircd.cl.intensity"  },
+	{ "default",   0L                   },
 };

 decltype(ircd::cl::watchdog_tsc)
@ -854,13 +854,13 @@ catch(const std::exception &e)
 }

 ircd::cl::exec::exec(kern &kern,
-                     const kern::range &work,
+                     const kern::range &range,
                     const opts &opts)
 try
 {
 	size_t dim(0);
-	for(size_t i(0); i < work.global.size(); ++i)
-		dim += work.global[i] > 0 && dim == i;
+	for(size_t i(0); i < range.global.size(); ++i)
+		dim += range.global[i] > 0 && dim == i;

 	if(!dim)
 		return;
@ -886,32 +886,67 @@ try
 	assert(!this->object);
 	this->object = &kern;

+	size_t global_size(range.global[0]);
+	size_t local_size(range.local[0]);
+	for(size_t d(1); d < dim; ++d)
+	{
+		global_size *= range.global[d];
+		local_size *= range.local[d];
+	}
+
+	assert(global_size % local_size == 0);
+	const size_t groups
+	{
+		global_size / local_size
+	};
+
+	assert(groups > 0);
+	size_t intensity
+	{
+		cl::intensity?
+			std::max(opts.intensity, uint(cl::intensity)):
+			opts.intensity
+	};
+
+	if(intensity < groups)
+		while(intensity > 1 && groups % intensity != 0)
+			intensity--;
+
+	const size_t tasks
+	{
+		intensity && intensity < groups?
+			groups / intensity:
+			1
+	};
+
 	assert(!this->handle);
+	for(size_t i(0); i < tasks; ++i)
+	{
+		kern::range sub_range(range);
+		for(size_t d(0); d < dim; ++d)
+		{
+			sub_range.global[d] /= tasks;
+			sub_range.offset[d] += sub_range.global[d] * i;
+		}
+
 		call
 		(
 			clEnqueueNDRangeKernel,
 			q,
 			cl_kernel(kern.handle),
 			dim,
-		work.offset.data(),
-		work.global.data(),
-		work.local.data(),
+			sub_range.offset.data(),
+			sub_range.global.data(),
+			sub_range.local.data(),
 			deps.size(),
 			deps.size()? deps.data(): nullptr,
-		addressof_handle(this)
+			i == tasks - 1? addressof_handle(this): nullptr
 		);
+	}

-	size_t global_size(work.global[0]);
-	for(size_t i(1); i < dim; ++i)
-		global_size *= work.global[i];
-
-	size_t local_size(work.local[0]);
-	for(size_t i(1); i < dim; ++i)
-		local_size *= work.local[i];
-
-	primary_stats.exec_kern_tasks += 1;
+	primary_stats.exec_kern_tasks += tasks;
 	primary_stats.exec_kern_threads += global_size;
-	primary_stats.exec_kern_groups += global_size / local_size;
+	primary_stats.exec_kern_groups += groups;
 	handle_submitted(this, opts);
 }
 catch(const std::exception &e)
@ -1136,12 +1171,6 @@ ircd::cl::handle_submitted(cl::exec *const &exec,

 	if(likely(!opts.blocking))
 		check_submit_blocking(exec, opts);
-
-	if(opts.nice == 0)
-		ctx::yield();
-
-	if(opts.nice > 0)
-		ctx::sleep(opts.nice * milliseconds(nice_rate));
 }

 /// Checks if the OpenCL runtime blocked this thread to sound the alarms.