ircd::gpt: Split debug related; improve flush options; minor cleanup.

2024-06-10 22:18:54 +02:00 · 2021-05-14 05:50:45 -07:00 · 2021-05-14 05:50:45 -07:00 · c3cb249f78
parent 179d9abcf7
commit c3cb249f78
6 changed files with 179 additions and 122 deletions
--- a/include/ircd/gpt/pipe/pipe.h
+++ b/include/ircd/gpt/pipe/pipe.h
@ -22,6 +22,8 @@ namespace ircd::gpt::pipe
 	extern code *default_code;
 	extern desc *default_desc;

+	void generate(task &);
+
 	void init(), fini() noexcept;
 };

--- a/include/ircd/gpt/task/gate.h
+++ b/include/ircd/gpt/task/gate.h
@ -15,7 +15,8 @@
 ///
 struct ircd_gpt_gate
 {
-	ushort code[8];
+	ushort offset;
+	ushort code[7];
 }
 __attribute__((aligned(16)));

--- a/include/ircd/gpt/task/opts.h
+++ b/include/ircd/gpt/task/opts.h
@ -26,11 +26,14 @@ struct ircd_gpt_opts

 	/// Reference to the model (currently not available in device software).
 	#ifndef __cplusplus
-	const intptr_t model;
+	const void *model;
 	#else
 	const ircd::gpt::model::decoder *model;
 	#endif

+	/// Seed for the task's PRNG.
+	ulong seed;
+
 	/// Limit number of output tokens. Default of -1 is unlimited; the number
 	/// of tokens generated will be limited by other factors.
 	uint limit;
@ -81,18 +84,21 @@ struct ircd_gpt_opts
 	/// Number of possible target n-grams.
 	uint logits;

-	/// Seed for the task's PRNG.
-	ulong seed;
-
 	/// Training steps
-	ulong training_steps;
+	uint training_steps;

 	/// Validation steps
-	ulong validation_steps;
+	uint validation_steps;
+
+	/// Testing steps
+	uint testing_steps;

 	/// Target label
 	ushort label;

+	/// Bitbar toggling various debug modes
+	ushort debug;
+
 	/// Learning rate
 	float alpha;

@ -115,7 +121,7 @@ __attribute__((aligned(4096)));
 #ifdef __cplusplus
 namespace ircd::gpt
 {
-	using opts = struct ircd_gpt_opts;
+	using opts = ::ircd_gpt_opts;
 }

 static_assert(sizeof(struct ircd_gpt_opts) == 4096);
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@ -39,6 +39,8 @@ namespace ircd::gpt
 	static u16 argmax(const float *, const opts &);
 	static void embed(float *, const u16 token, const u16 position, const opts &);

+	static void generate_debug(task &, const uint &, const uint &);
+
 	static f32
 	logit alignas(64) [65536],
 	embeds alignas(64) [1024 * 768],
@ -91,21 +93,76 @@ ircd::gpt::generate(const vector_view<u16> &out,
 	ctrl.tokens.count = 0;
 	ctrl.tokens.head = 0;

-	for(uint j(0); j < in.size(); ++j)
-		ctrl.token[ctrl.tokens.count++] = in[j];
-
+	uint j(0);
 	for(uint i(0); i < opts.gates; ++i)
-		for(uint k(0); k < 8; ++k)
+	{
+		const auto &gate
+		{
+			opts.gate[i]
+		};
+
+		while(j < in.size() && j < gate.offset && ctrl.tokens.count < opts.buffer_tokens)
+			ctrl.token[ctrl.tokens.count++] = in[j++];
+
+		for(uint k(0); k < 7; ++k)
 		{
 			if(ctrl.tokens.count >= opts.buffer_tokens)
 				break;

-			if(opts.gate[i].code[k] == 0)
+			if(gate.code[k] == 0)
 				break;

-			ctrl.token[ctrl.tokens.count] = opts.gate[i].code[k];
-			ctrl.tokens.count++;
+			ctrl.token[ctrl.tokens.count++] = gate.code[k];
 		}
+	}
+
+	while(j < in.size() && ctrl.tokens.count < opts.buffer_tokens)
+		ctrl.token[ctrl.tokens.count++] = in[j++];
+
+	const size_t in_size
+	{
+		ctrl.tokens.count
+	};
+
+	generate(task);
+
+	for(uint i(0); i < ctrl.tokens.count && ret < out.size() && !halt; ++i)
+	{
+		const auto j
+		{
+			(i + ctrl.tokens.head) % opts.buffer_tokens
+		};
+
+		const auto tok
+		{
+			ctrl.token[j]
+		};
+
+		if(j >= in_size)
+			out[ret++] = tok;
+
+		if(likely(~opts.debug & 0x01))
+			continue;
+
+		if(likely(~opts.debug & 0x02))
+			if(j < in_size)
+				continue;
+
+		generate_debug(task, j, in_size);
+	}
+
+	ctx::interruption_point();
+	return vector_view<u16>
+	{
+		out, ret
+	};
+}
+
+void
+ircd::gpt::generate(task &task)
+{
+	const auto &opts(*task.opts);
+	auto &ctrl(*task.ctrl);

 	const size_t in_size
 	{
@ -153,11 +210,10 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		ctrl.cert.last = ctrl.cert.mean;
 		ctrl.prop = false;
 		pipe::default_model->invalid = true;
-		return {};
+		return;
 	}

 	cycles = 0;
-	milliseconds last_time {0};
 	util::timer stopwatch;
 	{
 		const prof::scope_cycles task_cycles
@ -165,66 +221,69 @@ ircd::gpt::generate(const vector_view<u16> &out,
 			cycles
 		};

-		generate(task);
+		pipe::generate(task);
 	}
-	last_time = stopwatch.at<milliseconds>();
+
+	const milliseconds last_time
+	{
+		stopwatch.at<milliseconds>()
+	};
+
 	ctrl.epic.elapsed += last_time.count();
+}

-	for(uint j(0); j < ctrl.tokens.count && ret < out.size() && !halt; ++j)
+void
+ircd::gpt::generate_debug(task &task,
+                          const uint &i,
+                          const uint &in_size)
+{
+	const auto &opts(*task.opts);
+	auto &ctrl(*task.ctrl);
+
+	const auto j
 	{
-		const auto tok
-		{
-			ctrl.token[j]
-		};
+		(i + ctrl.tokens.head) % opts.buffer_tokens
+	};

-		if(j >= in_size)
-			out[ret++] = tok;
-
-		if(j < in_size)
-			continue;
-
-		static char dbuf[512] {0};
-		char report[1536] {0};
-		char tmbuf[4][64] {0};
-		const size_t bsz(ctrl.tokens.count - in_size);
-		const size_t report_size = snprintf
-		(
-			report, sizeof(report),
-			"%-3u %4u:%-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP  %5.1f%% %s %04x  %8s %8s | %8s",
-			j,
-			ret - 1,
-			ctrl.tokens.count,
-			ctrl.epic.epoch,
-			ctrl.epic.cycle,
-			std::clamp(ctrl.cert.mean * 100.0f, 0.0f, 100.0f),
-			std::clamp(ctrl.perp.mean, 0.0f, 100.0f),
-			std::clamp(ctrl.loss.mean, 0.0f, 99.99f),
-			opts.label == tok? '+': ' ',
-			' ', // flag place
-			' ', // flag place
-			opts.label,
-			std::clamp(ctrl.loss.last, 0.0f, 99.99f),
-			std::clamp(ctrl.perp.last, 0.0f, 100.0f),
-			std::clamp(ctrl.cert.last * 100.0f, 0.0f, 100.0f),
-			vocab::debug(dbuf, tok).c_str(),
-			tok,
-			pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
-			pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
-			pretty(tmbuf[2], milliseconds(ctrl.epic.elapsed), 1).c_str()
-		);
-
-		log::logf
-		{
-			log, log::level::DEBUG,
-			"%s",
-			string_view{report, report_size}
-		};
-	}
-
-	ctx::interruption_point();
-	return vector_view<u16>
+	const auto tok
 	{
-		out, ret
+		ctrl.token[j]
+	};
+
+	static char dbuf[512];
+	static char report[1536];
+	static char tmbuf[4][64];
+	const size_t bsz(ctrl.tokens.count - in_size);
+	const size_t report_size = snprintf
+	(
+		report, sizeof(report),
+		"%-3u %-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP  %5.1f%% %s %04x  %8s %8s | %8s",
+		j,
+		ctrl.tokens.count,
+		ctrl.epic.epoch,
+		ctrl.epic.cycle,
+		std::clamp(ctrl.cert.mean * 100.0f, 0.0f, 100.0f),
+		std::clamp(ctrl.perp.mean, 0.0f, 100.0f),
+		std::clamp(ctrl.loss.mean, 0.0f, 99.99f),
+		opts.label == tok? '+': ' ',
+		' ', // flag place
+		' ', // flag place
+		opts.label,
+		std::clamp(ctrl.loss.last, 0.0f, 99.99f),
+		std::clamp(ctrl.perp.last, 0.0f, 100.0f),
+		std::clamp(ctrl.cert.last * 100.0f, 0.0f, 100.0f),
+		vocab::debug(dbuf, tok).c_str(),
+		tok,
+		pretty(tmbuf[0], milliseconds(0ms / bsz), 1).c_str(),
+		pretty(tmbuf[1], si(0UL / bsz), 1).c_str(),
+		pretty(tmbuf[2], milliseconds(ctrl.epic.elapsed), 1).c_str()
+	);
+
+	log::logf
+	{
+		log, log::level::DEBUG,
+		"%s",
+		string_view{report, report_size}
 	};
 }

@ -849,7 +908,11 @@ ircd_gpt_opts::ircd_gpt_opts(const ircd::gpt::model::decoder *const model)
 noexcept
 :model
 {
-	model
+	model?: ircd::gpt::model::default_model
+}
+,seed
+{
+	1234567890UL
 }
 ,limit
 {
@ -915,10 +978,6 @@ noexcept
 {
 	50257
 }
-,seed
-{
-	1234567890UL
-}
 ,training_steps
 {
 	250000
@ -927,10 +986,18 @@ noexcept
 {
 	5000
 }
+,testing_steps
+{
+	5000
+}
 ,label
 {
 	198
 }
+,debug
+{
+	0x01
+}
 ,alpha
 {
 	0.001f
--- a/ircd/gpt_cl.cl
+++ b/ircd/gpt_cl.cl
@ -600,8 +600,6 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
 	if(li != 0)
 		return;

-	// On the last cycle, with no prior call or error code set, indicate
-	// a nominal exit condition.
 	if(ctrl->epic.cycle + 1 >= opts->limit)
 		ctrl->epic.epoch += 1;

@ -620,8 +618,7 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
                   __global const float *const restrict logit)
 {
 	// To read from cells other than idx[0] we need this barrier.
-	if(opts->top_k > 1)
-		barrier(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_LOCAL_MEM_FENCE);

 	// Mask for write-leader
 	if(li != 0)
@ -661,9 +658,10 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
 	loss_sum = ctrl->loss.sum[0] + ctrl->loss.sum[1] + ctrl->loss.sum[2] + loss,
 	perp_sum = ctrl->perp.sum[0] + ctrl->perp.sum[1] + ctrl->perp.sum[2] + perp,
 	cert_sum = ctrl->cert.sum[0] + ctrl->cert.sum[1] + ctrl->cert.sum[2] + cert,
-	loss_mean = loss_sum / (ctrl->epic.epoch + 1.0f),
-	perp_mean = perp_sum / (ctrl->epic.epoch + 1.0f),
-	cert_mean = cert_sum / (ctrl->epic.epoch + 1.0f);
+	mean_div = ctrl->epic.epoch + 1.0f,
+	loss_mean = loss_sum / mean_div,
+	perp_mean = perp_sum / mean_div,
+	cert_mean = cert_sum / mean_div;

 	ctrl->loss.last = loss;
 	ctrl->loss.sum[sum_sel] += loss;
--- a/ircd/gpt_pipe.cc
+++ b/ircd/gpt_pipe.cc
@ -12,14 +12,13 @@ namespace ircd::gpt::pipe
 {
 	static void profile_dumplog(pipe::exec &);

-	static ircd::cl::exec::opts
-	negative_opts, positive_opts, selfattn_opts,
-	cathode_opts, anode_opts, lmhead_opts, lmamax_opts,
-	backprop_opts;
-
-	extern conf::item<size_t> flush_cycles;
 	extern conf::item<size_t> queue_cycles;
 	extern const ircd::run::changed handle_quit;
+
+	static ircd::cl::exec::opts
+	send_opts_opts, send_ctrl_opts, send_coil_opts, send_head_opts,
+	anode_opts, negative_opts, positive_opts, cathode_opts,
+	lmhead_opts, lmamax_opts, backprop_opts, recv_ctrl_opts;
 }

 decltype(ircd::gpt::pipe::queue_cycles)
@ -29,13 +28,6 @@ ircd::gpt::pipe::queue_cycles
 	{ "default",  1L,                   },
 };

-decltype(ircd::gpt::pipe::flush_cycles)
-ircd::gpt::pipe::flush_cycles
-{
-	{ "name",     "ircd.gpt.pipe.flush" },
-	{ "default",  1L,                   },
-};
-
 decltype(ircd::gpt::pipe::default_model)
 ircd::gpt::pipe::default_model;

@ -75,6 +67,13 @@ ircd::gpt::pipe::init()
 		*pipe::default_code, *pipe::default_model
 	};

+	//XXX
+	send_ctrl_opts.flush = true;
+	send_ctrl_opts.nice = 1;
+	lmamax_opts.flush = true;
+	lmamax_opts.nice = 2;
+	recv_ctrl_opts.flush = true;
+
 	log::debug
 	{
 		log, "Pipe initialized from model:%p data:%p code:%p desc:%p",
@ -113,7 +112,7 @@ noexcept
 //

 void
-ircd::gpt::generate(task &task)
+ircd::gpt::pipe::generate(task &task)
 {
 	assert(pipe::default_model);

@ -131,9 +130,10 @@ ircd::gpt::generate(task &task)

 	ctrl.epic.cycle = 0;
 	ctrl.epic.host_tsc = prof::cycles();
-	volatile const size_t tokens(ctrl.tokens.count);
-	volatile const auto epoch(ctrl.epic.epoch);
-	volatile size_t cycle(ctrl.epic.cycle);
+
+	const auto tokens(ctrl.tokens.count);
+	const auto epoch(ctrl.epic.epoch);
+	volatile auto cycle(ctrl.epic.cycle);

 	std::deque<pipe::exec> list;
 	for(; cycle < opts.limit; ++cycle)
@ -150,23 +150,6 @@ ircd::gpt::generate(task &task)
 			task, tokens + cycle, rel, acq
 		);

-		// Conditions for a cl::flush here
-		const bool flush
-		{
-			// Flushing here is enabled by the configuration
-			pipe::flush_cycles
-
-			// Skip flushing on cycles already performing IO or waiting.
-			&& !acq && list.size() <= pipe::queue_cycles
-
-			// The configuration item can specify an interval greater than
-			// one between flushes.
-			&& cycle % pipe::flush_cycles == 0
-		};
-
-		if(flush)
-			cl::flush();
-
 		if(ctx::interruption_requested())
 			if(acq || termination(ctx::cur()))
 				break;
@ -301,19 +284,19 @@ ircd::gpt::pipe::exec::exec(task &task,
 }
 ,release_opts
 {
-	desc->opts, send_opts
+	desc->opts, send_opts, send_opts_opts,
 }
 ,release_ctrl
 {
-	desc->ctrl, send_ctrl
+	desc->ctrl, send_ctrl, send_ctrl_opts
 }
 ,release_coil
 {
-	desc->model->decode->master[0], send_coil
+	desc->model->decode->master[0], send_coil, send_coil_opts
 }
 ,release_head
 {
-	desc->model->embed->master[0], send_head
+	desc->model->embed->master[0], send_head, send_head_opts
 }
 ,lm_embed
 {
@ -364,7 +347,7 @@ ircd::gpt::pipe::exec::exec(task &task,
 }
 ,acquire_ctrl
 {
-	desc->ctrl, recv_ctrl
+	desc->ctrl, recv_ctrl, recv_ctrl_opts
 }
 {
 	if(release && desc->model->invalid)