diff --git a/include/ircd/gpt/ctrl.h b/include/ircd/gpt/ctrl.h index c1f8a9173..448e1379b 100644 --- a/include/ircd/gpt/ctrl.h +++ b/include/ircd/gpt/ctrl.h @@ -161,6 +161,7 @@ namespace ircd::gpt string_view debug_token(const mutable_buffer &, const opts &, const ctrl &, const uint fmt = -1U); string_view debug_head(const mutable_buffer &, const opts &, const ctrl_clk &); string_view debug_head(const mutable_buffer &, const opts &, const ctrl &); + string_view debug(const mutable_buffer &, const opts &, const ctrl_prof &); string_view debug(const mutable_buffer &, const opts &, const ctrl_logit &, const uint fmt = 0); string_view debug(const mutable_buffer &, const opts &, const ctrl_label &, const uint fmt = 0); string_view debug(const mutable_buffer &, const opts &, const ctrl &); diff --git a/ircd/gpt.cc b/ircd/gpt.cc index 9fe0c6da5..1123f548b 100644 --- a/ircd/gpt.cc +++ b/ircd/gpt.cc @@ -818,9 +818,13 @@ noexcept bool ircd::gpt::samp::operator()() { + ctx::interruption_point(); + if(dispatch > 0) { - ctx::interruption_point(); + if(!cycle) + ctrl.prof.released = prof::cycles(); + queue.emplace_back(*this); desc.cached = tokens; tokens += count >= tokens; @@ -979,6 +983,7 @@ ircd::gpt::samp::evaluate(pipe::cycle &cycle) assert(ctrl.magic != 0xDEADBEEF); assert(ctrl.magic == 0xC7012C70UL); + ctrl.prof.acquired = prof::cycles(); ctrl.clk.cycle += cycling; ctrl.clk.samp += sampling; ctrl.clk.step += stepping; @@ -1151,7 +1156,7 @@ ircd::gpt::debug(const mutable_buffer &out, return fmt::sprintf { - out, "%s %s %c T%02d %4u %6.2f%% %10.7f$L %c %s %s", + out, "%s %s %c T%02d %3u %6.2f%% %10.7f$L %c %s %s %s", vocab::debug(buf[0], ctrl.select.logit.token, 1), debug(buf[1], opts, ctrl.select), ctrl.target.logit.token == ctrl.top[0].token? '=' : ' ', @@ -1162,6 +1167,7 @@ ircd::gpt::debug(const mutable_buffer &out, ctrl.target.logit.token == ctrl.select.logit.token? '=' : ' ', debug(buf[2], opts, ctrl.target), vocab::debug(buf[3], ctrl.target.logit.token, 1), + debug(buf[4], opts, ctrl.prof), }; } @@ -1212,13 +1218,41 @@ ircd::gpt::debug(const mutable_buffer &out, { return fmt::sprintf { - out, "%6.2f%% %10.7f$L %5.1f$P", + out, "%6.2f%% %10.7f$L %4.1f$P", logit.samax * 100.0f, +0.0f - logf(logit.samax), (1.0f - logit.samax) * log2f(opts.logits), }; } +ircd::string_view +ircd::gpt::debug(const mutable_buffer &out, + const opts &opts, + const ctrl_prof &prof) +{ + thread_local char buf[1][32]; + + const auto kern_cycles + { + prof.finished - prof.entered + }; + + const auto host_cycles + { + prof.acquired - prof.released + }; + + return fmt::sprintf + { + out, "%s", + kern_cycles > 0? + pretty(buf[0], si(kern_cycles), 1): + host_cycles > 0? + pretty(buf[0], si(host_cycles), 1): + string_view{}, + }; +} + ircd::string_view ircd::gpt::debug_head(const mutable_buffer &out, const opts &opts, @@ -1226,11 +1260,12 @@ ircd::gpt::debug_head(const mutable_buffer &out, { thread_local char head[64]; + assert(ctrl.count > 0); return fmt::sprintf { out, "%s[%4u]-%1u", debug_head(head, opts, ctrl.clk), - ctrl.count, + ctrl.count - 1, ctrl.dispatch, }; } diff --git a/ircd/gpt_gpu.cl b/ircd/gpt_gpu.cl index 3a9b81e68..d64e5c707 100644 --- a/ircd/gpt_gpu.cl +++ b/ircd/gpt_gpu.cl @@ -81,8 +81,10 @@ ircd_gpt_enter(__global const void *const restrict model, ln = get_local_size(0), cycle = ctrl->clk.cycle; + #if defined(__clang__) && !defined(__SPIR) if(li == 0) - ;//ctrl->prof.entered = __builtin_readcyclecounter(); + ctrl->prof.entered = __builtin_readcyclecounter(); + #endif } __kernel void @@ -1031,8 +1033,10 @@ ircd_gpt_leave(__global const void *const restrict model, stepping = sampling && (ctrl->clk.samp + 1) >= batch_size, epoching = stepping && (ctrl->clk.step + 1) >= steps; + #if defined(__clang__) && !defined(__SPIR) if(li == 0) - ;//ctrl->prof.finished = __builtin_readcyclecounter(); + ctrl->prof.finished = __builtin_readcyclecounter(); + #endif if(li == 0) { diff --git a/ircd/gpt_pipe.cc b/ircd/gpt_pipe.cc index e81380199..c3bc34f13 100644 --- a/ircd/gpt_pipe.cc +++ b/ircd/gpt_pipe.cc @@ -270,7 +270,7 @@ ircd::gpt::pipe::cycle::cycle(gpt::samp &samp) }, cl::exec // Initial cycle kernel { - desc.enter, range.embed, + desc.enter, range.select, }, cl::exec // Compute token and positional embeddings. {