diff --git a/include/ircd/gpt/vocab.h b/include/ircd/gpt/vocab.h index c63d8637e..69147dc9d 100644 --- a/include/ircd/gpt/vocab.h +++ b/include/ircd/gpt/vocab.h @@ -42,9 +42,15 @@ namespace ircd::gpt::vocab // Tokenize one token. Error thrown if input is not exactly one token. u16 tokenize(const string_view &in, const bool prefix_space = false); + // Return the number of tokens which would be output by a tokenize(). + size_t count(const string_view &in) noexcept; + // Decode token values to build output text string. string_view detokenize(const mutable_buffer &out, const vector_view &in) noexcept; + // Return the length of the string which would be output by a detokenize(). + size_t count(const vector_view &in) noexcept; + // Other tools string_view debug(const mutable_buffer &buf, const u16 token, const uint fmt_msk = -1U); } diff --git a/ircd/gpt_vocab.cc b/ircd/gpt_vocab.cc index a53c2b4c5..bd1ba1540 100644 --- a/ircd/gpt_vocab.cc +++ b/ircd/gpt_vocab.cc @@ -213,9 +213,29 @@ ircd::gpt::vocab::debug(const mutable_buffer &out, }; } -// -// detokenize -// +size_t +ircd::gpt::vocab::count(const vector_view &in) +noexcept +{ + static const size_t max + { + 32_KiB + }; + + thread_local char buf[max]; + const auto res + { + detokenize(buf, in) + }; + + const auto ret + { + res.size() + }; + + assert(ret < max); + return ret; +} ircd::string_view ircd::gpt::vocab::detokenize(const mutable_buffer &out, @@ -252,9 +272,29 @@ noexcept }; } -// -// tokenize -// +size_t +ircd::gpt::vocab::count(const string_view &in) +noexcept +{ + static const size_t max + { + 2048 + }; + + thread_local u16 buf[max]; + const auto res + { + tokenize(buf, in) + }; + + const auto ret + { + res.size() + }; + + assert(ret < max); + return ret; +} uint16_t ircd::gpt::vocab::tokenize(const string_view &in,