diff --git a/include/ircd/gpt/token.h b/include/ircd/gpt/token.h index b928c8758..e7b953468 100644 --- a/include/ircd/gpt/token.h +++ b/include/ircd/gpt/token.h @@ -29,7 +29,7 @@ class ircd::gpt::token operator string_view() const; token(const_buffer &buf) noexcept; - token(const string_view &); + token(const string_view &, const bool prefix_space = false); token(const uint16_t &) noexcept; }; @@ -44,9 +44,11 @@ noexcept {} /// Must resolve to one token or error thrown. +/// prefix_space=true internally prepends space for potentially better token. inline -ircd::gpt::token::token(const string_view &str) -:val{vocab::tokenize(str)} +ircd::gpt::token::token(const string_view &str, + const bool prefix_space) +:val{vocab::tokenize(str, prefix_space)} {} /// Consumes input for one token off front of buf diff --git a/include/ircd/gpt/vocab.h b/include/ircd/gpt/vocab.h index ac314d797..c63d8637e 100644 --- a/include/ircd/gpt/vocab.h +++ b/include/ircd/gpt/vocab.h @@ -40,7 +40,7 @@ namespace ircd::gpt::vocab u16 tokenize(const_buffer &) noexcept; // Tokenize one token. Error thrown if input is not exactly one token. - u16 tokenize(const string_view &in); + u16 tokenize(const string_view &in, const bool prefix_space = false); // Decode token values to build output text string. string_view detokenize(const mutable_buffer &out, const vector_view &in) noexcept; diff --git a/ircd/gpt_vocab.cc b/ircd/gpt_vocab.cc index 282059be1..a53c2b4c5 100644 --- a/ircd/gpt_vocab.cc +++ b/ircd/gpt_vocab.cc @@ -257,18 +257,24 @@ noexcept // uint16_t -ircd::gpt::vocab::tokenize(const string_view &in) +ircd::gpt::vocab::tokenize(const string_view &in, + const bool prefix_space) { - char str_buf[16]; - const string_view str + char str_buf[16] {' '}; + const mutable_buffer buf { - str_buf, copy(str_buf, in) + str_buf + prefix_space, sizeof(str_buf) - prefix_space }; - u16 buf[16]; + const string_view str + { + str_buf, copy(buf, in) + prefix_space + }; + + u16 out_buf[16]; const auto out { - tokenize(buf, str) + tokenize(out_buf, str) }; if(unlikely(out.size() != 1)) @@ -278,7 +284,7 @@ ircd::gpt::vocab::tokenize(const string_view &in) out.size() }; - return buf[0]; + return out_buf[0]; } uint16_t