0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2025-01-14 00:34:18 +01:00

ircd::gpt::vocab: Add space-prefix convenience argument.

This commit is contained in:
Jason Volk 2022-07-01 16:33:09 -07:00
parent 08d39efca6
commit 56d944f33e
3 changed files with 19 additions and 11 deletions

View file

@ -29,7 +29,7 @@ class ircd::gpt::token
operator string_view() const; operator string_view() const;
token(const_buffer &buf) noexcept; token(const_buffer &buf) noexcept;
token(const string_view &); token(const string_view &, const bool prefix_space = false);
token(const uint16_t &) noexcept; token(const uint16_t &) noexcept;
}; };
@ -44,9 +44,11 @@ noexcept
{} {}
/// Must resolve to one token or error thrown. /// Must resolve to one token or error thrown.
/// prefix_space=true internally prepends space for potentially better token.
inline inline
ircd::gpt::token::token(const string_view &str) ircd::gpt::token::token(const string_view &str,
:val{vocab::tokenize(str)} const bool prefix_space)
:val{vocab::tokenize(str, prefix_space)}
{} {}
/// Consumes input for one token off front of buf /// Consumes input for one token off front of buf

View file

@ -40,7 +40,7 @@ namespace ircd::gpt::vocab
u16 tokenize(const_buffer &) noexcept; u16 tokenize(const_buffer &) noexcept;
// Tokenize one token. Error thrown if input is not exactly one token. // Tokenize one token. Error thrown if input is not exactly one token.
u16 tokenize(const string_view &in); u16 tokenize(const string_view &in, const bool prefix_space = false);
// Decode token values to build output text string. // Decode token values to build output text string.
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept; string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;

View file

@ -257,18 +257,24 @@ noexcept
// //
uint16_t uint16_t
ircd::gpt::vocab::tokenize(const string_view &in) ircd::gpt::vocab::tokenize(const string_view &in,
const bool prefix_space)
{ {
char str_buf[16]; char str_buf[16] {' '};
const string_view str const mutable_buffer buf
{ {
str_buf, copy(str_buf, in) str_buf + prefix_space, sizeof(str_buf) - prefix_space
}; };
u16 buf[16]; const string_view str
{
str_buf, copy(buf, in) + prefix_space
};
u16 out_buf[16];
const auto out const auto out
{ {
tokenize(buf, str) tokenize(out_buf, str)
}; };
if(unlikely(out.size() != 1)) if(unlikely(out.size() != 1))
@ -278,7 +284,7 @@ ircd::gpt::vocab::tokenize(const string_view &in)
out.size() out.size()
}; };
return buf[0]; return out_buf[0];
} }
uint16_t uint16_t