0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-12-26 07:23:53 +01:00

ircd::gpt::vocab: Add space-prefix convenience argument.

This commit is contained in:
Jason Volk 2022-07-01 16:33:09 -07:00
parent 08d39efca6
commit 56d944f33e
3 changed files with 19 additions and 11 deletions

View file

@ -29,7 +29,7 @@ class ircd::gpt::token
operator string_view() const;
token(const_buffer &buf) noexcept;
token(const string_view &);
token(const string_view &, const bool prefix_space = false);
token(const uint16_t &) noexcept;
};
@ -44,9 +44,11 @@ noexcept
{}
/// Must resolve to one token or error thrown.
/// prefix_space=true internally prepends space for potentially better token.
inline
ircd::gpt::token::token(const string_view &str)
:val{vocab::tokenize(str)}
ircd::gpt::token::token(const string_view &str,
const bool prefix_space)
:val{vocab::tokenize(str, prefix_space)}
{}
/// Consumes input for one token off front of buf

View file

@ -40,7 +40,7 @@ namespace ircd::gpt::vocab
u16 tokenize(const_buffer &) noexcept;
// Tokenize one token. Error thrown if input is not exactly one token.
u16 tokenize(const string_view &in);
u16 tokenize(const string_view &in, const bool prefix_space = false);
// Decode token values to build output text string.
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;

View file

@ -257,18 +257,24 @@ noexcept
//
uint16_t
ircd::gpt::vocab::tokenize(const string_view &in)
ircd::gpt::vocab::tokenize(const string_view &in,
const bool prefix_space)
{
char str_buf[16];
const string_view str
char str_buf[16] {' '};
const mutable_buffer buf
{
str_buf, copy(str_buf, in)
str_buf + prefix_space, sizeof(str_buf) - prefix_space
};
u16 buf[16];
const string_view str
{
str_buf, copy(buf, in) + prefix_space
};
u16 out_buf[16];
const auto out
{
tokenize(buf, str)
tokenize(out_buf, str)
};
if(unlikely(out.size() != 1))
@ -278,7 +284,7 @@ ircd::gpt::vocab::tokenize(const string_view &in)
out.size()
};
return buf[0];
return out_buf[0];
}
uint16_t