mirror of
https://github.com/matrix-construct/construct
synced 2024-12-26 07:23:53 +01:00
ircd::gpt::vocab: Add space-prefix convenience argument.
This commit is contained in:
parent
08d39efca6
commit
56d944f33e
3 changed files with 19 additions and 11 deletions
|
@ -29,7 +29,7 @@ class ircd::gpt::token
|
|||
operator string_view() const;
|
||||
|
||||
token(const_buffer &buf) noexcept;
|
||||
token(const string_view &);
|
||||
token(const string_view &, const bool prefix_space = false);
|
||||
token(const uint16_t &) noexcept;
|
||||
};
|
||||
|
||||
|
@ -44,9 +44,11 @@ noexcept
|
|||
{}
|
||||
|
||||
/// Must resolve to one token or error thrown.
|
||||
/// prefix_space=true internally prepends space for potentially better token.
|
||||
inline
|
||||
ircd::gpt::token::token(const string_view &str)
|
||||
:val{vocab::tokenize(str)}
|
||||
ircd::gpt::token::token(const string_view &str,
|
||||
const bool prefix_space)
|
||||
:val{vocab::tokenize(str, prefix_space)}
|
||||
{}
|
||||
|
||||
/// Consumes input for one token off front of buf
|
||||
|
|
|
@ -40,7 +40,7 @@ namespace ircd::gpt::vocab
|
|||
u16 tokenize(const_buffer &) noexcept;
|
||||
|
||||
// Tokenize one token. Error thrown if input is not exactly one token.
|
||||
u16 tokenize(const string_view &in);
|
||||
u16 tokenize(const string_view &in, const bool prefix_space = false);
|
||||
|
||||
// Decode token values to build output text string.
|
||||
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;
|
||||
|
|
|
@ -257,18 +257,24 @@ noexcept
|
|||
//
|
||||
|
||||
uint16_t
|
||||
ircd::gpt::vocab::tokenize(const string_view &in)
|
||||
ircd::gpt::vocab::tokenize(const string_view &in,
|
||||
const bool prefix_space)
|
||||
{
|
||||
char str_buf[16];
|
||||
const string_view str
|
||||
char str_buf[16] {' '};
|
||||
const mutable_buffer buf
|
||||
{
|
||||
str_buf, copy(str_buf, in)
|
||||
str_buf + prefix_space, sizeof(str_buf) - prefix_space
|
||||
};
|
||||
|
||||
u16 buf[16];
|
||||
const string_view str
|
||||
{
|
||||
str_buf, copy(buf, in) + prefix_space
|
||||
};
|
||||
|
||||
u16 out_buf[16];
|
||||
const auto out
|
||||
{
|
||||
tokenize(buf, str)
|
||||
tokenize(out_buf, str)
|
||||
};
|
||||
|
||||
if(unlikely(out.size() != 1))
|
||||
|
@ -278,7 +284,7 @@ ircd::gpt::vocab::tokenize(const string_view &in)
|
|||
out.size()
|
||||
};
|
||||
|
||||
return buf[0];
|
||||
return out_buf[0];
|
||||
}
|
||||
|
||||
uint16_t
|
||||
|
|
Loading…
Reference in a new issue