mirror of
https://github.com/matrix-construct/construct
synced 2025-01-14 00:34:18 +01:00
ircd::gpt::vocab: Add space-prefix convenience argument.
This commit is contained in:
parent
08d39efca6
commit
56d944f33e
3 changed files with 19 additions and 11 deletions
|
@ -29,7 +29,7 @@ class ircd::gpt::token
|
||||||
operator string_view() const;
|
operator string_view() const;
|
||||||
|
|
||||||
token(const_buffer &buf) noexcept;
|
token(const_buffer &buf) noexcept;
|
||||||
token(const string_view &);
|
token(const string_view &, const bool prefix_space = false);
|
||||||
token(const uint16_t &) noexcept;
|
token(const uint16_t &) noexcept;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -44,9 +44,11 @@ noexcept
|
||||||
{}
|
{}
|
||||||
|
|
||||||
/// Must resolve to one token or error thrown.
|
/// Must resolve to one token or error thrown.
|
||||||
|
/// prefix_space=true internally prepends space for potentially better token.
|
||||||
inline
|
inline
|
||||||
ircd::gpt::token::token(const string_view &str)
|
ircd::gpt::token::token(const string_view &str,
|
||||||
:val{vocab::tokenize(str)}
|
const bool prefix_space)
|
||||||
|
:val{vocab::tokenize(str, prefix_space)}
|
||||||
{}
|
{}
|
||||||
|
|
||||||
/// Consumes input for one token off front of buf
|
/// Consumes input for one token off front of buf
|
||||||
|
|
|
@ -40,7 +40,7 @@ namespace ircd::gpt::vocab
|
||||||
u16 tokenize(const_buffer &) noexcept;
|
u16 tokenize(const_buffer &) noexcept;
|
||||||
|
|
||||||
// Tokenize one token. Error thrown if input is not exactly one token.
|
// Tokenize one token. Error thrown if input is not exactly one token.
|
||||||
u16 tokenize(const string_view &in);
|
u16 tokenize(const string_view &in, const bool prefix_space = false);
|
||||||
|
|
||||||
// Decode token values to build output text string.
|
// Decode token values to build output text string.
|
||||||
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;
|
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;
|
||||||
|
|
|
@ -257,18 +257,24 @@ noexcept
|
||||||
//
|
//
|
||||||
|
|
||||||
uint16_t
|
uint16_t
|
||||||
ircd::gpt::vocab::tokenize(const string_view &in)
|
ircd::gpt::vocab::tokenize(const string_view &in,
|
||||||
|
const bool prefix_space)
|
||||||
{
|
{
|
||||||
char str_buf[16];
|
char str_buf[16] {' '};
|
||||||
const string_view str
|
const mutable_buffer buf
|
||||||
{
|
{
|
||||||
str_buf, copy(str_buf, in)
|
str_buf + prefix_space, sizeof(str_buf) - prefix_space
|
||||||
};
|
};
|
||||||
|
|
||||||
u16 buf[16];
|
const string_view str
|
||||||
|
{
|
||||||
|
str_buf, copy(buf, in) + prefix_space
|
||||||
|
};
|
||||||
|
|
||||||
|
u16 out_buf[16];
|
||||||
const auto out
|
const auto out
|
||||||
{
|
{
|
||||||
tokenize(buf, str)
|
tokenize(out_buf, str)
|
||||||
};
|
};
|
||||||
|
|
||||||
if(unlikely(out.size() != 1))
|
if(unlikely(out.size() != 1))
|
||||||
|
@ -278,7 +284,7 @@ ircd::gpt::vocab::tokenize(const string_view &in)
|
||||||
out.size()
|
out.size()
|
||||||
};
|
};
|
||||||
|
|
||||||
return buf[0];
|
return out_buf[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
uint16_t
|
uint16_t
|
||||||
|
|
Loading…
Reference in a new issue