mirror of
https://github.com/matrix-construct/construct
synced 2024-09-27 19:28:52 +02:00
ircd::gpt::vocab: Add tokenization and detokenization count() convenience.
This commit is contained in:
parent
6092fabe42
commit
4a8302038a
2 changed files with 52 additions and 6 deletions
|
@ -42,9 +42,15 @@ namespace ircd::gpt::vocab
|
|||
// Tokenize one token. Error thrown if input is not exactly one token.
|
||||
u16 tokenize(const string_view &in, const bool prefix_space = false);
|
||||
|
||||
// Return the number of tokens which would be output by a tokenize().
|
||||
size_t count(const string_view &in) noexcept;
|
||||
|
||||
// Decode token values to build output text string.
|
||||
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;
|
||||
|
||||
// Return the length of the string which would be output by a detokenize().
|
||||
size_t count(const vector_view<const u16> &in) noexcept;
|
||||
|
||||
// Other tools
|
||||
string_view debug(const mutable_buffer &buf, const u16 token, const uint fmt_msk = -1U);
|
||||
}
|
||||
|
|
|
@ -213,9 +213,29 @@ ircd::gpt::vocab::debug(const mutable_buffer &out,
|
|||
};
|
||||
}
|
||||
|
||||
//
|
||||
// detokenize
|
||||
//
|
||||
size_t
|
||||
ircd::gpt::vocab::count(const vector_view<const u16> &in)
|
||||
noexcept
|
||||
{
|
||||
static const size_t max
|
||||
{
|
||||
32_KiB
|
||||
};
|
||||
|
||||
thread_local char buf[max];
|
||||
const auto res
|
||||
{
|
||||
detokenize(buf, in)
|
||||
};
|
||||
|
||||
const auto ret
|
||||
{
|
||||
res.size()
|
||||
};
|
||||
|
||||
assert(ret < max);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ircd::string_view
|
||||
ircd::gpt::vocab::detokenize(const mutable_buffer &out,
|
||||
|
@ -252,9 +272,29 @@ noexcept
|
|||
};
|
||||
}
|
||||
|
||||
//
|
||||
// tokenize
|
||||
//
|
||||
size_t
|
||||
ircd::gpt::vocab::count(const string_view &in)
|
||||
noexcept
|
||||
{
|
||||
static const size_t max
|
||||
{
|
||||
2048
|
||||
};
|
||||
|
||||
thread_local u16 buf[max];
|
||||
const auto res
|
||||
{
|
||||
tokenize(buf, in)
|
||||
};
|
||||
|
||||
const auto ret
|
||||
{
|
||||
res.size()
|
||||
};
|
||||
|
||||
assert(ret < max);
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint16_t
|
||||
ircd::gpt::vocab::tokenize(const string_view &in,
|
||||
|
|
Loading…
Reference in a new issue