0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-06-18 18:08:21 +02:00

ircd::gpt::vocab: Add tokenization and detokenization count() convenience.

This commit is contained in:
Jason Volk 2022-09-21 16:41:20 -07:00
parent 6092fabe42
commit 4a8302038a
2 changed files with 52 additions and 6 deletions

View file

@ -42,9 +42,15 @@ namespace ircd::gpt::vocab
// Tokenize one token. Error thrown if input is not exactly one token.
u16 tokenize(const string_view &in, const bool prefix_space = false);
// Return the number of tokens which would be output by a tokenize().
size_t count(const string_view &in) noexcept;
// Decode token values to build output text string.
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;
// Return the length of the string which would be output by a detokenize().
size_t count(const vector_view<const u16> &in) noexcept;
// Other tools
string_view debug(const mutable_buffer &buf, const u16 token, const uint fmt_msk = -1U);
}

View file

@ -213,9 +213,29 @@ ircd::gpt::vocab::debug(const mutable_buffer &out,
};
}
//
// detokenize
//
size_t
ircd::gpt::vocab::count(const vector_view<const u16> &in)
noexcept
{
static const size_t max
{
32_KiB
};
thread_local char buf[max];
const auto res
{
detokenize(buf, in)
};
const auto ret
{
res.size()
};
assert(ret < max);
return ret;
}
ircd::string_view
ircd::gpt::vocab::detokenize(const mutable_buffer &out,
@ -252,9 +272,29 @@ noexcept
};
}
//
// tokenize
//
size_t
ircd::gpt::vocab::count(const string_view &in)
noexcept
{
static const size_t max
{
2048
};
thread_local u16 buf[max];
const auto res
{
tokenize(buf, in)
};
const auto ret
{
res.size()
};
assert(ret < max);
return ret;
}
uint16_t
ircd::gpt::vocab::tokenize(const string_view &in,