0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-10-01 05:08:59 +02:00

ircd::gpt: Split vocab related into separate unit.

This commit is contained in:
Jason Volk 2021-03-01 14:49:08 -08:00
parent 59f9aca938
commit 29b99dcf4d
3 changed files with 18 additions and 9 deletions

View file

@ -26,6 +26,11 @@ namespace ircd::gpt::vocab
token [65536][16],
merge [65536][2][16];
// Paths to the files containing token and merge datas.
extern conf::item<std::string>
tokens_path,
merges_path;
// Tokenize UTF-8 input string of any length into proper token values,
vector_view<u16>
tokenize(const vector_view<u16> &out,

View file

@ -217,7 +217,7 @@ libircd_la_SOURCES += png.cc
if OPENCL
libircd_la_SOURCES += cl.cc
endif
libircd_la_SOURCES += gpt.cc
libircd_la_SOURCES += gpt_vocab.cc
libircd_la_SOURCES += openssl.cc
libircd_la_SOURCES += rfc1459.cc
libircd_la_SOURCES += rfc3986.cc

View file

@ -13,19 +13,19 @@ namespace ircd::gpt::vocab
static u16 find_token(const u8x16) noexcept;
static uint find_tokens(u16x16 &, const uint, const u8x16 (&)[16], const uint) noexcept;
static u16 find_merge(const u8x16, const u8x16) noexcept;
static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint) noexcept;
static uint bpe_merge(u8x16 (&)[16][2], u16 (&)[16], const uint, const u16) noexcept;
static uint bpe_postpare(u8x16 (&)[16], const u8x16 (&)[16][2], const uint) noexcept;
static uint bpe_prepare(u8x16 (&)[16][2], const u8x16) noexcept;
static uint bpe_tokenize(u8x16 (&)[16], const u8x16) noexcept;
static u64x2 pre_tokenize_split(u8x16 (&)[16], u32x16, u32x16, u32x16) noexcept;
static u64x2 pre_tokenize(u8x16 (&)[16], const u8x16, const u8x16) noexcept;
static u64x2 tokenize_block(u16x16 &, const u8x16, const u8x16) noexcept;
static void init_tokens() noexcept;
static void init_merges() noexcept;
extern conf::item<std::string> tokens_path;
extern conf::item<std::string> merges_path;
static u64x2 tokenize_block(u16x16 &, const u8x16, const u8x16) noexcept;
static void init_tokens(), init_merges();
}
decltype(ircd::gpt::vocab::tokens)
@ -64,7 +64,6 @@ ircd::gpt::vocab::merges_path
void
ircd::gpt::vocab::init_tokens()
noexcept
{
if(!tokens_path)
return;
@ -89,7 +88,6 @@ noexcept
void
ircd::gpt::vocab::init_merges()
noexcept
{
if(!merges_path)
return;
@ -130,7 +128,7 @@ noexcept
{
mutable_buffer buf(out);
for(const u16 &token : in)
consume(buf, copy(buf, const_buffer(vocab::token[token], simd::strlen(vocab::token[token]))));
consume(buf, copy(buf, const_buffer(vocab::token[token], size(string_view(vocab::token[token])))));
return string_view
{
@ -412,6 +410,12 @@ ircd::gpt::vocab::bpe_tokenize(u8x16 (&str)[16],
const u8x16 pre_token)
noexcept
{
if(simd::strlen(pre_token) < 2)
{
str[0] = pre_token;
return 1;
}
u8x16 pair[16][2];
auto pairs
{