mirror of
https://github.com/matrix-construct/construct
synced 2024-11-19 00:10:59 +01:00
ircd::gpt: Split vocab related into separate unit.
This commit is contained in:
parent
59f9aca938
commit
29b99dcf4d
3 changed files with 18 additions and 9 deletions
|
@ -26,6 +26,11 @@ namespace ircd::gpt::vocab
|
||||||
token [65536][16],
|
token [65536][16],
|
||||||
merge [65536][2][16];
|
merge [65536][2][16];
|
||||||
|
|
||||||
|
// Paths to the files containing token and merge datas.
|
||||||
|
extern conf::item<std::string>
|
||||||
|
tokens_path,
|
||||||
|
merges_path;
|
||||||
|
|
||||||
// Tokenize UTF-8 input string of any length into proper token values,
|
// Tokenize UTF-8 input string of any length into proper token values,
|
||||||
vector_view<u16>
|
vector_view<u16>
|
||||||
tokenize(const vector_view<u16> &out,
|
tokenize(const vector_view<u16> &out,
|
||||||
|
|
|
@ -217,7 +217,7 @@ libircd_la_SOURCES += png.cc
|
||||||
if OPENCL
|
if OPENCL
|
||||||
libircd_la_SOURCES += cl.cc
|
libircd_la_SOURCES += cl.cc
|
||||||
endif
|
endif
|
||||||
libircd_la_SOURCES += gpt.cc
|
libircd_la_SOURCES += gpt_vocab.cc
|
||||||
libircd_la_SOURCES += openssl.cc
|
libircd_la_SOURCES += openssl.cc
|
||||||
libircd_la_SOURCES += rfc1459.cc
|
libircd_la_SOURCES += rfc1459.cc
|
||||||
libircd_la_SOURCES += rfc3986.cc
|
libircd_la_SOURCES += rfc3986.cc
|
||||||
|
|
|
@ -13,19 +13,19 @@ namespace ircd::gpt::vocab
|
||||||
static u16 find_token(const u8x16) noexcept;
|
static u16 find_token(const u8x16) noexcept;
|
||||||
static uint find_tokens(u16x16 &, const uint, const u8x16 (&)[16], const uint) noexcept;
|
static uint find_tokens(u16x16 &, const uint, const u8x16 (&)[16], const uint) noexcept;
|
||||||
static u16 find_merge(const u8x16, const u8x16) noexcept;
|
static u16 find_merge(const u8x16, const u8x16) noexcept;
|
||||||
|
|
||||||
static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint) noexcept;
|
static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint) noexcept;
|
||||||
static uint bpe_merge(u8x16 (&)[16][2], u16 (&)[16], const uint, const u16) noexcept;
|
static uint bpe_merge(u8x16 (&)[16][2], u16 (&)[16], const uint, const u16) noexcept;
|
||||||
static uint bpe_postpare(u8x16 (&)[16], const u8x16 (&)[16][2], const uint) noexcept;
|
static uint bpe_postpare(u8x16 (&)[16], const u8x16 (&)[16][2], const uint) noexcept;
|
||||||
static uint bpe_prepare(u8x16 (&)[16][2], const u8x16) noexcept;
|
static uint bpe_prepare(u8x16 (&)[16][2], const u8x16) noexcept;
|
||||||
static uint bpe_tokenize(u8x16 (&)[16], const u8x16) noexcept;
|
static uint bpe_tokenize(u8x16 (&)[16], const u8x16) noexcept;
|
||||||
|
|
||||||
static u64x2 pre_tokenize_split(u8x16 (&)[16], u32x16, u32x16, u32x16) noexcept;
|
static u64x2 pre_tokenize_split(u8x16 (&)[16], u32x16, u32x16, u32x16) noexcept;
|
||||||
static u64x2 pre_tokenize(u8x16 (&)[16], const u8x16, const u8x16) noexcept;
|
static u64x2 pre_tokenize(u8x16 (&)[16], const u8x16, const u8x16) noexcept;
|
||||||
static u64x2 tokenize_block(u16x16 &, const u8x16, const u8x16) noexcept;
|
|
||||||
static void init_tokens() noexcept;
|
|
||||||
static void init_merges() noexcept;
|
|
||||||
|
|
||||||
extern conf::item<std::string> tokens_path;
|
static u64x2 tokenize_block(u16x16 &, const u8x16, const u8x16) noexcept;
|
||||||
extern conf::item<std::string> merges_path;
|
|
||||||
|
static void init_tokens(), init_merges();
|
||||||
}
|
}
|
||||||
|
|
||||||
decltype(ircd::gpt::vocab::tokens)
|
decltype(ircd::gpt::vocab::tokens)
|
||||||
|
@ -64,7 +64,6 @@ ircd::gpt::vocab::merges_path
|
||||||
|
|
||||||
void
|
void
|
||||||
ircd::gpt::vocab::init_tokens()
|
ircd::gpt::vocab::init_tokens()
|
||||||
noexcept
|
|
||||||
{
|
{
|
||||||
if(!tokens_path)
|
if(!tokens_path)
|
||||||
return;
|
return;
|
||||||
|
@ -89,7 +88,6 @@ noexcept
|
||||||
|
|
||||||
void
|
void
|
||||||
ircd::gpt::vocab::init_merges()
|
ircd::gpt::vocab::init_merges()
|
||||||
noexcept
|
|
||||||
{
|
{
|
||||||
if(!merges_path)
|
if(!merges_path)
|
||||||
return;
|
return;
|
||||||
|
@ -130,7 +128,7 @@ noexcept
|
||||||
{
|
{
|
||||||
mutable_buffer buf(out);
|
mutable_buffer buf(out);
|
||||||
for(const u16 &token : in)
|
for(const u16 &token : in)
|
||||||
consume(buf, copy(buf, const_buffer(vocab::token[token], simd::strlen(vocab::token[token]))));
|
consume(buf, copy(buf, const_buffer(vocab::token[token], size(string_view(vocab::token[token])))));
|
||||||
|
|
||||||
return string_view
|
return string_view
|
||||||
{
|
{
|
||||||
|
@ -412,6 +410,12 @@ ircd::gpt::vocab::bpe_tokenize(u8x16 (&str)[16],
|
||||||
const u8x16 pre_token)
|
const u8x16 pre_token)
|
||||||
noexcept
|
noexcept
|
||||||
{
|
{
|
||||||
|
if(simd::strlen(pre_token) < 2)
|
||||||
|
{
|
||||||
|
str[0] = pre_token;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
u8x16 pair[16][2];
|
u8x16 pair[16][2];
|
||||||
auto pairs
|
auto pairs
|
||||||
{
|
{
|
Loading…
Reference in a new issue