0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-09-27 11:18:51 +02:00

ircd::gpt::vocab: Fix string length accumulation.

This commit is contained in:
Jason Volk 2021-04-09 20:52:20 -07:00
parent 0a87754c99
commit 0a6be0efed

View file

@ -547,23 +547,29 @@ ircd::gpt::vocab::unk_tokenize(u16x16 &token,
u64 tokens(0), consumed(0);
const auto len(simd::strlen(str));
while(consumed < len && num + tokens < 16)
for(uint i(0); i < len; ++i)
{
u16 slen(0), tok;
for(uint i(1); i < len; ++i)
{
u8x16 s(str);
for(uint j(0); j < consumed; ++j)
s = shr<8>(s);
for(uint j(len - i); j < 16; ++j)
for(uint j(i); j < 16; ++j)
s[j] = 0;
if((token[num + tokens] = find_token(s)) != u16(-1))
{
consumed += len - i;
++tokens;
break;
}
if((tok = find_token(s)) == u16(-1))
continue;
slen = simd::strlen(s);
token[num + tokens] = tok;
}
//assert(slen > 0);
consumed += slen;
tokens += bool(slen);
}
assert(len >= consumed);
assert(num + tokens <= 16);
const auto overflow{len - consumed};