ircd::gpt: Split vocab related into separate unit.

2024-10-01 05:08:59 +02:00 · 2021-03-01 14:49:08 -08:00 · 2021-03-01 14:49:08 -08:00 · 29b99dcf4d
commit 29b99dcf4d
parent 59f9aca938
3 changed files with 18 additions and 9 deletions
--- a/include/ircd/gpt/vocab.h
+++ b/include/ircd/gpt/vocab.h
@ -26,6 +26,11 @@ namespace ircd::gpt::vocab
 	token [65536][16],
 	merge [65536][2][16];

+	// Paths to the files containing token and merge datas.
+	extern conf::item<std::string>
+	tokens_path,
+	merges_path;
+
 	// Tokenize UTF-8 input string of any length into proper token values,
 	vector_view<u16>
 	tokenize(const vector_view<u16> &out,
--- a/ircd/Makefile.am
+++ b/ircd/Makefile.am
@ -217,7 +217,7 @@ libircd_la_SOURCES += png.cc
 if OPENCL
 libircd_la_SOURCES += cl.cc
 endif
-libircd_la_SOURCES += gpt.cc
+libircd_la_SOURCES += gpt_vocab.cc
 libircd_la_SOURCES += openssl.cc
 libircd_la_SOURCES += rfc1459.cc
 libircd_la_SOURCES += rfc3986.cc
--- a/ircd/gpt_vocab.cc
+++ b/ircd/gpt_vocab.cc
@ -13,19 +13,19 @@ namespace ircd::gpt::vocab
 	static u16 find_token(const u8x16) noexcept;
 	static uint find_tokens(u16x16 &, const uint, const u8x16 (&)[16], const uint) noexcept;
 	static u16 find_merge(const u8x16, const u8x16) noexcept;
+
 	static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint) noexcept;
 	static uint bpe_merge(u8x16 (&)[16][2], u16 (&)[16], const uint, const u16) noexcept;
 	static uint bpe_postpare(u8x16 (&)[16], const u8x16 (&)[16][2], const uint) noexcept;
 	static uint bpe_prepare(u8x16 (&)[16][2], const u8x16) noexcept;
 	static uint bpe_tokenize(u8x16 (&)[16], const u8x16) noexcept;
+
 	static u64x2 pre_tokenize_split(u8x16 (&)[16], u32x16, u32x16, u32x16) noexcept;
 	static u64x2 pre_tokenize(u8x16 (&)[16], const u8x16, const u8x16) noexcept;
-	static u64x2 tokenize_block(u16x16 &, const u8x16, const u8x16) noexcept;
-	static void init_tokens() noexcept;
-	static void init_merges() noexcept;

-	extern conf::item<std::string> tokens_path;
-	extern conf::item<std::string> merges_path;
+	static u64x2 tokenize_block(u16x16 &, const u8x16, const u8x16) noexcept;
+
+	static void init_tokens(), init_merges();
 }

 decltype(ircd::gpt::vocab::tokens)
@ -64,7 +64,6 @@ ircd::gpt::vocab::merges_path

 void
 ircd::gpt::vocab::init_tokens()
-noexcept
 {
 	if(!tokens_path)
 		return;
@ -89,7 +88,6 @@ noexcept

 void
 ircd::gpt::vocab::init_merges()
-noexcept
 {
 	if(!merges_path)
 		return;
@ -130,7 +128,7 @@ noexcept
 {
 	mutable_buffer buf(out);
 	for(const u16 &token : in)
-		consume(buf, copy(buf, const_buffer(vocab::token[token], simd::strlen(vocab::token[token]))));
+		consume(buf, copy(buf, const_buffer(vocab::token[token], size(string_view(vocab::token[token])))));

 	return string_view
 	{
@ -412,6 +410,12 @@ ircd::gpt::vocab::bpe_tokenize(u8x16 (&str)[16],
                               const u8x16 pre_token)
 noexcept
 {
+	if(simd::strlen(pre_token) < 2)
+	{
+		str[0] = pre_token;
+		return 1;
+	}
+
 	u8x16 pair[16][2];
 	auto pairs
 	{