// Tensor Construct // // Copyright (C) Matrix Construct Developers, Authors & Contributors // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net> // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. namespace ircd::gpt::vocab { static u8x16 get_token(const u16); static u16 find_token(const u8x16); static u16 find_merge(const u8x16, const u8x16); static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint); static uint bpe_merge(u8x16 (&)[16][2], u16 (&)[16], const uint, const u16); static uint bpe_postpare(u8x16 (&)[16], const u8x16 (&)[16][2], const uint); static uint bpe_prepare(u8x16 (&)[16][2], const u8x16); static uint bpe_tokenize(u8x16 (&)[16], const u8x16); static std::array<u32x16, 3> pre_tokenize_split(const u8x16, const i8x16); static u64x2 pre_tokenize(u8x16 (&)[16], const u8x16, const u8x16); static u64x2 unk_tokenize(u16x16 &, const u8x16, u64); static u64x2 tokenize_block(u16x16 &, const u8x16, const i8x16) noexcept; static void init_tokens(), init_merges(); extern const char32_t charset[256]; } /// Remapping of single byte characters (Control (C0) and Basic Latin (ASCII)). [[gnu::visibility("internal")]] decltype(ircd::gpt::vocab::charset) ircd::gpt::vocab::charset alignas(64) { U'Ā', U'ā', U'Ă', U'ă', U'Ą', U'ą', U'Ć', U'ć', // [0x07] U'Ĉ', U'ĉ', U'Ċ', U'ċ', U'Č', U'č', U'Ď', U'ď', // [0x0F] U'Đ', U'đ', U'Ē', U'ē', U'Ĕ', U'ĕ', U'Ė', U'ė', // [0x17] U'Ę', U'ę', U'Ě', U'ě', U'Ĝ', U'ĝ', U'Ğ', U'ğ', // [0x1F] U'Ġ', U'!', U'"', U'#', U'$', U'%', U'&', U'\'', // [0x27] U'(', U')', U'*', U'+', U',', U'-', U'.', U'/', // [0x2F] U'0', U'1', U'2', U'3', U'4', U'5', U'6', U'7', // [0x37] U'8', U'9', U':', U';', U'<', U'=', U'>', U'?', // [0x3F] U'@', U'A', U'B', U'C', U'D', U'E', U'F', U'G', // [0x47] U'H', U'I', U'J', U'K', U'L', U'M', U'N', U'O', // [0x4F] U'P', U'Q', U'R', U'S', U'T', U'U', U'V', U'W', // [0x57] U'X', U'Y', U'Z', U'[', U'\\', U']', U'^', U'_', // [0x5F] U'`', U'a', U'b', U'c', U'd', U'e', U'f', U'g', // [0x67] U'h', U'i', U'j', U'k', U'l', U'm', U'n', U'o', // [0x6F] U'p', U'q', U'r', U's', U't', U'u', U'v', U'w', // [0x77] U'x', U'y', U'z', U'{', U'|', U'}', U'~', U'ġ', // [0x7F] U'Ģ', U'ģ', U'Ĥ', U'ĥ', U'Ħ', U'ħ', U'Ĩ', U'ĩ', // [0x87] U'Ī', U'ī', U'Ĭ', U'ĭ', U'Į', U'į', U'İ', U'ı', // [0x8F] U'IJ', U'ij', U'Ĵ', U'ĵ', U'Ķ', U'ķ', U'ĸ', U'Ĺ', // [0x97] U'ĺ', U'Ļ', U'ļ', U'Ľ', U'ľ', U'Ŀ', U'ŀ', U'Ł', // [0x9F] U'ł', U'¡', U'¢', U'£', U'¤', U'¥', U'¦', U'§', // [0xA7] U'¨', U'©', U'ª', U'«', U'¬', U'Ń', U'®', U'¯', // [0xAF] U'°', U'±', U'²', U'³', U'´', U'µ', U'¶', U'·', // [0xB7] U'¸', U'¹', U'º', U'»', U'¼', U'½', U'¾', U'¿', // [0xBF] U'À', U'Á', U'Â', U'Ã', U'Ä', U'Å', U'Æ', U'Ç', // [0xC7] U'È', U'É', U'Ê', U'Ë', U'Ì', U'Í', U'Î', U'Ï', // [0xCF] U'Ð', U'Ñ', U'Ò', U'Ó', U'Ô', U'Õ', U'Ö', U'×', // [0xD7] U'Ø', U'Ù', U'Ú', U'Û', U'Ü', U'Ý', U'Þ', U'ß', // [0xDF] U'à', U'á', U'â', U'ã', U'ä', U'å', U'æ', U'ç', // [0xE7] U'è', U'é', U'ê', U'ë', U'ì', U'í', U'î', U'ï', // [0xEF] U'ð', U'ñ', U'ò', U'ó', U'ô', U'õ', U'ö', U'÷', // [0xF7] U'ø', U'ù', U'ú', U'û', U'ü', U'ý', U'þ', U'ÿ', // [0xFF] }; decltype(ircd::gpt::vocab::tokens) ircd::gpt::vocab::tokens; decltype(ircd::gpt::vocab::merges) ircd::gpt::vocab::merges; decltype(ircd::gpt::vocab::token) ircd::gpt::vocab::token alignas(64); decltype(ircd::gpt::vocab::merge) ircd::gpt::vocab::merge alignas(64); decltype(ircd::gpt::vocab::tokens_path) ircd::gpt::vocab::tokens_path { { { "name", "ircd.gpt.vocab.tokens.path" }, { "default", string_view{} }, }, init_tokens }; decltype(ircd::gpt::vocab::merges_path) ircd::gpt::vocab::merges_path { { { "name", "ircd.gpt.vocab.merges.path" }, { "default", string_view{} }, }, init_merges }; void ircd::gpt::vocab::init_tokens() { if(!tokens_path) return; const ircd::fs::fd file { string_view{tokens_path} }; const ircd::fs::map vocab_json { file, ircd::fs::map::opts{} }; tokens = 0; for(const auto &[key, val] : json::object(vocab_json)) { assert(tokens == lex_cast<uint16_t>(val)); auto &buf { token[tokens++] }; const auto unescaped { json::unescape(buf, key) }; for(size_t i(size(unescaped)); i < 16; ++i) buf[i] = 0; } } void ircd::gpt::vocab::init_merges() { if(!merges_path) return; const ircd::fs::fd file { string_view{merges_path} }; const ircd::fs::map merges_txt { file, ircd::fs::map::opts{} }; merges = 0; ircd::tokens(split(merges_txt, '\n').second, '\n', [] (const string_view &line) { const auto &[a, b] { split(line, ' ') }; copy(merge[merges][0], a); copy(merge[merges][1], b); ++merges; }); } ircd::string_view ircd::gpt::vocab::debug(const mutable_buffer &out, const u16 idx, const uint mask) { const auto *const token { reinterpret_cast<const u8x16 *>(vocab::token) }; thread_local char strbuf[2][512]; return fmt::sprintf { out, "%5u %s%32s%s%s%s", idx, mask & 0x1? "[ "_sv: string_view{}, mask & 0x1? simd::print_chr(strbuf[0], token[idx]): string_view{}, mask & 0x1? " ]"_sv: string_view{}, mask & 0x2? " "_sv: string_view{}, mask & 0x2? simd::print_mem(strbuf[1], token[idx]): string_view{}, }; } // // detokenize // ircd::string_view ircd::gpt::vocab::detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept { size_t off(0); for(const u16 &id : in) { const auto &token { vocab::token[id] }; const string_view text { token, strnlen(token, 16) }; string_view dest { data(out + off), copy(out + off, text) }; dest = replace(out + off, dest, "Ġ"_sv, " "_sv); dest = replace(out + off, dest, "Ċ"_sv, "\n"_sv); off += size(dest); } assert(off <= size(out)); return string_view { data(out), off }; } // // tokenize // uint16_t ircd::gpt::vocab::tokenize(const string_view &in) { char str_buf[16]; const string_view str { str_buf, copy(str_buf, in) }; u16 buf[16]; const auto out { tokenize(buf, str) }; if(unlikely(out.size() != 1)) throw error { "Input tokenizes to %zu tokens.", out.size() }; return buf[0]; } uint16_t ircd::gpt::vocab::tokenize(const_buffer &in) noexcept { char str_buf[16]; const string_view str { str_buf, copy(str_buf, in) }; u16 buf[16]; const auto out { tokenize(buf, str) }; const auto &tok { get_token(buf[0]) }; const auto consumed { simd::strlen(tok) }; consume(in, consumed); return buf[0]; } ircd::vector_view<ircd::u16> ircd::gpt::vocab::tokenize(const vector_view<u16> &out, const string_view &in) noexcept { using input_t = u8x16; using block_t = u16x16; assert(out.size() >= simd::lanes<block_t>()); const u64x2 max { out.size(), in.size(), }; const auto block { reinterpret_cast<block_t *>(out.data()) }; const auto consumed { simd::tokens<input_t, block_t>(block, in.data(), max, gpt::vocab::tokenize_block) }; assert(consumed[0] <= out.size()); assert(consumed[0] <= consumed[1]); return vector_view<u16> ( out.data(), consumed[0] ); } ircd::u64x2 ircd::gpt::vocab::tokenize_block(u16x16 &token, const u8x16 in, const i8x16 in_mask) noexcept { u8x16 pre_token[16]; const auto [pre_tokens, consumed] { pre_tokenize(pre_token, in, in_mask) }; u64x2 ret { 0, consumed }; assert(consumed); for(uint i(0); i < pre_tokens && ret[0] < 16; ++i) { // one token in hand is worth two in the bpe if(likely((token[ret[0]] = find_token(pre_token[i])) != u16(-1))) { ++ret[0]; continue; } u8x16 str[16]; const uint strs { bpe_tokenize(str, pre_token[i]) }; for(uint j(0); j < strs && ret[0] < 16; ++j) { if(likely((token[ret[0]] = find_token(str[j])) != u16(-1))) { ++ret[0]; continue; } ret += unk_tokenize(token, str[j], ret[0]); } } assert(ret[1]); return ret; } // // pre-tokenizer // /// Pre-tokenizationis formalized by the regular expression: /// /// 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ /// /// The return value in [0] indicates the number of tokens populated in the /// array; the value in [1] indicates the bytes consumed from the input. /// /// Split single vector of UTF-32 codepoints into vectors of UTF-8 strings for /// each token determined by the input masks. Returns the number of tokens in /// [0] and the number of codepoints consumed in [1]. ircd::u64x2 ircd::gpt::vocab::pre_tokenize(u8x16 (&token)[16], const u8x16 in, const u8x16 in_mask) { auto [ch, ch_mask, tok_mask] { pre_tokenize_split(in, in_mask) }; // Replace single-byte codepoints from the LUT. u32x16 rch; for(uint i(0); i < 16; ++i) rch[i] = ch[i] > 0xFF? ch[i]: charset[ch[i]]; u64x2 ret {0, 0}; for(uint i(0); ret[0] < 16 && ret[1] < 16; ++i) { static const u32x16 lane0_mask { -1U, 0 }; // Create a mask from all non-leading characters of input tokens with // a mask of just the leading character of the first token. To be sure // extra characters are not included we rinse it with the ch_mask. const u32x16 cover_mask ( (lane0_mask | tok_mask) & ch_mask ); // Get the number of codepoints of the first token from the cover. const auto cp_num { std::min(simd::lzcnt(~cover_mask | ~ch_mask) / 32UL, 16UL) }; // Input codepoint lengths const u32x16 cp_len ( utf8::length(ch) & cover_mask ); // Output codepoint lengths const u32x16 rcp_len ( utf8::length(rch) & cover_mask ); // Generate utf-8 codepoints const u8x64 rch8 ( utf8::encode_sparse(rch & cover_mask) ); u32x16 idx; uint off(0), len(0); for(uint j(0); j < cp_num; ++j) idx[j] = off, off += rcp_len[j], len += cp_len[j]; // One token over the line... if(ret[1] + off >= 16 && i > 0) break; // We have to return the proper number of bytes for what was truncated // from the input, but the truncation is determined after a transform // which may have a different size; this has to be offset back now. if(ret[1] + off > 16) len = 16; // Pack the utf-8 codepoints into the result token token[i] = u8x16{0}; for(uint j(0); j < cp_num; ++j) for(uint k(0); k < rcp_len[j] && idx[j] + k < 16; ++k) token[i][idx[j] + k] = rch8[j * 4 + k]; // Shift the token off the input to consume the next. for(uint j(0); j < cp_num; ++j) { ch = shr<32>(ch); rch = shr<32>(rch); ch_mask = shr<32>(ch_mask); tok_mask = shr<32>(tok_mask); } ret[0] += 1; ret[1] += len; assert(ret[1] <= 16); assert(len <= 16); } return ret; } std::array<ircd::u32x16, 3> ircd::gpt::vocab::pre_tokenize_split(const u8x16 in, const i8x16 in_mask) { const i8x16 is_ascii_ctrl ( in < 0x20 ); const i8x16 is_ascii_space ( in == ' ' ); const i8x16 is_ascii_number ( in >= '0' && in <= '9' ); const i8x16 is_ascii_letter (0 | (in >= 'a' && in <= 'z') | (in >= 'A' && in <= 'Z') ); const i8x16 is_ascii_punct (0 | (in >= '!' && in <= '/') | (in >= ':' && in <= '@') | (in >= '[' && in <= '`') | (in >= '{' && in <= '~') ); const i8x16 ascii_categorized (0 | is_ascii_ctrl | is_ascii_space | is_ascii_punct | is_ascii_letter | is_ascii_number ); const i8x16 maybe_notascii ( ~ascii_categorized & in_mask ); const i8x16 null_mask ( in == 0 && in_mask != 0 ); const u32x16 ch ( utf8::decode(in) ); const i32x16 ch_mask (0 | (ch != 0) | lane_cast<i32x16>(null_mask) ); const u32x16 uc_ch ( ch & (lane_cast<i32x16>(maybe_notascii)) ); const u32x16 uc_cat ( icu::category(uc_ch) ); const i32x16 is_L (0 | ((uc_cat & 0x0000003eU) != 0) | (lane_cast<i32x16>(is_ascii_letter)) ); const i32x16 is_N (0 | ((uc_cat & 0x00000e00U) != 0) | (lane_cast<i32x16>(is_ascii_number)) ); const i32x16 is_Z (0 | ((uc_cat & 0x00007000U) != 0) | (lane_cast<i32x16>(is_ascii_space)) ); const i32x16 is_C0 (0 | (lane_cast<i32x16>(is_ascii_ctrl)) ); const i32x16 is_punct (0 | (lane_cast<i32x16>(is_ascii_punct)) ); // Decide characters which do not start a new token based on the // preceding character. const i32x16 is_trail (0 | (is_L & shl<32>(is_L)) | (is_N & shl<32>(is_N)) | (is_Z & shl<32>(is_Z)) | (is_L & shl<32>(is_punct)) | (is_punct & shl<32>(is_punct)) ); // Decide characters which may start a token. const i32x16 is_head ( (~is_trail | is_C0) & ch_mask ); // Decide if candidate token is preceded by a space. const i32x16 leading_space ( is_head & shl<32>(is_Z) ); // Mask if next char is also the same char. const i32x16 is_rep ( is_head & (shl<32>(ch) == ch) ); // Decide the starting character of each token. const i32x16 tok_head (0 | (is_head & ~leading_space & ~is_rep) | shr<32>(leading_space) ); const i32x16 tok_trail ( ~tok_head ); const i32x16 tok_mask ( tok_trail ); return { ch, ch_mask, tok_mask }; } // // post-tokenizer // [[gnu::noinline]] ircd::u64x2 ircd::gpt::vocab::unk_tokenize(u16x16 &token, const u8x16 str, const u64 num) { const auto len { simd::strlen(str) }; u64 tokens(0), consumed(0); while(consumed < len && num + tokens < 16) { uint slen(0); for(uint i(0); i < len - consumed; ++i) { u8x16 s(str); for(uint j(0); j < consumed; ++j) s = shr<8>(s); for(uint j(i + 1); j < 16; ++j) s[j] = 0; u16 tok; if((tok = find_token(s)) == u16(-1)) continue; token[num + tokens] = tok; slen = simd::strlen(s); } // Last possible branch; token is bytewise identity. if(!slen) token[num + tokens] = str[consumed]; assert(slen < 16); consumed += std::max(slen, 1U); tokens += 1U; } assert(len >= consumed); assert(num + tokens <= 16); const auto overflow{len - consumed}; assert(overflow == 0 || num + tokens == 16); assert(consumed > 0 || tokens == 0); assert(tokens > 0 || len == 0); return u64x2 { // return number of tokens created only; the caller already counted // the length of str as consumed input. tokens, 0 }; } // // byte-pair encoding // [[gnu::noinline]] uint ircd::gpt::vocab::bpe_tokenize(u8x16 (&str)[16], const u8x16 pre_token) { if(simd::strlen(pre_token) < 2) { str[0] = pre_token; return 1; } u8x16 pair[16][2]; auto pairs { bpe_prepare(pair, pre_token) }; u16 score[16] {0}; for(uint j(0); j < 16 && pairs > 1; ++j) { const auto best_score { bpe_score(score, pair, pairs) }; const auto merges { bpe_merge(pair, score, pairs, best_score) }; pairs -= merges; if(!merges) break; } const uint strs { bpe_postpare(str, pair, pairs) }; return strs; } uint ircd::gpt::vocab::bpe_prepare(u8x16 (&out)[16][2], const u8x16 in) { const auto len { simd::strlen(in) }; const u8x16 cplen ( utf8::length(in) ); u32x16 idx; for(uint i(0), off(0); i < 16; off += cplen[i++]) idx[i] = off; uint ret(0); for(uint phase(0); phase < 2; ++phase) for(uint i(phase); i < 16; i += 2, ++ret) { if(idx[i] >= 16 || !in[idx[i]]) break; out[i][0] = u8x16{0}; out[i][1] = u8x16{0}; for(uint k(0); k < 2; ++k) for(uint j(0); j < cplen[i + k] && idx[i + k] + j < 16; ++j) out[i][k][j] = in[idx[i + k] + j]; } return ret; } uint ircd::gpt::vocab::bpe_postpare(u8x16 (&out)[16], const u8x16 (&in)[16][2], const uint num) { uint ret(0); for(uint j(0); j < num; ++j) if(simd::strlen(in[j][0])) out[ret++] = in[j][0]; if(likely(num)) if(simd::strlen(in[num - 1][1])) out[ret++] = in[num - 1][1]; return ret; } uint ircd::gpt::vocab::bpe_merge(u8x16 (&pair)[16][2], u16 (&score)[16], const uint num, const u16 best_score) { uint ret(0); for(uint i(0); i < num - ret; ++i) { if(score[i] != best_score) continue; pair[i][0] = simd::strcat(pair[i][0], pair[i][1]); score[i] = 0; if(i > 0) { pair[i - 1][1] = simd::strcat(pair[i - 1][1], pair[i][1]); score[i - 1] = 0; } if(i < 15) pair[i][1] = pair[i + 1][1]; for(uint j(i + 1); j + 1 < num; ++j) { pair[j][0] = pair[j + 1][0]; pair[j][1] = pair[j + 1][1]; score[j] = score[j + 1]; } ++ret; } return ret; } ircd::u16 ircd::gpt::vocab::bpe_score(u16 (&score)[16], const u8x16 (&pair)[16][2], const uint num) { uint best(-1U), is_min; for(uint i(0); i < num; i++) { // Only find the merge if the score is set to zero. if(!score[i]) score[i] = find_merge(pair[i][0], pair[i][1]); // If the score is set to -1 this index is inactive or wasn't a // valid pair. is_min = boolmask<uint>(score[i] != u16(-1)); is_min &= boolmask<uint>(score[i] < best); best = (is_min & score[i]) | (~is_min & best); } return best; } // // queries // ircd::u16 ircd::gpt::vocab::find_token(const u8x16 string) { for(uint i(0); i < tokens; ++i) if(simd::streq(string, get_token(i))) return i; return u16(-1U); } ircd::u16 ircd::gpt::vocab::find_merge(const u8x16 a, const u8x16 b) { const auto &__restrict__ merge { reinterpret_cast<const u8x16 (&)[65536][2]>(vocab::merge) }; for(uint i(0); i < merges; ++i) { if(likely(!simd::streq(a, merge[i][0]))) continue; if(likely(!simd::streq(b, merge[i][1]))) continue; return i; } return u16(-1U); } ircd::u8x16 ircd::gpt::vocab::get_token(const u16 idx) { const auto *const __restrict__ token { reinterpret_cast<const u8x16 *>(vocab::token) }; return token[idx]; }