// Matrix Construct // // Copyright (C) Matrix Construct Developers, Authors & Contributors // Copyright (C) 2016-2021 Jason Volk // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. #pragma once #define HAVE_IRCD_GPT_VOCAB_H /// Vocabulary Tokenization & Encoding /// namespace ircd::gpt::vocab { IRCD_EXCEPTION(gpt::error, error) // Actual number of tokens and merges stored in following lists. extern size_t tokens, merges; // Lists of tokens and merges. Values are strings up to length maxlen which // are null terminated if shorter. extern char token [65536][16], merge [65536][2][16]; // Paths to the files containing token and merge datas. extern conf::item tokens_path, merges_path; // Tokenize UTF-8 input string of any length into proper token values, vector_view tokenize(const vector_view &out, const string_view &in) noexcept; // Tokenize one token. The buffer is advanced consuming one token per call. u16 tokenize(const_buffer &) noexcept; // Tokenize one token. Error thrown if input is not exactly one token. u16 tokenize(const string_view &in); // Decode token values to build output text string. string_view detokenize(const mutable_buffer &out, const vector_view &in) noexcept; // Other tools string_view debug(const mutable_buffer &buf, const u16 token, const uint fmt_msk = -1U); }