2021-02-25 19:05:02 -08:00
|
|
|
// Matrix Construct
|
|
|
|
//
|
|
|
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
|
|
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
|
|
|
//
|
|
|
|
// Permission to use, copy, modify, and/or distribute this software for any
|
|
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
|
|
// copyright notice and this permission notice is present in all copies. The
|
|
|
|
// full license for this software is available in the LICENSE file.
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
#define HAVE_IRCD_GPT_VOCAB_H
|
|
|
|
|
|
|
|
/// Vocabulary Tokenization & Encoding
|
|
|
|
///
|
|
|
|
namespace ircd::gpt::vocab
|
|
|
|
{
|
2021-03-05 15:33:05 -08:00
|
|
|
IRCD_EXCEPTION(gpt::error, error)
|
|
|
|
|
2021-02-25 19:05:02 -08:00
|
|
|
// Actual number of tokens and merges stored in following lists.
|
|
|
|
extern size_t
|
|
|
|
tokens,
|
|
|
|
merges;
|
|
|
|
|
|
|
|
// Lists of tokens and merges. Values are strings up to length maxlen which
|
|
|
|
// are null terminated if shorter.
|
|
|
|
extern char
|
|
|
|
token [65536][16],
|
|
|
|
merge [65536][2][16];
|
|
|
|
|
2021-03-01 14:49:08 -08:00
|
|
|
// Paths to the files containing token and merge datas.
|
|
|
|
extern conf::item<std::string>
|
|
|
|
tokens_path,
|
|
|
|
merges_path;
|
|
|
|
|
2021-02-25 19:05:02 -08:00
|
|
|
// Tokenize UTF-8 input string of any length into proper token values,
|
2022-06-19 18:59:29 -07:00
|
|
|
vector_view<u16> tokenize(const vector_view<u16> &out, const string_view &in) noexcept;
|
|
|
|
|
|
|
|
// Tokenize one token. The buffer is advanced consuming one token per call.
|
|
|
|
u16 tokenize(const_buffer &) noexcept;
|
|
|
|
|
|
|
|
// Tokenize one token. Error thrown if input is not exactly one token.
|
2022-07-01 16:33:09 -07:00
|
|
|
u16 tokenize(const string_view &in, const bool prefix_space = false);
|
2021-02-25 19:05:02 -08:00
|
|
|
|
2022-09-21 16:41:20 -07:00
|
|
|
// Return the number of tokens which would be output by a tokenize().
|
|
|
|
size_t count(const string_view &in) noexcept;
|
|
|
|
|
2021-02-25 19:05:02 -08:00
|
|
|
// Decode token values to build output text string.
|
2022-06-19 18:59:29 -07:00
|
|
|
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;
|
2021-03-09 04:48:17 -08:00
|
|
|
|
2022-09-21 16:41:20 -07:00
|
|
|
// Return the length of the string which would be output by a detokenize().
|
|
|
|
size_t count(const vector_view<const u16> &in) noexcept;
|
|
|
|
|
2021-03-09 04:48:17 -08:00
|
|
|
// Other tools
|
2022-06-19 18:59:29 -07:00
|
|
|
string_view debug(const mutable_buffer &buf, const u16 token, const uint fmt_msk = -1U);
|
2021-02-25 19:05:02 -08:00
|
|
|
}
|