ircd::gpt: Add task struct; mmap cached model directly; improve init.

2024-06-10 22:18:54 +02:00 · 2021-03-10 00:18:23 -08:00 · 2021-03-10 00:18:23 -08:00 · 4da7d2ae43
parent 6f3adfd160
commit 4da7d2ae43
6 changed files with 309 additions and 145 deletions
--- a/include/ircd/gpt/generate.h
+++ b/include/ircd/gpt/generate.h
@ -0,0 +1,27 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_GENERATE_H
+
+namespace ircd::gpt
+{
+	vector_view<u16>
+	generate(const vector_view<u16> &out,
+	         const vector_view<const u16> &in,
+	         const opts * = &default_opts,
+	         task * = nullptr);
+
+	string_view
+	generate(const mutable_buffer &out,
+	         const string_view &in,
+	         const opts * = &default_opts,
+	         task * = nullptr);
+}
--- a/include/ircd/gpt/gpt.h
+++ b/include/ircd/gpt/gpt.h
@ -18,25 +18,23 @@ namespace ircd::gpt
 	IRCD_EXCEPTION(ircd::error, error)

 	struct opts;
-	struct context;
+	struct task;

 	extern const opts default_opts;
 	extern log::log log;
-
-	vector_view<u16>
-	generate(const vector_view<u16> &out,
-	         const vector_view<const u16> &in,
-	         const opts & = default_opts);
-
-	string_view
-	generate(const mutable_buffer &out,
-	         const string_view &in,
-	         const opts & = default_opts);
 }

 #include "vocab.h"
 #include "model.h"
+#include "task.h"
+#include "generate.h"

+/// Primary Options
+///
+/// Use this structure to configure and control specifics of the machine.
+/// These settings are immutable for the operations. To maintain state between
+/// calls see task.h
+///
 struct ircd::gpt::opts
 {
 	/// Specifies the nominal halting condition based on the sequence of
@ -64,9 +62,21 @@ struct ircd::gpt::opts

 	/// Limit number of output tokens. Default of -1 is unlimited; the number
 	/// of tokens generated will be limited by other factors.
-	uint limit {-1U};
+	uint limit
+	{
+		-1U
+	};

 	/// Flip random coins over the top k logits each round. Setting to 1
 	/// deterministically selects the top logit.
-	uint top_k {2};
+	uint top_k
+	{
+		2
+	};
+
+	/// Pointer to the model
+	const model::decoder *model
+	{
+		model::default_model
+	};
 };
--- a/include/ircd/gpt/model.h
+++ b/include/ircd/gpt/model.h
@ -18,6 +18,8 @@ namespace ircd::gpt::model
 	struct ffnn;
 	struct block;
 	struct decoder;
+
+	extern const decoder *default_model;
 }

 /// Attention aperature
--- a/include/ircd/gpt/task.h
+++ b/include/ircd/gpt/task.h
@ -0,0 +1,57 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_TASK_H
+
+/// Context to maintain state across calls.
+///
+struct ircd::gpt::task
+{
+	enum status :char;
+
+	/// Reference to the attached options.
+	const gpt::opts *opts {nullptr};
+
+	/// Current task status.
+	enum status status {'\0'};
+
+	/// Accumulates the number of executions by the user. Each call to the
+	/// interface is an execution.
+	uint64_t epoch {0};
+
+	/// Accumulates the number of tokens produced by the task. Several tokens
+	/// may be produced each epoch.
+	uint64_t produced {0};
+
+	/// Accumulates the number tokens witnessed by the task. The number of
+	/// tokens in the context for each produced token is counted as witnessed.
+	uint64_t witnessed {0};
+
+	/// Accumulates the number of CPU reference cycles consumed by the task.
+	/// This counter does not reflect time when the task is queued or waiting
+	/// or offloaded to a co-processor/accelerator.
+	uint64_t cycles {0};
+
+	/// Accumulates the total time in milliseconds over all executions of the
+	/// task. This counter reflects total wall-clock time of all phases of
+	/// the execution.
+	milliseconds time {0ms};
+};
+
+/// The current status of a task is indicated with intelligible characters
+enum ircd::gpt::task::status
+:char
+{
+	QUEUED    = 'Q',  ///< Queued for execution.
+	RUNNING   = 'R',  ///< Currently being executed.
+	ACCEPT    = 'A',  ///< Execution completed successfully.
+	ERROR     = 'E',  ///< Execution did not complete successfully.
+};
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@ -28,28 +28,22 @@ namespace ircd::gpt
 	static void logits(float *, const float (&)[768], const model::decoder &);
 	static void tail(float *, const float *, const model::decoder &);
 	static u16 argmax(const float *, const opts &);
-
-	static vector_view<f32> embed(const vector_view<f32> &out, const u16 token, const u16 position);
-
-	std::unique_ptr<model::decoder> device
-	{
-		new model::decoder{}
-	};
+	static void embed(float *, const u16 token, const u16 position, const opts &);

 	static f32
 	logit alignas(64) [65536],
 	scratch alignas(64) [1024 * 768];
 }

-decltype(ircd::gpt::default_opts)
-ircd::gpt::default_opts;
-
 decltype(ircd::gpt::log)
 ircd::gpt::log
 {
 	"gpt"
 };

+decltype(ircd::gpt::default_opts)
+ircd::gpt::default_opts;
+
 namespace ircd::gpt::model
 {
 	constexpr float embed_pdrop
@ -86,7 +80,8 @@ namespace ircd::gpt::model
 ircd::string_view
 ircd::gpt::generate(const mutable_buffer &out,
                    const string_view &in,
-                    const opts &opts)
+                    const opts *opts,
+                    task *task)
 {
 	u16 buf[2][256];
 	const auto input_tokens
@ -96,7 +91,7 @@ ircd::gpt::generate(const mutable_buffer &out,

 	const auto output_tokens
 	{
-		generate(buf[1], input_tokens, opts)
+		generate(buf[1], input_tokens, opts, task)
 	};

 	const auto output
@ -110,12 +105,13 @@ ircd::gpt::generate(const mutable_buffer &out,
 ircd::vector_view<ircd::u16>
 ircd::gpt::generate(const vector_view<u16> &out,
                    const vector_view<const u16> &in,
-                    const opts &opts)
+                    const opts *opts,
+                    task *task)
 {
 	size_t ret(0);
 	bool halt(false);
 	uint errc[3] {0}, accc[3] {0};
-	for(uint i(0); !halt && i < out.size() && ret < opts.limit; ++i)
+	for(uint i(0); !halt && i < out.size() && ret < opts->limit; ++i)
 	{
 		const size_t tokens
 		{
@ -134,10 +130,7 @@ ircd::gpt::generate(const vector_view<u16> &out,
 				data(scratch) + j * 768, 768
 			};

-			const auto embedding
-			{
-				embed(dst, in[j], j)
-			};
+			embed(data(dst), in[j], j, *opts);
 		}

 		for(uint j(0); j < ret; ++j)
@ -147,32 +140,29 @@ ircd::gpt::generate(const vector_view<u16> &out,
 				data(scratch) + (in.size() + j) * 768, 768
 			};

-			const auto embedding
-			{
-				embed(dst, out[j], in.size() + j)
-			};
+			embed(data(dst), out[j], in.size() + j, *opts);
 		}

-		transform(data(scratch), tokens, *device);
+		transform(data(scratch), tokens, *opts->model);

 		const vector_view<f32> last_embed
 		{
 			data(scratch) + ((tokens - 1) * 768), 768
 		};

-		tail(logit, data(last_embed), *device);
-		out[i] = argmax(logit, opts);
+		tail(logit, data(last_embed), *opts->model);
+		out[i] = argmax(logit, *opts);

 		for(uint j(0); j < 3; ++j)
 		{
-			errc[j] = out[i] == opts.error_code[j][errc[j]]? errc[j] + 1: 0;
-			accc[j] = out[i] == opts.accept_code[j][accc[j]]? accc[j] + 1: 0;
+			errc[j] = out[i] == opts->error_code[j][errc[j]]? errc[j] + 1: 0;
+			accc[j] = out[i] == opts->accept_code[j][accc[j]]? accc[j] + 1: 0;
 		}

 		for(uint j(0); j < 3; ++j)
 		{
-			halt |= errc[j] >= 3 || (errc[j] && opts.error_code[j][errc[j] + 1] == -1U);
-			halt |= accc[j] >= 3 || (accc[j] && opts.accept_code[j][accc[j] + 1] == -1U);
+			halt |= errc[j] >= 3 || (errc[j] && opts->error_code[j][errc[j] + 1] == -1U);
+			halt |= accc[j] >= 3 || (accc[j] && opts->accept_code[j][accc[j] + 1] == -1U);
 		}

 		++ret;
@ -184,30 +174,25 @@ ircd::gpt::generate(const vector_view<u16> &out,
 	};
 }

-ircd::vector_view<ircd::f32>
-ircd::gpt::embed(const vector_view<f32> &out,
+void
+ircd::gpt::embed(float *const out,
                 const u16 token,
-                 const u16 position)
+                 const u16 position,
+                 const opts &opts)
 {
-	assert(device);
-
+	assert(opts.model);
 	const auto &wpe
 	{
-		device->wpe[position]
+		opts.model->wpe[position]
 	};

 	const auto &wte
 	{
-		device->wte[token]
+		opts.model->wte[token]
 	};

 	for(uint j(0); j < 768; ++j)
 		out[j] = wte[j] + wpe[j];
-
-	return vector_view<f32>
-	{
-		data(out), 768
-	};
 }

 uint16_t
--- a/ircd/gpt_model.cc
+++ b/ircd/gpt_model.cc
@ -10,6 +10,9 @@

 namespace ircd::gpt::model
 {
+	using init_func = void (*)(decoder &, const string_view &, const size_t &, const json::array &);
+	using init_handler = std::pair<string_view, init_func>;
+
 	static void
 	init_f_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_f_bias(decoder &, const string_view &, const size_t &, const json::array &),
@ -27,18 +30,23 @@ namespace ircd::gpt::model
 	init_h_attn_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_attn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
-	init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
-	init() noexcept;
+	init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &);

-	extern conf::item<std::string> path;
-	extern const std::pair
-	<
-		string_view,
-		void (*)(decoder &, const string_view &, const size_t &, const json::array &)
-	>
+	static bool init_from_cache(const string_view &);
+	static void init_from_json_handle(decoder &, const init_handler &, const size_t &);
+	static void init_from_json(const string_view &, const string_view &);
+	static void init() noexcept;
+
+	extern const init_handler
 	manifest[],
 	manifest_h[],
 	manifest_td[];
+
+	extern conf::item<std::string> path;
+	extern conf::item<std::string> cache_path;
+
+	static fs::map default_model_shm;
+	static std::unique_ptr<decoder> default_model_res;
 }

 decltype(ircd::gpt::model::manifest_h)
@ -76,6 +84,13 @@ ircd::gpt::model::manifest_td
 	{ "train.jsonl",   nullptr,  },
 };

+decltype(ircd::gpt::model::cache_path)
+ircd::gpt::model::cache_path
+{
+	{ "name",     "ircd.gpt.model.cache.path" },
+	{ "default",  "model.cache.localhost"     },
+};
+
 decltype(ircd::gpt::model::path)
 ircd::gpt::model::path
 {
@ -86,11 +101,8 @@ ircd::gpt::model::path
 	init
 };

-//TODO: XXX
-namespace ircd::gpt
-{
-	extern const std::unique_ptr<model::decoder> device;
-}
+decltype(ircd::gpt::model::default_model)
+ircd::gpt::model::default_model;

 void
 ircd::gpt::model::init()
@ -99,93 +111,164 @@ noexcept
 	if(!model::path)
 		return;

-	const size_t layers
+	if(!init_from_cache(model::cache_path))
+		init_from_json(model::cache_path, model::path);
+}
+
+bool
+ircd::gpt::model::init_from_cache(const string_view &cache_path)
+{
+	if(!fs::is_reg(cache_path))
+		return false;
+
+	const auto size
 	{
-		12
+		fs::size(cache_path)
 	};

-	const auto handle{[]
-	(const auto &a, const auto &b, const auto &i)
+	if(unlikely(size != sizeof(model::decoder)))
+		throw error
+		{
+			"Cached model `%s' size %zu differs from %zu.",
+			cache_path,
+			size,
+			sizeof(model::decoder),
+		};
+
+	const fs::fd fd
 	{
-		const auto &[fmt, handler]
-		{
-			a[b]
-		};
+		cache_path
+	};

-		char namebuf[128] {0};
-		const string_view path_part[2]
-		{
-			model::path, fmt::sprintf
-			{
-				namebuf, fmt, i
-			}
-		};
-
-		const fs::fd fd
-		{
-			fs::path(fs::path_scratch, path_part)
-		};
-
-		fs::map::opts map_opts;
-		const fs::map map
-		{
-			fd, map_opts
-		};
-
-		const json::array mat
-		{
-			map
-		};
-
-		assert(gpt::device);
-		handler(*gpt::device, path_part[1], i, mat);
-		log::logf
-		{
-			log, log::level::DEBUG,
-			"Model init [%2d][%2d] :%s",
-			i,
-			b,
-			path_part[1],
-		};
-	}};
-
-	ircd::timer sw;
-	size_t read(0), wrote(0);
-	if(fs::exists("model"))
+	fs::map::opts map_opts;
+	default_model_shm = fs::map
 	{
-		const auto _read
-		{
-			fs::read(fs::fd{"model"}, mutable_buffer{(char *)(gpt::device.get()), sizeof(model::decoder)})
-		};
+		fd, map_opts, sizeof(decoder)
+	};

-		read = size(_read);
-	} else {
-		memset(device.get(),  0x0, sizeof(model::decoder));
+	default_model = reinterpret_cast<decoder *>
+	(
+		data(default_model_shm)
+	);

-		handle(manifest, 0, 0);
-		handle(manifest, 1, 0);
-		handle(manifest, 2, 0);
-		handle(manifest, 3, 0);
-		for(size_t i(0); i < layers; ++i)
-			for(size_t j(0); j < 13; ++j)
-				handle(manifest_h, j, i);
-
-		const auto _wrote
-		{
-			fs::write("model", const_buffer{(const char *)(gpt::device.get()), sizeof(model::decoder)})
-		};
-
-		wrote = size(_wrote);
-	}
-
-	char pbuf[3][48];
-	log::logf
+	char pbuf[48];
+	log::info
 	{
-		log, log::level::DEBUG,
-		"Model init completed in %s read %s wrote %s",
-		sw.pretty(pbuf[0]),
-		pretty(pbuf[1], iec(size(read))),
-		pretty(pbuf[2], iec(size(wrote))),
+		log, "model(%p) mapped cached model `%s' %s",
+		data(default_model_shm),
+		cache_path,
+		pretty(pbuf, iec(size)),
+	};
+
+	return true;
+}
+
+void
+ircd::gpt::model::init_from_json(const string_view &cache_path,
+                                 const string_view &model_path)
+{
+	util::timer stopwatch;
+	auto decoder
+	{
+		std::make_unique<model::decoder>()
+	};
+
+	// Load the top level files, vocab etc
+	for(size_t i(0); i < 4; ++i)
+		init_from_json_handle(*decoder, manifest[i], 0);
+
+	// Load the transformer files by layer
+	const size_t layers {12};
+	for(size_t i(0); i < layers; ++i)
+		for(size_t j(0); j < 13; ++j)
+			init_from_json_handle(*decoder, manifest_h[j], i);
+
+	const const_buffer src
+	{
+		reinterpret_cast<char *>(decoder.get()), sizeof(model::decoder)
+	};
+
+	const auto wrote
+	{
+		fs::write(cache_path, src)
+	};
+
+	char pbuf[2][48];
+	log::info
+	{
+		log, "model(%p) parsed `%s' cached %s to `%s' in %s",
+		decoder.get(),
+		model_path,
+		pretty(pbuf[0], iec(size(wrote))),
+		cache_path,
+		stopwatch.pretty(pbuf[1]),
+	};
+
+	default_model_res = std::move(decoder);
+	default_model = default_model_res.get();
+}
+
+void
+ircd::gpt::model::init_from_json_handle(decoder &d,
+                                        const init_handler &handler,
+                                        const size_t &layer)
+{
+	const auto &[fmt, func]
+	{
+		handler
+	};
+
+	char namebuf[128];
+	const string_view path_part[2]
+	{
+		model::path, fmt::sprintf
+		{
+			namebuf, fmt, layer
+		}
+	};
+
+	const auto path
+	{
+		fs::path(fs::path_scratch, path_part)
+	};
+
+	fs::fd::opts fdopts;
+	fdopts.sequential = true;
+	const fs::fd fd
+	{
+		path, fdopts
+	};
+
+	// mmap of the file
+	const fs::map map
+	{
+		fd
+	};
+
+	// Each file is a JSON array at the top level.
+	const json::array matrix
+	{
+		map
+	};
+
+	// Readable name for logging
+	const auto &name
+	{
+		path_part[1]
+	};
+
+	if(likely(func))
+		func(d, name, layer, matrix);
+
+	// Check for interrupt after long operation
+	ctx::interruption_point();
+
+	log::info
+	{
+		log, "model(%p) loaded layer:%zu :%s",
+		&d,
+		layer,
+		name,
 	};
 }