0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-06-10 22:18:54 +02:00

ircd::gpt: Add task struct; mmap cached model directly; improve init.

This commit is contained in:
Jason Volk 2021-03-10 00:18:23 -08:00
parent 6f3adfd160
commit 4da7d2ae43
6 changed files with 309 additions and 145 deletions

View file

@ -0,0 +1,27 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_GENERATE_H
namespace ircd::gpt
{
vector_view<u16>
generate(const vector_view<u16> &out,
const vector_view<const u16> &in,
const opts * = &default_opts,
task * = nullptr);
string_view
generate(const mutable_buffer &out,
const string_view &in,
const opts * = &default_opts,
task * = nullptr);
}

View file

@ -18,25 +18,23 @@ namespace ircd::gpt
IRCD_EXCEPTION(ircd::error, error)
struct opts;
struct context;
struct task;
extern const opts default_opts;
extern log::log log;
vector_view<u16>
generate(const vector_view<u16> &out,
const vector_view<const u16> &in,
const opts & = default_opts);
string_view
generate(const mutable_buffer &out,
const string_view &in,
const opts & = default_opts);
}
#include "vocab.h"
#include "model.h"
#include "task.h"
#include "generate.h"
/// Primary Options
///
/// Use this structure to configure and control specifics of the machine.
/// These settings are immutable for the operations. To maintain state between
/// calls see task.h
///
struct ircd::gpt::opts
{
/// Specifies the nominal halting condition based on the sequence of
@ -64,9 +62,21 @@ struct ircd::gpt::opts
/// Limit number of output tokens. Default of -1 is unlimited; the number
/// of tokens generated will be limited by other factors.
uint limit {-1U};
uint limit
{
-1U
};
/// Flip random coins over the top k logits each round. Setting to 1
/// deterministically selects the top logit.
uint top_k {2};
uint top_k
{
2
};
/// Pointer to the model
const model::decoder *model
{
model::default_model
};
};

View file

@ -18,6 +18,8 @@ namespace ircd::gpt::model
struct ffnn;
struct block;
struct decoder;
extern const decoder *default_model;
}
/// Attention aperature

57
include/ircd/gpt/task.h Normal file
View file

@ -0,0 +1,57 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_TASK_H
/// Context to maintain state across calls.
///
struct ircd::gpt::task
{
enum status :char;
/// Reference to the attached options.
const gpt::opts *opts {nullptr};
/// Current task status.
enum status status {'\0'};
/// Accumulates the number of executions by the user. Each call to the
/// interface is an execution.
uint64_t epoch {0};
/// Accumulates the number of tokens produced by the task. Several tokens
/// may be produced each epoch.
uint64_t produced {0};
/// Accumulates the number tokens witnessed by the task. The number of
/// tokens in the context for each produced token is counted as witnessed.
uint64_t witnessed {0};
/// Accumulates the number of CPU reference cycles consumed by the task.
/// This counter does not reflect time when the task is queued or waiting
/// or offloaded to a co-processor/accelerator.
uint64_t cycles {0};
/// Accumulates the total time in milliseconds over all executions of the
/// task. This counter reflects total wall-clock time of all phases of
/// the execution.
milliseconds time {0ms};
};
/// The current status of a task is indicated with intelligible characters
enum ircd::gpt::task::status
:char
{
QUEUED = 'Q', ///< Queued for execution.
RUNNING = 'R', ///< Currently being executed.
ACCEPT = 'A', ///< Execution completed successfully.
ERROR = 'E', ///< Execution did not complete successfully.
};

View file

@ -28,28 +28,22 @@ namespace ircd::gpt
static void logits(float *, const float (&)[768], const model::decoder &);
static void tail(float *, const float *, const model::decoder &);
static u16 argmax(const float *, const opts &);
static vector_view<f32> embed(const vector_view<f32> &out, const u16 token, const u16 position);
std::unique_ptr<model::decoder> device
{
new model::decoder{}
};
static void embed(float *, const u16 token, const u16 position, const opts &);
static f32
logit alignas(64) [65536],
scratch alignas(64) [1024 * 768];
}
decltype(ircd::gpt::default_opts)
ircd::gpt::default_opts;
decltype(ircd::gpt::log)
ircd::gpt::log
{
"gpt"
};
decltype(ircd::gpt::default_opts)
ircd::gpt::default_opts;
namespace ircd::gpt::model
{
constexpr float embed_pdrop
@ -86,7 +80,8 @@ namespace ircd::gpt::model
ircd::string_view
ircd::gpt::generate(const mutable_buffer &out,
const string_view &in,
const opts &opts)
const opts *opts,
task *task)
{
u16 buf[2][256];
const auto input_tokens
@ -96,7 +91,7 @@ ircd::gpt::generate(const mutable_buffer &out,
const auto output_tokens
{
generate(buf[1], input_tokens, opts)
generate(buf[1], input_tokens, opts, task)
};
const auto output
@ -110,12 +105,13 @@ ircd::gpt::generate(const mutable_buffer &out,
ircd::vector_view<ircd::u16>
ircd::gpt::generate(const vector_view<u16> &out,
const vector_view<const u16> &in,
const opts &opts)
const opts *opts,
task *task)
{
size_t ret(0);
bool halt(false);
uint errc[3] {0}, accc[3] {0};
for(uint i(0); !halt && i < out.size() && ret < opts.limit; ++i)
for(uint i(0); !halt && i < out.size() && ret < opts->limit; ++i)
{
const size_t tokens
{
@ -134,10 +130,7 @@ ircd::gpt::generate(const vector_view<u16> &out,
data(scratch) + j * 768, 768
};
const auto embedding
{
embed(dst, in[j], j)
};
embed(data(dst), in[j], j, *opts);
}
for(uint j(0); j < ret; ++j)
@ -147,32 +140,29 @@ ircd::gpt::generate(const vector_view<u16> &out,
data(scratch) + (in.size() + j) * 768, 768
};
const auto embedding
{
embed(dst, out[j], in.size() + j)
};
embed(data(dst), out[j], in.size() + j, *opts);
}
transform(data(scratch), tokens, *device);
transform(data(scratch), tokens, *opts->model);
const vector_view<f32> last_embed
{
data(scratch) + ((tokens - 1) * 768), 768
};
tail(logit, data(last_embed), *device);
out[i] = argmax(logit, opts);
tail(logit, data(last_embed), *opts->model);
out[i] = argmax(logit, *opts);
for(uint j(0); j < 3; ++j)
{
errc[j] = out[i] == opts.error_code[j][errc[j]]? errc[j] + 1: 0;
accc[j] = out[i] == opts.accept_code[j][accc[j]]? accc[j] + 1: 0;
errc[j] = out[i] == opts->error_code[j][errc[j]]? errc[j] + 1: 0;
accc[j] = out[i] == opts->accept_code[j][accc[j]]? accc[j] + 1: 0;
}
for(uint j(0); j < 3; ++j)
{
halt |= errc[j] >= 3 || (errc[j] && opts.error_code[j][errc[j] + 1] == -1U);
halt |= accc[j] >= 3 || (accc[j] && opts.accept_code[j][accc[j] + 1] == -1U);
halt |= errc[j] >= 3 || (errc[j] && opts->error_code[j][errc[j] + 1] == -1U);
halt |= accc[j] >= 3 || (accc[j] && opts->accept_code[j][accc[j] + 1] == -1U);
}
++ret;
@ -184,30 +174,25 @@ ircd::gpt::generate(const vector_view<u16> &out,
};
}
ircd::vector_view<ircd::f32>
ircd::gpt::embed(const vector_view<f32> &out,
void
ircd::gpt::embed(float *const out,
const u16 token,
const u16 position)
const u16 position,
const opts &opts)
{
assert(device);
assert(opts.model);
const auto &wpe
{
device->wpe[position]
opts.model->wpe[position]
};
const auto &wte
{
device->wte[token]
opts.model->wte[token]
};
for(uint j(0); j < 768; ++j)
out[j] = wte[j] + wpe[j];
return vector_view<f32>
{
data(out), 768
};
}
uint16_t

View file

@ -10,6 +10,9 @@
namespace ircd::gpt::model
{
using init_func = void (*)(decoder &, const string_view &, const size_t &, const json::array &);
using init_handler = std::pair<string_view, init_func>;
static void
init_f_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_f_bias(decoder &, const string_view &, const size_t &, const json::array &),
@ -27,18 +30,23 @@ namespace ircd::gpt::model
init_h_attn_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_h_attn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
init() noexcept;
init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &);
extern conf::item<std::string> path;
extern const std::pair
<
string_view,
void (*)(decoder &, const string_view &, const size_t &, const json::array &)
>
static bool init_from_cache(const string_view &);
static void init_from_json_handle(decoder &, const init_handler &, const size_t &);
static void init_from_json(const string_view &, const string_view &);
static void init() noexcept;
extern const init_handler
manifest[],
manifest_h[],
manifest_td[];
extern conf::item<std::string> path;
extern conf::item<std::string> cache_path;
static fs::map default_model_shm;
static std::unique_ptr<decoder> default_model_res;
}
decltype(ircd::gpt::model::manifest_h)
@ -76,6 +84,13 @@ ircd::gpt::model::manifest_td
{ "train.jsonl", nullptr, },
};
decltype(ircd::gpt::model::cache_path)
ircd::gpt::model::cache_path
{
{ "name", "ircd.gpt.model.cache.path" },
{ "default", "model.cache.localhost" },
};
decltype(ircd::gpt::model::path)
ircd::gpt::model::path
{
@ -86,11 +101,8 @@ ircd::gpt::model::path
init
};
//TODO: XXX
namespace ircd::gpt
{
extern const std::unique_ptr<model::decoder> device;
}
decltype(ircd::gpt::model::default_model)
ircd::gpt::model::default_model;
void
ircd::gpt::model::init()
@ -99,93 +111,164 @@ noexcept
if(!model::path)
return;
const size_t layers
if(!init_from_cache(model::cache_path))
init_from_json(model::cache_path, model::path);
}
bool
ircd::gpt::model::init_from_cache(const string_view &cache_path)
{
if(!fs::is_reg(cache_path))
return false;
const auto size
{
12
fs::size(cache_path)
};
const auto handle{[]
(const auto &a, const auto &b, const auto &i)
if(unlikely(size != sizeof(model::decoder)))
throw error
{
"Cached model `%s' size %zu differs from %zu.",
cache_path,
size,
sizeof(model::decoder),
};
const fs::fd fd
{
const auto &[fmt, handler]
{
a[b]
};
cache_path
};
char namebuf[128] {0};
const string_view path_part[2]
{
model::path, fmt::sprintf
{
namebuf, fmt, i
}
};
const fs::fd fd
{
fs::path(fs::path_scratch, path_part)
};
fs::map::opts map_opts;
const fs::map map
{
fd, map_opts
};
const json::array mat
{
map
};
assert(gpt::device);
handler(*gpt::device, path_part[1], i, mat);
log::logf
{
log, log::level::DEBUG,
"Model init [%2d][%2d] :%s",
i,
b,
path_part[1],
};
}};
ircd::timer sw;
size_t read(0), wrote(0);
if(fs::exists("model"))
fs::map::opts map_opts;
default_model_shm = fs::map
{
const auto _read
{
fs::read(fs::fd{"model"}, mutable_buffer{(char *)(gpt::device.get()), sizeof(model::decoder)})
};
fd, map_opts, sizeof(decoder)
};
read = size(_read);
} else {
memset(device.get(), 0x0, sizeof(model::decoder));
default_model = reinterpret_cast<decoder *>
(
data(default_model_shm)
);
handle(manifest, 0, 0);
handle(manifest, 1, 0);
handle(manifest, 2, 0);
handle(manifest, 3, 0);
for(size_t i(0); i < layers; ++i)
for(size_t j(0); j < 13; ++j)
handle(manifest_h, j, i);
const auto _wrote
{
fs::write("model", const_buffer{(const char *)(gpt::device.get()), sizeof(model::decoder)})
};
wrote = size(_wrote);
}
char pbuf[3][48];
log::logf
char pbuf[48];
log::info
{
log, log::level::DEBUG,
"Model init completed in %s read %s wrote %s",
sw.pretty(pbuf[0]),
pretty(pbuf[1], iec(size(read))),
pretty(pbuf[2], iec(size(wrote))),
log, "model(%p) mapped cached model `%s' %s",
data(default_model_shm),
cache_path,
pretty(pbuf, iec(size)),
};
return true;
}
void
ircd::gpt::model::init_from_json(const string_view &cache_path,
const string_view &model_path)
{
util::timer stopwatch;
auto decoder
{
std::make_unique<model::decoder>()
};
// Load the top level files, vocab etc
for(size_t i(0); i < 4; ++i)
init_from_json_handle(*decoder, manifest[i], 0);
// Load the transformer files by layer
const size_t layers {12};
for(size_t i(0); i < layers; ++i)
for(size_t j(0); j < 13; ++j)
init_from_json_handle(*decoder, manifest_h[j], i);
const const_buffer src
{
reinterpret_cast<char *>(decoder.get()), sizeof(model::decoder)
};
const auto wrote
{
fs::write(cache_path, src)
};
char pbuf[2][48];
log::info
{
log, "model(%p) parsed `%s' cached %s to `%s' in %s",
decoder.get(),
model_path,
pretty(pbuf[0], iec(size(wrote))),
cache_path,
stopwatch.pretty(pbuf[1]),
};
default_model_res = std::move(decoder);
default_model = default_model_res.get();
}
void
ircd::gpt::model::init_from_json_handle(decoder &d,
const init_handler &handler,
const size_t &layer)
{
const auto &[fmt, func]
{
handler
};
char namebuf[128];
const string_view path_part[2]
{
model::path, fmt::sprintf
{
namebuf, fmt, layer
}
};
const auto path
{
fs::path(fs::path_scratch, path_part)
};
fs::fd::opts fdopts;
fdopts.sequential = true;
const fs::fd fd
{
path, fdopts
};
// mmap of the file
const fs::map map
{
fd
};
// Each file is a JSON array at the top level.
const json::array matrix
{
map
};
// Readable name for logging
const auto &name
{
path_part[1]
};
if(likely(func))
func(d, name, layer, matrix);
// Check for interrupt after long operation
ctx::interruption_point();
log::info
{
log, "model(%p) loaded layer:%zu :%s",
&d,
layer,
name,
};
}