mirror of
https://github.com/matrix-construct/construct
synced 2024-11-26 00:32:35 +01:00
ircd::buffer: Add experimental non-temporal copy device.
This commit is contained in:
parent
2235acae61
commit
f1c8eb3c1c
2 changed files with 143 additions and 0 deletions
|
@ -105,6 +105,7 @@ namespace ircd::buffer::buffers
|
|||
#include "const_buffer.h"
|
||||
#include "copy.h"
|
||||
#include "move.h"
|
||||
#include "stream.h"
|
||||
#include "fixed_buffer.h"
|
||||
#include "window_buffer.h"
|
||||
#include "parse_buffer.h"
|
||||
|
|
142
include/ircd/buffer/stream.h
Normal file
142
include/ircd/buffer/stream.h
Normal file
|
@ -0,0 +1,142 @@
|
|||
// The Construct
|
||||
//
|
||||
// Copyright (C) The Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#define HAVE_IRCD_BUFFER_STREAM_H
|
||||
|
||||
namespace ircd::buffer
|
||||
{
|
||||
size_t stream_aligned(const mutable_buffer &dst, const const_buffer &src);
|
||||
}
|
||||
|
||||
/// Non-temporal copy. This copies from an aligned source to an aligned
|
||||
/// destination without the data cycling through the d-cache. The alignment
|
||||
/// requirements are currently very strict. The source and destination buffers
|
||||
/// must begin at a cache-line alignment and the size of the buffers must be
|
||||
/// a multiple of something we'll call "register-file size" which is the size
|
||||
/// of all named multimedia registers (256 for SSE, 512 for AVX, 2048 for
|
||||
/// AVX512) so it's safe to say buffers should just be aligned and padded out
|
||||
/// to 4K page-size to be safe. The size of the src argument itself can be an
|
||||
/// arbitrary size and this function will return that size, but its backing
|
||||
/// buffer must be padded out to alignment.
|
||||
///
|
||||
inline size_t
|
||||
ircd::buffer::stream_aligned(const mutable_buffer &dst,
|
||||
const const_buffer &src)
|
||||
{
|
||||
// Platforms that have non-temporal store support; this is all of x86_64
|
||||
constexpr bool has_nontemporal_store
|
||||
{
|
||||
#if defined(__SSE2__) && !defined(RB_GENERIC)
|
||||
true
|
||||
#else
|
||||
false
|
||||
#endif
|
||||
};
|
||||
|
||||
// Platforms that have non-temporal load support; sidenote SSE4.1 can do
|
||||
// 16 byte loads and AVX2 can do 32 byte loads; SSE2 cannot do loads.
|
||||
constexpr bool has_nontemporal_load
|
||||
{
|
||||
#if defined(__AVX__) && !defined(RB_GENERIC)
|
||||
true
|
||||
#else
|
||||
false
|
||||
#endif
|
||||
};
|
||||
|
||||
// Use the AVX512 vector type unconditionally as it conveniently matches
|
||||
// the cache-line size on the relevant platforms and simplifies our syntax
|
||||
// below by being a single object. On those w/o AVX512 it uses an
|
||||
// isomorphic configuration of the best available regs.
|
||||
using block_type = u512x1;
|
||||
|
||||
// The number of cache lines we'll have "in flight" which is basically
|
||||
// just a gimmick to unroll the loop such that each iteration covers
|
||||
// the full register file. On SSE with 256 bytes of register file we can
|
||||
// name 4 cache lines at once; on AVX with 512 bytes we can name 8, etc.
|
||||
constexpr size_t file_lines
|
||||
{
|
||||
#if defined(__AVX512F__)
|
||||
32
|
||||
#elif defined(__AVX__)
|
||||
8
|
||||
#else
|
||||
4
|
||||
#endif
|
||||
};
|
||||
|
||||
// Configurable magic number only relevant to SSE2 systems which don't have
|
||||
// non-temporal load instructions. On these platforms we'll conduct a
|
||||
// prefetch loop and mark the lines NTA.
|
||||
constexpr size_t latency
|
||||
{
|
||||
16
|
||||
};
|
||||
|
||||
// When the constexpr conditions aren't favorable we can fallback to
|
||||
// normal copy here.
|
||||
if constexpr(!has_nontemporal_store && !has_nontemporal_load)
|
||||
return copy(dst, src);
|
||||
|
||||
// Assert valid arguments
|
||||
assert(!overlap(src, dst));
|
||||
assert(aligned(data(src), sizeof(block_type)));
|
||||
assert(aligned(data(dst), sizeof(block_type)));
|
||||
assert(size(dst) % (sizeof(block_type) * file_lines));
|
||||
|
||||
// Size in bytes to be copied
|
||||
const size_t copy_size
|
||||
{
|
||||
std::min(size(src), size(dst))
|
||||
};
|
||||
|
||||
// Number of lines to be copied.
|
||||
const size_t copy_lines
|
||||
{
|
||||
(copy_size / sizeof(block_type)) + bool(copy_size % sizeof(block_type))
|
||||
};
|
||||
|
||||
// destination base ptr
|
||||
block_type *const __restrict__ out
|
||||
{
|
||||
reinterpret_cast<block_type *__restrict__>(data(dst))
|
||||
};
|
||||
|
||||
// source base ptr
|
||||
const block_type *const __restrict__ in
|
||||
{
|
||||
reinterpret_cast<const block_type *__restrict__>(data(src))
|
||||
};
|
||||
|
||||
if constexpr(!has_nontemporal_load)
|
||||
#pragma clang loop unroll(disable)
|
||||
for(size_t i(0); i < latency; ++i)
|
||||
__builtin_prefetch(in + i, 0, 0);
|
||||
|
||||
for(size_t i(0); i < copy_lines; i += file_lines)
|
||||
{
|
||||
if constexpr(!has_nontemporal_load)
|
||||
for(size_t j(0); j < file_lines; ++j)
|
||||
__builtin_prefetch(in + i + latency + j, 0, 0);
|
||||
|
||||
block_type block[file_lines];
|
||||
for(size_t j(0); j < file_lines; ++j)
|
||||
block[j] = __builtin_nontemporal_load(in + i + j);
|
||||
|
||||
for(size_t j(0); j < file_lines; ++j)
|
||||
__builtin_nontemporal_store(block[j], out + i + j);
|
||||
}
|
||||
|
||||
if constexpr(has_nontemporal_store)
|
||||
asm volatile ("sfence");
|
||||
|
||||
return copy_size;
|
||||
}
|
Loading…
Reference in a new issue