0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-11-26 00:32:35 +01:00

ircd::fs: Support various RWF flags for operations; also preadv2().

This commit is contained in:
Jason Volk 2019-03-13 19:33:58 -07:00
parent 56092db18c
commit 87144cef77
6 changed files with 226 additions and 19 deletions

View file

@ -711,6 +711,7 @@ AC_CHECK_FUNCS([ \
vsnprintf \ vsnprintf \
posix_fadvise \ posix_fadvise \
pwritev2 \ pwritev2 \
preadv2 \
]) ])
AC_SEARCH_LIBS(dlinfo, dl, AC_DEFINE(HAVE_DLINFO, 1, [Define if you have dlinfo])) AC_SEARCH_LIBS(dlinfo, dl, AC_DEFINE(HAVE_DLINFO, 1, [Define if you have dlinfo]))

View file

@ -30,6 +30,9 @@ namespace ircd::fs::aio
extern const bool support_fdsync; extern const bool support_fdsync;
extern const bool support_append; extern const bool support_append;
extern const bool support_nowait; extern const bool support_nowait;
extern const bool support_hipri;
extern const bool support_sync;
extern const bool support_dsync;
extern const size_t MAX_EVENTS; extern const size_t MAX_EVENTS;
extern const size_t MAX_REQPRIO; extern const size_t MAX_REQPRIO;

View file

@ -21,10 +21,15 @@ namespace ircd::fs
/// Options common to all operations /// Options common to all operations
struct ircd::fs::opts struct ircd::fs::opts
{ {
/// Offset in the file. static const int highest_priority;
off_t offset {0};
/// Request priority. Lower value takes priority over higher. /// Offset in the file. If this is -1, for writes, it indicates an append
/// at the end of the file (RWF_APPEND or legacy non-atomic lseek()).
off_t offset {0};
/// Request priority. Lower value takes priority over higher. The lowest
/// possible priority value is special, on supporting platforms (RWF_HIPRI).
/// One can either simply set the integer minimum or use the extern value.
int8_t priority {0}; int8_t priority {0};
/// Submits the I/O request immediately rather than allowing IRCd to /// Submits the I/O request immediately rather than allowing IRCd to
@ -32,6 +37,13 @@ struct ircd::fs::opts
/// (only relevant to aio). /// (only relevant to aio).
bool nodelay {false}; bool nodelay {false};
/// Setting this to false enables non-blocking behavior. If the operation
/// would block, EAGAIN is returned. This is only available with RWF_NOWAIT
/// on newer systems, otherwise this value is ignored and is always true.
/// This feature makes up for the fact that O_NONBLOCK when opening the
/// file is ineffective for regular files.
bool blocking {true};
/// Determines whether this operation is conducted via AIO. If not, a /// Determines whether this operation is conducted via AIO. If not, a
/// direct syscall is made. Using AIO will only block one ircd::ctx while /// direct syscall is made. Using AIO will only block one ircd::ctx while
/// a direct syscall will block the thread (all contexts). If AIO is not /// a direct syscall will block the thread (all contexts). If AIO is not

View file

@ -66,6 +66,23 @@ struct ircd::fs::write_opts
/// in the useful propagation of an exception for this event. /// in the useful propagation of an exception for this event.
bool interruptible {true}; bool interruptible {true};
/// Whether to update the fd's offset on appends. This happens naturally
/// when the file is opened in append mode. If not, we get the same per-
/// write atomic seek behavior if RWF_APPEND is supported. In the latter
/// case, this option determines whether the fd's offset is affected.
bool update_offset {true};
/// Whether to RWF_SYNC or RWF_DSYNC depending on the metadata option. This
/// is a range-sync, it only covers the offset and size of the write;
/// perhaps a worthy replacement for sync_file_range(2).
bool sync {false};
/// When sync is true: if metadata is true RWF_SYNC (like fsync(2)) is used,
/// otherwise RWF_DSYNC (like fdsync(2)) is used. This is only if available,
/// Careful, if it is not available you are responsible for following the
/// write with fsync(2)/fdsync(2) yourself.
bool metadata {false};
write_opts(const off_t &); write_opts(const off_t &);
write_opts() = default; write_opts() = default;
}; };

View file

@ -408,6 +408,7 @@ ircd::fs::flush(const fd &fd,
namespace ircd::fs namespace ircd::fs
{ {
static size_t _read(const fd &, const const_iovec_view &, const read_opts &);
static size_t read(const fd &, const const_iovec_view &, const read_opts &); static size_t read(const fd &, const const_iovec_view &, const read_opts &);
} }
@ -568,7 +569,7 @@ ircd::fs::read(const fd &fd,
} }
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
/// Lowest-level read() call. This call only conducts a single operation /// Lowest-level'ish read() call. This call only conducts a single operation
/// (no looping) and can return a partial read(). It does have branches /// (no looping) and can return a partial read(). It does have branches
/// for various read_opts. The arguments involve `struct ::iovec` which /// for various read_opts. The arguments involve `struct ::iovec` which
/// we do not expose to the ircd.h API; thus this function is internal to /// we do not expose to the ircd.h API; thus this function is internal to
@ -584,6 +585,38 @@ ircd::fs::read(const fd &fd,
return aio::read(fd, iov, opts); return aio::read(fd, iov, opts);
#endif #endif
return _read(fd, iov, opts);
}
#ifdef HAVE_PREADV2
size_t
ircd::fs::_read(const fd &fd,
const const_iovec_view &iov,
const read_opts &opts)
{
int flags{0};
if(aio::support_hipri && reqprio(opts.priority) == reqprio(opts::highest_priority))
flags |= RWF_HIPRI;
if(aio::support_nowait && !opts.blocking)
flags |= RWF_NOWAIT;
const auto ret
{
opts.interruptible?
syscall(::preadv2, fd, iov.data(), iov.size(), opts.offset, flags):
syscall_nointr(::preadv2, fd, iov.data(), iov.size(), opts.offset, flags)
};
return size_t(ret);
}
#else
size_t
ircd::fs::_read(const fd &fd,
const const_iovec_view &iov,
const read_opts &opts)
{
const auto ret const auto ret
{ {
opts.interruptible? opts.interruptible?
@ -593,6 +626,7 @@ ircd::fs::read(const fd &fd,
return size_t(ret); return size_t(ret);
} }
#endif // HAVE_PREADV2
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// //
@ -738,17 +772,39 @@ ircd::fs::append(const string_view &path,
return append(fd, bufs, opts); return append(fd, bufs, opts);
} }
// When we have pwritev2() we can use RWF_APPEND indicated by
// the -1. Otherwise, we don't keep flags in userspace and we
// don't check the fd for whether it was opened with O_APPEND
// so the user may just have to eat the cost of an extra lseek().
#ifdef HAVE_PWRITEV2
size_t size_t
ircd::fs::append(const fd &fd, ircd::fs::append(const fd &fd,
const const_buffers &bufs, const const_buffers &bufs,
const write_opts &opts_) const write_opts &opts_)
{ {
auto opts(opts_); auto opts(opts_);
if(!opts.offset) if(!aio::support_append)
{
if(!opts.offset)
opts.offset = syscall(::lseek, fd, 0, SEEK_END);
}
else opts.offset = -1;
return write(fd, bufs, opts);
}
#else
size_t
ircd::fs::append(const fd &fd,
const const_buffers &bufs,
const write_opts &opts_)
{
auto opts(opts_);
if(!opts.offset || opts.offset == -1)
opts.offset = syscall(::lseek, fd, 0, SEEK_END); opts.offset = syscall(::lseek, fd, 0, SEEK_END);
return write(fd, bufs, opts); return write(fd, bufs, opts);
} }
#endif // HAVE_PWRITEV2
ircd::const_buffer ircd::const_buffer
ircd::fs::write(const string_view &path, ircd::fs::write(const string_view &path,
@ -812,8 +868,8 @@ ircd::fs::write(const fd &fd,
info::iov_max info::iov_max
}; };
size_t off(0);
write_opts opts(opts_); write_opts opts(opts_);
size_t off(opts.offset - opts_.offset);
assert(bufs.size() <= info::iov_max); assert(bufs.size() <= info::iov_max);
struct ::iovec iovbuf[bufs.size()]; do struct ::iovec iovbuf[bufs.size()]; do
{ {
@ -825,9 +881,8 @@ ircd::fs::write(const fd &fd,
opts.offset += write(fd, iov, opts); opts.offset += write(fd, iov, opts);
assert(opts.offset >= opts_.offset); assert(opts.offset >= opts_.offset);
off = opts.offset - opts_.offset; off = opts.offset - opts_.offset;
assert(off <= buffers::size(bufs));
} }
while(opts.all && off < buffers::size(bufs)); while(opts.all && opts_.offset >= 0 && off < buffers::size(bufs));
assert(opts.offset >= opts_.offset); assert(opts.offset >= opts_.offset);
assert(ssize_t(off) == opts.offset - opts_.offset); assert(ssize_t(off) == opts.offset - opts_.offset);
assert(!opts.all || off == buffers::size(bufs)); assert(!opts.all || off == buffers::size(bufs));
@ -862,10 +917,34 @@ ircd::fs::_write(const fd &fd,
{ {
int flags{0}; int flags{0};
assert(opts.offset >= 0 || aio::support_append);
if(aio::support_append && opts.offset == -1)
flags |= RWF_APPEND;
if(aio::support_hipri && reqprio(opts.priority) == reqprio(opts::highest_priority))
flags |= RWF_HIPRI;
if(aio::support_nowait && !opts.blocking)
flags |= RWF_NOWAIT;
if(aio::support_dsync && opts.sync && !opts.metadata)
flags |= RWF_DSYNC;
if(aio::support_sync && opts.sync && opts.metadata)
flags |= RWF_SYNC;
// Manpages sez that when appending with RWF_APPEND, the offset has no
// effect on the write; but if the value of the offset is -1 then the
// fd's offset is updated, otherwise it is not.
const off_t &offset
{
(flags & RWF_APPEND) && !opts.update_offset? 0 : opts.offset
};
return return
opts.interruptible? opts.interruptible?
syscall(::pwritev2, fd, iov.data(), iov.size(), opts.offset, flags): syscall(::pwritev2, fd, iov.data(), iov.size(), offset, flags):
syscall_nointr(::pwritev2, fd, iov.data(), iov.size(), opts.offset, flags); syscall_nointr(::pwritev2, fd, iov.data(), iov.size(), offset, flags);
} }
#else #else
size_t size_t
@ -894,21 +973,61 @@ decltype(ircd::fs::aio::support)
extern __attribute__((weak)) extern __attribute__((weak))
ircd::fs::aio::support; ircd::fs::aio::support;
decltype(ircd::fs::aio::support_fsync) decltype(ircd::fs::aio::support_sync)
extern __attribute__((weak)) extern __attribute__((weak))
ircd::fs::aio::support_fsync; ircd::fs::aio::support_sync
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 7
};
decltype(ircd::fs::aio::support_fdsync) decltype(ircd::fs::aio::support_dsync)
extern __attribute__((weak)) extern __attribute__((weak))
ircd::fs::aio::support_fdsync; ircd::fs::aio::support_dsync
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 7
};
decltype(ircd::fs::aio::support_append) decltype(ircd::fs::aio::support_hipri)
extern __attribute__((weak)) extern __attribute__((weak))
ircd::fs::aio::support_append; ircd::fs::aio::support_hipri
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 6
};
decltype(ircd::fs::aio::support_nowait) decltype(ircd::fs::aio::support_nowait)
extern __attribute__((weak)) extern __attribute__((weak))
ircd::fs::aio::support_nowait; ircd::fs::aio::support_nowait
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 14
};
decltype(ircd::fs::aio::support_append)
extern __attribute__((weak))
ircd::fs::aio::support_append
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 16
};
decltype(ircd::fs::aio::support_fsync)
extern __attribute__((weak))
ircd::fs::aio::support_fsync
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 18
};
decltype(ircd::fs::aio::support_fdsync)
extern __attribute__((weak))
ircd::fs::aio::support_fdsync
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 18
};
decltype(ircd::fs::aio::MAX_EVENTS) decltype(ircd::fs::aio::MAX_EVENTS)
extern __attribute__((weak)) extern __attribute__((weak))
@ -1245,6 +1364,12 @@ decltype(ircd::fs::opts_default)
ircd::fs::opts_default ircd::fs::opts_default
{}; {};
decltype(ircd::fs::opts::highest_priority)
ircd::fs::opts::highest_priority
{
std::numeric_limits<decltype(priority)>::min()
};
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// //
// fs/iov.h // fs/iov.h
@ -1255,6 +1380,7 @@ ircd::fs::make_iov(const iovec_view &iov,
const mutable_buffers &bufs, const mutable_buffers &bufs,
const size_t &offset) const size_t &offset)
{ {
assert(offset <= buffers::size(bufs));
const size_t max const size_t max
{ {
std::min(iov.size(), bufs.size()) std::min(iov.size(), bufs.size())
@ -1293,6 +1419,7 @@ ircd::fs::make_iov(const iovec_view &iov,
const const_buffers &bufs, const const_buffers &bufs,
const size_t &offset) const size_t &offset)
{ {
assert(offset <= buffers::size(bufs));
const size_t max const size_t max
{ {
std::min(iov.size(), bufs.size()) std::min(iov.size(), bufs.size())

View file

@ -28,7 +28,31 @@ ircd::fs::aio::support
true true
}; };
/// True if RWF_NOWAIT is support by AIO. /// True if RWF_SYNC is supported by AIO.
decltype(ircd::fs::aio::support_sync)
ircd::fs::aio::support_sync
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 7
};
/// True if RWF_DSYNC is supported by AIO.
decltype(ircd::fs::aio::support_dsync)
ircd::fs::aio::support_dsync
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 7
};
/// True if RWF_HIPRI is supported by AIO.
decltype(ircd::fs::aio::support_hipri)
ircd::fs::aio::support_hipri
{
info::kversion[0] >= 4 &&
info::kversion[1] >= 6
};
/// True if RWF_NOWAIT is supported by AIO.
decltype(ircd::fs::aio::support_nowait) decltype(ircd::fs::aio::support_nowait)
ircd::fs::aio::support_nowait ircd::fs::aio::support_nowait
{ {
@ -36,7 +60,7 @@ ircd::fs::aio::support_nowait
info::kversion[1] >= 14 info::kversion[1] >= 14
}; };
/// True if RWF_APPEND is support by AIO. /// True if RWF_APPEND is supported by AIO.
decltype(ircd::fs::aio::support_append) decltype(ircd::fs::aio::support_append)
ircd::fs::aio::support_append ircd::fs::aio::support_append
{ {
@ -215,6 +239,21 @@ ircd::fs::aio::request::write::write(const int &fd,
aio_buf = uintptr_t(iov.data()); aio_buf = uintptr_t(iov.data());
aio_nbytes = iov.size(); aio_nbytes = iov.size();
aio_offset = opts.offset; aio_offset = opts.offset;
#ifdef HAVE_PWRITEV2
if(aio::support_append && opts.offset == -1)
{
// AIO departs from pwritev2() behavior and EINVAL's on -1.
aio_offset = 0;
aio_rw_flags |= RWF_APPEND;
}
if(aio::support_dsync && opts.sync && !opts.metadata)
aio_rw_flags |= RWF_DSYNC;
if(aio::support_sync && opts.sync && opts.metadata)
aio_rw_flags |= RWF_SYNC;
#endif
} }
size_t size_t
@ -287,6 +326,14 @@ ircd::fs::aio::request::request(const int &fd,
aio_resfd = system->resfd.native_handle(); aio_resfd = system->resfd.native_handle();
aio_fildes = fd; aio_fildes = fd;
aio_data = uintptr_t(this); aio_data = uintptr_t(this);
#if defined(HAVE_PWRITEV2) && defined(HAVE_PREADV2)
if(aio::support_hipri && reqprio(opts->priority) == reqprio(opts::highest_priority))
aio_rw_flags |= RWF_HIPRI;
if(aio::support_nowait && !opts->blocking)
aio_rw_flags |= RWF_NOWAIT;
#endif
} }
ircd::fs::aio::request::~request() ircd::fs::aio::request::~request()