From 87144cef77cac5e70bf5ead03914f20e5bd89564 Mon Sep 17 00:00:00 2001 From: Jason Volk Date: Wed, 13 Mar 2019 19:33:58 -0700 Subject: [PATCH] ircd::fs: Support various RWF flags for operations; also preadv2(). --- configure.ac | 1 + include/ircd/fs/aio.h | 3 + include/ircd/fs/opts.h | 18 ++++- include/ircd/fs/write.h | 17 +++++ ircd/fs.cc | 155 ++++++++++++++++++++++++++++++++++++---- ircd/fs_aio.cc | 51 ++++++++++++- 6 files changed, 226 insertions(+), 19 deletions(-) diff --git a/configure.ac b/configure.ac index be6529b27..54d14b1f2 100644 --- a/configure.ac +++ b/configure.ac @@ -711,6 +711,7 @@ AC_CHECK_FUNCS([ \ vsnprintf \ posix_fadvise \ pwritev2 \ + preadv2 \ ]) AC_SEARCH_LIBS(dlinfo, dl, AC_DEFINE(HAVE_DLINFO, 1, [Define if you have dlinfo])) diff --git a/include/ircd/fs/aio.h b/include/ircd/fs/aio.h index dcc448906..f9d06f9c6 100644 --- a/include/ircd/fs/aio.h +++ b/include/ircd/fs/aio.h @@ -30,6 +30,9 @@ namespace ircd::fs::aio extern const bool support_fdsync; extern const bool support_append; extern const bool support_nowait; + extern const bool support_hipri; + extern const bool support_sync; + extern const bool support_dsync; extern const size_t MAX_EVENTS; extern const size_t MAX_REQPRIO; diff --git a/include/ircd/fs/opts.h b/include/ircd/fs/opts.h index b435ae66b..859b23a58 100644 --- a/include/ircd/fs/opts.h +++ b/include/ircd/fs/opts.h @@ -21,10 +21,15 @@ namespace ircd::fs /// Options common to all operations struct ircd::fs::opts { - /// Offset in the file. - off_t offset {0}; + static const int highest_priority; - /// Request priority. Lower value takes priority over higher. + /// Offset in the file. If this is -1, for writes, it indicates an append + /// at the end of the file (RWF_APPEND or legacy non-atomic lseek()). + off_t offset {0}; + + /// Request priority. Lower value takes priority over higher. The lowest + /// possible priority value is special, on supporting platforms (RWF_HIPRI). + /// One can either simply set the integer minimum or use the extern value. int8_t priority {0}; /// Submits the I/O request immediately rather than allowing IRCd to @@ -32,6 +37,13 @@ struct ircd::fs::opts /// (only relevant to aio). bool nodelay {false}; + /// Setting this to false enables non-blocking behavior. If the operation + /// would block, EAGAIN is returned. This is only available with RWF_NOWAIT + /// on newer systems, otherwise this value is ignored and is always true. + /// This feature makes up for the fact that O_NONBLOCK when opening the + /// file is ineffective for regular files. + bool blocking {true}; + /// Determines whether this operation is conducted via AIO. If not, a /// direct syscall is made. Using AIO will only block one ircd::ctx while /// a direct syscall will block the thread (all contexts). If AIO is not diff --git a/include/ircd/fs/write.h b/include/ircd/fs/write.h index 13c98f7d9..101a8fd3b 100644 --- a/include/ircd/fs/write.h +++ b/include/ircd/fs/write.h @@ -66,6 +66,23 @@ struct ircd::fs::write_opts /// in the useful propagation of an exception for this event. bool interruptible {true}; + /// Whether to update the fd's offset on appends. This happens naturally + /// when the file is opened in append mode. If not, we get the same per- + /// write atomic seek behavior if RWF_APPEND is supported. In the latter + /// case, this option determines whether the fd's offset is affected. + bool update_offset {true}; + + /// Whether to RWF_SYNC or RWF_DSYNC depending on the metadata option. This + /// is a range-sync, it only covers the offset and size of the write; + /// perhaps a worthy replacement for sync_file_range(2). + bool sync {false}; + + /// When sync is true: if metadata is true RWF_SYNC (like fsync(2)) is used, + /// otherwise RWF_DSYNC (like fdsync(2)) is used. This is only if available, + /// Careful, if it is not available you are responsible for following the + /// write with fsync(2)/fdsync(2) yourself. + bool metadata {false}; + write_opts(const off_t &); write_opts() = default; }; diff --git a/ircd/fs.cc b/ircd/fs.cc index 22f5a8ed1..b65905dd3 100644 --- a/ircd/fs.cc +++ b/ircd/fs.cc @@ -408,6 +408,7 @@ ircd::fs::flush(const fd &fd, namespace ircd::fs { + static size_t _read(const fd &, const const_iovec_view &, const read_opts &); static size_t read(const fd &, const const_iovec_view &, const read_opts &); } @@ -568,7 +569,7 @@ ircd::fs::read(const fd &fd, } #pragma GCC diagnostic pop -/// Lowest-level read() call. This call only conducts a single operation +/// Lowest-level'ish read() call. This call only conducts a single operation /// (no looping) and can return a partial read(). It does have branches /// for various read_opts. The arguments involve `struct ::iovec` which /// we do not expose to the ircd.h API; thus this function is internal to @@ -584,6 +585,38 @@ ircd::fs::read(const fd &fd, return aio::read(fd, iov, opts); #endif + return _read(fd, iov, opts); +} + +#ifdef HAVE_PREADV2 +size_t +ircd::fs::_read(const fd &fd, + const const_iovec_view &iov, + const read_opts &opts) +{ + int flags{0}; + + if(aio::support_hipri && reqprio(opts.priority) == reqprio(opts::highest_priority)) + flags |= RWF_HIPRI; + + if(aio::support_nowait && !opts.blocking) + flags |= RWF_NOWAIT; + + const auto ret + { + opts.interruptible? + syscall(::preadv2, fd, iov.data(), iov.size(), opts.offset, flags): + syscall_nointr(::preadv2, fd, iov.data(), iov.size(), opts.offset, flags) + }; + + return size_t(ret); +} +#else +size_t +ircd::fs::_read(const fd &fd, + const const_iovec_view &iov, + const read_opts &opts) +{ const auto ret { opts.interruptible? @@ -593,6 +626,7 @@ ircd::fs::read(const fd &fd, return size_t(ret); } +#endif // HAVE_PREADV2 /////////////////////////////////////////////////////////////////////////////// // @@ -738,17 +772,39 @@ ircd::fs::append(const string_view &path, return append(fd, bufs, opts); } +// When we have pwritev2() we can use RWF_APPEND indicated by +// the -1. Otherwise, we don't keep flags in userspace and we +// don't check the fd for whether it was opened with O_APPEND +// so the user may just have to eat the cost of an extra lseek(). +#ifdef HAVE_PWRITEV2 size_t ircd::fs::append(const fd &fd, const const_buffers &bufs, const write_opts &opts_) { auto opts(opts_); - if(!opts.offset) + if(!aio::support_append) + { + if(!opts.offset) + opts.offset = syscall(::lseek, fd, 0, SEEK_END); + } + else opts.offset = -1; + + return write(fd, bufs, opts); +} +#else +size_t +ircd::fs::append(const fd &fd, + const const_buffers &bufs, + const write_opts &opts_) +{ + auto opts(opts_); + if(!opts.offset || opts.offset == -1) opts.offset = syscall(::lseek, fd, 0, SEEK_END); return write(fd, bufs, opts); } +#endif // HAVE_PWRITEV2 ircd::const_buffer ircd::fs::write(const string_view &path, @@ -812,8 +868,8 @@ ircd::fs::write(const fd &fd, info::iov_max }; + size_t off(0); write_opts opts(opts_); - size_t off(opts.offset - opts_.offset); assert(bufs.size() <= info::iov_max); struct ::iovec iovbuf[bufs.size()]; do { @@ -825,9 +881,8 @@ ircd::fs::write(const fd &fd, opts.offset += write(fd, iov, opts); assert(opts.offset >= opts_.offset); off = opts.offset - opts_.offset; - assert(off <= buffers::size(bufs)); } - while(opts.all && off < buffers::size(bufs)); + while(opts.all && opts_.offset >= 0 && off < buffers::size(bufs)); assert(opts.offset >= opts_.offset); assert(ssize_t(off) == opts.offset - opts_.offset); assert(!opts.all || off == buffers::size(bufs)); @@ -862,10 +917,34 @@ ircd::fs::_write(const fd &fd, { int flags{0}; + assert(opts.offset >= 0 || aio::support_append); + if(aio::support_append && opts.offset == -1) + flags |= RWF_APPEND; + + if(aio::support_hipri && reqprio(opts.priority) == reqprio(opts::highest_priority)) + flags |= RWF_HIPRI; + + if(aio::support_nowait && !opts.blocking) + flags |= RWF_NOWAIT; + + if(aio::support_dsync && opts.sync && !opts.metadata) + flags |= RWF_DSYNC; + + if(aio::support_sync && opts.sync && opts.metadata) + flags |= RWF_SYNC; + + // Manpages sez that when appending with RWF_APPEND, the offset has no + // effect on the write; but if the value of the offset is -1 then the + // fd's offset is updated, otherwise it is not. + const off_t &offset + { + (flags & RWF_APPEND) && !opts.update_offset? 0 : opts.offset + }; + return opts.interruptible? - syscall(::pwritev2, fd, iov.data(), iov.size(), opts.offset, flags): - syscall_nointr(::pwritev2, fd, iov.data(), iov.size(), opts.offset, flags); + syscall(::pwritev2, fd, iov.data(), iov.size(), offset, flags): + syscall_nointr(::pwritev2, fd, iov.data(), iov.size(), offset, flags); } #else size_t @@ -894,21 +973,61 @@ decltype(ircd::fs::aio::support) extern __attribute__((weak)) ircd::fs::aio::support; -decltype(ircd::fs::aio::support_fsync) +decltype(ircd::fs::aio::support_sync) extern __attribute__((weak)) -ircd::fs::aio::support_fsync; +ircd::fs::aio::support_sync +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 7 +}; -decltype(ircd::fs::aio::support_fdsync) +decltype(ircd::fs::aio::support_dsync) extern __attribute__((weak)) -ircd::fs::aio::support_fdsync; +ircd::fs::aio::support_dsync +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 7 +}; -decltype(ircd::fs::aio::support_append) +decltype(ircd::fs::aio::support_hipri) extern __attribute__((weak)) -ircd::fs::aio::support_append; +ircd::fs::aio::support_hipri +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 6 +}; decltype(ircd::fs::aio::support_nowait) extern __attribute__((weak)) -ircd::fs::aio::support_nowait; +ircd::fs::aio::support_nowait +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 14 +}; + +decltype(ircd::fs::aio::support_append) +extern __attribute__((weak)) +ircd::fs::aio::support_append +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 16 +}; + +decltype(ircd::fs::aio::support_fsync) +extern __attribute__((weak)) +ircd::fs::aio::support_fsync +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 18 +}; + +decltype(ircd::fs::aio::support_fdsync) +extern __attribute__((weak)) +ircd::fs::aio::support_fdsync +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 18 +}; decltype(ircd::fs::aio::MAX_EVENTS) extern __attribute__((weak)) @@ -1245,6 +1364,12 @@ decltype(ircd::fs::opts_default) ircd::fs::opts_default {}; +decltype(ircd::fs::opts::highest_priority) +ircd::fs::opts::highest_priority +{ + std::numeric_limits::min() +}; + /////////////////////////////////////////////////////////////////////////////// // // fs/iov.h @@ -1255,6 +1380,7 @@ ircd::fs::make_iov(const iovec_view &iov, const mutable_buffers &bufs, const size_t &offset) { + assert(offset <= buffers::size(bufs)); const size_t max { std::min(iov.size(), bufs.size()) @@ -1293,6 +1419,7 @@ ircd::fs::make_iov(const iovec_view &iov, const const_buffers &bufs, const size_t &offset) { + assert(offset <= buffers::size(bufs)); const size_t max { std::min(iov.size(), bufs.size()) diff --git a/ircd/fs_aio.cc b/ircd/fs_aio.cc index 293bfe088..3d7eeb453 100644 --- a/ircd/fs_aio.cc +++ b/ircd/fs_aio.cc @@ -28,7 +28,31 @@ ircd::fs::aio::support true }; -/// True if RWF_NOWAIT is support by AIO. +/// True if RWF_SYNC is supported by AIO. +decltype(ircd::fs::aio::support_sync) +ircd::fs::aio::support_sync +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 7 +}; + +/// True if RWF_DSYNC is supported by AIO. +decltype(ircd::fs::aio::support_dsync) +ircd::fs::aio::support_dsync +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 7 +}; + +/// True if RWF_HIPRI is supported by AIO. +decltype(ircd::fs::aio::support_hipri) +ircd::fs::aio::support_hipri +{ + info::kversion[0] >= 4 && + info::kversion[1] >= 6 +}; + +/// True if RWF_NOWAIT is supported by AIO. decltype(ircd::fs::aio::support_nowait) ircd::fs::aio::support_nowait { @@ -36,7 +60,7 @@ ircd::fs::aio::support_nowait info::kversion[1] >= 14 }; -/// True if RWF_APPEND is support by AIO. +/// True if RWF_APPEND is supported by AIO. decltype(ircd::fs::aio::support_append) ircd::fs::aio::support_append { @@ -215,6 +239,21 @@ ircd::fs::aio::request::write::write(const int &fd, aio_buf = uintptr_t(iov.data()); aio_nbytes = iov.size(); aio_offset = opts.offset; + + #ifdef HAVE_PWRITEV2 + if(aio::support_append && opts.offset == -1) + { + // AIO departs from pwritev2() behavior and EINVAL's on -1. + aio_offset = 0; + aio_rw_flags |= RWF_APPEND; + } + + if(aio::support_dsync && opts.sync && !opts.metadata) + aio_rw_flags |= RWF_DSYNC; + + if(aio::support_sync && opts.sync && opts.metadata) + aio_rw_flags |= RWF_SYNC; + #endif } size_t @@ -287,6 +326,14 @@ ircd::fs::aio::request::request(const int &fd, aio_resfd = system->resfd.native_handle(); aio_fildes = fd; aio_data = uintptr_t(this); + + #if defined(HAVE_PWRITEV2) && defined(HAVE_PREADV2) + if(aio::support_hipri && reqprio(opts->priority) == reqprio(opts::highest_priority)) + aio_rw_flags |= RWF_HIPRI; + + if(aio::support_nowait && !opts->blocking) + aio_rw_flags |= RWF_NOWAIT; + #endif } ircd::fs::aio::request::~request()