From 0e6a6ea0b1dedc97c6a58fdc112d94df30fcbca2 Mon Sep 17 00:00:00 2001 From: Jason Volk Date: Fri, 3 Jul 2020 13:20:26 -0700 Subject: [PATCH] ircd::json: Replace char generator w/ new vectorized string streaming. (fixes #158) (fixes #159) --- include/ircd/json/string.h | 10 + ircd/json.cc | 733 ++++++++++++++++++++++++++++++------- 2 files changed, 617 insertions(+), 126 deletions(-) diff --git a/include/ircd/json/string.h b/include/ircd/json/string.h index 1b100be7c..b99d2e536 100644 --- a/include/ircd/json/string.h +++ b/include/ircd/json/string.h @@ -28,6 +28,16 @@ namespace ircd::json struct ircd::json::string :string_view { + // Note that the input argument is not a json::string; the caller must + // strip surrounding quotes from the view otherwise they will be counted + // with their missing escapes in the return value. This is by design to + // avoid unintentionally stripping quotes from actual payloads. + static size_t serialized(const string_view &in) noexcept; + + // Transform input into canonical string content. The output buffer must + // be at least the size reported by serialized() on the same input. + static size_t stringify(const mutable_buffer &out, const string_view &in) noexcept; + string() = default; string(json::string &&) = default; string(const json::string &) = default; diff --git a/ircd/json.cc b/ircd/json.cc index 682332f55..dacfabfed 100644 --- a/ircd/json.cc +++ b/ircd/json.cc @@ -281,24 +281,11 @@ ircd::json::printer }; // string - struct string_state; - struct character_state; - using character_prototype = char(const string_view &, string_state &); - template static void character_dfa(char &__restrict__, context &, bool &) noexcept; - const rule> character + using string_context = boost::spirit::context, fusion::vector<>>; + static void string_generate(unused_type, string_context &, bool &) noexcept; + const rule string { - repeat[char_[([](auto &out, auto &gen, auto &ret) - { - character_dfa(out, gen, ret); - })]] - ,"character" - }; - - _a_type _string_state; - _val_type _string_input; - const rule> string - { - quote << *(character(_string_input, _string_state)) << quote + quote << eps[std::bind(&printer::string_generate, ph::_1, ph::_2, ph::_3)] << quote ,"string" }; @@ -359,128 +346,46 @@ ircd::json::printer } const ircd::json::printer; -struct ircd::json::printer::string_state -{ - uint32_t pos {0}; - bool escaped {0}; -}; - -struct ircd::json::printer::character_state -{ - static const char ctrl_tab[0x20][8]; - - enum mode - { - PASS, - LEAVE, - CTRL, - QUOTE, - ESCAPE, - ESCAPED, - } - mode {PASS}; - uint8_t pos {0}; -}; - -decltype(ircd::json::printer::character_state::ctrl_tab) -ircd::json::printer::character_state::ctrl_tab -{ - "\\0", - "\\u0001", "\\u0002", "\\u0003", - "\\u0004", "\\u0005", "\\u0006", - "\\u0007", - "\\b", - "\\t", - "\\n", - "\\u000B", - "\\f", - "\\r", - "\\u000E", "\\u000F", "\\u0010", - "\\u0011", "\\u0012", "\\u0013", - "\\u0014", "\\u0015", "\\u0016", - "\\u0017", "\\u0018", "\\u0019", - "\\u001A", "\\u001B", "\\u001C", - "\\u001D", "\\u001E", "\\u001F", -}; - -template inline void -ircd::json::printer::character_dfa(char &__restrict__ out, - gen &g, - bool &ret) +ircd::json::printer::string_generate(unused_type, + string_context &g, + bool &ret) noexcept { - using mode = decltype(character_state::mode); - #if __has_builtin(__builtin_assume) __builtin_assume(ret == true); #endif - const string_view &str(attr_at<1>(g)); // Whole input string. - const uint8_t &in(attr_at<0>(g)); // Current character in input. - - string_state &sst(attr_at<2>(g)); // Whole input string state. - auto &st(local_at<0>(g)); // Current character state. - - out = in; - st.mode = - st.mode != mode::PASS? st.mode: - sst.escaped? mode::ESCAPED: - in < 0x20U? mode::CTRL: - in == '"'? mode::QUOTE: - in == '\\'? mode::ESCAPE: - mode::PASS; - switch(st.mode) + assert(generator_state); + auto &state { - [[likely]] - case mode::PASS: - st.mode = mode::LEAVE; - break; // mode::PASS + *generator_state + }; - [[likely]] - case mode::LEAVE: - ret = false; - sst.pos++; - assert(sst.pos <= str.size()); - break; // mode::LEAVE + const string_view &input + { + attr_at<0>(g) + }; - case mode::CTRL: - out = st.ctrl_tab[in][st.pos++]; - ret &= out != '\0'; // break loop at this iteration - sst.pos += !ret; - assert(st.pos <= 8); - break; // mode::CTRL + const size_t output_length + { + json::string::stringify(state.out, input) + }; - case mode::QUOTE: - out = "\\\""_sv[st.pos++]; - ret &= out != '\0'; // break loop at this iteration - sst.pos += !ret; - assert(st.pos <= 8); - break; // mode::QUOTE + const size_t consumed + { + std::min(output_length, size(state.out)) + }; - case mode::ESCAPE: - st.mode = sst.pos + 1 < str.size()? - mode::LEAVE: - mode::PASS; // must spin if last char of string is esc - sst.escaped = true; - assert(sst.pos < str.size()); - break; // mode::ESCAPE + const size_t overflow + { + output_length - consumed + }; - case mode::ESCAPED: - { - const auto ok - { - (in == 'u') | (in == '"') | (in == '\\') - }; - - sst.escaped = false; - out = ok? out: '\\'; - st.mode = ok? - mode::LEAVE: - mode::PASS; - break; // mode::ESCAPED - } - } + state.consumed += consume(state.out, consumed); + state.generated += output_length; + state.overflow += overflow; + ret = !overflow; } template(ircd::data(buf) + count[1]) + }; + + const auto si + { + reinterpret_cast(input.data() + count[0]) + }; + + block_t block + { + _mm_loadu_si128(si) + }; + + const u64x2 consume + { + string_stringify(block, mask) + }; + + _mm_storeu_si128(di, block); + count += consume; + } + + while(count[0] < input.size()) + { + const size_t remain + { + input.size() - count[0] + }; + + size_t j(0); + block_t block{0}, mask{0}; + for(; j < remain && j < sizeof(block_t); ++j) + { + block[j] = input[count[0] + j]; + mask[j] = 0xff; + } + + const u64x2 consume + { + string_stringify(block, mask) + }; + + char *const __restrict__ di + { + ircd::data(buf) + count[1] + }; + + block_t di_mask{0}; + for(size_t i(0); i < consume[1] && i + count[1] < ircd::size(buf); ++i) + di_mask[i] = 0xff; + + _mm_maskmoveu_si128(block, di_mask, di); + count += consume; + } + + return count[1]; +} + +/// Returns two addends to the outer loop. The first advances the input string +/// pointer any number of bytes; the block for the next invocation will start +/// at the new offset. This function may want to advance the input less than +/// the full block width if there's a possibility something important is being +/// split between blocks (i.e. an escaped utf-16 surrogate pair of 12 chars); +/// next invocation will then encounter the contiguous sequence without issue. +/// The second value is added to the final return count to indicate the length +/// of the input string in serialized form after correction. Partial sequences +/// trailing off the block are not counted here so they can be pushed over to +/// the next invocation. +/// +/// The input is a block of characters from the original string. When the block +/// cannot be filled at the end of a string (or a short string) the block_mask +/// will indicate 0 for any bytes past the end, otherwise -1 for valid chars; +/// note that null characters in the string are valid which we will escape. +/// +ircd::u64x2 +ircd::json::string_stringify(u8x16 &__restrict__ block, + const u8x16 block_mask) +{ + const u8x16 is_esc + { + block == '\\' + }; + + const u8x16 is_quote + { + block == '"' + }; + + const u8x16 is_ctrl + { + block < 0x20 + }; + + const u8x16 is_special + { + is_esc | is_quote | is_ctrl + }; + + // Count the number of uninteresting characters from the front of the + // block. With the special characters masked, we count leading zeroes. + // The inverted block_mask generates non-zero bits to terminate any + // counting past the end of the string. + const u64 regular_prefix_count + { + simd::clz(u64x2(is_special | ~block_mask)) / 8 + }; + + // Fast-path; backward branch to count and consume uninteresting characters + // from the front of the input. + if(likely(regular_prefix_count)) + return u64x2 + { + regular_prefix_count, regular_prefix_count, + }; + + // Slow-path; decide what to do based on the next character. + switch(block[0]) + { + // Covers the ctrl 0x00-0x20 range only; no other character here. + default: + { + assert(block[0] < 0x20); + __builtin_assume(block[0] < 0x20); + + const u8 idx{block[0]}; + block = *reinterpret_cast(ctrl_tab + idx); + return u64x2 + { + 1, u64(ctrl_tab_len[idx]) + }; + } + + // Unescaped quote + case '"': + block[0] = '\\'; + block[1] = '"'; + return u64x2 + { + 1, 2 + }; + + // Escape sequence + case '\\': switch(block[1] & block_mask[1]) + { + // Legitimately escaped single chars + case '\\': + case '"': + case 'b': + case 't': + case 'n': + case 'f': + case 'r': + case '0': + block[0] = '\\'; + block[1] = block[1]; + return u64x2 + { + 2, 2 + }; + + case 'u': // Possible utf-16 surrogate(s) + return string_stringify_utf16(block, block_mask); + + // Unnecessary escape; unless it's the last char. + default: + block[0] = '\\'; + block[1] = '\\'; + return u64x2 + { + 1, block_mask[1]? 0UL: 2UL + }; + }; + } +} + +ircd::u64x2 +ircd::json::string_stringify_utf16(u8x16 &__restrict__ block, + const u8x16 block_mask) +{ + const u32x4 unicode + { + utf16::decode_surrogate_aligned_next(block & block_mask) + }; + + const u32x4 is_surrogate + { + utf16::find_surrogate(block & block_mask) + }; + + const u32x4 is_ctrl + { + unicode < 0x20 + }; + + const u32x4 length_encoded + { + utf8::length(unicode) & ~is_ctrl + }; + + const u32x4 ctrl_idx + { + unicode & is_ctrl + }; + + const u32x4 length_surrogate + { + u32(ctrl_tab_len[ctrl_idx[0]]), + u32(ctrl_tab_len[ctrl_idx[1]]), + }; + + const u32x4 length + { + (length_encoded | length_surrogate) & is_surrogate + }; + + const u32x4 encoded_sparse + { + utf8::encode(unicode) + }; + + const u8x16 encoded + { + encoded_sparse + }; + + size_t di(0); + for(size_t i(0); i < 2; ++i) + for(size_t j(0); j < length[i]; ++j) + block[di++] = is_ctrl[i]? + ctrl_tab[ctrl_idx[i]][j]: + encoded[i * 4 + j]; + + const auto total_decoded + { + 6UL * ((is_surrogate[0] & 1) + (is_surrogate[1] & 1)) + }; + + assert(di == length[0] + length[1]); + return u64x2 + { + total_decoded, di + }; +} + +size_t +ircd::json::string::serialized(const string_view &input) +noexcept +{ + using block_t = u8x16; + using vec_t = u128x1; + using unaligned_vec_t = u128x1_u; + static_assert(sizeof(block_t) == sizeof(vec_t)); + + u8x16 block; + u64x2 count{0}; // input pos, return value + while(count[0] + sizeof(block_t) <= input.size()) + { + const auto ptr + { + reinterpret_cast(input.data() + count[0]) + }; + + block = _mm_loadu_si128(ptr); + count += string_serialized(block, ~u8x16{0}); + } + + while(count[0] < input.size()) + { + const size_t remain(input.size() - count[0]); + assert(remain < sizeof(block_t)); + + size_t j(0); + u8x16 mask{0}; + for(; count[0] + j < input.size(); ++j) + { + block[j] = input[count[0] + j]; + mask[j] = 0xff; + } + + count += string_serialized(block, mask); + } + + return count[1]; +} + +ircd::u64x2 +ircd::json::string_serialized(const u8x16 block, + const u8x16 block_mask) +{ + const u8x16 is_esc + { + block == '\\' + }; + + const u8x16 is_quote + { + block == '"' + }; + + const u8x16 is_ctrl + { + block < 0x20 + }; + + const u8x16 is_special + { + is_esc | is_quote | is_ctrl + }; + + const u64 regular_prefix_count + { + simd::clz(u64x2(is_special | ~block_mask)) / 8 + }; + + // Fast-path; backward branch to count and consume uninteresting characters + // from the front of the input. + if(likely(regular_prefix_count)) + return u64x2 + { + regular_prefix_count, regular_prefix_count, + }; + + // Slow-path; decide what to do based on the next character. + switch(block[0]) + { + // Covers the ctrl 0x00-0x20 range only; no other character here. + default: + assert(block[0] < 0x20); + __builtin_assume(block[0] < 0x20); + return string_serialized_ctrl(block, block_mask, is_ctrl); + + // Unescaped quote: +1 + case '"': + return u64x2 + { + 1, 2 + }; + + // Escape sequence + case '\\': switch(block[1] & block_mask[1]) + { + // Legitimately escaped single chars + case '"': + case 'b': + case 't': + case 'n': + case 'f': + case 'r': + case '0': + return u64x2 + { + 2, 2 + }; + + case 'u': // Possible utf-16 surrogate(s) + return string_serialized_utf16(block, block_mask); + + // Unnecessary escape; unless it's the last char: -1 + default: + return u64x2 + { + 1, block_mask[1]? 0UL: 2UL + }; + }; + } +} + +ircd::u64x2 +ircd::json::string_serialized_utf16(const u8x16 block, + const u8x16 block_mask) +{ + const u32x4 is_surrogate + { + utf16::find_surrogate(block & block_mask) + }; + + const u32x4 unicode + { + utf16::decode_surrogate_aligned_next(block & block_mask) + }; + + const u32x4 is_ctrl + { + unicode < 0x20 + }; + + // Determine the utf-8 encoding length for each codepoint but + // null out codepoints in the control character range. + const u32x4 length_encoded + { + utf8::length(unicode) & ~is_ctrl + }; + + const u32x4 ctrl_idx + { + unicode & is_ctrl + }; + + const i32x4 surrogate_len + { + ctrl_tab_len[ctrl_idx[0]], + ctrl_tab_len[ctrl_idx[1]], + }; + + // Supplement the escaped surrogate length for excluded codepoints. + const u32x4 length + { + (length_encoded | (is_ctrl & surrogate_len)) & is_surrogate + }; + + const auto total_length + { + length[0] + length[1] + }; + + const auto total_decoded + { + 6UL * ((is_surrogate[0] & 1) + (is_surrogate[1] & 1)) + }; + + return u64x2 + { + total_decoded, total_length, + }; +} + +ircd::u64x2 +ircd::json::string_serialized_ctrl(const u8x16 block, + const u8x16 block_mask, + const u8x16 is_ctrl) +{ + assert(block[0] < 0x20); + const u8x16 ctrl_esc_len + { + lookup_ctrl_tab_len(block & is_ctrl) + }; + + const u64 ctrl_prefix_count + { + simd::clz(u64x2(~is_ctrl | ~block_mask)) / 8 + }; + + u64 ret(0); + for(size_t i(0); i < ctrl_prefix_count; ++i) + ret += ctrl_esc_len[i]; + + return u64x2 + { + ctrl_prefix_count, ret + }; +} + +/// Performs a parallel transform of control characters in the input into +/// the length of their escape surrogate. The input character must be in +/// the control character range. +ircd::u8x16 +ircd::json::lookup_ctrl_tab_len(const u8x16 in) +{ + static const int32_t *const tab + { + ctrl_tab_len + }; + + size_t k(0); + i32x4 idx[4] + { + { in[k++], in[k++], in[k++], in[k++] }, + { in[k++], in[k++], in[k++], in[k++] }, + { in[k++], in[k++], in[k++], in[k++] }, + { in[k++], in[k++], in[k++], in[k++] }, + }; + + size_t i, j; + i32x8 res[2]; + for(i = 0; i < 2; ++i) + for(j = 0; j < 8; ++j) + res[i][j] = tab[idx[i][j]]; + + i8x16 ret; + k = 0; + for(i = 0; i < 2; ++i) + for(j = 0; j < 8; ++j) + ret[k++] = res[i][j]; + + return ret; +} + /////////////////////////////////////////////////////////////////////////////// // // json/value.h