0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-12-25 23:14:13 +01:00

ircd::json: Replace char generator w/ new vectorized string streaming. (fixes #158) (fixes #159)

This commit is contained in:
Jason Volk 2020-07-03 13:20:26 -07:00
parent 76a16469fb
commit 0e6a6ea0b1
2 changed files with 617 additions and 126 deletions

View file

@ -28,6 +28,16 @@ namespace ircd::json
struct ircd::json::string
:string_view
{
// Note that the input argument is not a json::string; the caller must
// strip surrounding quotes from the view otherwise they will be counted
// with their missing escapes in the return value. This is by design to
// avoid unintentionally stripping quotes from actual payloads.
static size_t serialized(const string_view &in) noexcept;
// Transform input into canonical string content. The output buffer must
// be at least the size reported by serialized() on the same input.
static size_t stringify(const mutable_buffer &out, const string_view &in) noexcept;
string() = default;
string(json::string &&) = default;
string(const json::string &) = default;

View file

@ -281,24 +281,11 @@ ircd::json::printer
};
// string
struct string_state;
struct character_state;
using character_prototype = char(const string_view &, string_state &);
template<class context> static void character_dfa(char &__restrict__, context &, bool &) noexcept;
const rule<character_prototype, locals<character_state>> character
using string_context = boost::spirit::context<fusion::cons<const string_view &>, fusion::vector<>>;
static void string_generate(unused_type, string_context &, bool &) noexcept;
const rule<string_view()> string
{
repeat[char_[([](auto &out, auto &gen, auto &ret)
{
character_dfa(out, gen, ret);
})]]
,"character"
};
_a_type _string_state;
_val_type _string_input;
const rule<string_view(), locals<string_state>> string
{
quote << *(character(_string_input, _string_state)) << quote
quote << eps[std::bind(&printer::string_generate, ph::_1, ph::_2, ph::_3)] << quote
,"string"
};
@ -359,128 +346,46 @@ ircd::json::printer
}
const ircd::json::printer;
struct ircd::json::printer::string_state
{
uint32_t pos {0};
bool escaped {0};
};
struct ircd::json::printer::character_state
{
static const char ctrl_tab[0x20][8];
enum mode
{
PASS,
LEAVE,
CTRL,
QUOTE,
ESCAPE,
ESCAPED,
}
mode {PASS};
uint8_t pos {0};
};
decltype(ircd::json::printer::character_state::ctrl_tab)
ircd::json::printer::character_state::ctrl_tab
{
"\\0",
"\\u0001", "\\u0002", "\\u0003",
"\\u0004", "\\u0005", "\\u0006",
"\\u0007",
"\\b",
"\\t",
"\\n",
"\\u000B",
"\\f",
"\\r",
"\\u000E", "\\u000F", "\\u0010",
"\\u0011", "\\u0012", "\\u0013",
"\\u0014", "\\u0015", "\\u0016",
"\\u0017", "\\u0018", "\\u0019",
"\\u001A", "\\u001B", "\\u001C",
"\\u001D", "\\u001E", "\\u001F",
};
template<class gen>
inline void
ircd::json::printer::character_dfa(char &__restrict__ out,
gen &g,
bool &ret)
ircd::json::printer::string_generate(unused_type,
string_context &g,
bool &ret)
noexcept
{
using mode = decltype(character_state::mode);
#if __has_builtin(__builtin_assume)
__builtin_assume(ret == true);
#endif
const string_view &str(attr_at<1>(g)); // Whole input string.
const uint8_t &in(attr_at<0>(g)); // Current character in input.
string_state &sst(attr_at<2>(g)); // Whole input string state.
auto &st(local_at<0>(g)); // Current character state.
out = in;
st.mode =
st.mode != mode::PASS? st.mode:
sst.escaped? mode::ESCAPED:
in < 0x20U? mode::CTRL:
in == '"'? mode::QUOTE:
in == '\\'? mode::ESCAPE:
mode::PASS;
switch(st.mode)
assert(generator_state);
auto &state
{
[[likely]]
case mode::PASS:
st.mode = mode::LEAVE;
break; // mode::PASS
*generator_state
};
[[likely]]
case mode::LEAVE:
ret = false;
sst.pos++;
assert(sst.pos <= str.size());
break; // mode::LEAVE
const string_view &input
{
attr_at<0>(g)
};
case mode::CTRL:
out = st.ctrl_tab[in][st.pos++];
ret &= out != '\0'; // break loop at this iteration
sst.pos += !ret;
assert(st.pos <= 8);
break; // mode::CTRL
const size_t output_length
{
json::string::stringify(state.out, input)
};
case mode::QUOTE:
out = "\\\""_sv[st.pos++];
ret &= out != '\0'; // break loop at this iteration
sst.pos += !ret;
assert(st.pos <= 8);
break; // mode::QUOTE
const size_t consumed
{
std::min(output_length, size(state.out))
};
case mode::ESCAPE:
st.mode = sst.pos + 1 < str.size()?
mode::LEAVE:
mode::PASS; // must spin if last char of string is esc
sst.escaped = true;
assert(sst.pos < str.size());
break; // mode::ESCAPE
const size_t overflow
{
output_length - consumed
};
case mode::ESCAPED:
{
const auto ok
{
(in == 'u') | (in == '"') | (in == '\\')
};
sst.escaped = false;
out = ok? out: '\\';
st.mode = ok?
mode::LEAVE:
mode::PASS;
break; // mode::ESCAPED
}
}
state.consumed += consume(state.out, consumed);
state.generated += output_length;
state.overflow += overflow;
ret = !overflow;
}
template<class gen,
@ -3284,6 +3189,66 @@ ircd::json::operator==(const member &a, const string_view &b)
// json/string.h
//
namespace ircd::json
{
extern const char ctrl_tab[0x20][16];
extern const int32_t ctrl_tab_len[0x20];
static u8x16 lookup_ctrl_tab_len(const u8x16 block);
static u64x2 string_serialized_ctrl(const u8x16 block, const u8x16 mask, const u8x16 ctrl_mask);
static u64x2 string_serialized_utf16(const u8x16 block, const u8x16 mask);
static u64x2 string_serialized(const u8x16 block, const u8x16 mask);
static u64x2 string_stringify_utf16(u8x16 &__restrict__ block, const u8x16 mask);
static u64x2 string_stringify(u8x16 &__restrict__ block, const u8x16 mask);
}
/// Escaped control character LUT.
decltype(ircd::json::ctrl_tab)
ircd::json::ctrl_tab
alignas(32)
{
"\\0",
"\\u0001", "\\u0002", "\\u0003",
"\\u0004", "\\u0005", "\\u0006",
"\\u0007",
"\\b",
"\\t",
"\\n",
"\\u000B",
"\\f",
"\\r",
"\\u000E", "\\u000F", "\\u0010",
"\\u0011", "\\u0012", "\\u0013",
"\\u0014", "\\u0015", "\\u0016",
"\\u0017", "\\u0018", "\\u0019",
"\\u001A", "\\u001B", "\\u001C",
"\\u001D", "\\u001E", "\\u001F",
};
/// Escaped control character LUT length hints
decltype(ircd::json::ctrl_tab_len)
ircd::json::ctrl_tab_len
alignas(32)
{
2,
6, 6, 6,
6, 6, 6,
6,
2,
2,
2,
6,
2,
2,
6, 6, 6,
6, 6, 6,
6, 6, 6,
6, 6, 6,
6, 6, 6,
6, 6, 6,
};
ircd::const_buffer
ircd::json::unescape(const mutable_buffer &buf,
const string &in)
@ -3305,6 +3270,522 @@ ircd::json::escape(const mutable_buffer &buf,
return ret;
}
/// Streaming transform for canonical JSON strings. This function takes
/// virtually any input and "always makes it right" i.e. always outputs
/// the application's so-called canonical JSON.
///
/// This involves a variable-length transformation where the output might
/// end up as significantly longer or shorter than the input; neither will
/// have any hope for aligned access, and most of the inputs are short and
/// already canonical. This is all tricky.
///
size_t
ircd::json::string::stringify(const mutable_buffer &buf,
const string_view &input)
noexcept
{
using block_t = u8x16;
using vec_t = u128x1;
using unaligned_vec_t = u128x1_u;
static_assert(sizeof(block_t) == sizeof(vec_t));
u64x2 count{0}; // input pos, return value
while(count[0] + sizeof(block_t) <= input.size() && count[1] + sizeof(block_t) <= ircd::size(buf))
{
static const auto mask
{
~block_t{0}
};
const auto di
{
reinterpret_cast<u128x1_u *__restrict__>(ircd::data(buf) + count[1])
};
const auto si
{
reinterpret_cast<const u128x1_u *__restrict__>(input.data() + count[0])
};
block_t block
{
_mm_loadu_si128(si)
};
const u64x2 consume
{
string_stringify(block, mask)
};
_mm_storeu_si128(di, block);
count += consume;
}
while(count[0] < input.size())
{
const size_t remain
{
input.size() - count[0]
};
size_t j(0);
block_t block{0}, mask{0};
for(; j < remain && j < sizeof(block_t); ++j)
{
block[j] = input[count[0] + j];
mask[j] = 0xff;
}
const u64x2 consume
{
string_stringify(block, mask)
};
char *const __restrict__ di
{
ircd::data(buf) + count[1]
};
block_t di_mask{0};
for(size_t i(0); i < consume[1] && i + count[1] < ircd::size(buf); ++i)
di_mask[i] = 0xff;
_mm_maskmoveu_si128(block, di_mask, di);
count += consume;
}
return count[1];
}
/// Returns two addends to the outer loop. The first advances the input string
/// pointer any number of bytes; the block for the next invocation will start
/// at the new offset. This function may want to advance the input less than
/// the full block width if there's a possibility something important is being
/// split between blocks (i.e. an escaped utf-16 surrogate pair of 12 chars);
/// next invocation will then encounter the contiguous sequence without issue.
/// The second value is added to the final return count to indicate the length
/// of the input string in serialized form after correction. Partial sequences
/// trailing off the block are not counted here so they can be pushed over to
/// the next invocation.
///
/// The input is a block of characters from the original string. When the block
/// cannot be filled at the end of a string (or a short string) the block_mask
/// will indicate 0 for any bytes past the end, otherwise -1 for valid chars;
/// note that null characters in the string are valid which we will escape.
///
ircd::u64x2
ircd::json::string_stringify(u8x16 &__restrict__ block,
const u8x16 block_mask)
{
const u8x16 is_esc
{
block == '\\'
};
const u8x16 is_quote
{
block == '"'
};
const u8x16 is_ctrl
{
block < 0x20
};
const u8x16 is_special
{
is_esc | is_quote | is_ctrl
};
// Count the number of uninteresting characters from the front of the
// block. With the special characters masked, we count leading zeroes.
// The inverted block_mask generates non-zero bits to terminate any
// counting past the end of the string.
const u64 regular_prefix_count
{
simd::clz(u64x2(is_special | ~block_mask)) / 8
};
// Fast-path; backward branch to count and consume uninteresting characters
// from the front of the input.
if(likely(regular_prefix_count))
return u64x2
{
regular_prefix_count, regular_prefix_count,
};
// Slow-path; decide what to do based on the next character.
switch(block[0])
{
// Covers the ctrl 0x00-0x20 range only; no other character here.
default:
{
assert(block[0] < 0x20);
__builtin_assume(block[0] < 0x20);
const u8 idx{block[0]};
block = *reinterpret_cast<const u128x1 *>(ctrl_tab + idx);
return u64x2
{
1, u64(ctrl_tab_len[idx])
};
}
// Unescaped quote
case '"':
block[0] = '\\';
block[1] = '"';
return u64x2
{
1, 2
};
// Escape sequence
case '\\': switch(block[1] & block_mask[1])
{
// Legitimately escaped single chars
case '\\':
case '"':
case 'b':
case 't':
case 'n':
case 'f':
case 'r':
case '0':
block[0] = '\\';
block[1] = block[1];
return u64x2
{
2, 2
};
case 'u': // Possible utf-16 surrogate(s)
return string_stringify_utf16(block, block_mask);
// Unnecessary escape; unless it's the last char.
default:
block[0] = '\\';
block[1] = '\\';
return u64x2
{
1, block_mask[1]? 0UL: 2UL
};
};
}
}
ircd::u64x2
ircd::json::string_stringify_utf16(u8x16 &__restrict__ block,
const u8x16 block_mask)
{
const u32x4 unicode
{
utf16::decode_surrogate_aligned_next(block & block_mask)
};
const u32x4 is_surrogate
{
utf16::find_surrogate(block & block_mask)
};
const u32x4 is_ctrl
{
unicode < 0x20
};
const u32x4 length_encoded
{
utf8::length(unicode) & ~is_ctrl
};
const u32x4 ctrl_idx
{
unicode & is_ctrl
};
const u32x4 length_surrogate
{
u32(ctrl_tab_len[ctrl_idx[0]]),
u32(ctrl_tab_len[ctrl_idx[1]]),
};
const u32x4 length
{
(length_encoded | length_surrogate) & is_surrogate
};
const u32x4 encoded_sparse
{
utf8::encode(unicode)
};
const u8x16 encoded
{
encoded_sparse
};
size_t di(0);
for(size_t i(0); i < 2; ++i)
for(size_t j(0); j < length[i]; ++j)
block[di++] = is_ctrl[i]?
ctrl_tab[ctrl_idx[i]][j]:
encoded[i * 4 + j];
const auto total_decoded
{
6UL * ((is_surrogate[0] & 1) + (is_surrogate[1] & 1))
};
assert(di == length[0] + length[1]);
return u64x2
{
total_decoded, di
};
}
size_t
ircd::json::string::serialized(const string_view &input)
noexcept
{
using block_t = u8x16;
using vec_t = u128x1;
using unaligned_vec_t = u128x1_u;
static_assert(sizeof(block_t) == sizeof(vec_t));
u8x16 block;
u64x2 count{0}; // input pos, return value
while(count[0] + sizeof(block_t) <= input.size())
{
const auto ptr
{
reinterpret_cast<const u128x1_u *>(input.data() + count[0])
};
block = _mm_loadu_si128(ptr);
count += string_serialized(block, ~u8x16{0});
}
while(count[0] < input.size())
{
const size_t remain(input.size() - count[0]);
assert(remain < sizeof(block_t));
size_t j(0);
u8x16 mask{0};
for(; count[0] + j < input.size(); ++j)
{
block[j] = input[count[0] + j];
mask[j] = 0xff;
}
count += string_serialized(block, mask);
}
return count[1];
}
ircd::u64x2
ircd::json::string_serialized(const u8x16 block,
const u8x16 block_mask)
{
const u8x16 is_esc
{
block == '\\'
};
const u8x16 is_quote
{
block == '"'
};
const u8x16 is_ctrl
{
block < 0x20
};
const u8x16 is_special
{
is_esc | is_quote | is_ctrl
};
const u64 regular_prefix_count
{
simd::clz(u64x2(is_special | ~block_mask)) / 8
};
// Fast-path; backward branch to count and consume uninteresting characters
// from the front of the input.
if(likely(regular_prefix_count))
return u64x2
{
regular_prefix_count, regular_prefix_count,
};
// Slow-path; decide what to do based on the next character.
switch(block[0])
{
// Covers the ctrl 0x00-0x20 range only; no other character here.
default:
assert(block[0] < 0x20);
__builtin_assume(block[0] < 0x20);
return string_serialized_ctrl(block, block_mask, is_ctrl);
// Unescaped quote: +1
case '"':
return u64x2
{
1, 2
};
// Escape sequence
case '\\': switch(block[1] & block_mask[1])
{
// Legitimately escaped single chars
case '"':
case 'b':
case 't':
case 'n':
case 'f':
case 'r':
case '0':
return u64x2
{
2, 2
};
case 'u': // Possible utf-16 surrogate(s)
return string_serialized_utf16(block, block_mask);
// Unnecessary escape; unless it's the last char: -1
default:
return u64x2
{
1, block_mask[1]? 0UL: 2UL
};
};
}
}
ircd::u64x2
ircd::json::string_serialized_utf16(const u8x16 block,
const u8x16 block_mask)
{
const u32x4 is_surrogate
{
utf16::find_surrogate(block & block_mask)
};
const u32x4 unicode
{
utf16::decode_surrogate_aligned_next(block & block_mask)
};
const u32x4 is_ctrl
{
unicode < 0x20
};
// Determine the utf-8 encoding length for each codepoint but
// null out codepoints in the control character range.
const u32x4 length_encoded
{
utf8::length(unicode) & ~is_ctrl
};
const u32x4 ctrl_idx
{
unicode & is_ctrl
};
const i32x4 surrogate_len
{
ctrl_tab_len[ctrl_idx[0]],
ctrl_tab_len[ctrl_idx[1]],
};
// Supplement the escaped surrogate length for excluded codepoints.
const u32x4 length
{
(length_encoded | (is_ctrl & surrogate_len)) & is_surrogate
};
const auto total_length
{
length[0] + length[1]
};
const auto total_decoded
{
6UL * ((is_surrogate[0] & 1) + (is_surrogate[1] & 1))
};
return u64x2
{
total_decoded, total_length,
};
}
ircd::u64x2
ircd::json::string_serialized_ctrl(const u8x16 block,
const u8x16 block_mask,
const u8x16 is_ctrl)
{
assert(block[0] < 0x20);
const u8x16 ctrl_esc_len
{
lookup_ctrl_tab_len(block & is_ctrl)
};
const u64 ctrl_prefix_count
{
simd::clz(u64x2(~is_ctrl | ~block_mask)) / 8
};
u64 ret(0);
for(size_t i(0); i < ctrl_prefix_count; ++i)
ret += ctrl_esc_len[i];
return u64x2
{
ctrl_prefix_count, ret
};
}
/// Performs a parallel transform of control characters in the input into
/// the length of their escape surrogate. The input character must be in
/// the control character range.
ircd::u8x16
ircd::json::lookup_ctrl_tab_len(const u8x16 in)
{
static const int32_t *const tab
{
ctrl_tab_len
};
size_t k(0);
i32x4 idx[4]
{
{ in[k++], in[k++], in[k++], in[k++] },
{ in[k++], in[k++], in[k++], in[k++] },
{ in[k++], in[k++], in[k++], in[k++] },
{ in[k++], in[k++], in[k++], in[k++] },
};
size_t i, j;
i32x8 res[2];
for(i = 0; i < 2; ++i)
for(j = 0; j < 8; ++j)
res[i][j] = tab[idx[i][j]];
i8x16 ret;
k = 0;
for(i = 0; i < 2; ++i)
for(j = 0; j < 8; ++j)
ret[k++] = res[i][j];
return ret;
}
///////////////////////////////////////////////////////////////////////////////
//
// json/value.h