0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-12-29 08:54:02 +01:00
construct/ircd/rfc3986.cc
Jason Volk af9629324e ircd::rfc3986: Refactor uri encoder/decoder related; reorg.
ircd::rfc3986::encoder: Proper unreserved character selection.

ircd::rfc3986::decoder: Split into safe and unsafe grammars and interface.

ircd::rfc3986::decoder: Use expectation_failure template exception.
2020-03-28 23:39:13 -07:00

808 lines
16 KiB
C++

// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2018 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
namespace ircd { namespace rfc3986
__attribute__((visibility("hidden")))
{
using namespace ircd::spirit;
struct encoder extern const encoder;
struct decoder extern const decoder;
}}
namespace ircd { namespace rfc3986 { namespace parser
__attribute__((visibility("hidden")))
{
using namespace ircd::spirit;
}}}
decltype(ircd::rfc3986::parser::sub_delims)
ircd::rfc3986::parser::sub_delims
{
lit('!') | lit('$') | lit('&') | lit('\'') |
lit('(') | lit(')') | lit('*') | lit('+') |
lit(',') | lit(';') | lit('=')
,"sub-delims"
};
decltype(ircd::rfc3986::parser::gen_delims)
ircd::rfc3986::parser::gen_delims
{
lit(':') | lit('/') | lit('?') | lit('#') |
lit('[') | lit(']') | lit('@')
,"gen-delims"
};
decltype(ircd::rfc3986::parser::unreserved)
ircd::rfc3986::parser::unreserved
{
ascii::alpha | ascii::digit |
lit('-') | lit('.') | lit('_') | lit('~')
,"unreserved"
};
decltype(ircd::rfc3986::parser::reserved)
ircd::rfc3986::parser::reserved
{
gen_delims | sub_delims
,"reserved"
};
decltype(ircd::rfc3986::parser::pct_encoded)
ircd::rfc3986::parser::pct_encoded
{
lit('%') >> repeat(2)[ascii::xdigit]
,"pct-encoded"
};
decltype(ircd::rfc3986::parser::pchar)
ircd::rfc3986::parser::pchar
{
unreserved | pct_encoded | sub_delims | lit(':') | lit('@')
,"pchar"
};
decltype(ircd::rfc3986::parser::query)
ircd::rfc3986::parser::query
{
*(pchar | lit('/') | lit('?'))
,"query"
};
decltype(ircd::rfc3986::parser::fragment)
ircd::rfc3986::parser::fragment
{
*(pchar | lit('/') | lit('?'))
,"fragment"
};
decltype(ircd::rfc3986::parser::segment_nz_nc)
ircd::rfc3986::parser::segment_nz_nc
{
+(unreserved | pct_encoded | sub_delims | lit('@'))
,"segment-nz-nc"
};
decltype(ircd::rfc3986::parser::segment_nz)
ircd::rfc3986::parser::segment_nz
{
+pchar
,"segment-nz"
};
decltype(ircd::rfc3986::parser::segment)
ircd::rfc3986::parser::segment
{
*pchar
,"segment"
};
decltype(ircd::rfc3986::parser::path_abempty)
ircd::rfc3986::parser::path_abempty
{
*(lit('/') >> segment)
,"path-abempty"
};
decltype(ircd::rfc3986::parser::path_noscheme)
ircd::rfc3986::parser::path_noscheme
{
segment_nz_nc >> *(lit('/') >> segment)
,"path-noscheme"
};
decltype(ircd::rfc3986::parser::path_rootless)
ircd::rfc3986::parser::path_rootless
{
segment_nz >> *(lit('/') >> segment)
,"path-rootless"
};
decltype(ircd::rfc3986::parser::path_absolute)
ircd::rfc3986::parser::path_absolute
{
lit('/') >> -(path_rootless)
,"path-absolute"
};
decltype(ircd::rfc3986::parser::path)
ircd::rfc3986::parser::path
{
-(path_abempty | path_absolute | path_noscheme | path_rootless)
,"path"
};
decltype(ircd::rfc3986::parser::reg_name)
ircd::rfc3986::parser::reg_name
{
*(unreserved | pct_encoded | sub_delims)
,"reg-name"
};
decltype(ircd::rfc3986::parser::userinfo)
ircd::rfc3986::parser::userinfo
{
*(unreserved | pct_encoded | sub_delims | lit(':'))
,"userinfo"
};
decltype(ircd::rfc3986::parser::port)
ircd::rfc3986::parser::port
{
ushort_
,"port number"
};
decltype(ircd::rfc3986::parser::ip4_octet)
ircd::rfc3986::parser::ip4_octet
{
repeat(1,3)[char_("0-9")]
,"IPv4 octet"
};
decltype(ircd::rfc3986::parser::ip4_address)
ircd::rfc3986::parser::ip4_address
{
repeat(3)[ip4_octet >> '.'] >> ip4_octet
,"IPv4 address"
};
decltype(ircd::rfc3986::parser::ip4_literal)
ircd::rfc3986::parser::ip4_literal
{
ip4_address
,"IPv4 literal"
};
decltype(ircd::rfc3986::parser::ip4_remote)
ircd::rfc3986::parser::ip4_remote
{
ip4_literal >> -(':' > port)
,"IPv4 remote"
};
decltype(ircd::rfc3986::parser::ip6_char)
ircd::rfc3986::parser::ip6_char
{
char_("0-9a-fA-F")
,"IPv6 character"
};
decltype(ircd::rfc3986::parser::ip6_h16)
ircd::rfc3986::parser::ip6_h16
{
repeat(1,4)[ip6_char]
,"IPv6 hexdigit"
};
decltype(ircd::rfc3986::parser::ip6_piece)
ircd::rfc3986::parser::ip6_piece
{
ip6_h16 >> ':'
,"IPv6 address piece"
};
// This is reversed from the BNF in the RFC otherwise it requires
// backtracking during the repeat[]; parsers are adjusted accordingly.
decltype(ircd::rfc3986::parser::ip6_ipiece)
ircd::rfc3986::parser::ip6_ipiece
{
':' >> ip6_h16
,"IPv6 address piece"
};
decltype(ircd::rfc3986::parser::ip6_ls32)
ircd::rfc3986::parser::ip6_ls32
{
(ip6_h16 >> ':' >> ip6_h16) | ip4_address
};
/// https://tools.ietf.org/html/rfc3986 Appendix A
decltype(ircd::rfc3986::parser::ip6_addr)
ircd::rfc3986::parser::ip6_addr
{
{ repeat(6)[ip6_piece] >> ip6_ls32 },
{ lit("::") >> repeat(5)[ip6_piece] >> ip6_ls32 },
{ ip6_h16 >> lit("::") >> repeat(4)[ip6_piece] >> ip6_ls32 },
{ ip6_h16 >> repeat(0,1)[ip6_ipiece] >> lit("::") >> repeat(3)[ip6_piece] >> ip6_ls32 },
{ ip6_h16 >> repeat(0,2)[ip6_ipiece] >> lit("::") >> repeat(2)[ip6_piece] >> ip6_ls32 },
{ ip6_h16 >> repeat(0,3)[ip6_ipiece] >> lit("::") >> ip6_piece >> ip6_ls32 },
{ ip6_h16 >> repeat(0,4)[ip6_ipiece] >> lit("::") >> ip6_ls32 },
{ ip6_h16 >> repeat(0,5)[ip6_ipiece] >> lit("::") >> -ip6_h16 },
{ lit("::") >> -ip6_h16 },
};
decltype(ircd::rfc3986::parser::ip6_address)
ircd::rfc3986::parser::ip6_address
{
ip6_addr[0] | ip6_addr[1] | ip6_addr[2] |
ip6_addr[3] | ip6_addr[4] | ip6_addr[5] |
ip6_addr[6] | ip6_addr[7] | ip6_addr[8]
,"IPv6 address"
};
decltype(ircd::rfc3986::parser::ip6_literal)
ircd::rfc3986::parser::ip6_literal
{
'[' >> ip6_address >> ']'
,"IPv6 literal"
};
decltype(ircd::rfc3986::parser::ip6_remote)
ircd::rfc3986::parser::ip6_remote
{
ip6_literal >> -(':' > port)
,"IPv6 literal"
};
decltype(ircd::rfc3986::parser::ip_address)
ircd::rfc3986::parser::ip_address
{
ip4_address | ip6_address
,"IP address"
};
decltype(ircd::rfc3986::parser::ip_literal)
ircd::rfc3986::parser::ip_literal
{
ip6_literal | ip4_literal
,"IP literal"
};
decltype(ircd::rfc3986::parser::ip_remote)
ircd::rfc3986::parser::ip_remote
{
ip_literal >> -(':' > port)
,"IP literal"
};
decltype(ircd::rfc3986::parser::hostname)
ircd::rfc3986::parser::hostname
{
char_("A-Za-z0-9") >> *(char_("A-Za-z0-9\x2D")) // x2D is '-'
,"hostname"
};
decltype(ircd::rfc3986::parser::domain)
ircd::rfc3986::parser::domain
{
hostname % '.'
,"domain"
};
decltype(ircd::rfc3986::parser::hostport)
ircd::rfc3986::parser::hostport
{
domain >> -(':' > port)
,"hostport"
};
decltype(ircd::rfc3986::parser::host)
ircd::rfc3986::parser::host
{
ip_address | domain
,"host"
};
decltype(ircd::rfc3986::parser::host_literal)
ircd::rfc3986::parser::host_literal
{
ip_literal | domain
,"host literal"
};
decltype(ircd::rfc3986::parser::remote)
ircd::rfc3986::parser::remote
{
ip_remote | hostport
,"remote"
};
decltype(ircd::rfc3986::parser::authority)
ircd::rfc3986::parser::authority
{
-(userinfo >> lit('@')) >> remote
,"authority"
};
decltype(ircd::rfc3986::parser::scheme)
ircd::rfc3986::parser::scheme
{
ascii::alpha >> *(ascii::alnum | lit('+') | lit('-') | lit('.'))
,"scheme"
};
decltype(ircd::rfc3986::parser::hier_part)
ircd::rfc3986::parser::hier_part
{
-((lit("//") >> authority >> path_abempty) | path_absolute | path_rootless)
,"hier_part"
};
decltype(ircd::rfc3986::parser::relative_part)
ircd::rfc3986::parser::relative_part
{
-((lit("//") >> authority >> path_abempty) | path_absolute | path_noscheme)
,"relative-part"
};
decltype(ircd::rfc3986::parser::relative_ref)
ircd::rfc3986::parser::relative_ref
{
relative_part >> -(lit('?') >> query) >> -(lit('#') >> fragment)
,"relative-ref"
};
decltype(ircd::rfc3986::parser::absolute_uri)
ircd::rfc3986::parser::absolute_uri
{
scheme >> lit(':') >> hier_part >> -(lit('?') >> query)
,"absolute-URI"
};
decltype(ircd::rfc3986::parser::uri)
ircd::rfc3986::parser::uri
{
scheme >> lit(':') >> hier_part >> -(lit('?') >> query) >> -(lit('#') >> fragment)
,"URI"
};
decltype(ircd::rfc3986::parser::uri_ref)
ircd::rfc3986::parser::uri_ref
{
uri | relative_ref
,"URI-reference"
};
//
// uri decompose
//
BOOST_FUSION_ADAPT_STRUCT
(
ircd::rfc3986::uri,
( decltype(ircd::rfc3986::uri::scheme), scheme )
( decltype(ircd::rfc3986::uri::user), user )
( decltype(ircd::rfc3986::uri::remote), remote )
( decltype(ircd::rfc3986::uri::path), path )
( decltype(ircd::rfc3986::uri::query), query )
( decltype(ircd::rfc3986::uri::fragment), fragment )
)
ircd::rfc3986::uri::uri(const string_view &input)
{
static const parser::rule<rfc3986::uri> rule
{
raw[parser::scheme] >> lit("://")
>> -raw[parser::userinfo >> lit('@')]
>> raw[parser::remote]
>> raw[parser::path_abempty]
>> -raw[lit('?') >> parser::query]
>> -raw[lit('#') >> parser::fragment]
};
const char *start(begin(input)), *const stop(end(input));
qi::parse(start, stop, eps > rule, *this);
//TODO: XXX Can this go?
this->user = rstrip(this->user, '@');
this->query = lstrip(this->query, '?');
this->fragment = lstrip(this->fragment, '#');
}
//
// uri decoding
//
struct ircd::rfc3986::decoder
:qi::grammar<const char *, mutable_buffer>
{
template<class R = unused_type,
class... S>
using rule = qi::rule<const char *, R, S...>;
[[noreturn]] static void throw_unsafe()
{
throw decoding_error
{
"Unsafe characters in decoding."
};
}
const rule<char()> decode_char
{
lit('%') > qi::uint_parser<char, 16, 2, 2>{}
,"url decodable character"
};
const rule<char()> unreserved_char
{
// unreserved characters and !$+*'(),
char_("A-Za-z0-9._~!$+*'(),-")
,"url unreserved characters"
};
const rule<mutable_buffer> decode_unsafe
{
*((char_ - '%') | decode_char)
,"url unsafe decode"
};
rule<mutable_buffer> decode_safe
{
rule<mutable_buffer>{}
,"url safe decode"
};
decoder()
:decoder::base_type{decode_safe}
{
//TODO: XXX this never reports failure to throw; it just stops parsing
decode_safe %= *(unreserved_char | decode_char[_pass = (local::_1 > 0x1F)]);
}
}
const ircd::rfc3986::decoder;
ircd::const_buffer
ircd::rfc3986::decode_unsafe(const mutable_buffer &buf,
const string_view &url)
try
{
const char *start(url.data()), *const stop
{
start + std::min(size(url), size(buf))
};
mutable_buffer mb
{
data(buf), size_t(0)
};
const bool ok
{
qi::parse(start, stop, decoder.decode_unsafe, mb)
};
assert(size(mb) <= size(url));
return string_view
{
data(mb), size(mb)
};
}
catch(const qi::expectation_failure<const char *> &e)
{
throw expectation_failure<decoding_error>{e};
}
ircd::string_view
ircd::rfc3986::decode(const mutable_buffer &buf,
const string_view &url)
try
{
const char *start(url.data()), *const stop
{
start + std::min(size(url), size(buf))
};
mutable_buffer mb
{
data(buf), size_t(0)
};
const bool ok
{
qi::parse(start, stop, decoder.decode_safe, mb)
};
assert(size(mb) <= size(url));
return string_view
{
data(mb), size(mb)
};
}
catch(const qi::expectation_failure<const char *> &e)
{
throw expectation_failure<decoding_error>{e};
}
//
// uri encoding
//
struct ircd::rfc3986::encoder
:karma::grammar<char *, const string_view>
{
template<class R = unused_type,
class... S>
using rule = karma::rule<char *, R, S...>;
const rule<char()> unreserved
{
char_("A-Za-z0-9._~-")
,"url unencoded"
};
const rule<const string_view> encode
{
*(unreserved | (lit('%') << karma::right_align(2, '0')[karma::hex]))
,"url encode"
};
encoder()
:encoder::base_type{encode}
{}
}
const ircd::rfc3986::encoder;
ircd::string_view
ircd::rfc3986::encode(const mutable_buffer &out,
const json::members &members)
{
window_buffer buf{out};
const auto append{[&buf](const json::member &member)
{
assert(type(member.first) == json::STRING);
if(unlikely(!member.second.serial && type(member.second) != json::STRING))
throw panic
{
"Cannot encode non-serial json::member type '%s'",
reflect(type(member.second))
};
consume(buf, size(encode(buf, member.first)));
consume(buf, copy(buf, "="_sv));
consume(buf, size(encode(buf, member.second)));
}};
auto it(begin(members));
if(it != end(members))
{
append(*it);
for(++it; it != end(members); ++it)
{
consume(buf, copy(buf, "&"_sv));
append(*it);
}
}
return buf.completed();
}
ircd::string_view
ircd::rfc3986::encode(const mutable_buffer &buf,
const string_view &url)
{
char *out(data(buf));
const bool ok
{
karma::generate(out, maxwidth(size(buf))[encoder], url)
};
return string_view
{
data(buf), size_t(std::distance(data(buf), out))
};
}
//
// general interface
//
ircd::string_view
ircd::rfc3986::host(const string_view &str)
try
{
static const parser::rule<string_view> literal
{
parser::ip6_address
};
static const parser::rule<string_view> non_literal
{
parser::ip6_address | parser::ip4_address | parser::domain
};
static const parser::rule<string_view> rule
{
(lit('[') >> raw[literal]) | raw[non_literal]
,"host"
};
string_view ret;
const char *start(str.data()), *const stop(start + str.size());
qi::parse(start, stop, eps > rule, ret);
return ret;
}
catch(const qi::expectation_failure<const char *> &e)
{
throw expectation_failure<error>{e};
}
uint16_t
ircd::rfc3986::port(const string_view &str)
try
{
static const parser::rule<> literal
{
parser::ip6_literal | parser::ip4_literal | parser::domain
};
static const parser::rule<> non_literal
{
parser::ip6_address
};
static const parser::rule<uint16_t> rule
{
non_literal | (literal >> -(lit(':') >> parser::port) >> eoi)
,"port"
};
uint16_t ret(0);
const char *start(str.data()), *const stop(start + str.size());
qi::parse(start, stop, rule, ret);
return ret;
}
catch(const qi::expectation_failure<const char *> &e)
{
throw expectation_failure<error>{e};
}
//
// validators
//
bool
ircd::rfc3986::valid_remote(std::nothrow_t,
const string_view &str)
{
if(str.size() > DOMAIN_MAX + 6)
return false;
return valid(std::nothrow, parser::remote, str);
}
void
ircd::rfc3986::valid_remote(const string_view &str)
{
if(str.size() > DOMAIN_MAX + 6)
throw error
{
"String length %zu exceeds maximum of %zu characters",
size(str),
DOMAIN_MAX + 6
};
valid(parser::remote, str);
}
bool
ircd::rfc3986::valid_domain(std::nothrow_t,
const string_view &str)
{
if(str.size() > DOMAIN_MAX)
return false;
return valid(std::nothrow, parser::domain, str);
}
void
ircd::rfc3986::valid_domain(const string_view &str)
{
if(str.size() > DOMAIN_MAX)
throw error
{
"String length %zu exceeds maximum of %zu characters",
size(str),
DOMAIN_MAX
};
valid(parser::domain, str);
}
bool
ircd::rfc3986::valid_host(std::nothrow_t,
const string_view &str)
{
if(str.size() > DOMAIN_MAX)
return false;
return valid(std::nothrow, parser::host, str);
}
void
ircd::rfc3986::valid_host(const string_view &str)
{
if(str.size() > DOMAIN_MAX)
throw error
{
"String length %zu exceeds maximum of %zu characters",
size(str),
DOMAIN_MAX
};
valid(parser::host, str);
}
bool
ircd::rfc3986::valid_hostname(std::nothrow_t,
const string_view &str)
{
if(str.size() > HOSTNAME_MAX)
return false;
return valid(std::nothrow, parser::hostname, str);
}
void
ircd::rfc3986::valid_hostname(const string_view &str)
{
if(str.size() > HOSTNAME_MAX)
throw error
{
"String length %zu exceeds maximum of %zu characters",
size(str),
HOSTNAME_MAX
};
valid(parser::hostname, str);
}
bool
ircd::rfc3986::valid(std::nothrow_t,
const parser::rule<> &rule,
const string_view &str)
{
const parser::rule<> only_rule
{
rule >> eoi
};
const char *start(str.data()), *const stop(start + str.size());
return qi::parse(start, stop, only_rule);
}
void
ircd::rfc3986::valid(const parser::rule<> &rule,
const string_view &str)
try
{
const parser::rule<> only_rule
{
rule >> eoi
};
const char *start(str.data()), *const stop(start + str.size());
qi::parse(start, stop, eps > only_rule);
}
catch(const qi::expectation_failure<const char *> &e)
{
throw expectation_failure<error>{e};
}