From 1e08339955e0f0bed7e7d56c6b1b1ca38fd3bc90 Mon Sep 17 00:00:00 2001 From: Jason Volk Date: Mon, 12 Apr 2021 12:24:48 -0700 Subject: [PATCH] ircd::gpt::vocab: Fixes for additional missing cases. --- ircd/gpt_vocab.cc | 51 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/ircd/gpt_vocab.cc b/ircd/gpt_vocab.cc index e7542b230..01fe6fef9 100644 --- a/ircd/gpt_vocab.cc +++ b/ircd/gpt_vocab.cc @@ -339,17 +339,30 @@ ircd::gpt::vocab::pre_tokenize(u8x16 (&token)[16], const u8x16 is_ascii_letter ( - (in >= 'a' && in <= 'z') || (in >= 'A' && in <= 'Z') + (in >= 'a' && in <= 'z') || + (in >= 'A' && in <= 'Z') ); - const u8x16 ascii_identified + const u8x16 is_ascii_punct ( - is_ascii_ctrl | is_ascii_space | is_ascii_number | is_ascii_letter + (in >= '!' && in <= '/') || + (in >= ':' && in <= '@') || + (in >= '[' && in <= '`') || + (in >= '{' && in <= '~') + ); + + const u8x16 ascii_categorized + (0 + | is_ascii_ctrl + | is_ascii_space + | is_ascii_punct + | is_ascii_letter + | is_ascii_number ); const u8x16 maybe_notascii ( - ~ascii_identified & in_mask + ~ascii_categorized & in_mask ); const u32x16 ch @@ -357,6 +370,11 @@ ircd::gpt::vocab::pre_tokenize(u8x16 (&token)[16], utf8::decode(in) ); + const u32x16 ch_mask + ( + lane_cast(in_mask) != 0 + ); + const u32x16 uc_cat ( icu::category(ch & (lane_cast(maybe_notascii) != 0)) @@ -385,37 +403,40 @@ ircd::gpt::vocab::pre_tokenize(u8x16 (&token)[16], | (lane_cast(is_ascii_ctrl) != 0) ); + const u32x16 is_punct + (0 + | (lane_cast(is_ascii_punct) != 0) + ); + + // Decide characters which do not start a new token based on the + // preceding character. const u32x16 is_trail (0 | (is_L & shl<32>(is_L)) | (is_N & shl<32>(is_N)) | (is_Z & shl<32>(is_Z)) + | (is_L & shl<32>(is_punct)) ); - const u32x16 fat_mask - ( - lane_cast(in_mask) != 0 - ); - - // mask candidate start of token + // Decide characters which may start a token. const u32x16 is_head ( - (~is_trail | is_C0) & fat_mask + (~is_trail | is_C0) & ch_mask ); - // mask if token is preceded by a space + // Decide if candidate token is preceded by a space. const u32x16 leading_space ( is_head & shl<32>(is_Z) ); - // mask if next char is also the same char + // Mask if next char is also the same char. const u32x16 is_rep ( is_head & (shl<32>(ch) == ch) ); - // zero or one preceding space becomes prefixed to the next token + // Decide the starting character of each token. const u32x16 tok_head (0 | (is_head & ~leading_space & ~is_rep) @@ -434,7 +455,7 @@ ircd::gpt::vocab::pre_tokenize(u8x16 (&token)[16], const auto ret { - pre_tokenize_split(token, ch, fat_mask, tok_mask) + pre_tokenize_split(token, ch, ch_mask, tok_mask) }; return ret;