mirror of
https://github.com/matrix-construct/construct
synced 2024-12-28 08:24:08 +01:00
ircd::gpt::vocab: Fixes for additional missing cases.
This commit is contained in:
parent
8c65e75046
commit
1e08339955
1 changed files with 36 additions and 15 deletions
|
@ -339,17 +339,30 @@ ircd::gpt::vocab::pre_tokenize(u8x16 (&token)[16],
|
||||||
|
|
||||||
const u8x16 is_ascii_letter
|
const u8x16 is_ascii_letter
|
||||||
(
|
(
|
||||||
(in >= 'a' && in <= 'z') || (in >= 'A' && in <= 'Z')
|
(in >= 'a' && in <= 'z') ||
|
||||||
|
(in >= 'A' && in <= 'Z')
|
||||||
);
|
);
|
||||||
|
|
||||||
const u8x16 ascii_identified
|
const u8x16 is_ascii_punct
|
||||||
(
|
(
|
||||||
is_ascii_ctrl | is_ascii_space | is_ascii_number | is_ascii_letter
|
(in >= '!' && in <= '/') ||
|
||||||
|
(in >= ':' && in <= '@') ||
|
||||||
|
(in >= '[' && in <= '`') ||
|
||||||
|
(in >= '{' && in <= '~')
|
||||||
|
);
|
||||||
|
|
||||||
|
const u8x16 ascii_categorized
|
||||||
|
(0
|
||||||
|
| is_ascii_ctrl
|
||||||
|
| is_ascii_space
|
||||||
|
| is_ascii_punct
|
||||||
|
| is_ascii_letter
|
||||||
|
| is_ascii_number
|
||||||
);
|
);
|
||||||
|
|
||||||
const u8x16 maybe_notascii
|
const u8x16 maybe_notascii
|
||||||
(
|
(
|
||||||
~ascii_identified & in_mask
|
~ascii_categorized & in_mask
|
||||||
);
|
);
|
||||||
|
|
||||||
const u32x16 ch
|
const u32x16 ch
|
||||||
|
@ -357,6 +370,11 @@ ircd::gpt::vocab::pre_tokenize(u8x16 (&token)[16],
|
||||||
utf8::decode(in)
|
utf8::decode(in)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const u32x16 ch_mask
|
||||||
|
(
|
||||||
|
lane_cast<u32x16>(in_mask) != 0
|
||||||
|
);
|
||||||
|
|
||||||
const u32x16 uc_cat
|
const u32x16 uc_cat
|
||||||
(
|
(
|
||||||
icu::category(ch & (lane_cast<u32x16>(maybe_notascii) != 0))
|
icu::category(ch & (lane_cast<u32x16>(maybe_notascii) != 0))
|
||||||
|
@ -385,37 +403,40 @@ ircd::gpt::vocab::pre_tokenize(u8x16 (&token)[16],
|
||||||
| (lane_cast<u32x16>(is_ascii_ctrl) != 0)
|
| (lane_cast<u32x16>(is_ascii_ctrl) != 0)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const u32x16 is_punct
|
||||||
|
(0
|
||||||
|
| (lane_cast<u32x16>(is_ascii_punct) != 0)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Decide characters which do not start a new token based on the
|
||||||
|
// preceding character.
|
||||||
const u32x16 is_trail
|
const u32x16 is_trail
|
||||||
(0
|
(0
|
||||||
| (is_L & shl<32>(is_L))
|
| (is_L & shl<32>(is_L))
|
||||||
| (is_N & shl<32>(is_N))
|
| (is_N & shl<32>(is_N))
|
||||||
| (is_Z & shl<32>(is_Z))
|
| (is_Z & shl<32>(is_Z))
|
||||||
|
| (is_L & shl<32>(is_punct))
|
||||||
);
|
);
|
||||||
|
|
||||||
const u32x16 fat_mask
|
// Decide characters which may start a token.
|
||||||
(
|
|
||||||
lane_cast<u32x16>(in_mask) != 0
|
|
||||||
);
|
|
||||||
|
|
||||||
// mask candidate start of token
|
|
||||||
const u32x16 is_head
|
const u32x16 is_head
|
||||||
(
|
(
|
||||||
(~is_trail | is_C0) & fat_mask
|
(~is_trail | is_C0) & ch_mask
|
||||||
);
|
);
|
||||||
|
|
||||||
// mask if token is preceded by a space
|
// Decide if candidate token is preceded by a space.
|
||||||
const u32x16 leading_space
|
const u32x16 leading_space
|
||||||
(
|
(
|
||||||
is_head & shl<32>(is_Z)
|
is_head & shl<32>(is_Z)
|
||||||
);
|
);
|
||||||
|
|
||||||
// mask if next char is also the same char
|
// Mask if next char is also the same char.
|
||||||
const u32x16 is_rep
|
const u32x16 is_rep
|
||||||
(
|
(
|
||||||
is_head & (shl<32>(ch) == ch)
|
is_head & (shl<32>(ch) == ch)
|
||||||
);
|
);
|
||||||
|
|
||||||
// zero or one preceding space becomes prefixed to the next token
|
// Decide the starting character of each token.
|
||||||
const u32x16 tok_head
|
const u32x16 tok_head
|
||||||
(0
|
(0
|
||||||
| (is_head & ~leading_space & ~is_rep)
|
| (is_head & ~leading_space & ~is_rep)
|
||||||
|
@ -434,7 +455,7 @@ ircd::gpt::vocab::pre_tokenize(u8x16 (&token)[16],
|
||||||
|
|
||||||
const auto ret
|
const auto ret
|
||||||
{
|
{
|
||||||
pre_tokenize_split(token, ch, fat_mask, tok_mask)
|
pre_tokenize_split(token, ch, ch_mask, tok_mask)
|
||||||
};
|
};
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
Loading…
Reference in a new issue