[TextServer] Implement ICU/UAX 31 based is_valid_identifier
function.
This commit is contained in:
parent
e317e34c15
commit
9c3ec25dce
|
@ -21,8 +21,8 @@
|
|||
<argument index="1" name="orientation" type="int" enum="TextServer.Orientation" default="0" />
|
||||
<description>
|
||||
Creates new buffer for complex text layout, with the given [code]direction[/code] and [code]orientation[/code]. To free the resulting buffer, use [method free_rid] method.
|
||||
[b]Note:[/b] Direction is ignored if server does not support [code]FEATURE_BIDI_LAYOUT[/code] feature.
|
||||
[b]Note:[/b] Orientation is ignored if server does not support [code]FEATURE_VERTICAL_LAYOUT[/code] feature.
|
||||
[b]Note:[/b] Direction is ignored if server does not support [constant FEATURE_BIDI_LAYOUT] feature.
|
||||
[b]Note:[/b] Orientation is ignored if server does not support [constant FEATURE_VERTICAL_LAYOUT] feature.
|
||||
</description>
|
||||
</method>
|
||||
<method name="draw_hex_code_box" qualifiers="const">
|
||||
|
@ -843,6 +843,21 @@
|
|||
Returns [code]true[/code] if locale is right-to-left.
|
||||
</description>
|
||||
</method>
|
||||
<method name="is_valid_identifier" qualifiers="const">
|
||||
<return type="bool" />
|
||||
<argument index="0" name="string" type="String" />
|
||||
<description>
|
||||
Returns [code]true[/code] is [code]string[/code] is a valid identifier.
|
||||
If the text server supports the [constant FEATURE_UNICODE_IDENTIFIERS] feature, a valid identifier must:
|
||||
- Conform to normalization form C.
|
||||
- Begin with a Unicode character of class XID_Start or [code]"_"[/code].
|
||||
- May contain Unicode characters of class XID_Continue in the other positions.
|
||||
- Use UAX #31 recommended scripts only (mixed scripts are allowed).
|
||||
If the [constant FEATURE_UNICODE_IDENTIFIERS] feature is not supported, a valid identifier must:
|
||||
- Begin with a Basic Latin letter character or [code]"_"[/code].
|
||||
- May contain Basic Latin letter characters, digits or [code]"_"[/code] in the other positions.
|
||||
</description>
|
||||
</method>
|
||||
<method name="load_support_data">
|
||||
<return type="bool" />
|
||||
<argument index="0" name="filename" type="String" />
|
||||
|
@ -1233,7 +1248,7 @@
|
|||
<argument index="1" name="direction" type="int" enum="TextServer.Direction" default="0" />
|
||||
<description>
|
||||
Sets desired text direction. If set to [code]TEXT_DIRECTION_AUTO[/code], direction will be detected based on the buffer contents and current locale.
|
||||
[b]Note:[/b] Direction is ignored if server does not support [code]FEATURE_BIDI_LAYOUT[/code] feature.
|
||||
[b]Note:[/b] Direction is ignored if server does not support [constant FEATURE_BIDI_LAYOUT] feature.
|
||||
</description>
|
||||
</method>
|
||||
<method name="shaped_text_set_orientation">
|
||||
|
@ -1242,7 +1257,7 @@
|
|||
<argument index="1" name="orientation" type="int" enum="TextServer.Orientation" default="0" />
|
||||
<description>
|
||||
Sets desired text orientation.
|
||||
[b]Note:[/b] Orientation is ignored if server does not support [code]FEATURE_VERTICAL_LAYOUT[/code] feature.
|
||||
[b]Note:[/b] Orientation is ignored if server does not support [constant FEATURE_BIDI_LAYOUT] feature.
|
||||
</description>
|
||||
</method>
|
||||
<method name="shaped_text_set_preserve_control">
|
||||
|
@ -1439,7 +1454,10 @@
|
|||
TextServer supports variable fonts.
|
||||
</constant>
|
||||
<constant name="FEATURE_USE_SUPPORT_DATA" value="128" enum="Feature">
|
||||
TextServer require external data file for some features.
|
||||
TextServer require external data file for some features, see [method load_support_data].
|
||||
</constant>
|
||||
<constant name="FEATURE_UNICODE_IDENTIFIERS" value="256" enum="Feature">
|
||||
TextServer supports UAX #31 identifier validation, see [method is_valid_identifier].
|
||||
</constant>
|
||||
<constant name="CONTOUR_CURVE_TAG_ON" value="1" enum="ContourPointTag">
|
||||
Contour point is on the curve.
|
||||
|
|
|
@ -320,7 +320,7 @@ _FORCE_INLINE_ bool is_underscore(char32_t p_char) {
|
|||
/*************************************************************************/
|
||||
|
||||
String TextServerAdvanced::interface_name = "ICU / HarfBuzz / Graphite";
|
||||
uint32_t TextServerAdvanced::interface_features = FEATURE_BIDI_LAYOUT | FEATURE_VERTICAL_LAYOUT | FEATURE_SHAPING | FEATURE_KASHIDA_JUSTIFICATION | FEATURE_BREAK_ITERATORS | FEATURE_USE_SUPPORT_DATA | FEATURE_FONT_VARIABLE;
|
||||
uint32_t TextServerAdvanced::interface_features = FEATURE_BIDI_LAYOUT | FEATURE_VERTICAL_LAYOUT | FEATURE_SHAPING | FEATURE_KASHIDA_JUSTIFICATION | FEATURE_BREAK_ITERATORS | FEATURE_USE_SUPPORT_DATA | FEATURE_FONT_VARIABLE | FEATURE_UNICODE_IDENTIFIERS;
|
||||
|
||||
bool TextServerAdvanced::has_feature(Feature p_feature) const {
|
||||
return (interface_features & p_feature) == p_feature;
|
||||
|
@ -5068,6 +5068,191 @@ String TextServerAdvanced::strip_diacritics(const String &p_string) const {
|
|||
return result;
|
||||
}
|
||||
|
||||
bool TextServerAdvanced::is_valid_identifier(const String &p_string) const {
|
||||
enum UAX31SequenceStatus {
|
||||
SEQ_NOT_STARTED,
|
||||
SEQ_STARTED,
|
||||
SEQ_STARTED_VIR,
|
||||
SEQ_NEAR_END,
|
||||
};
|
||||
|
||||
const char32_t *str = p_string.ptr();
|
||||
int len = p_string.length();
|
||||
|
||||
if (len == 0) {
|
||||
return false; // Empty string.
|
||||
}
|
||||
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
Char16String utf16 = p_string.utf16();
|
||||
const UNormalizer2 *norm_c = unorm2_getNFCInstance(&err);
|
||||
if (U_FAILURE(err)) {
|
||||
return false; // Failed to load normalizer.
|
||||
}
|
||||
bool isnurom = unorm2_isNormalized(norm_c, utf16.ptr(), utf16.length(), &err);
|
||||
if (U_FAILURE(err) || !isnurom) {
|
||||
return false; // Do not conform to Normalization Form C.
|
||||
}
|
||||
|
||||
UAX31SequenceStatus A1_sequence_status = SEQ_NOT_STARTED;
|
||||
UScriptCode A1_scr = USCRIPT_INHERITED;
|
||||
UAX31SequenceStatus A2_sequence_status = SEQ_NOT_STARTED;
|
||||
UScriptCode A2_scr = USCRIPT_INHERITED;
|
||||
UAX31SequenceStatus B_sequence_status = SEQ_NOT_STARTED;
|
||||
UScriptCode B_scr = USCRIPT_INHERITED;
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
err = U_ZERO_ERROR;
|
||||
UScriptCode scr = uscript_getScript(str[i], &err);
|
||||
if (U_FAILURE(err)) {
|
||||
return false; // Invalid script.
|
||||
}
|
||||
if (uscript_getUsage(scr) != USCRIPT_USAGE_RECOMMENDED) {
|
||||
return false; // Not a recommended script.
|
||||
}
|
||||
uint8_t cat = u_charType(str[i]);
|
||||
int32_t jt = u_getIntPropertyValue(str[i], UCHAR_JOINING_TYPE);
|
||||
|
||||
// UAX #31 section 2.3 subsections A1, A2 and B, check ZWNJ and ZWJ usage.
|
||||
switch (A1_sequence_status) {
|
||||
case SEQ_NEAR_END: {
|
||||
if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) {
|
||||
return false; // Mixed script.
|
||||
}
|
||||
if (jt == U_JT_RIGHT_JOINING || jt == U_JT_DUAL_JOINING) {
|
||||
A1_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
|
||||
} else if (jt != U_JT_TRANSPARENT) {
|
||||
return false; // Invalid end of sequence.
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED: {
|
||||
if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) {
|
||||
A1_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (jt != U_JT_TRANSPARENT) {
|
||||
if (str[i] == 0x200C /*ZWNJ*/) {
|
||||
A1_sequence_status = SEQ_NEAR_END;
|
||||
continue;
|
||||
} else {
|
||||
A1_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (A1_sequence_status == SEQ_NOT_STARTED) {
|
||||
if (jt == U_JT_LEFT_JOINING || jt == U_JT_DUAL_JOINING) {
|
||||
A1_sequence_status = SEQ_STARTED;
|
||||
A1_scr = scr;
|
||||
}
|
||||
};
|
||||
|
||||
switch (A2_sequence_status) {
|
||||
case SEQ_NEAR_END: {
|
||||
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
|
||||
return false; // Mixed script.
|
||||
}
|
||||
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
|
||||
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
|
||||
return false; // Invalid end of sequence.
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED_VIR: {
|
||||
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (str[i] == 0x200C /*ZWNJ*/) {
|
||||
A2_sequence_status = SEQ_NEAR_END;
|
||||
continue;
|
||||
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED: {
|
||||
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) {
|
||||
A2_sequence_status = SEQ_STARTED_VIR;
|
||||
} else if (cat != U_MODIFIER_LETTER) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (A2_sequence_status == SEQ_NOT_STARTED) {
|
||||
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
|
||||
A2_sequence_status = SEQ_STARTED;
|
||||
A2_scr = scr;
|
||||
}
|
||||
}
|
||||
|
||||
switch (B_sequence_status) {
|
||||
case SEQ_NEAR_END: {
|
||||
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
|
||||
return false; // Mixed script.
|
||||
}
|
||||
if (u_getIntPropertyValue(str[i], UCHAR_INDIC_SYLLABIC_CATEGORY) != U_INSC_VOWEL_DEPENDENT) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
|
||||
} else {
|
||||
return false; // Invalid end of sequence.
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED_VIR: {
|
||||
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (str[i] == 0x200D /*ZWJ*/) {
|
||||
B_sequence_status = SEQ_NEAR_END;
|
||||
continue;
|
||||
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED: {
|
||||
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) {
|
||||
B_sequence_status = SEQ_STARTED_VIR;
|
||||
} else if (cat != U_MODIFIER_LETTER) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (B_sequence_status == SEQ_NOT_STARTED) {
|
||||
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
|
||||
B_sequence_status = SEQ_STARTED;
|
||||
B_scr = scr;
|
||||
}
|
||||
}
|
||||
|
||||
if (u_hasBinaryProperty(str[i], UCHAR_PATTERN_SYNTAX) || u_hasBinaryProperty(str[i], UCHAR_PATTERN_WHITE_SPACE) || u_hasBinaryProperty(str[i], UCHAR_NONCHARACTER_CODE_POINT)) {
|
||||
return false; // Not a XID_Start or XID_Continue character.
|
||||
}
|
||||
if (i == 0) {
|
||||
if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || str[0] == 0x2118 || str[0] == 0x212E || str[0] == 0x309B || str[0] == 0x309C || str[0] == 0x005F)) {
|
||||
return false; // Not a XID_Start character.
|
||||
}
|
||||
} else {
|
||||
if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || cat == U_NON_SPACING_MARK || cat == U_COMBINING_SPACING_MARK || cat == U_DECIMAL_DIGIT_NUMBER || cat == U_CONNECTOR_PUNCTUATION || str[i] == 0x2118 || str[i] == 0x212E || str[i] == 0x309B || str[i] == 0x309C || str[i] == 0x1369 || str[i] == 0x1371 || str[i] == 0x00B7 || str[i] == 0x0387 || str[i] == 0x19DA || str[i] == 0x0E33 || str[i] == 0x0EB3 || str[i] == 0xFF9E || str[i] == 0xFF9F)) {
|
||||
return false; // Not a XID_Continue character.
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
TextServerAdvanced::TextServerAdvanced() {
|
||||
_insert_num_systems_lang();
|
||||
_insert_feature_sets();
|
||||
|
|
|
@ -519,6 +519,7 @@ public:
|
|||
virtual String percent_sign(const String &p_language = "") const override;
|
||||
|
||||
virtual String strip_diacritics(const String &p_string) const override;
|
||||
virtual bool is_valid_identifier(const String &p_string) const override;
|
||||
|
||||
TextServerAdvanced();
|
||||
~TextServerAdvanced();
|
||||
|
|
|
@ -416,6 +416,7 @@ void TextServer::_bind_methods() {
|
|||
ClassDB::bind_method(D_METHOD("percent_sign", "language"), &TextServer::percent_sign, DEFVAL(""));
|
||||
|
||||
ClassDB::bind_method(D_METHOD("strip_diacritics", "string"), &TextServer::strip_diacritics);
|
||||
ClassDB::bind_method(D_METHOD("is_valid_identifier", "string"), &TextServer::is_valid_identifier);
|
||||
|
||||
/* Direction */
|
||||
BIND_ENUM_CONSTANT(DIRECTION_AUTO);
|
||||
|
@ -476,6 +477,7 @@ void TextServer::_bind_methods() {
|
|||
BIND_ENUM_CONSTANT(FEATURE_FONT_SYSTEM);
|
||||
BIND_ENUM_CONSTANT(FEATURE_FONT_VARIABLE);
|
||||
BIND_ENUM_CONSTANT(FEATURE_USE_SUPPORT_DATA);
|
||||
BIND_ENUM_CONSTANT(FEATURE_UNICODE_IDENTIFIERS);
|
||||
|
||||
/* FT Contour Point Types */
|
||||
BIND_ENUM_CONSTANT(CONTOUR_CURVE_TAG_ON);
|
||||
|
|
|
@ -109,7 +109,8 @@ public:
|
|||
FEATURE_BREAK_ITERATORS = 1 << 4,
|
||||
FEATURE_FONT_SYSTEM = 1 << 5,
|
||||
FEATURE_FONT_VARIABLE = 1 << 6,
|
||||
FEATURE_USE_SUPPORT_DATA = 1 << 7
|
||||
FEATURE_USE_SUPPORT_DATA = 1 << 7,
|
||||
FEATURE_UNICODE_IDENTIFIERS = 1 << 8,
|
||||
};
|
||||
|
||||
enum ContourPointTag {
|
||||
|
@ -451,6 +452,7 @@ public:
|
|||
virtual String percent_sign(const String &p_language = "") const { return "%"; };
|
||||
|
||||
virtual String strip_diacritics(const String &p_string) const;
|
||||
virtual bool is_valid_identifier(const String &p_string) const { return p_string.is_valid_identifier(); };
|
||||
|
||||
TextServer();
|
||||
~TextServer();
|
||||
|
|
|
@ -266,6 +266,27 @@ TEST_SUITE("[[TextServer]") {
|
|||
}
|
||||
}
|
||||
|
||||
SUBCASE("[TextServer] Unicode identifiers") {
|
||||
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
|
||||
Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i);
|
||||
TEST_FAIL_COND(ts.is_null(), "Invalid TS interface.");
|
||||
|
||||
if (ts->has_feature(TextServer::FEATURE_UNICODE_IDENTIFIERS)) {
|
||||
static const char32_t *data[19] = { U"-30", U"100", U"10.1", U"10,1", U"1e2", U"1e-2", U"1e2e3", U"0xAB", U"AB", U"Test1", U"1Test", U"Test*1", U"test_testeT", U"test_tes teT", U"عَلَيْكُمْ", U"عَلَيْكُمْTest", U"ӒӖӚӜ", U"_test", U"ÂÃÄÅĀĂĄÇĆĈĊ" };
|
||||
static bool isid[19] = { false, false, false, false, false, false, false, false, true, true, false, false, true, false, true, true, true, true, true };
|
||||
for (int j = 0; j < 19; j++) {
|
||||
String s = String(data[j]);
|
||||
CHECK(ts->is_valid_identifier(s) == isid[j]);
|
||||
}
|
||||
|
||||
// Test UAX 3.2 ZW(N)J usage.
|
||||
CHECK(ts->is_valid_identifier(U"\u0646\u0627\u0645\u0647\u200C\u0627\u06CC"));
|
||||
CHECK(ts->is_valid_identifier(U"\u0D26\u0D43\u0D15\u0D4D\u200C\u0D38\u0D3E\u0D15\u0D4D\u0D37\u0D3F"));
|
||||
CHECK(ts->is_valid_identifier(U"\u0DC1\u0DCA\u200D\u0DBB\u0DD3"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SUBCASE("[TextServer] Strip Diacritics") {
|
||||
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
|
||||
Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i);
|
||||
|
|
Loading…
Reference in a new issue