[TextServer] Implement ICU/UAX 31 based is_valid_identifier function.

This commit is contained in:
bruvzg 2021-10-18 15:07:11 +03:00
parent e317e34c15
commit 9c3ec25dce
6 changed files with 236 additions and 7 deletions

View file

@ -21,8 +21,8 @@
<argument index="1" name="orientation" type="int" enum="TextServer.Orientation" default="0" />
<description>
Creates new buffer for complex text layout, with the given [code]direction[/code] and [code]orientation[/code]. To free the resulting buffer, use [method free_rid] method.
[b]Note:[/b] Direction is ignored if server does not support [code]FEATURE_BIDI_LAYOUT[/code] feature.
[b]Note:[/b] Orientation is ignored if server does not support [code]FEATURE_VERTICAL_LAYOUT[/code] feature.
[b]Note:[/b] Direction is ignored if server does not support [constant FEATURE_BIDI_LAYOUT] feature.
[b]Note:[/b] Orientation is ignored if server does not support [constant FEATURE_VERTICAL_LAYOUT] feature.
</description>
</method>
<method name="draw_hex_code_box" qualifiers="const">
@ -843,6 +843,21 @@
Returns [code]true[/code] if locale is right-to-left.
</description>
</method>
<method name="is_valid_identifier" qualifiers="const">
<return type="bool" />
<argument index="0" name="string" type="String" />
<description>
Returns [code]true[/code] is [code]string[/code] is a valid identifier.
If the text server supports the [constant FEATURE_UNICODE_IDENTIFIERS] feature, a valid identifier must:
- Conform to normalization form C.
- Begin with a Unicode character of class XID_Start or [code]"_"[/code].
- May contain Unicode characters of class XID_Continue in the other positions.
- Use UAX #31 recommended scripts only (mixed scripts are allowed).
If the [constant FEATURE_UNICODE_IDENTIFIERS] feature is not supported, a valid identifier must:
- Begin with a Basic Latin letter character or [code]"_"[/code].
- May contain Basic Latin letter characters, digits or [code]"_"[/code] in the other positions.
</description>
</method>
<method name="load_support_data">
<return type="bool" />
<argument index="0" name="filename" type="String" />
@ -1233,7 +1248,7 @@
<argument index="1" name="direction" type="int" enum="TextServer.Direction" default="0" />
<description>
Sets desired text direction. If set to [code]TEXT_DIRECTION_AUTO[/code], direction will be detected based on the buffer contents and current locale.
[b]Note:[/b] Direction is ignored if server does not support [code]FEATURE_BIDI_LAYOUT[/code] feature.
[b]Note:[/b] Direction is ignored if server does not support [constant FEATURE_BIDI_LAYOUT] feature.
</description>
</method>
<method name="shaped_text_set_orientation">
@ -1242,7 +1257,7 @@
<argument index="1" name="orientation" type="int" enum="TextServer.Orientation" default="0" />
<description>
Sets desired text orientation.
[b]Note:[/b] Orientation is ignored if server does not support [code]FEATURE_VERTICAL_LAYOUT[/code] feature.
[b]Note:[/b] Orientation is ignored if server does not support [constant FEATURE_BIDI_LAYOUT] feature.
</description>
</method>
<method name="shaped_text_set_preserve_control">
@ -1439,7 +1454,10 @@
TextServer supports variable fonts.
</constant>
<constant name="FEATURE_USE_SUPPORT_DATA" value="128" enum="Feature">
TextServer require external data file for some features.
TextServer require external data file for some features, see [method load_support_data].
</constant>
<constant name="FEATURE_UNICODE_IDENTIFIERS" value="256" enum="Feature">
TextServer supports UAX #31 identifier validation, see [method is_valid_identifier].
</constant>
<constant name="CONTOUR_CURVE_TAG_ON" value="1" enum="ContourPointTag">
Contour point is on the curve.

View file

@ -320,7 +320,7 @@ _FORCE_INLINE_ bool is_underscore(char32_t p_char) {
/*************************************************************************/
String TextServerAdvanced::interface_name = "ICU / HarfBuzz / Graphite";
uint32_t TextServerAdvanced::interface_features = FEATURE_BIDI_LAYOUT | FEATURE_VERTICAL_LAYOUT | FEATURE_SHAPING | FEATURE_KASHIDA_JUSTIFICATION | FEATURE_BREAK_ITERATORS | FEATURE_USE_SUPPORT_DATA | FEATURE_FONT_VARIABLE;
uint32_t TextServerAdvanced::interface_features = FEATURE_BIDI_LAYOUT | FEATURE_VERTICAL_LAYOUT | FEATURE_SHAPING | FEATURE_KASHIDA_JUSTIFICATION | FEATURE_BREAK_ITERATORS | FEATURE_USE_SUPPORT_DATA | FEATURE_FONT_VARIABLE | FEATURE_UNICODE_IDENTIFIERS;
bool TextServerAdvanced::has_feature(Feature p_feature) const {
return (interface_features & p_feature) == p_feature;
@ -5068,6 +5068,191 @@ String TextServerAdvanced::strip_diacritics(const String &p_string) const {
return result;
}
bool TextServerAdvanced::is_valid_identifier(const String &p_string) const {
enum UAX31SequenceStatus {
SEQ_NOT_STARTED,
SEQ_STARTED,
SEQ_STARTED_VIR,
SEQ_NEAR_END,
};
const char32_t *str = p_string.ptr();
int len = p_string.length();
if (len == 0) {
return false; // Empty string.
}
UErrorCode err = U_ZERO_ERROR;
Char16String utf16 = p_string.utf16();
const UNormalizer2 *norm_c = unorm2_getNFCInstance(&err);
if (U_FAILURE(err)) {
return false; // Failed to load normalizer.
}
bool isnurom = unorm2_isNormalized(norm_c, utf16.ptr(), utf16.length(), &err);
if (U_FAILURE(err) || !isnurom) {
return false; // Do not conform to Normalization Form C.
}
UAX31SequenceStatus A1_sequence_status = SEQ_NOT_STARTED;
UScriptCode A1_scr = USCRIPT_INHERITED;
UAX31SequenceStatus A2_sequence_status = SEQ_NOT_STARTED;
UScriptCode A2_scr = USCRIPT_INHERITED;
UAX31SequenceStatus B_sequence_status = SEQ_NOT_STARTED;
UScriptCode B_scr = USCRIPT_INHERITED;
for (int i = 0; i < len; i++) {
err = U_ZERO_ERROR;
UScriptCode scr = uscript_getScript(str[i], &err);
if (U_FAILURE(err)) {
return false; // Invalid script.
}
if (uscript_getUsage(scr) != USCRIPT_USAGE_RECOMMENDED) {
return false; // Not a recommended script.
}
uint8_t cat = u_charType(str[i]);
int32_t jt = u_getIntPropertyValue(str[i], UCHAR_JOINING_TYPE);
// UAX #31 section 2.3 subsections A1, A2 and B, check ZWNJ and ZWJ usage.
switch (A1_sequence_status) {
case SEQ_NEAR_END: {
if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) {
return false; // Mixed script.
}
if (jt == U_JT_RIGHT_JOINING || jt == U_JT_DUAL_JOINING) {
A1_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
} else if (jt != U_JT_TRANSPARENT) {
return false; // Invalid end of sequence.
}
} break;
case SEQ_STARTED: {
if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) {
A1_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (jt != U_JT_TRANSPARENT) {
if (str[i] == 0x200C /*ZWNJ*/) {
A1_sequence_status = SEQ_NEAR_END;
continue;
} else {
A1_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
}
} break;
default:
break;
}
if (A1_sequence_status == SEQ_NOT_STARTED) {
if (jt == U_JT_LEFT_JOINING || jt == U_JT_DUAL_JOINING) {
A1_sequence_status = SEQ_STARTED;
A1_scr = scr;
}
};
switch (A2_sequence_status) {
case SEQ_NEAR_END: {
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
return false; // Mixed script.
}
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
A2_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
return false; // Invalid end of sequence.
}
} break;
case SEQ_STARTED_VIR: {
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (str[i] == 0x200C /*ZWNJ*/) {
A2_sequence_status = SEQ_NEAR_END;
continue;
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
} break;
case SEQ_STARTED: {
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) {
A2_sequence_status = SEQ_STARTED_VIR;
} else if (cat != U_MODIFIER_LETTER) {
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
} break;
default:
break;
}
if (A2_sequence_status == SEQ_NOT_STARTED) {
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
A2_sequence_status = SEQ_STARTED;
A2_scr = scr;
}
}
switch (B_sequence_status) {
case SEQ_NEAR_END: {
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
return false; // Mixed script.
}
if (u_getIntPropertyValue(str[i], UCHAR_INDIC_SYLLABIC_CATEGORY) != U_INSC_VOWEL_DEPENDENT) {
B_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
} else {
return false; // Invalid end of sequence.
}
} break;
case SEQ_STARTED_VIR: {
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
B_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (str[i] == 0x200D /*ZWJ*/) {
B_sequence_status = SEQ_NEAR_END;
continue;
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
B_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
} break;
case SEQ_STARTED: {
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
B_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) {
B_sequence_status = SEQ_STARTED_VIR;
} else if (cat != U_MODIFIER_LETTER) {
B_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
} break;
default:
break;
}
if (B_sequence_status == SEQ_NOT_STARTED) {
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
B_sequence_status = SEQ_STARTED;
B_scr = scr;
}
}
if (u_hasBinaryProperty(str[i], UCHAR_PATTERN_SYNTAX) || u_hasBinaryProperty(str[i], UCHAR_PATTERN_WHITE_SPACE) || u_hasBinaryProperty(str[i], UCHAR_NONCHARACTER_CODE_POINT)) {
return false; // Not a XID_Start or XID_Continue character.
}
if (i == 0) {
if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || str[0] == 0x2118 || str[0] == 0x212E || str[0] == 0x309B || str[0] == 0x309C || str[0] == 0x005F)) {
return false; // Not a XID_Start character.
}
} else {
if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || cat == U_NON_SPACING_MARK || cat == U_COMBINING_SPACING_MARK || cat == U_DECIMAL_DIGIT_NUMBER || cat == U_CONNECTOR_PUNCTUATION || str[i] == 0x2118 || str[i] == 0x212E || str[i] == 0x309B || str[i] == 0x309C || str[i] == 0x1369 || str[i] == 0x1371 || str[i] == 0x00B7 || str[i] == 0x0387 || str[i] == 0x19DA || str[i] == 0x0E33 || str[i] == 0x0EB3 || str[i] == 0xFF9E || str[i] == 0xFF9F)) {
return false; // Not a XID_Continue character.
}
}
}
return true;
}
TextServerAdvanced::TextServerAdvanced() {
_insert_num_systems_lang();
_insert_feature_sets();

View file

@ -519,6 +519,7 @@ public:
virtual String percent_sign(const String &p_language = "") const override;
virtual String strip_diacritics(const String &p_string) const override;
virtual bool is_valid_identifier(const String &p_string) const override;
TextServerAdvanced();
~TextServerAdvanced();

View file

@ -416,6 +416,7 @@ void TextServer::_bind_methods() {
ClassDB::bind_method(D_METHOD("percent_sign", "language"), &TextServer::percent_sign, DEFVAL(""));
ClassDB::bind_method(D_METHOD("strip_diacritics", "string"), &TextServer::strip_diacritics);
ClassDB::bind_method(D_METHOD("is_valid_identifier", "string"), &TextServer::is_valid_identifier);
/* Direction */
BIND_ENUM_CONSTANT(DIRECTION_AUTO);
@ -476,6 +477,7 @@ void TextServer::_bind_methods() {
BIND_ENUM_CONSTANT(FEATURE_FONT_SYSTEM);
BIND_ENUM_CONSTANT(FEATURE_FONT_VARIABLE);
BIND_ENUM_CONSTANT(FEATURE_USE_SUPPORT_DATA);
BIND_ENUM_CONSTANT(FEATURE_UNICODE_IDENTIFIERS);
/* FT Contour Point Types */
BIND_ENUM_CONSTANT(CONTOUR_CURVE_TAG_ON);

View file

@ -109,7 +109,8 @@ public:
FEATURE_BREAK_ITERATORS = 1 << 4,
FEATURE_FONT_SYSTEM = 1 << 5,
FEATURE_FONT_VARIABLE = 1 << 6,
FEATURE_USE_SUPPORT_DATA = 1 << 7
FEATURE_USE_SUPPORT_DATA = 1 << 7,
FEATURE_UNICODE_IDENTIFIERS = 1 << 8,
};
enum ContourPointTag {
@ -451,6 +452,7 @@ public:
virtual String percent_sign(const String &p_language = "") const { return "%"; };
virtual String strip_diacritics(const String &p_string) const;
virtual bool is_valid_identifier(const String &p_string) const { return p_string.is_valid_identifier(); };
TextServer();
~TextServer();

View file

@ -266,6 +266,27 @@ TEST_SUITE("[[TextServer]") {
}
}
SUBCASE("[TextServer] Unicode identifiers") {
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i);
TEST_FAIL_COND(ts.is_null(), "Invalid TS interface.");
if (ts->has_feature(TextServer::FEATURE_UNICODE_IDENTIFIERS)) {
static const char32_t *data[19] = { U"-30", U"100", U"10.1", U"10,1", U"1e2", U"1e-2", U"1e2e3", U"0xAB", U"AB", U"Test1", U"1Test", U"Test*1", U"test_testeT", U"test_tes teT", U"عَلَيْكُمْ", U"عَلَيْكُمْTest", U"ӒӖӚӜ", U"_test", U"ÂÃÄÅĀĂĄÇĆĈĊ" };
static bool isid[19] = { false, false, false, false, false, false, false, false, true, true, false, false, true, false, true, true, true, true, true };
for (int j = 0; j < 19; j++) {
String s = String(data[j]);
CHECK(ts->is_valid_identifier(s) == isid[j]);
}
// Test UAX 3.2 ZW(N)J usage.
CHECK(ts->is_valid_identifier(U"\u0646\u0627\u0645\u0647\u200C\u0627\u06CC"));
CHECK(ts->is_valid_identifier(U"\u0D26\u0D43\u0D15\u0D4D\u200C\u0D38\u0D3E\u0D15\u0D4D\u0D37\u0D3F"));
CHECK(ts->is_valid_identifier(U"\u0DC1\u0DCA\u200D\u0DBB\u0DD3"));
}
}
}
SUBCASE("[TextServer] Strip Diacritics") {
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i);