From 1df3182865fb089bd653763cd0abbea811545365 Mon Sep 17 00:00:00 2001 From: "Dustin L. Howett" Date: Tue, 27 Oct 2020 10:36:28 -0700 Subject: [PATCH] Fully regenerate CodepointWidthDetector from Unicode 13.0 (#8035) This commit also adds an override UCD and migrates all of the overrides from GetQuickCharWidth into it. GetQuickCharWidth ----------------- The removal of overrides from GQCW reduces the number of comparisons required for looking up a single character's width from 41 (32 individual ranged comparisons from GQCW + 8+1 from the binary search in CPWD) to 11 (2 from GQCW, 8+1 from CPWD). GQCW also incorrectly marked 67 reserved codepoints as `Wide` when they should have been `Narrow`. The codepoints whose definitions have changed from `Wide` to `Narrow` are: ``` 2E9A 2EF4 2EF5 2EF6 2EF7 2EF8 2EF9 2EFA 2EFB 2EFC 2EFD 2EFE 2EFF 2FD6 2FD7 2FD8 2FD9 2FDA 2FDB 2FDC 2FDD 2FDE 2FDF 2FE0 2FE1 2FE2 2FE3 2FE4 2FE5 2FE6 2FE7 2FE8 2FE9 2FEA 2FEB 2FEC 2FED 2FEE 2FEF 2FFC 2FFD 2FFE 2FFF 31E4 31E5 31E6 31E7 31E8 31E9 31EA 31EB 31EC 31ED 31EE 31EF 321F A48D A48E A48F FE1A FE1B FE1C FE1D FE1E FE1F FE53 FE67 ``` All of them are reserved, but those reserved regions are marked as narrow in the UCD. This change also offers us the chance to document exactly why we're overriding a specific character range. Comments from the override document will be copied to the generated CPWD table. New in Unicode 13.0 ------------------ Some widths have changed due to previously-reserved characters becoming _used_ such as U+32FF SQUARE ERA NAME REIWA, the Tangut components 756-768, the entire Khitan Small Script character set, and the Tangut Ideographs. A number of the changes in this diff are due to better/worse comment tracking and the removal of the Emoji/EPres comments. The script once mistakenly applied comments to packed regions (and it has been updated to not do so.) Validation ---------- I build a test application that compared codepoints 0-FFFF for GQCW against their new registered widths. --- .github/actions/spell-check/expect/expect.txt | 1 + src/types/CodepointWidthDetector.cpp | 220 ++++++++---------- src/types/convert.cpp | 169 +------------- src/types/unicode_width_overrides.xml | 9 + tools/Generate-CodepointWidthsFromUCD.ps1 | 6 + 5 files changed, 116 insertions(+), 289 deletions(-) create mode 100644 src/types/unicode_width_overrides.xml diff --git a/.github/actions/spell-check/expect/expect.txt b/.github/actions/spell-check/expect/expect.txt index bee163b1b..a3f0ed7ba 100644 --- a/.github/actions/spell-check/expect/expect.txt +++ b/.github/actions/spell-check/expect/expect.txt @@ -601,6 +601,7 @@ devops Dext df DFactory +DFF DFMT dh dhandler diff --git a/src/types/CodepointWidthDetector.cpp b/src/types/CodepointWidthDetector.cpp index 9eb637d00..de29ba196 100644 --- a/src/types/CodepointWidthDetector.cpp +++ b/src/types/CodepointWidthDetector.cpp @@ -19,39 +19,12 @@ namespace return range.upperBound < searchTerm; } - static constexpr std::array s_wideAndAmbiguousTable{ - // generated from http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt - // anything not present here is presumed to be Narrow. - // - // GH #900 - Supplemented with emoji codepoints from https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt - // Emojis in 0x2010 - 0x2B59 used to be marked as Ambiguous in GetQuickCharWidth() in order to - // force a font lookup, but since we default all Ambiguous width to Narrow, those emojis always - // came out looking squished/tiny. They've been moved into this table and marked as Wide. - // - // === UCD Definitions === - // EA - EastAsianWidth - // Emoji - Emoji - // EPres - Emoji Presentation - // ======================= - // - // This table has been partially regenerated from the Unicode Character Database as of 13.0, with - // the following rules: - // Codepoints whose EA is "W", "F" are Wide - // Codepoints whose EA is "A" are Ambiguous - // Codepoints where Emoji=Y and EPres=Y are Emoji, therefore Wide - // - - // Codepoints where Emoji=Y but EPres=*N* are only Emoji when followed - // by U+FE0F variation selector 15. - // - // There are a couple of codepoints that Microsoft specifically gave an emoji representation - // even if it's not specified as an emoji in the standard. I'll list the ones I'm aware of in this comment in case - // we decide to add them in the future: - // 0x261A-0x261C, 0x261E-0x261F - // 0x2661, - // 0x2662, - // 0x2664, - // 0x2666 0x2710, - // 0x270E 0x2765 0x1f000 - 0x1f02b except 0x1f004 0x1f594 + // Generated by Generate-CodepointWidthsFromUCD.ps1 -Pack:True -Full:False -NoOverrides:False + // on 10/25/2020 7:32:04 AM (UTC) from Unicode 13.0.0. + // 321205 (0x4E6B5) codepoints covered. + // 240 (0xF0) codepoints overridden. + // Override path: .\src\types\unicode_width_overrides.xml + static constexpr std::array s_wideAndAmbiguousTable{ UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa4, 0xa4, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa7, 0xa8, CodepointWidth::Ambiguous }, @@ -176,16 +149,14 @@ namespace UnicodeRange{ 0x22a5, 0x22a5, CodepointWidth::Ambiguous }, UnicodeRange{ 0x22bf, 0x22bf, CodepointWidth::Ambiguous }, UnicodeRange{ 0x2312, 0x2312, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x231a, 0x231b, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x231a, 0x231b, CodepointWidth::Wide }, UnicodeRange{ 0x2329, 0x232a, CodepointWidth::Wide }, - UnicodeRange{ 0x23e9, 0x23ec, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x23f0, 0x23f0, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x23f3, 0x23f3, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x23e9, 0x23ec, CodepointWidth::Wide }, + UnicodeRange{ 0x23f0, 0x23f0, CodepointWidth::Wide }, + UnicodeRange{ 0x23f3, 0x23f3, CodepointWidth::Wide }, UnicodeRange{ 0x2460, 0x24e9, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x24eb, 0x254b, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x2550, 0x2573, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x2580, 0x258f, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x2592, 0x2595, CodepointWidth::Ambiguous }, + UnicodeRange{ 0x24eb, 0x24ff, CodepointWidth::Ambiguous }, + UnicodeRange{ 0x2500, 0x259f, CodepointWidth::Narrow }, // box-drawing and block elements require 1-cell alignment UnicodeRange{ 0x25a0, 0x25a1, CodepointWidth::Ambiguous }, UnicodeRange{ 0x25a3, 0x25a9, CodepointWidth::Ambiguous }, UnicodeRange{ 0x25b2, 0x25b3, CodepointWidth::Ambiguous }, @@ -197,61 +168,61 @@ namespace UnicodeRange{ 0x25ce, 0x25d1, CodepointWidth::Ambiguous }, UnicodeRange{ 0x25e2, 0x25e5, CodepointWidth::Ambiguous }, UnicodeRange{ 0x25ef, 0x25ef, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x25fd, 0x25fe, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x25fd, 0x25fe, CodepointWidth::Wide }, UnicodeRange{ 0x2605, 0x2606, CodepointWidth::Ambiguous }, UnicodeRange{ 0x2609, 0x2609, CodepointWidth::Ambiguous }, UnicodeRange{ 0x260e, 0x260f, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x2614, 0x2615, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x2614, 0x2615, CodepointWidth::Wide }, UnicodeRange{ 0x261c, 0x261c, CodepointWidth::Ambiguous }, UnicodeRange{ 0x261e, 0x261e, CodepointWidth::Ambiguous }, UnicodeRange{ 0x2640, 0x2640, CodepointWidth::Ambiguous }, UnicodeRange{ 0x2642, 0x2642, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x2648, 0x2653, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x2648, 0x2653, CodepointWidth::Wide }, UnicodeRange{ 0x2660, 0x2661, CodepointWidth::Ambiguous }, UnicodeRange{ 0x2663, 0x2665, CodepointWidth::Ambiguous }, UnicodeRange{ 0x2667, 0x266a, CodepointWidth::Ambiguous }, UnicodeRange{ 0x266c, 0x266d, CodepointWidth::Ambiguous }, UnicodeRange{ 0x266f, 0x266f, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x267f, 0x267f, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x2693, 0x2693, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x267f, 0x267f, CodepointWidth::Wide }, + UnicodeRange{ 0x2693, 0x2693, CodepointWidth::Wide }, UnicodeRange{ 0x269e, 0x269f, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x26a1, 0x26a1, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x26aa, 0x26ab, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x26bd, 0x26be, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x26a1, 0x26a1, CodepointWidth::Wide }, + UnicodeRange{ 0x26aa, 0x26ab, CodepointWidth::Wide }, + UnicodeRange{ 0x26bd, 0x26be, CodepointWidth::Wide }, UnicodeRange{ 0x26bf, 0x26bf, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x26c4, 0x26c5, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x26c4, 0x26c5, CodepointWidth::Wide }, UnicodeRange{ 0x26c6, 0x26cd, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x26ce, 0x26ce, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x26ce, 0x26ce, CodepointWidth::Wide }, UnicodeRange{ 0x26cf, 0x26d3, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x26d4, 0x26d4, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x26d4, 0x26d4, CodepointWidth::Wide }, UnicodeRange{ 0x26d5, 0x26e1, CodepointWidth::Ambiguous }, UnicodeRange{ 0x26e3, 0x26e3, CodepointWidth::Ambiguous }, UnicodeRange{ 0x26e8, 0x26e9, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x26ea, 0x26ea, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x26ea, 0x26ea, CodepointWidth::Wide }, UnicodeRange{ 0x26eb, 0x26f1, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x26f2, 0x26f3, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x26f2, 0x26f3, CodepointWidth::Wide }, UnicodeRange{ 0x26f4, 0x26f4, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x26f5, 0x26f5, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x26f5, 0x26f5, CodepointWidth::Wide }, UnicodeRange{ 0x26f6, 0x26f9, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x26fa, 0x26fa, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x26fa, 0x26fa, CodepointWidth::Wide }, UnicodeRange{ 0x26fb, 0x26fc, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x26fd, 0x26fd, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x26fd, 0x26fd, CodepointWidth::Wide }, UnicodeRange{ 0x26fe, 0x26ff, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x2705, 0x2705, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x270a, 0x270b, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x2728, 0x2728, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x2705, 0x2705, CodepointWidth::Wide }, + UnicodeRange{ 0x270a, 0x270b, CodepointWidth::Wide }, + UnicodeRange{ 0x2728, 0x2728, CodepointWidth::Wide }, UnicodeRange{ 0x273d, 0x273d, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x274c, 0x274c, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x274e, 0x274e, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x2753, 0x2755, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x2757, 0x2757, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x274c, 0x274c, CodepointWidth::Wide }, + UnicodeRange{ 0x274e, 0x274e, CodepointWidth::Wide }, + UnicodeRange{ 0x2753, 0x2755, CodepointWidth::Wide }, + UnicodeRange{ 0x2757, 0x2757, CodepointWidth::Wide }, UnicodeRange{ 0x2776, 0x277f, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x2795, 0x2797, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x27b0, 0x27b0, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x27bf, 0x27bf, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x2b1b, 0x2b1c, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x2b50, 0x2b50, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x2b55, 0x2b55, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x2795, 0x2797, CodepointWidth::Wide }, + UnicodeRange{ 0x27b0, 0x27b0, CodepointWidth::Wide }, + UnicodeRange{ 0x27bf, 0x27bf, CodepointWidth::Wide }, + UnicodeRange{ 0x2b1b, 0x2b1c, CodepointWidth::Wide }, + UnicodeRange{ 0x2b50, 0x2b50, CodepointWidth::Wide }, + UnicodeRange{ 0x2b55, 0x2b55, CodepointWidth::Wide }, UnicodeRange{ 0x2b56, 0x2b59, CodepointWidth::Ambiguous }, UnicodeRange{ 0x2e80, 0x2e99, CodepointWidth::Wide }, UnicodeRange{ 0x2e9b, 0x2ef3, CodepointWidth::Wide }, @@ -260,15 +231,14 @@ namespace UnicodeRange{ 0x3000, 0x303e, CodepointWidth::Wide }, UnicodeRange{ 0x3041, 0x3096, CodepointWidth::Wide }, UnicodeRange{ 0x3099, 0x30ff, CodepointWidth::Wide }, - UnicodeRange{ 0x3105, 0x312e, CodepointWidth::Wide }, + UnicodeRange{ 0x3105, 0x312f, CodepointWidth::Wide }, UnicodeRange{ 0x3131, 0x318e, CodepointWidth::Wide }, - UnicodeRange{ 0x3190, 0x31ba, CodepointWidth::Wide }, - UnicodeRange{ 0x31c0, 0x31e3, CodepointWidth::Wide }, + UnicodeRange{ 0x3190, 0x31e3, CodepointWidth::Wide }, UnicodeRange{ 0x31f0, 0x321e, CodepointWidth::Wide }, UnicodeRange{ 0x3220, 0x3247, CodepointWidth::Wide }, UnicodeRange{ 0x3248, 0x324f, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x3250, 0x32fe, CodepointWidth::Wide }, - UnicodeRange{ 0x3300, 0x4dbf, CodepointWidth::Wide }, + UnicodeRange{ 0x3250, 0x4dbf, CodepointWidth::Wide }, + UnicodeRange{ 0x4dc0, 0x4dff, CodepointWidth::Narrow }, // hexagrams are historically narrow UnicodeRange{ 0x4e00, 0xa48c, CodepointWidth::Wide }, UnicodeRange{ 0xa490, 0xa4c6, CodepointWidth::Wide }, UnicodeRange{ 0xa960, 0xa97c, CodepointWidth::Wide }, @@ -277,75 +247,79 @@ namespace UnicodeRange{ 0xf900, 0xfaff, CodepointWidth::Wide }, UnicodeRange{ 0xfe00, 0xfe0f, CodepointWidth::Ambiguous }, UnicodeRange{ 0xfe10, 0xfe19, CodepointWidth::Wide }, + UnicodeRange{ 0xfe20, 0xfe2f, CodepointWidth::Narrow }, // narrow combining ligatures (split into left/right halves, which take 2 columns together) UnicodeRange{ 0xfe30, 0xfe52, CodepointWidth::Wide }, UnicodeRange{ 0xfe54, 0xfe66, CodepointWidth::Wide }, UnicodeRange{ 0xfe68, 0xfe6b, CodepointWidth::Wide }, UnicodeRange{ 0xff01, 0xff60, CodepointWidth::Wide }, UnicodeRange{ 0xffe0, 0xffe6, CodepointWidth::Wide }, UnicodeRange{ 0xfffd, 0xfffd, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x16fe0, 0x16fe1, CodepointWidth::Wide }, - UnicodeRange{ 0x17000, 0x187ec, CodepointWidth::Wide }, - UnicodeRange{ 0x18800, 0x18af2, CodepointWidth::Wide }, + UnicodeRange{ 0x16fe0, 0x16fe4, CodepointWidth::Wide }, + UnicodeRange{ 0x16ff0, 0x16ff1, CodepointWidth::Wide }, + UnicodeRange{ 0x17000, 0x187f7, CodepointWidth::Wide }, + UnicodeRange{ 0x18800, 0x18cd5, CodepointWidth::Wide }, + UnicodeRange{ 0x18d00, 0x18d08, CodepointWidth::Wide }, UnicodeRange{ 0x1b000, 0x1b11e, CodepointWidth::Wide }, + UnicodeRange{ 0x1b150, 0x1b152, CodepointWidth::Wide }, + UnicodeRange{ 0x1b164, 0x1b167, CodepointWidth::Wide }, UnicodeRange{ 0x1b170, 0x1b2fb, CodepointWidth::Wide }, - UnicodeRange{ 0x1f004, 0x1f004, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f0cf, 0x1f0cf, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x1f004, 0x1f004, CodepointWidth::Wide }, + UnicodeRange{ 0x1f0cf, 0x1f0cf, CodepointWidth::Wide }, UnicodeRange{ 0x1f100, 0x1f10a, CodepointWidth::Ambiguous }, UnicodeRange{ 0x1f110, 0x1f12d, CodepointWidth::Ambiguous }, UnicodeRange{ 0x1f130, 0x1f169, CodepointWidth::Ambiguous }, UnicodeRange{ 0x1f170, 0x1f18d, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x1f18e, 0x1f18e, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x1f18e, 0x1f18e, CodepointWidth::Wide }, UnicodeRange{ 0x1f18f, 0x1f190, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x1f191, 0x1f19a, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x1f191, 0x1f19a, CodepointWidth::Wide }, UnicodeRange{ 0x1f19b, 0x1f1ac, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x1f1e6, 0x1f1ff, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f200, 0x1f202, CodepointWidth::Wide }, + UnicodeRange{ 0x1f1e6, 0x1f202, CodepointWidth::Wide }, UnicodeRange{ 0x1f210, 0x1f23b, CodepointWidth::Wide }, UnicodeRange{ 0x1f240, 0x1f248, CodepointWidth::Wide }, - UnicodeRange{ 0x1f250, 0x1f251, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x1f250, 0x1f251, CodepointWidth::Wide }, UnicodeRange{ 0x1f260, 0x1f265, CodepointWidth::Wide }, - UnicodeRange{ 0x1f300, 0x1f320, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f32d, 0x1f335, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f337, 0x1f37c, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f37e, 0x1f393, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f3a0, 0x1f3ca, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f3cf, 0x1f3d3, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f3e0, 0x1f3f0, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f3f4, 0x1f3f4, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f3f8, 0x1f43e, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f440, 0x1f440, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f442, 0x1f4fc, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f4ff, 0x1f53d, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f54b, 0x1f54e, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f550, 0x1f567, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f57a, 0x1f57a, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f595, 0x1f596, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f5a4, 0x1f5a4, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f5fb, 0x1f64f, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f680, 0x1f6c5, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f6cc, 0x1f6cc, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f6d0, 0x1f6d2, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f6d5, 0x1f6d7, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f6eb, 0x1f6ec, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f6f4, 0x1f6fc, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f7e0, 0x1f7eb, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f90c, 0x1f93a, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f93c, 0x1f945, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f947, 0x1f978, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f97a, 0x1f9cb, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1f9cd, 0x1f9ff, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1fa70, 0x1fa74, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1fa78, 0x1fa7a, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1fa80, 0x1fa86, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1fa90, 0x1faa8, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1fab0, 0x1fab6, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1fac0, 0x1fac2, CodepointWidth::Wide }, // Emoji=Y EPres=Y - UnicodeRange{ 0x1fad0, 0x1fad6, CodepointWidth::Wide }, // Emoji=Y EPres=Y + UnicodeRange{ 0x1f300, 0x1f320, CodepointWidth::Wide }, + UnicodeRange{ 0x1f32d, 0x1f335, CodepointWidth::Wide }, + UnicodeRange{ 0x1f337, 0x1f37c, CodepointWidth::Wide }, + UnicodeRange{ 0x1f37e, 0x1f393, CodepointWidth::Wide }, + UnicodeRange{ 0x1f3a0, 0x1f3ca, CodepointWidth::Wide }, + UnicodeRange{ 0x1f3cf, 0x1f3d3, CodepointWidth::Wide }, + UnicodeRange{ 0x1f3e0, 0x1f3f0, CodepointWidth::Wide }, + UnicodeRange{ 0x1f3f4, 0x1f3f4, CodepointWidth::Wide }, + UnicodeRange{ 0x1f3f8, 0x1f43e, CodepointWidth::Wide }, + UnicodeRange{ 0x1f440, 0x1f440, CodepointWidth::Wide }, + UnicodeRange{ 0x1f442, 0x1f4fc, CodepointWidth::Wide }, + UnicodeRange{ 0x1f4ff, 0x1f53d, CodepointWidth::Wide }, + UnicodeRange{ 0x1f54b, 0x1f54e, CodepointWidth::Wide }, + UnicodeRange{ 0x1f550, 0x1f567, CodepointWidth::Wide }, + UnicodeRange{ 0x1f57a, 0x1f57a, CodepointWidth::Wide }, + UnicodeRange{ 0x1f595, 0x1f596, CodepointWidth::Wide }, + UnicodeRange{ 0x1f5a4, 0x1f5a4, CodepointWidth::Wide }, + UnicodeRange{ 0x1f5fb, 0x1f64f, CodepointWidth::Wide }, + UnicodeRange{ 0x1f680, 0x1f6c5, CodepointWidth::Wide }, + UnicodeRange{ 0x1f6cc, 0x1f6cc, CodepointWidth::Wide }, + UnicodeRange{ 0x1f6d0, 0x1f6d2, CodepointWidth::Wide }, + UnicodeRange{ 0x1f6d5, 0x1f6d7, CodepointWidth::Wide }, + UnicodeRange{ 0x1f6eb, 0x1f6ec, CodepointWidth::Wide }, + UnicodeRange{ 0x1f6f4, 0x1f6fc, CodepointWidth::Wide }, + UnicodeRange{ 0x1f7e0, 0x1f7eb, CodepointWidth::Wide }, + UnicodeRange{ 0x1f90c, 0x1f93a, CodepointWidth::Wide }, + UnicodeRange{ 0x1f93c, 0x1f945, CodepointWidth::Wide }, + UnicodeRange{ 0x1f947, 0x1f978, CodepointWidth::Wide }, + UnicodeRange{ 0x1f97a, 0x1f9cb, CodepointWidth::Wide }, + UnicodeRange{ 0x1f9cd, 0x1f9ff, CodepointWidth::Wide }, + UnicodeRange{ 0x1fa70, 0x1fa74, CodepointWidth::Wide }, + UnicodeRange{ 0x1fa78, 0x1fa7a, CodepointWidth::Wide }, + UnicodeRange{ 0x1fa80, 0x1fa86, CodepointWidth::Wide }, + UnicodeRange{ 0x1fa90, 0x1faa8, CodepointWidth::Wide }, + UnicodeRange{ 0x1fab0, 0x1fab6, CodepointWidth::Wide }, + UnicodeRange{ 0x1fac0, 0x1fac2, CodepointWidth::Wide }, + UnicodeRange{ 0x1fad0, 0x1fad6, CodepointWidth::Wide }, UnicodeRange{ 0x20000, 0x2fffd, CodepointWidth::Wide }, UnicodeRange{ 0x30000, 0x3fffd, CodepointWidth::Wide }, UnicodeRange{ 0xe0100, 0xe01ef, CodepointWidth::Ambiguous }, UnicodeRange{ 0xf0000, 0xffffd, CodepointWidth::Ambiguous }, - UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous } + UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }, }; } diff --git a/src/types/convert.cpp b/src/types/convert.cpp index dc7b71908..3e9d8e149 100644 --- a/src/types/convert.cpp +++ b/src/types/convert.cpp @@ -354,179 +354,16 @@ std::deque> SynthesizeNumpadEvents(const wchar_t wch, // May-01-2019 MiNiksa Forced lookup-via-renderer for retroactively recategorized emoji // that used to be narrow but now might be wide. (approx x2194-x2b55, not inclusive) // Also forced block characters segment (x2580-x259F) to narrow +// Oct-25-2020 DuHowett Replaced the entire table with a set of overrides that get built into +// CodepointWidthDetector (unicode_width_overrides.xml) CodepointWidth GetQuickCharWidth(const wchar_t wch) noexcept { - // 0x00-0x1F is ambiguous by font if (0x20 <= wch && wch <= 0x7e) { /* ASCII */ return CodepointWidth::Narrow; } - // 0x80 - 0x0451 varies from narrow to ambiguous by character and font (Unicode 9.0) - else if (0x0452 <= wch && wch <= 0x10FF) - { - // From Unicode 9.0, this range is narrow (assorted languages) - return CodepointWidth::Narrow; - } - else if (0x1100 <= wch && wch <= 0x115F) - { - // From Unicode 9.0, Hangul Choseong is wide - return CodepointWidth::Wide; - } - else if (0x1160 <= wch && wch <= 0x200F) - { - // From Unicode 9.0, this range is narrow (assorted languages) - return CodepointWidth::Narrow; - } - // 0x2500 - 0x257F is the box drawing character range - - // Technically, these are ambiguous width characters, but applications that - // use them generally assume that they're narrow to ensure proper alignment. - else if (0x2500 <= wch && wch <= 0x257F) - { - return CodepointWidth::Narrow; - } - // 0x2580 - 0x259F is the block element characters. - // Technically these are ambiguous width, but many many things assume they're narrow. - else if (0x2580 <= wch && wch <= 0x259F) - { - return CodepointWidth::Narrow; - } - else if (0x2B5A <= wch && wch <= 0x2E44) - { - // From Unicode 9.0, this range is narrow (assorted languages) - return CodepointWidth::Narrow; - } - else if (0x2E80 <= wch && wch <= 0x303e) - { - // From Unicode 9.0, this range is wide (assorted languages) - return CodepointWidth::Wide; - } - else if (0x3041 <= wch && wch <= 0x3094) - { - /* Hiragana */ - return CodepointWidth::Wide; - } - else if (0x30a1 <= wch && wch <= 0x30f6) - { - /* Katakana */ - return CodepointWidth::Wide; - } - else if (0x3105 <= wch && wch <= 0x312c) - { - /* Bopomofo */ - return CodepointWidth::Wide; - } - else if (0x3131 <= wch && wch <= 0x318e) - { - /* Hangul Elements */ - return CodepointWidth::Wide; - } - else if (0x3190 <= wch && wch <= 0x3247) - { - // From Unicode 9.0, this range is wide - return CodepointWidth::Wide; - } - else if (0x3251 <= wch && wch <= 0xA4C6) - { - // This exception range is narrow width hexagrams. - if (0x4DC0 <= wch && wch <= 0x4DFF) - { - return CodepointWidth::Narrow; - } - else - { - // From Unicode 9.0, this range is wide - // CJK Unified Ideograph and Yi and Reserved. - // Includes Han Ideographic range. - return CodepointWidth::Wide; - } - } - else if (0xA4D0 <= wch && wch <= 0xABF9) - { - // This exception range is wide Hangul Choseong - if (0xA960 <= wch && wch <= 0xA97C) - { - return CodepointWidth::Wide; - } - else - { - // From Unicode 9.0, this range is narrow (assorted languages) - return CodepointWidth::Narrow; - } - } - else if (0xac00 <= wch && wch <= 0xd7a3) - { - /* Korean Hangul Syllables */ - return CodepointWidth::Wide; - } - else if (0xD7B0 <= wch && wch <= 0xD7FB) - { - // From Unicode 9.0, this range is narrow - // Hangul Jungseong and Hangul Jongseong - return CodepointWidth::Narrow; - } - // 0xD800-0xDFFF is reserved for UTF-16 surrogate pairs. - // 0xE000-0xF8FF is reserved for private use characters and is therefore always ambiguous. - else if (0xF900 <= wch && wch <= 0xFAFF) - { - // From Unicode 9.0, this range is wide - // CJK Compatibility Ideographs - // Includes Han Compatibility Ideographs - return CodepointWidth::Wide; - } - else if (0xFB00 <= wch && wch <= 0xFDFD) - { - // From Unicode 9.0, this range is narrow (assorted languages) - return CodepointWidth::Narrow; - } - else if (0xFE10 <= wch && wch <= 0xFE6B) - { - // This exception range has narrow combining ligatures - if (0xFE20 <= wch && wch <= 0xFE2F) - { - return CodepointWidth::Narrow; - } - else - { - // From Unicode 9.0, this range is wide - // Presentation forms - return CodepointWidth::Wide; - } - } - else if (0xFE70 <= wch && wch <= 0xFEFF) - { - // From Unicode 9.0, this range is narrow - return CodepointWidth::Narrow; - } - else if (0xff01 <= wch && wch <= 0xff5e) - { - /* Fullwidth ASCII variants */ - return CodepointWidth::Wide; - } - else if (0xff61 <= wch && wch <= 0xff9f) - { - /* Halfwidth Katakana variants */ - return CodepointWidth::Narrow; - } - else if ((0xffa0 <= wch && wch <= 0xffbe) || - (0xffc2 <= wch && wch <= 0xffc7) || - (0xffca <= wch && wch <= 0xffcf) || - (0xffd2 <= wch && wch <= 0xffd7) || - (0xffda <= wch && wch <= 0xffdc)) - { - /* Halfwidth Hangul variants */ - return CodepointWidth::Narrow; - } - else if (0xffe0 <= wch && wch <= 0xffe6) - { - /* Fullwidth symbol variants */ - return CodepointWidth::Wide; - } - // Currently we do not support codepoints above 0xffff - else - { - return CodepointWidth::Invalid; - } + return CodepointWidth::Invalid; } wchar_t Utf16ToUcs2(const std::wstring_view charData) diff --git a/src/types/unicode_width_overrides.xml b/src/types/unicode_width_overrides.xml new file mode 100644 index 000000000..142038869 --- /dev/null +++ b/src/types/unicode_width_overrides.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/tools/Generate-CodepointWidthsFromUCD.ps1 b/tools/Generate-CodepointWidthsFromUCD.ps1 index 788e53892..d1eb56ea6 100644 --- a/tools/Generate-CodepointWidthsFromUCD.ps1 +++ b/tools/Generate-CodepointWidthsFromUCD.ps1 @@ -133,6 +133,11 @@ Class UnicodeRange : System.IComparable { Return $false } + # Comments are different: do not merge + If ($this.Comment -ne $Other.Comment) { + Return $false + } + # Flags are different: do not merge If ($this.Flags -ne $Other.Flags) { Return $false @@ -261,6 +266,7 @@ If (-not $NoOverrides) { " // {0} (0x{0:X}) codepoints covered." -f $c If (-not $NoOverrides) { " // {0} (0x{0:X}) codepoints overridden." -f $overrideCount +" // Override path: {0}" -f $OverridePath } " static constexpr std::array s_wideAndAmbiguousTable{{" -f $ranges.Count ForEach($_ in $ranges) {