Fully regenerate CodepointWidthDetector from Unicode 13.0 (#8035)

This commit also adds an override UCD and migrates all of the overrides
from GetQuickCharWidth into it.

GetQuickCharWidth
-----------------

The removal of overrides from GQCW reduces the number of comparisons
required for looking up a single character's width from 41 (32
individual ranged comparisons from GQCW + 8+1 from the binary search in
CPWD) to 11 (2 from GQCW, 8+1 from CPWD).

GQCW also incorrectly marked 67 reserved codepoints as `Wide` when they
should have been `Narrow`.

The codepoints whose definitions have changed from `Wide` to `Narrow` are:

```
2E9A 2EF4 2EF5 2EF6 2EF7 2EF8 2EF9 2EFA 2EFB 2EFC 2EFD 2EFE 2EFF 2FD6
2FD7 2FD8 2FD9 2FDA 2FDB 2FDC 2FDD 2FDE 2FDF 2FE0 2FE1 2FE2 2FE3 2FE4
2FE5 2FE6 2FE7 2FE8 2FE9 2FEA 2FEB 2FEC 2FED 2FEE 2FEF 2FFC 2FFD 2FFE
2FFF 31E4 31E5 31E6 31E7 31E8 31E9 31EA 31EB 31EC 31ED 31EE 31EF 321F
A48D A48E A48F FE1A FE1B FE1C FE1D FE1E FE1F FE53 FE67
```

All of them are reserved, but those reserved regions are marked as narrow
in the UCD.

This change also offers us the chance to document exactly why we're
overriding a specific character range. Comments from the override
document will be copied to the generated CPWD table.

New in Unicode 13.0
------------------

Some widths have changed due to previously-reserved characters becoming
_used_ such as U+32FF SQUARE ERA NAME REIWA, the Tangut components
756-768, the entire Khitan Small Script character set, and the Tangut
Ideographs.

A number of the changes in this diff are due to better/worse comment
tracking and the removal of the Emoji/EPres comments. The script once
mistakenly applied comments to packed regions (and it has been updated
to not do so.)

Validation
----------

I build a test application that compared codepoints 0-FFFF for GQCW
against their new registered widths.
This commit is contained in:
Dustin L. Howett 2020-10-27 10:36:28 -07:00 committed by GitHub
parent b603929214
commit 1df3182865
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 116 additions and 289 deletions

View File

@ -601,6 +601,7 @@ devops
Dext
df
DFactory
DFF
DFMT
dh
dhandler

View File

@ -19,39 +19,12 @@ namespace
return range.upperBound < searchTerm;
}
static constexpr std::array<UnicodeRange, 294> s_wideAndAmbiguousTable{
// generated from http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
// anything not present here is presumed to be Narrow.
//
// GH #900 - Supplemented with emoji codepoints from https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt
// Emojis in 0x2010 - 0x2B59 used to be marked as Ambiguous in GetQuickCharWidth() in order to
// force a font lookup, but since we default all Ambiguous width to Narrow, those emojis always
// came out looking squished/tiny. They've been moved into this table and marked as Wide.
//
// === UCD Definitions ===
// EA - EastAsianWidth
// Emoji - Emoji
// EPres - Emoji Presentation
// =======================
//
// This table has been partially regenerated from the Unicode Character Database as of 13.0, with
// the following rules:
// Codepoints whose EA is "W", "F" are Wide
// Codepoints whose EA is "A" are Ambiguous
// Codepoints where Emoji=Y and EPres=Y are Emoji, therefore Wide
// -
// Codepoints where Emoji=Y but EPres=*N* are only Emoji when followed
// by U+FE0F variation selector 15.
//
// There are a couple of codepoints that Microsoft specifically gave an emoji representation
// even if it's not specified as an emoji in the standard. I'll list the ones I'm aware of in this comment in case
// we decide to add them in the future:
// 0x261A-0x261C, 0x261E-0x261F
// 0x2661,
// 0x2662,
// 0x2664,
// 0x2666 0x2710,
// 0x270E 0x2765 0x1f000 - 0x1f02b except 0x1f004 0x1f594
// Generated by Generate-CodepointWidthsFromUCD.ps1 -Pack:True -Full:False -NoOverrides:False
// on 10/25/2020 7:32:04 AM (UTC) from Unicode 13.0.0.
// 321205 (0x4E6B5) codepoints covered.
// 240 (0xF0) codepoints overridden.
// Override path: .\src\types\unicode_width_overrides.xml
static constexpr std::array<UnicodeRange, 295> s_wideAndAmbiguousTable{
UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous },
UnicodeRange{ 0xa4, 0xa4, CodepointWidth::Ambiguous },
UnicodeRange{ 0xa7, 0xa8, CodepointWidth::Ambiguous },
@ -176,16 +149,14 @@ namespace
UnicodeRange{ 0x22a5, 0x22a5, CodepointWidth::Ambiguous },
UnicodeRange{ 0x22bf, 0x22bf, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2312, 0x2312, CodepointWidth::Ambiguous },
UnicodeRange{ 0x231a, 0x231b, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x231a, 0x231b, CodepointWidth::Wide },
UnicodeRange{ 0x2329, 0x232a, CodepointWidth::Wide },
UnicodeRange{ 0x23e9, 0x23ec, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x23f0, 0x23f0, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x23f3, 0x23f3, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x23e9, 0x23ec, CodepointWidth::Wide },
UnicodeRange{ 0x23f0, 0x23f0, CodepointWidth::Wide },
UnicodeRange{ 0x23f3, 0x23f3, CodepointWidth::Wide },
UnicodeRange{ 0x2460, 0x24e9, CodepointWidth::Ambiguous },
UnicodeRange{ 0x24eb, 0x254b, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2550, 0x2573, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2580, 0x258f, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2592, 0x2595, CodepointWidth::Ambiguous },
UnicodeRange{ 0x24eb, 0x24ff, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2500, 0x259f, CodepointWidth::Narrow }, // box-drawing and block elements require 1-cell alignment
UnicodeRange{ 0x25a0, 0x25a1, CodepointWidth::Ambiguous },
UnicodeRange{ 0x25a3, 0x25a9, CodepointWidth::Ambiguous },
UnicodeRange{ 0x25b2, 0x25b3, CodepointWidth::Ambiguous },
@ -197,61 +168,61 @@ namespace
UnicodeRange{ 0x25ce, 0x25d1, CodepointWidth::Ambiguous },
UnicodeRange{ 0x25e2, 0x25e5, CodepointWidth::Ambiguous },
UnicodeRange{ 0x25ef, 0x25ef, CodepointWidth::Ambiguous },
UnicodeRange{ 0x25fd, 0x25fe, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x25fd, 0x25fe, CodepointWidth::Wide },
UnicodeRange{ 0x2605, 0x2606, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2609, 0x2609, CodepointWidth::Ambiguous },
UnicodeRange{ 0x260e, 0x260f, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2614, 0x2615, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2614, 0x2615, CodepointWidth::Wide },
UnicodeRange{ 0x261c, 0x261c, CodepointWidth::Ambiguous },
UnicodeRange{ 0x261e, 0x261e, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2640, 0x2640, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2642, 0x2642, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2648, 0x2653, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2648, 0x2653, CodepointWidth::Wide },
UnicodeRange{ 0x2660, 0x2661, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2663, 0x2665, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2667, 0x266a, CodepointWidth::Ambiguous },
UnicodeRange{ 0x266c, 0x266d, CodepointWidth::Ambiguous },
UnicodeRange{ 0x266f, 0x266f, CodepointWidth::Ambiguous },
UnicodeRange{ 0x267f, 0x267f, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2693, 0x2693, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x267f, 0x267f, CodepointWidth::Wide },
UnicodeRange{ 0x2693, 0x2693, CodepointWidth::Wide },
UnicodeRange{ 0x269e, 0x269f, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26a1, 0x26a1, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26aa, 0x26ab, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26bd, 0x26be, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26a1, 0x26a1, CodepointWidth::Wide },
UnicodeRange{ 0x26aa, 0x26ab, CodepointWidth::Wide },
UnicodeRange{ 0x26bd, 0x26be, CodepointWidth::Wide },
UnicodeRange{ 0x26bf, 0x26bf, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26c4, 0x26c5, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26c4, 0x26c5, CodepointWidth::Wide },
UnicodeRange{ 0x26c6, 0x26cd, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26ce, 0x26ce, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26ce, 0x26ce, CodepointWidth::Wide },
UnicodeRange{ 0x26cf, 0x26d3, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26d4, 0x26d4, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26d4, 0x26d4, CodepointWidth::Wide },
UnicodeRange{ 0x26d5, 0x26e1, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26e3, 0x26e3, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26e8, 0x26e9, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26ea, 0x26ea, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26ea, 0x26ea, CodepointWidth::Wide },
UnicodeRange{ 0x26eb, 0x26f1, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26f2, 0x26f3, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26f2, 0x26f3, CodepointWidth::Wide },
UnicodeRange{ 0x26f4, 0x26f4, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26f5, 0x26f5, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26f5, 0x26f5, CodepointWidth::Wide },
UnicodeRange{ 0x26f6, 0x26f9, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26fa, 0x26fa, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26fa, 0x26fa, CodepointWidth::Wide },
UnicodeRange{ 0x26fb, 0x26fc, CodepointWidth::Ambiguous },
UnicodeRange{ 0x26fd, 0x26fd, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x26fd, 0x26fd, CodepointWidth::Wide },
UnicodeRange{ 0x26fe, 0x26ff, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2705, 0x2705, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x270a, 0x270b, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2728, 0x2728, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2705, 0x2705, CodepointWidth::Wide },
UnicodeRange{ 0x270a, 0x270b, CodepointWidth::Wide },
UnicodeRange{ 0x2728, 0x2728, CodepointWidth::Wide },
UnicodeRange{ 0x273d, 0x273d, CodepointWidth::Ambiguous },
UnicodeRange{ 0x274c, 0x274c, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x274e, 0x274e, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2753, 0x2755, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2757, 0x2757, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x274c, 0x274c, CodepointWidth::Wide },
UnicodeRange{ 0x274e, 0x274e, CodepointWidth::Wide },
UnicodeRange{ 0x2753, 0x2755, CodepointWidth::Wide },
UnicodeRange{ 0x2757, 0x2757, CodepointWidth::Wide },
UnicodeRange{ 0x2776, 0x277f, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2795, 0x2797, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x27b0, 0x27b0, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x27bf, 0x27bf, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2b1b, 0x2b1c, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2b50, 0x2b50, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2b55, 0x2b55, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x2795, 0x2797, CodepointWidth::Wide },
UnicodeRange{ 0x27b0, 0x27b0, CodepointWidth::Wide },
UnicodeRange{ 0x27bf, 0x27bf, CodepointWidth::Wide },
UnicodeRange{ 0x2b1b, 0x2b1c, CodepointWidth::Wide },
UnicodeRange{ 0x2b50, 0x2b50, CodepointWidth::Wide },
UnicodeRange{ 0x2b55, 0x2b55, CodepointWidth::Wide },
UnicodeRange{ 0x2b56, 0x2b59, CodepointWidth::Ambiguous },
UnicodeRange{ 0x2e80, 0x2e99, CodepointWidth::Wide },
UnicodeRange{ 0x2e9b, 0x2ef3, CodepointWidth::Wide },
@ -260,15 +231,14 @@ namespace
UnicodeRange{ 0x3000, 0x303e, CodepointWidth::Wide },
UnicodeRange{ 0x3041, 0x3096, CodepointWidth::Wide },
UnicodeRange{ 0x3099, 0x30ff, CodepointWidth::Wide },
UnicodeRange{ 0x3105, 0x312e, CodepointWidth::Wide },
UnicodeRange{ 0x3105, 0x312f, CodepointWidth::Wide },
UnicodeRange{ 0x3131, 0x318e, CodepointWidth::Wide },
UnicodeRange{ 0x3190, 0x31ba, CodepointWidth::Wide },
UnicodeRange{ 0x31c0, 0x31e3, CodepointWidth::Wide },
UnicodeRange{ 0x3190, 0x31e3, CodepointWidth::Wide },
UnicodeRange{ 0x31f0, 0x321e, CodepointWidth::Wide },
UnicodeRange{ 0x3220, 0x3247, CodepointWidth::Wide },
UnicodeRange{ 0x3248, 0x324f, CodepointWidth::Ambiguous },
UnicodeRange{ 0x3250, 0x32fe, CodepointWidth::Wide },
UnicodeRange{ 0x3300, 0x4dbf, CodepointWidth::Wide },
UnicodeRange{ 0x3250, 0x4dbf, CodepointWidth::Wide },
UnicodeRange{ 0x4dc0, 0x4dff, CodepointWidth::Narrow }, // hexagrams are historically narrow
UnicodeRange{ 0x4e00, 0xa48c, CodepointWidth::Wide },
UnicodeRange{ 0xa490, 0xa4c6, CodepointWidth::Wide },
UnicodeRange{ 0xa960, 0xa97c, CodepointWidth::Wide },
@ -277,75 +247,79 @@ namespace
UnicodeRange{ 0xf900, 0xfaff, CodepointWidth::Wide },
UnicodeRange{ 0xfe00, 0xfe0f, CodepointWidth::Ambiguous },
UnicodeRange{ 0xfe10, 0xfe19, CodepointWidth::Wide },
UnicodeRange{ 0xfe20, 0xfe2f, CodepointWidth::Narrow }, // narrow combining ligatures (split into left/right halves, which take 2 columns together)
UnicodeRange{ 0xfe30, 0xfe52, CodepointWidth::Wide },
UnicodeRange{ 0xfe54, 0xfe66, CodepointWidth::Wide },
UnicodeRange{ 0xfe68, 0xfe6b, CodepointWidth::Wide },
UnicodeRange{ 0xff01, 0xff60, CodepointWidth::Wide },
UnicodeRange{ 0xffe0, 0xffe6, CodepointWidth::Wide },
UnicodeRange{ 0xfffd, 0xfffd, CodepointWidth::Ambiguous },
UnicodeRange{ 0x16fe0, 0x16fe1, CodepointWidth::Wide },
UnicodeRange{ 0x17000, 0x187ec, CodepointWidth::Wide },
UnicodeRange{ 0x18800, 0x18af2, CodepointWidth::Wide },
UnicodeRange{ 0x16fe0, 0x16fe4, CodepointWidth::Wide },
UnicodeRange{ 0x16ff0, 0x16ff1, CodepointWidth::Wide },
UnicodeRange{ 0x17000, 0x187f7, CodepointWidth::Wide },
UnicodeRange{ 0x18800, 0x18cd5, CodepointWidth::Wide },
UnicodeRange{ 0x18d00, 0x18d08, CodepointWidth::Wide },
UnicodeRange{ 0x1b000, 0x1b11e, CodepointWidth::Wide },
UnicodeRange{ 0x1b150, 0x1b152, CodepointWidth::Wide },
UnicodeRange{ 0x1b164, 0x1b167, CodepointWidth::Wide },
UnicodeRange{ 0x1b170, 0x1b2fb, CodepointWidth::Wide },
UnicodeRange{ 0x1f004, 0x1f004, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f0cf, 0x1f0cf, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f004, 0x1f004, CodepointWidth::Wide },
UnicodeRange{ 0x1f0cf, 0x1f0cf, CodepointWidth::Wide },
UnicodeRange{ 0x1f100, 0x1f10a, CodepointWidth::Ambiguous },
UnicodeRange{ 0x1f110, 0x1f12d, CodepointWidth::Ambiguous },
UnicodeRange{ 0x1f130, 0x1f169, CodepointWidth::Ambiguous },
UnicodeRange{ 0x1f170, 0x1f18d, CodepointWidth::Ambiguous },
UnicodeRange{ 0x1f18e, 0x1f18e, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f18e, 0x1f18e, CodepointWidth::Wide },
UnicodeRange{ 0x1f18f, 0x1f190, CodepointWidth::Ambiguous },
UnicodeRange{ 0x1f191, 0x1f19a, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f191, 0x1f19a, CodepointWidth::Wide },
UnicodeRange{ 0x1f19b, 0x1f1ac, CodepointWidth::Ambiguous },
UnicodeRange{ 0x1f1e6, 0x1f1ff, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f200, 0x1f202, CodepointWidth::Wide },
UnicodeRange{ 0x1f1e6, 0x1f202, CodepointWidth::Wide },
UnicodeRange{ 0x1f210, 0x1f23b, CodepointWidth::Wide },
UnicodeRange{ 0x1f240, 0x1f248, CodepointWidth::Wide },
UnicodeRange{ 0x1f250, 0x1f251, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f250, 0x1f251, CodepointWidth::Wide },
UnicodeRange{ 0x1f260, 0x1f265, CodepointWidth::Wide },
UnicodeRange{ 0x1f300, 0x1f320, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f32d, 0x1f335, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f337, 0x1f37c, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f37e, 0x1f393, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f3a0, 0x1f3ca, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f3cf, 0x1f3d3, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f3e0, 0x1f3f0, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f3f4, 0x1f3f4, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f3f8, 0x1f43e, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f440, 0x1f440, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f442, 0x1f4fc, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f4ff, 0x1f53d, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f54b, 0x1f54e, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f550, 0x1f567, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f57a, 0x1f57a, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f595, 0x1f596, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f5a4, 0x1f5a4, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f5fb, 0x1f64f, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f680, 0x1f6c5, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f6cc, 0x1f6cc, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f6d0, 0x1f6d2, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f6d5, 0x1f6d7, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f6eb, 0x1f6ec, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f6f4, 0x1f6fc, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f7e0, 0x1f7eb, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f90c, 0x1f93a, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f93c, 0x1f945, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f947, 0x1f978, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f97a, 0x1f9cb, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f9cd, 0x1f9ff, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1fa70, 0x1fa74, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1fa78, 0x1fa7a, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1fa80, 0x1fa86, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1fa90, 0x1faa8, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1fab0, 0x1fab6, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1fac0, 0x1fac2, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1fad0, 0x1fad6, CodepointWidth::Wide }, // Emoji=Y EPres=Y
UnicodeRange{ 0x1f300, 0x1f320, CodepointWidth::Wide },
UnicodeRange{ 0x1f32d, 0x1f335, CodepointWidth::Wide },
UnicodeRange{ 0x1f337, 0x1f37c, CodepointWidth::Wide },
UnicodeRange{ 0x1f37e, 0x1f393, CodepointWidth::Wide },
UnicodeRange{ 0x1f3a0, 0x1f3ca, CodepointWidth::Wide },
UnicodeRange{ 0x1f3cf, 0x1f3d3, CodepointWidth::Wide },
UnicodeRange{ 0x1f3e0, 0x1f3f0, CodepointWidth::Wide },
UnicodeRange{ 0x1f3f4, 0x1f3f4, CodepointWidth::Wide },
UnicodeRange{ 0x1f3f8, 0x1f43e, CodepointWidth::Wide },
UnicodeRange{ 0x1f440, 0x1f440, CodepointWidth::Wide },
UnicodeRange{ 0x1f442, 0x1f4fc, CodepointWidth::Wide },
UnicodeRange{ 0x1f4ff, 0x1f53d, CodepointWidth::Wide },
UnicodeRange{ 0x1f54b, 0x1f54e, CodepointWidth::Wide },
UnicodeRange{ 0x1f550, 0x1f567, CodepointWidth::Wide },
UnicodeRange{ 0x1f57a, 0x1f57a, CodepointWidth::Wide },
UnicodeRange{ 0x1f595, 0x1f596, CodepointWidth::Wide },
UnicodeRange{ 0x1f5a4, 0x1f5a4, CodepointWidth::Wide },
UnicodeRange{ 0x1f5fb, 0x1f64f, CodepointWidth::Wide },
UnicodeRange{ 0x1f680, 0x1f6c5, CodepointWidth::Wide },
UnicodeRange{ 0x1f6cc, 0x1f6cc, CodepointWidth::Wide },
UnicodeRange{ 0x1f6d0, 0x1f6d2, CodepointWidth::Wide },
UnicodeRange{ 0x1f6d5, 0x1f6d7, CodepointWidth::Wide },
UnicodeRange{ 0x1f6eb, 0x1f6ec, CodepointWidth::Wide },
UnicodeRange{ 0x1f6f4, 0x1f6fc, CodepointWidth::Wide },
UnicodeRange{ 0x1f7e0, 0x1f7eb, CodepointWidth::Wide },
UnicodeRange{ 0x1f90c, 0x1f93a, CodepointWidth::Wide },
UnicodeRange{ 0x1f93c, 0x1f945, CodepointWidth::Wide },
UnicodeRange{ 0x1f947, 0x1f978, CodepointWidth::Wide },
UnicodeRange{ 0x1f97a, 0x1f9cb, CodepointWidth::Wide },
UnicodeRange{ 0x1f9cd, 0x1f9ff, CodepointWidth::Wide },
UnicodeRange{ 0x1fa70, 0x1fa74, CodepointWidth::Wide },
UnicodeRange{ 0x1fa78, 0x1fa7a, CodepointWidth::Wide },
UnicodeRange{ 0x1fa80, 0x1fa86, CodepointWidth::Wide },
UnicodeRange{ 0x1fa90, 0x1faa8, CodepointWidth::Wide },
UnicodeRange{ 0x1fab0, 0x1fab6, CodepointWidth::Wide },
UnicodeRange{ 0x1fac0, 0x1fac2, CodepointWidth::Wide },
UnicodeRange{ 0x1fad0, 0x1fad6, CodepointWidth::Wide },
UnicodeRange{ 0x20000, 0x2fffd, CodepointWidth::Wide },
UnicodeRange{ 0x30000, 0x3fffd, CodepointWidth::Wide },
UnicodeRange{ 0xe0100, 0xe01ef, CodepointWidth::Ambiguous },
UnicodeRange{ 0xf0000, 0xffffd, CodepointWidth::Ambiguous },
UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }
UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous },
};
}

View File

@ -354,179 +354,16 @@ std::deque<std::unique_ptr<KeyEvent>> SynthesizeNumpadEvents(const wchar_t wch,
// May-01-2019 MiNiksa Forced lookup-via-renderer for retroactively recategorized emoji
// that used to be narrow but now might be wide. (approx x2194-x2b55, not inclusive)
// Also forced block characters segment (x2580-x259F) to narrow
// Oct-25-2020 DuHowett Replaced the entire table with a set of overrides that get built into
// CodepointWidthDetector (unicode_width_overrides.xml)
CodepointWidth GetQuickCharWidth(const wchar_t wch) noexcept
{
// 0x00-0x1F is ambiguous by font
if (0x20 <= wch && wch <= 0x7e)
{
/* ASCII */
return CodepointWidth::Narrow;
}
// 0x80 - 0x0451 varies from narrow to ambiguous by character and font (Unicode 9.0)
else if (0x0452 <= wch && wch <= 0x10FF)
{
// From Unicode 9.0, this range is narrow (assorted languages)
return CodepointWidth::Narrow;
}
else if (0x1100 <= wch && wch <= 0x115F)
{
// From Unicode 9.0, Hangul Choseong is wide
return CodepointWidth::Wide;
}
else if (0x1160 <= wch && wch <= 0x200F)
{
// From Unicode 9.0, this range is narrow (assorted languages)
return CodepointWidth::Narrow;
}
// 0x2500 - 0x257F is the box drawing character range -
// Technically, these are ambiguous width characters, but applications that
// use them generally assume that they're narrow to ensure proper alignment.
else if (0x2500 <= wch && wch <= 0x257F)
{
return CodepointWidth::Narrow;
}
// 0x2580 - 0x259F is the block element characters.
// Technically these are ambiguous width, but many many things assume they're narrow.
else if (0x2580 <= wch && wch <= 0x259F)
{
return CodepointWidth::Narrow;
}
else if (0x2B5A <= wch && wch <= 0x2E44)
{
// From Unicode 9.0, this range is narrow (assorted languages)
return CodepointWidth::Narrow;
}
else if (0x2E80 <= wch && wch <= 0x303e)
{
// From Unicode 9.0, this range is wide (assorted languages)
return CodepointWidth::Wide;
}
else if (0x3041 <= wch && wch <= 0x3094)
{
/* Hiragana */
return CodepointWidth::Wide;
}
else if (0x30a1 <= wch && wch <= 0x30f6)
{
/* Katakana */
return CodepointWidth::Wide;
}
else if (0x3105 <= wch && wch <= 0x312c)
{
/* Bopomofo */
return CodepointWidth::Wide;
}
else if (0x3131 <= wch && wch <= 0x318e)
{
/* Hangul Elements */
return CodepointWidth::Wide;
}
else if (0x3190 <= wch && wch <= 0x3247)
{
// From Unicode 9.0, this range is wide
return CodepointWidth::Wide;
}
else if (0x3251 <= wch && wch <= 0xA4C6)
{
// This exception range is narrow width hexagrams.
if (0x4DC0 <= wch && wch <= 0x4DFF)
{
return CodepointWidth::Narrow;
}
else
{
// From Unicode 9.0, this range is wide
// CJK Unified Ideograph and Yi and Reserved.
// Includes Han Ideographic range.
return CodepointWidth::Wide;
}
}
else if (0xA4D0 <= wch && wch <= 0xABF9)
{
// This exception range is wide Hangul Choseong
if (0xA960 <= wch && wch <= 0xA97C)
{
return CodepointWidth::Wide;
}
else
{
// From Unicode 9.0, this range is narrow (assorted languages)
return CodepointWidth::Narrow;
}
}
else if (0xac00 <= wch && wch <= 0xd7a3)
{
/* Korean Hangul Syllables */
return CodepointWidth::Wide;
}
else if (0xD7B0 <= wch && wch <= 0xD7FB)
{
// From Unicode 9.0, this range is narrow
// Hangul Jungseong and Hangul Jongseong
return CodepointWidth::Narrow;
}
// 0xD800-0xDFFF is reserved for UTF-16 surrogate pairs.
// 0xE000-0xF8FF is reserved for private use characters and is therefore always ambiguous.
else if (0xF900 <= wch && wch <= 0xFAFF)
{
// From Unicode 9.0, this range is wide
// CJK Compatibility Ideographs
// Includes Han Compatibility Ideographs
return CodepointWidth::Wide;
}
else if (0xFB00 <= wch && wch <= 0xFDFD)
{
// From Unicode 9.0, this range is narrow (assorted languages)
return CodepointWidth::Narrow;
}
else if (0xFE10 <= wch && wch <= 0xFE6B)
{
// This exception range has narrow combining ligatures
if (0xFE20 <= wch && wch <= 0xFE2F)
{
return CodepointWidth::Narrow;
}
else
{
// From Unicode 9.0, this range is wide
// Presentation forms
return CodepointWidth::Wide;
}
}
else if (0xFE70 <= wch && wch <= 0xFEFF)
{
// From Unicode 9.0, this range is narrow
return CodepointWidth::Narrow;
}
else if (0xff01 <= wch && wch <= 0xff5e)
{
/* Fullwidth ASCII variants */
return CodepointWidth::Wide;
}
else if (0xff61 <= wch && wch <= 0xff9f)
{
/* Halfwidth Katakana variants */
return CodepointWidth::Narrow;
}
else if ((0xffa0 <= wch && wch <= 0xffbe) ||
(0xffc2 <= wch && wch <= 0xffc7) ||
(0xffca <= wch && wch <= 0xffcf) ||
(0xffd2 <= wch && wch <= 0xffd7) ||
(0xffda <= wch && wch <= 0xffdc))
{
/* Halfwidth Hangul variants */
return CodepointWidth::Narrow;
}
else if (0xffe0 <= wch && wch <= 0xffe6)
{
/* Fullwidth symbol variants */
return CodepointWidth::Wide;
}
// Currently we do not support codepoints above 0xffff
else
{
return CodepointWidth::Invalid;
}
return CodepointWidth::Invalid;
}
wchar_t Utf16ToUcs2(const std::wstring_view charData)

View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<ucd xmlns="http://www.unicode.org/ns/2003/ucd/1.0">
<repertoire>
<override first-cp="2500" last-cp="259F" ea="H" comment="box-drawing and block elements require 1-cell alignment" />
<override first-cp="4DC0" last-cp="4DFF" ea="H" comment="hexagrams are historically narrow" />
<override first-cp="FE20" last-cp="FE2F" ea="H" comment="narrow combining ligatures (split into left/right halves, which take 2 columns together)" />
</repertoire>
</ucd>

View File

@ -133,6 +133,11 @@ Class UnicodeRange : System.IComparable {
Return $false
}
# Comments are different: do not merge
If ($this.Comment -ne $Other.Comment) {
Return $false
}
# Flags are different: do not merge
If ($this.Flags -ne $Other.Flags) {
Return $false
@ -261,6 +266,7 @@ If (-not $NoOverrides) {
" // {0} (0x{0:X}) codepoints covered." -f $c
If (-not $NoOverrides) {
" // {0} (0x{0:X}) codepoints overridden." -f $overrideCount
" // Override path: {0}" -f $OverridePath
}
" static constexpr std::array<UnicodeRange, {0}> s_wideAndAmbiguousTable{{" -f $ranges.Count
ForEach($_ in $ranges) {