terminal/tools/Generate-CodepointWidthsFromUCD.ps1

281 lines
9.2 KiB
PowerShell
Raw Permalink Normal View History

tools: add a powershell script to generate CPWD from the UCD (#5946) This commit introduces Generate-CodepointWidthsFromUCD, a powershell (7+) script that will parse a UCD XML database in the UAX 42 format from https://www.unicode.org/Public/UCD/latest/ucdxml/ and generate CodepointWidthDetector's giant width array. By default, it will emit one UnicodeRange for every range of non-narrow glyphs with a different Width + Emoji + Emoji Presentation class; however, it can be run in "packing" and "full" mode. * Packing mode: ignore the width/emoji/pres class and combine adjacent runs that CPWD will treat the same. * This is for optimizing the number of individual ranges emitted into code. * Full mode: include narrow codepoints (helpful for visualization) It also supports overrides, provided in an XML document of the same format as the UCD itself. Entries in the overrides files are applied after the entire UCD is read and will replace any impacted ranges. The output (when packing) looks like this: ```c++ // Generated by Generate-CodepointWidthsFromUCD -Pack:True -Full:False // on 05/17/2020 02:47:55 (UTC) from Unicode 13.0.0. // 66182 (0x10286) codepoints covered. static constexpr std::array<UnicodeRange, 23> s_wideAndAmbiguousTable{ UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa4, 0xa4, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa7, 0xa8, CodepointWidth::Ambiguous }, . . . UnicodeRange{ 0x1f210, 0x1f23b, CodepointWidth::Wide }, UnicodeRange{ 0x1f37e, 0x1f393, CodepointWidth::Wide }, UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }, }; ``` The output (when overriding) looks like this: ```c++ // Generated by Generate-CodepointWidthsFromUCD.ps1 -Pack:True -Full:False -NoOverrides:False // on 5/22/2020 11:17:39 PM (UTC) from Unicode 13.0.0. // 321205 (0x4E6B5) codepoints covered. // 240 (0xF0) codepoints overridden. static constexpr std::array<UnicodeRange, 23> s_wideAndAmbiguousTable{ UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous }, ... UnicodeRange{ 0xfe20, 0xfe2f, CodepointWidth::Narrow }, // narrow combining ligatures (split into left/right halves, which take 2 columns together) ... UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }, }; ```
2020-06-03 09:16:14 +02:00
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
#Requires -Version 7
# (we use the null coalescing operator)
################################################################################
# This script generates the an array suitable for replacing the body of
# src/types/CodepointWidthDetector.cpp from a Unicode UCD XML document[1]
# compliant with UAX#42[2].
#
# This script supports a quasi-mandatory "overrides" file, overrides.xml.
# If you do not have overrides, supply the -NoOverrides parameter. This was
# developed for use with the CodepointWidthDetector, which has some override
# ranges.
#
# This script was developed against the flat "no han unification" UCD
# "ucd.nounihan.flat.xml".
# It does not support the grouped database format.
# significantly smaller, which would provide a performance win on the admittedly
# extremely rare occasion that we should need to regenerate our table.
#
# Invoke as ./Generate-xxx ucd.nounihan.flat.xml -Pack | Out-File -Encoding
# UTF-8 Temporary.cpp
#
# [1]: https://www.unicode.org/Public/UCD/latest/ucdxml/
# [2]: https://www.unicode.org/reports/tr42/
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSAvoidUsingPositionalParameters', '')]
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseProcessBlockForPipelineCommand', '')]
[CmdletBinding()]
Param(
[Parameter(Position=0, ValueFromPipeline=$true, ParameterSetName="Parsed")]
[System.Xml.XmlDocument]$InputObject,
[Parameter(Position=1, ValueFromPipeline=$true, ParameterSetName="Parsed")]
[System.Xml.XmlDocument]$OverrideObject,
[Parameter(Position=0, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")]
[string]$Path = "ucd.nounihan.flat.xml",
[Parameter(Position=1, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")]
[string]$OverridePath = "overrides.xml",
[switch]$Pack, # Pack tightly based on width
[switch]$NoOverrides, # Do not include overrides
[switch]$Full = $False # Include Narrow codepoints
)
Enum CodepointWidth {
Narrow;
Wide;
Ambiguous;
Invalid;
}
# UCD Functions {{{
Function Get-UCDEntryRange($entry) {
$s = $e = 0
if ($null -ne $v.cp) {
# Individual Codepoint
$s = $e = [int]("0x"+$v.cp)
} ElseIf ($null -ne $v."first-cp") {
# Range of Codepoints
$s = [int]("0x"+$v."first-cp")
$e = [int]("0x"+$v."last-cp")
}
$s
$e
}
Function Get-UCDEntryWidth($entry) {
If ($entry.Emoji -eq "Y" -and $entry.EPres -eq "Y") {
[CodepointWidth]::Wide
Return
}
Switch($entry.ea) {
"N" { [CodepointWidth]::Narrow; Return }
"Na" { [CodepointWidth]::Narrow; Return }
"H" { [CodepointWidth]::Narrow; Return }
"W" { [CodepointWidth]::Wide; Return }
"F" { [CodepointWidth]::Wide; Return }
"A" { [CodepointWidth]::Ambiguous; Return }
}
[CodepointWidth]::Invalid
}
Function Get-UCDEntryFlags($entry) {
If ($script:Pack) {
# If we're "pack"ing entries, only the computed width matters for telling them apart
Get-UCDEntryWidth $entry
Return
}
$normalizedEAWidth = $entry.ea
$normalizedEAWidth = $normalizedEAWidth -eq "F" ? "W" : $normalizedEAWidth;
"{0}{1}{2}" -f $normalizedEAWidth, $entry.Emoji, $entry.EPres
}
# }}}
Class UnicodeRange : System.IComparable {
[int]$Start
[int]$End
[CodepointWidth]$Width
[string]$Flags
[string]$Comment
UnicodeRange([System.Xml.XmlElement]$ucdEntry) {
$this.Start, $this.End = Get-UCDEntryRange $ucdEntry
$this.Width = Get-UCDEntryWidth $ucdEntry
$this.Flags = Get-UCDEntryFlags $ucdEntry
If (-not $script:Pack -and $ucdEntry.Emoji -eq "Y" -and $ucdEntry.EPres -eq "Y") {
$this.Comment = "Emoji=Y EPres=Y"
}
If ($null -ne $ucdEntry.comment) {
$this.Comment = $ucdEntry.comment
}
}
[int] CompareTo([object]$Other) {
If ($Other -is [int]) {
Return $this.Start - $Other
}
Return $this.Start - $Other.Start
}
[bool] Merge([UnicodeRange]$Other) {
# If there's more than one codepoint between them, don't merge
If (($Other.Start - $this.End) -gt 1) {
Return $false
}
Fully regenerate CodepointWidthDetector from Unicode 13.0 (#8035) This commit also adds an override UCD and migrates all of the overrides from GetQuickCharWidth into it. GetQuickCharWidth ----------------- The removal of overrides from GQCW reduces the number of comparisons required for looking up a single character's width from 41 (32 individual ranged comparisons from GQCW + 8+1 from the binary search in CPWD) to 11 (2 from GQCW, 8+1 from CPWD). GQCW also incorrectly marked 67 reserved codepoints as `Wide` when they should have been `Narrow`. The codepoints whose definitions have changed from `Wide` to `Narrow` are: ``` 2E9A 2EF4 2EF5 2EF6 2EF7 2EF8 2EF9 2EFA 2EFB 2EFC 2EFD 2EFE 2EFF 2FD6 2FD7 2FD8 2FD9 2FDA 2FDB 2FDC 2FDD 2FDE 2FDF 2FE0 2FE1 2FE2 2FE3 2FE4 2FE5 2FE6 2FE7 2FE8 2FE9 2FEA 2FEB 2FEC 2FED 2FEE 2FEF 2FFC 2FFD 2FFE 2FFF 31E4 31E5 31E6 31E7 31E8 31E9 31EA 31EB 31EC 31ED 31EE 31EF 321F A48D A48E A48F FE1A FE1B FE1C FE1D FE1E FE1F FE53 FE67 ``` All of them are reserved, but those reserved regions are marked as narrow in the UCD. This change also offers us the chance to document exactly why we're overriding a specific character range. Comments from the override document will be copied to the generated CPWD table. New in Unicode 13.0 ------------------ Some widths have changed due to previously-reserved characters becoming _used_ such as U+32FF SQUARE ERA NAME REIWA, the Tangut components 756-768, the entire Khitan Small Script character set, and the Tangut Ideographs. A number of the changes in this diff are due to better/worse comment tracking and the removal of the Emoji/EPres comments. The script once mistakenly applied comments to packed regions (and it has been updated to not do so.) Validation ---------- I build a test application that compared codepoints 0-FFFF for GQCW against their new registered widths.
2020-10-27 18:36:28 +01:00
# Comments are different: do not merge
If ($this.Comment -ne $Other.Comment) {
Return $false
}
tools: add a powershell script to generate CPWD from the UCD (#5946) This commit introduces Generate-CodepointWidthsFromUCD, a powershell (7+) script that will parse a UCD XML database in the UAX 42 format from https://www.unicode.org/Public/UCD/latest/ucdxml/ and generate CodepointWidthDetector's giant width array. By default, it will emit one UnicodeRange for every range of non-narrow glyphs with a different Width + Emoji + Emoji Presentation class; however, it can be run in "packing" and "full" mode. * Packing mode: ignore the width/emoji/pres class and combine adjacent runs that CPWD will treat the same. * This is for optimizing the number of individual ranges emitted into code. * Full mode: include narrow codepoints (helpful for visualization) It also supports overrides, provided in an XML document of the same format as the UCD itself. Entries in the overrides files are applied after the entire UCD is read and will replace any impacted ranges. The output (when packing) looks like this: ```c++ // Generated by Generate-CodepointWidthsFromUCD -Pack:True -Full:False // on 05/17/2020 02:47:55 (UTC) from Unicode 13.0.0. // 66182 (0x10286) codepoints covered. static constexpr std::array<UnicodeRange, 23> s_wideAndAmbiguousTable{ UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa4, 0xa4, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa7, 0xa8, CodepointWidth::Ambiguous }, . . . UnicodeRange{ 0x1f210, 0x1f23b, CodepointWidth::Wide }, UnicodeRange{ 0x1f37e, 0x1f393, CodepointWidth::Wide }, UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }, }; ``` The output (when overriding) looks like this: ```c++ // Generated by Generate-CodepointWidthsFromUCD.ps1 -Pack:True -Full:False -NoOverrides:False // on 5/22/2020 11:17:39 PM (UTC) from Unicode 13.0.0. // 321205 (0x4E6B5) codepoints covered. // 240 (0xF0) codepoints overridden. static constexpr std::array<UnicodeRange, 23> s_wideAndAmbiguousTable{ UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous }, ... UnicodeRange{ 0xfe20, 0xfe2f, CodepointWidth::Narrow }, // narrow combining ligatures (split into left/right halves, which take 2 columns together) ... UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }, }; ```
2020-06-03 09:16:14 +02:00
# Flags are different: do not merge
If ($this.Flags -ne $Other.Flags) {
Return $false
}
$this.End = $Other.End
Return $true
}
[int] Length() {
return $this.End - $this.Start + 1
}
}
Class UnicodeRangeList : System.Collections.Generic.List[Object] {
UnicodeRangeList([int]$Capacity) : base($Capacity) { }
[int] hidden _FindInsertionPoint([int]$codepoint) {
$l = $this.BinarySearch($codepoint)
If ($l -lt 0) {
# Return value <0: value was not found, return value is bitwise complement the index of the first >= value
Return -bNOT $l
}
Return $l
}
ReplaceUnicodeRange([UnicodeRange]$newRange) {
$subset = [System.Collections.Generic.List[Object]]::New(3)
$subset.Add($newRange)
$i = $this._FindInsertionPoint($newRange.Start)
# Left overlap can only ever be one (_FindInsertionPoint always returns the
# index immediately after the range whose Start is <= than ours).
$prev = $null
If($i -gt 0 -and $this[$i - 1].End -ge $newRange.Start) {
$prev = $i - 1
}
# Right overlap can be Infinite (because we didn't account for End)
# Find extent of right overlap
For($next = $i; ($next -lt $this.Count - 1) -and ($this[$next+1].Start -le $newRange.End); $next++) { }
If ($this[$next].Start -gt $newRange.End) {
# It turns out we didn't damage the following range; clear it
$next = $null
}
If ($null -ne $next) {
# Replace damaged elements after I with a truncated range
$last = $this[$next]
$this.RemoveRange($i, $next - $i + 1) # Remove damaged elements after I
$last.Start = $newRange.End + 1
If ($last.Start -le $last.End) {
$subset.Add($last)
}
}
If ($null -ne $prev) {
# Replace damaged elements before I with a truncated range
$first = $this[$prev]
$this.RemoveRange($prev, $i - $prev) # Remove damaged elements (b/c we may not need to re-add them!)
$first.End = $newRange.Start - 1
If ($first.End -ge $first.Start) {
$subset.Insert(0, $first)
}
$i = $prev # Update the insertion cursor
}
$this.InsertRange($i, $subset)
}
}
# Ingest UCD
If ($null -eq $InputObject) {
$InputObject = [xml](Get-Content $Path)
}
$UCDRepertoire = $InputObject.ucd.repertoire.ChildNodes | Sort-Object {
# Sort by either cp or first-cp (for ranges)
if ($null -ne $_.cp) {
[int]("0x"+$_.cp)
} ElseIf ($null -ne $_."first-cp") {
[int]("0x"+$_."first-cp")
}
}
If (-not $Full) {
$UCDRepertoire = $UCDRepertoire | Where-Object {
# Select everything Wide/Ambiguous/Full OR Emoji w/ Emoji Presentation
($_.ea -notin "N", "Na", "H") -or ($_.Emoji -eq "Y" -and $_.EPres -eq "Y")
}
}
$ranges = [UnicodeRangeList]::New(1024)
$c = 0
ForEach($v in $UCDRepertoire) {
$range = [UnicodeRange]::new($v)
$c += $range.Length()
If ($ranges.Count -gt 0 -and $ranges[$ranges.Count - 1].Merge($range)) {
# Merged into last entry
Continue
}
$ranges.Add([object]$range)
}
If (-not $NoOverrides) {
If ($null -eq $OverrideObject) {
$OverrideObject = [xml](Get-Content $OverridePath)
}
$OverrideRepertoire = $OverrideObject.ucd.repertoire.ChildNodes
$overrideCount = 0
ForEach($v in $OverrideRepertoire) {
$range = [UnicodeRange]::new($v)
$overrideCount += $range.Length()
$range.Comment = $range.Comment ?? "overridden without comment"
$ranges.ReplaceUnicodeRange($range)
}
}
# Emit Code
" // Generated by {0} -Pack:{1} -Full:{2} -NoOverrides:{3}" -f $MyInvocation.MyCommand.Name, $Pack, $Full, $NoOverrides
" // on {0} (UTC) from {1}." -f (Get-Date -AsUTC), $InputObject.ucd.description
" // {0} (0x{0:X}) codepoints covered." -f $c
If (-not $NoOverrides) {
" // {0} (0x{0:X}) codepoints overridden." -f $overrideCount
Fully regenerate CodepointWidthDetector from Unicode 13.0 (#8035) This commit also adds an override UCD and migrates all of the overrides from GetQuickCharWidth into it. GetQuickCharWidth ----------------- The removal of overrides from GQCW reduces the number of comparisons required for looking up a single character's width from 41 (32 individual ranged comparisons from GQCW + 8+1 from the binary search in CPWD) to 11 (2 from GQCW, 8+1 from CPWD). GQCW also incorrectly marked 67 reserved codepoints as `Wide` when they should have been `Narrow`. The codepoints whose definitions have changed from `Wide` to `Narrow` are: ``` 2E9A 2EF4 2EF5 2EF6 2EF7 2EF8 2EF9 2EFA 2EFB 2EFC 2EFD 2EFE 2EFF 2FD6 2FD7 2FD8 2FD9 2FDA 2FDB 2FDC 2FDD 2FDE 2FDF 2FE0 2FE1 2FE2 2FE3 2FE4 2FE5 2FE6 2FE7 2FE8 2FE9 2FEA 2FEB 2FEC 2FED 2FEE 2FEF 2FFC 2FFD 2FFE 2FFF 31E4 31E5 31E6 31E7 31E8 31E9 31EA 31EB 31EC 31ED 31EE 31EF 321F A48D A48E A48F FE1A FE1B FE1C FE1D FE1E FE1F FE53 FE67 ``` All of them are reserved, but those reserved regions are marked as narrow in the UCD. This change also offers us the chance to document exactly why we're overriding a specific character range. Comments from the override document will be copied to the generated CPWD table. New in Unicode 13.0 ------------------ Some widths have changed due to previously-reserved characters becoming _used_ such as U+32FF SQUARE ERA NAME REIWA, the Tangut components 756-768, the entire Khitan Small Script character set, and the Tangut Ideographs. A number of the changes in this diff are due to better/worse comment tracking and the removal of the Emoji/EPres comments. The script once mistakenly applied comments to packed regions (and it has been updated to not do so.) Validation ---------- I build a test application that compared codepoints 0-FFFF for GQCW against their new registered widths.
2020-10-27 18:36:28 +01:00
" // Override path: {0}" -f $OverridePath
tools: add a powershell script to generate CPWD from the UCD (#5946) This commit introduces Generate-CodepointWidthsFromUCD, a powershell (7+) script that will parse a UCD XML database in the UAX 42 format from https://www.unicode.org/Public/UCD/latest/ucdxml/ and generate CodepointWidthDetector's giant width array. By default, it will emit one UnicodeRange for every range of non-narrow glyphs with a different Width + Emoji + Emoji Presentation class; however, it can be run in "packing" and "full" mode. * Packing mode: ignore the width/emoji/pres class and combine adjacent runs that CPWD will treat the same. * This is for optimizing the number of individual ranges emitted into code. * Full mode: include narrow codepoints (helpful for visualization) It also supports overrides, provided in an XML document of the same format as the UCD itself. Entries in the overrides files are applied after the entire UCD is read and will replace any impacted ranges. The output (when packing) looks like this: ```c++ // Generated by Generate-CodepointWidthsFromUCD -Pack:True -Full:False // on 05/17/2020 02:47:55 (UTC) from Unicode 13.0.0. // 66182 (0x10286) codepoints covered. static constexpr std::array<UnicodeRange, 23> s_wideAndAmbiguousTable{ UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa4, 0xa4, CodepointWidth::Ambiguous }, UnicodeRange{ 0xa7, 0xa8, CodepointWidth::Ambiguous }, . . . UnicodeRange{ 0x1f210, 0x1f23b, CodepointWidth::Wide }, UnicodeRange{ 0x1f37e, 0x1f393, CodepointWidth::Wide }, UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }, }; ``` The output (when overriding) looks like this: ```c++ // Generated by Generate-CodepointWidthsFromUCD.ps1 -Pack:True -Full:False -NoOverrides:False // on 5/22/2020 11:17:39 PM (UTC) from Unicode 13.0.0. // 321205 (0x4E6B5) codepoints covered. // 240 (0xF0) codepoints overridden. static constexpr std::array<UnicodeRange, 23> s_wideAndAmbiguousTable{ UnicodeRange{ 0xa1, 0xa1, CodepointWidth::Ambiguous }, ... UnicodeRange{ 0xfe20, 0xfe2f, CodepointWidth::Narrow }, // narrow combining ligatures (split into left/right halves, which take 2 columns together) ... UnicodeRange{ 0x100000, 0x10fffd, CodepointWidth::Ambiguous }, }; ```
2020-06-03 09:16:14 +02:00
}
" static constexpr std::array<UnicodeRange, {0}> s_wideAndAmbiguousTable{{" -f $ranges.Count
ForEach($_ in $ranges) {
$comment = ""
if ($null -ne $_.Comment) {
# We only vend comments when we aren't packing tightly
$comment = " // {0}" -f $_.Comment
}
" UnicodeRange{{ 0x{0:x}, 0x{1:x}, CodepointWidth::{2} }},{3}" -f $_.Start, $_.End, $_.Width, $comment
}
" };"