diff --git a/.github/actions/spell-check/dictionary/apis.txt b/.github/actions/spell-check/dictionary/apis.txt index 1bf739dcf..30bfef9ce 100644 --- a/.github/actions/spell-check/dictionary/apis.txt +++ b/.github/actions/spell-check/dictionary/apis.txt @@ -8,8 +8,9 @@ EXPCMDSTATE href IBox IBind -ICustom IClass +IComparable +ICustom IExplorer IMap IObject diff --git a/.github/actions/spell-check/expect/expect.txt b/.github/actions/spell-check/expect/expect.txt index 8334d6e82..2550c0ca5 100644 --- a/.github/actions/spell-check/expect/expect.txt +++ b/.github/actions/spell-check/expect/expect.txt @@ -1511,11 +1511,13 @@ NOTHOUSANDS nothrow NOTICKS NOTIMPL +notin NOTNULL NOTRACK NOTSUPPORTED notypeopt nouicompat +nounihan NOUPDATE NOWAIT NOYIELD @@ -2376,9 +2378,11 @@ typeparam TYUI uap uapadmin +UAX ubuntu ucd UCD +ucdxml uch UCHAR ucs diff --git a/tools/Generate-CodepointWidthsFromUCD.ps1 b/tools/Generate-CodepointWidthsFromUCD.ps1 new file mode 100644 index 000000000..788e53892 --- /dev/null +++ b/tools/Generate-CodepointWidthsFromUCD.ps1 @@ -0,0 +1,274 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +#Requires -Version 7 +# (we use the null coalescing operator) + +################################################################################ +# This script generates the an array suitable for replacing the body of +# src/types/CodepointWidthDetector.cpp from a Unicode UCD XML document[1] +# compliant with UAX#42[2]. +# +# This script supports a quasi-mandatory "overrides" file, overrides.xml. +# If you do not have overrides, supply the -NoOverrides parameter. This was +# developed for use with the CodepointWidthDetector, which has some override +# ranges. +# +# This script was developed against the flat "no han unification" UCD +# "ucd.nounihan.flat.xml". +# It does not support the grouped database format. +# significantly smaller, which would provide a performance win on the admittedly +# extremely rare occasion that we should need to regenerate our table. +# +# Invoke as ./Generate-xxx ucd.nounihan.flat.xml -Pack | Out-File -Encoding +# UTF-8 Temporary.cpp +# +# [1]: https://www.unicode.org/Public/UCD/latest/ucdxml/ +# [2]: https://www.unicode.org/reports/tr42/ + +[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSAvoidUsingPositionalParameters', '')] +[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseProcessBlockForPipelineCommand', '')] +[CmdletBinding()] +Param( + [Parameter(Position=0, ValueFromPipeline=$true, ParameterSetName="Parsed")] + [System.Xml.XmlDocument]$InputObject, + + [Parameter(Position=1, ValueFromPipeline=$true, ParameterSetName="Parsed")] + [System.Xml.XmlDocument]$OverrideObject, + + [Parameter(Position=0, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")] + [string]$Path = "ucd.nounihan.flat.xml", + + [Parameter(Position=1, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")] + [string]$OverridePath = "overrides.xml", + + [switch]$Pack, # Pack tightly based on width + [switch]$NoOverrides, # Do not include overrides + [switch]$Full = $False # Include Narrow codepoints +) + +Enum CodepointWidth { + Narrow; + Wide; + Ambiguous; + Invalid; +} + +# UCD Functions {{{ +Function Get-UCDEntryRange($entry) { + $s = $e = 0 + if ($null -ne $v.cp) { + # Individual Codepoint + $s = $e = [int]("0x"+$v.cp) + } ElseIf ($null -ne $v."first-cp") { + # Range of Codepoints + $s = [int]("0x"+$v."first-cp") + $e = [int]("0x"+$v."last-cp") + } + $s + $e +} + +Function Get-UCDEntryWidth($entry) { + If ($entry.Emoji -eq "Y" -and $entry.EPres -eq "Y") { + [CodepointWidth]::Wide + Return + } + + Switch($entry.ea) { + "N" { [CodepointWidth]::Narrow; Return } + "Na" { [CodepointWidth]::Narrow; Return } + "H" { [CodepointWidth]::Narrow; Return } + "W" { [CodepointWidth]::Wide; Return } + "F" { [CodepointWidth]::Wide; Return } + "A" { [CodepointWidth]::Ambiguous; Return } + } + [CodepointWidth]::Invalid +} + +Function Get-UCDEntryFlags($entry) { + If ($script:Pack) { + # If we're "pack"ing entries, only the computed width matters for telling them apart + Get-UCDEntryWidth $entry + Return + } + + $normalizedEAWidth = $entry.ea + $normalizedEAWidth = $normalizedEAWidth -eq "F" ? "W" : $normalizedEAWidth; + "{0}{1}{2}" -f $normalizedEAWidth, $entry.Emoji, $entry.EPres +} +# }}} + +Class UnicodeRange : System.IComparable { + [int]$Start + [int]$End + [CodepointWidth]$Width + [string]$Flags + [string]$Comment + + UnicodeRange([System.Xml.XmlElement]$ucdEntry) { + $this.Start, $this.End = Get-UCDEntryRange $ucdEntry + $this.Width = Get-UCDEntryWidth $ucdEntry + $this.Flags = Get-UCDEntryFlags $ucdEntry + + If (-not $script:Pack -and $ucdEntry.Emoji -eq "Y" -and $ucdEntry.EPres -eq "Y") { + $this.Comment = "Emoji=Y EPres=Y" + } + + If ($null -ne $ucdEntry.comment) { + $this.Comment = $ucdEntry.comment + } + } + + [int] CompareTo([object]$Other) { + If ($Other -is [int]) { + Return $this.Start - $Other + } + Return $this.Start - $Other.Start + } + + [bool] Merge([UnicodeRange]$Other) { + # If there's more than one codepoint between them, don't merge + If (($Other.Start - $this.End) -gt 1) { + Return $false + } + + # Flags are different: do not merge + If ($this.Flags -ne $Other.Flags) { + Return $false + } + + $this.End = $Other.End + Return $true + } + + [int] Length() { + return $this.End - $this.Start + 1 + } +} + +Class UnicodeRangeList : System.Collections.Generic.List[Object] { + UnicodeRangeList([int]$Capacity) : base($Capacity) { } + + [int] hidden _FindInsertionPoint([int]$codepoint) { + $l = $this.BinarySearch($codepoint) + If ($l -lt 0) { + # Return value <0: value was not found, return value is bitwise complement the index of the first >= value + Return -bNOT $l + } + Return $l + } + + ReplaceUnicodeRange([UnicodeRange]$newRange) { + $subset = [System.Collections.Generic.List[Object]]::New(3) + $subset.Add($newRange) + + $i = $this._FindInsertionPoint($newRange.Start) + + # Left overlap can only ever be one (_FindInsertionPoint always returns the + # index immediately after the range whose Start is <= than ours). + $prev = $null + If($i -gt 0 -and $this[$i - 1].End -ge $newRange.Start) { + $prev = $i - 1 + } + + # Right overlap can be Infinite (because we didn't account for End) + # Find extent of right overlap + For($next = $i; ($next -lt $this.Count - 1) -and ($this[$next+1].Start -le $newRange.End); $next++) { } + If ($this[$next].Start -gt $newRange.End) { + # It turns out we didn't damage the following range; clear it + $next = $null + } + + If ($null -ne $next) { + # Replace damaged elements after I with a truncated range + $last = $this[$next] + $this.RemoveRange($i, $next - $i + 1) # Remove damaged elements after I + $last.Start = $newRange.End + 1 + If ($last.Start -le $last.End) { + $subset.Add($last) + } + } + + If ($null -ne $prev) { + # Replace damaged elements before I with a truncated range + $first = $this[$prev] + $this.RemoveRange($prev, $i - $prev) # Remove damaged elements (b/c we may not need to re-add them!) + $first.End = $newRange.Start - 1 + If ($first.End -ge $first.Start) { + $subset.Insert(0, $first) + } + $i = $prev # Update the insertion cursor + } + + $this.InsertRange($i, $subset) + } +} + +# Ingest UCD +If ($null -eq $InputObject) { + $InputObject = [xml](Get-Content $Path) +} + +$UCDRepertoire = $InputObject.ucd.repertoire.ChildNodes | Sort-Object { + # Sort by either cp or first-cp (for ranges) + if ($null -ne $_.cp) { + [int]("0x"+$_.cp) + } ElseIf ($null -ne $_."first-cp") { + [int]("0x"+$_."first-cp") + } +} + +If (-not $Full) { + $UCDRepertoire = $UCDRepertoire | Where-Object { + # Select everything Wide/Ambiguous/Full OR Emoji w/ Emoji Presentation + ($_.ea -notin "N", "Na", "H") -or ($_.Emoji -eq "Y" -and $_.EPres -eq "Y") + } +} + +$ranges = [UnicodeRangeList]::New(1024) + +$c = 0 +ForEach($v in $UCDRepertoire) { + $range = [UnicodeRange]::new($v) + $c += $range.Length() + + If ($ranges.Count -gt 0 -and $ranges[$ranges.Count - 1].Merge($range)) { + # Merged into last entry + Continue + } + $ranges.Add([object]$range) +} + +If (-not $NoOverrides) { + If ($null -eq $OverrideObject) { + $OverrideObject = [xml](Get-Content $OverridePath) + } + + $OverrideRepertoire = $OverrideObject.ucd.repertoire.ChildNodes + $overrideCount = 0 + ForEach($v in $OverrideRepertoire) { + $range = [UnicodeRange]::new($v) + $overrideCount += $range.Length() + $range.Comment = $range.Comment ?? "overridden without comment" + $ranges.ReplaceUnicodeRange($range) + } +} + +# Emit Code +" // Generated by {0} -Pack:{1} -Full:{2} -NoOverrides:{3}" -f $MyInvocation.MyCommand.Name, $Pack, $Full, $NoOverrides +" // on {0} (UTC) from {1}." -f (Get-Date -AsUTC), $InputObject.ucd.description +" // {0} (0x{0:X}) codepoints covered." -f $c +If (-not $NoOverrides) { +" // {0} (0x{0:X}) codepoints overridden." -f $overrideCount +} +" static constexpr std::array s_wideAndAmbiguousTable{{" -f $ranges.Count +ForEach($_ in $ranges) { + $comment = "" + if ($null -ne $_.Comment) { + # We only vend comments when we aren't packing tightly + $comment = " // {0}" -f $_.Comment + } +" UnicodeRange{{ 0x{0:x}, 0x{1:x}, CodepointWidth::{2} }},{3}" -f $_.Start, $_.End, $_.Width, $comment +} +" };"