# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. #Requires -Version 7 # (we use the null coalescing operator) ################################################################################ # This script generates the an array suitable for replacing the body of # src/types/CodepointWidthDetector.cpp from a Unicode UCD XML document[1] # compliant with UAX#42[2]. # # This script supports a quasi-mandatory "overrides" file, overrides.xml. # If you do not have overrides, supply the -NoOverrides parameter. This was # developed for use with the CodepointWidthDetector, which has some override # ranges. # # This script was developed against the flat "no han unification" UCD # "ucd.nounihan.flat.xml". # It does not support the grouped database format. # significantly smaller, which would provide a performance win on the admittedly # extremely rare occasion that we should need to regenerate our table. # # Invoke as ./Generate-xxx ucd.nounihan.flat.xml -Pack | Out-File -Encoding # UTF-8 Temporary.cpp # # [1]: https://www.unicode.org/Public/UCD/latest/ucdxml/ # [2]: https://www.unicode.org/reports/tr42/ [Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSAvoidUsingPositionalParameters', '')] [Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseProcessBlockForPipelineCommand', '')] [CmdletBinding()] Param( [Parameter(Position=0, ValueFromPipeline=$true, ParameterSetName="Parsed")] [System.Xml.XmlDocument]$InputObject, [Parameter(Position=1, ValueFromPipeline=$true, ParameterSetName="Parsed")] [System.Xml.XmlDocument]$OverrideObject, [Parameter(Position=0, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")] [string]$Path = "ucd.nounihan.flat.xml", [Parameter(Position=1, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")] [string]$OverridePath = "overrides.xml", [switch]$Pack, # Pack tightly based on width [switch]$NoOverrides, # Do not include overrides [switch]$Full = $False # Include Narrow codepoints ) Enum CodepointWidth { Narrow; Wide; Ambiguous; Invalid; } # UCD Functions {{{ Function Get-UCDEntryRange($entry) { $s = $e = 0 if ($null -ne $v.cp) { # Individual Codepoint $s = $e = [int]("0x"+$v.cp) } ElseIf ($null -ne $v."first-cp") { # Range of Codepoints $s = [int]("0x"+$v."first-cp") $e = [int]("0x"+$v."last-cp") } $s $e } Function Get-UCDEntryWidth($entry) { If ($entry.Emoji -eq "Y" -and $entry.EPres -eq "Y") { [CodepointWidth]::Wide Return } Switch($entry.ea) { "N" { [CodepointWidth]::Narrow; Return } "Na" { [CodepointWidth]::Narrow; Return } "H" { [CodepointWidth]::Narrow; Return } "W" { [CodepointWidth]::Wide; Return } "F" { [CodepointWidth]::Wide; Return } "A" { [CodepointWidth]::Ambiguous; Return } } [CodepointWidth]::Invalid } Function Get-UCDEntryFlags($entry) { If ($script:Pack) { # If we're "pack"ing entries, only the computed width matters for telling them apart Get-UCDEntryWidth $entry Return } $normalizedEAWidth = $entry.ea $normalizedEAWidth = $normalizedEAWidth -eq "F" ? "W" : $normalizedEAWidth; "{0}{1}{2}" -f $normalizedEAWidth, $entry.Emoji, $entry.EPres } # }}} Class UnicodeRange : System.IComparable { [int]$Start [int]$End [CodepointWidth]$Width [string]$Flags [string]$Comment UnicodeRange([System.Xml.XmlElement]$ucdEntry) { $this.Start, $this.End = Get-UCDEntryRange $ucdEntry $this.Width = Get-UCDEntryWidth $ucdEntry $this.Flags = Get-UCDEntryFlags $ucdEntry If (-not $script:Pack -and $ucdEntry.Emoji -eq "Y" -and $ucdEntry.EPres -eq "Y") { $this.Comment = "Emoji=Y EPres=Y" } If ($null -ne $ucdEntry.comment) { $this.Comment = $ucdEntry.comment } } [int] CompareTo([object]$Other) { If ($Other -is [int]) { Return $this.Start - $Other } Return $this.Start - $Other.Start } [bool] Merge([UnicodeRange]$Other) { # If there's more than one codepoint between them, don't merge If (($Other.Start - $this.End) -gt 1) { Return $false } # Comments are different: do not merge If ($this.Comment -ne $Other.Comment) { Return $false } # Flags are different: do not merge If ($this.Flags -ne $Other.Flags) { Return $false } $this.End = $Other.End Return $true } [int] Length() { return $this.End - $this.Start + 1 } } Class UnicodeRangeList : System.Collections.Generic.List[Object] { UnicodeRangeList([int]$Capacity) : base($Capacity) { } [int] hidden _FindInsertionPoint([int]$codepoint) { $l = $this.BinarySearch($codepoint) If ($l -lt 0) { # Return value <0: value was not found, return value is bitwise complement the index of the first >= value Return -bNOT $l } Return $l } ReplaceUnicodeRange([UnicodeRange]$newRange) { $subset = [System.Collections.Generic.List[Object]]::New(3) $subset.Add($newRange) $i = $this._FindInsertionPoint($newRange.Start) # Left overlap can only ever be one (_FindInsertionPoint always returns the # index immediately after the range whose Start is <= than ours). $prev = $null If($i -gt 0 -and $this[$i - 1].End -ge $newRange.Start) { $prev = $i - 1 } # Right overlap can be Infinite (because we didn't account for End) # Find extent of right overlap For($next = $i; ($next -lt $this.Count - 1) -and ($this[$next+1].Start -le $newRange.End); $next++) { } If ($this[$next].Start -gt $newRange.End) { # It turns out we didn't damage the following range; clear it $next = $null } If ($null -ne $next) { # Replace damaged elements after I with a truncated range $last = $this[$next] $this.RemoveRange($i, $next - $i + 1) # Remove damaged elements after I $last.Start = $newRange.End + 1 If ($last.Start -le $last.End) { $subset.Add($last) } } If ($null -ne $prev) { # Replace damaged elements before I with a truncated range $first = $this[$prev] $this.RemoveRange($prev, $i - $prev) # Remove damaged elements (b/c we may not need to re-add them!) $first.End = $newRange.Start - 1 If ($first.End -ge $first.Start) { $subset.Insert(0, $first) } $i = $prev # Update the insertion cursor } $this.InsertRange($i, $subset) } } # Ingest UCD If ($null -eq $InputObject) { $InputObject = [xml](Get-Content $Path) } $UCDRepertoire = $InputObject.ucd.repertoire.ChildNodes | Sort-Object { # Sort by either cp or first-cp (for ranges) if ($null -ne $_.cp) { [int]("0x"+$_.cp) } ElseIf ($null -ne $_."first-cp") { [int]("0x"+$_."first-cp") } } If (-not $Full) { $UCDRepertoire = $UCDRepertoire | Where-Object { # Select everything Wide/Ambiguous/Full OR Emoji w/ Emoji Presentation ($_.ea -notin "N", "Na", "H") -or ($_.Emoji -eq "Y" -and $_.EPres -eq "Y") } } $ranges = [UnicodeRangeList]::New(1024) $c = 0 ForEach($v in $UCDRepertoire) { $range = [UnicodeRange]::new($v) $c += $range.Length() If ($ranges.Count -gt 0 -and $ranges[$ranges.Count - 1].Merge($range)) { # Merged into last entry Continue } $ranges.Add([object]$range) } If (-not $NoOverrides) { If ($null -eq $OverrideObject) { $OverrideObject = [xml](Get-Content $OverridePath) } $OverrideRepertoire = $OverrideObject.ucd.repertoire.ChildNodes $overrideCount = 0 ForEach($v in $OverrideRepertoire) { $range = [UnicodeRange]::new($v) $overrideCount += $range.Length() $range.Comment = $range.Comment ?? "overridden without comment" $ranges.ReplaceUnicodeRange($range) } } # Emit Code " // Generated by {0} -Pack:{1} -Full:{2} -NoOverrides:{3}" -f $MyInvocation.MyCommand.Name, $Pack, $Full, $NoOverrides " // on {0} (UTC) from {1}." -f (Get-Date -AsUTC), $InputObject.ucd.description " // {0} (0x{0:X}) codepoints covered." -f $c If (-not $NoOverrides) { " // {0} (0x{0:X}) codepoints overridden." -f $overrideCount " // Override path: {0}" -f $OverridePath } " static constexpr std::array s_wideAndAmbiguousTable{{" -f $ranges.Count ForEach($_ in $ranges) { $comment = "" if ($null -ne $_.Comment) { # We only vend comments when we aren't packing tightly $comment = " // {0}" -f $_.Comment } " UnicodeRange{{ 0x{0:x}, 0x{1:x}, CodepointWidth::{2} }},{3}" -f $_.Start, $_.End, $_.Width, $comment } " };"