terminal/tools/Generate-CodepointWidthsFro...

281 lines
9.2 KiB
PowerShell

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
#Requires -Version 7
# (we use the null coalescing operator)
################################################################################
# This script generates the an array suitable for replacing the body of
# src/types/CodepointWidthDetector.cpp from a Unicode UCD XML document[1]
# compliant with UAX#42[2].
#
# This script supports a quasi-mandatory "overrides" file, overrides.xml.
# If you do not have overrides, supply the -NoOverrides parameter. This was
# developed for use with the CodepointWidthDetector, which has some override
# ranges.
#
# This script was developed against the flat "no han unification" UCD
# "ucd.nounihan.flat.xml".
# It does not support the grouped database format.
# significantly smaller, which would provide a performance win on the admittedly
# extremely rare occasion that we should need to regenerate our table.
#
# Invoke as ./Generate-xxx ucd.nounihan.flat.xml -Pack | Out-File -Encoding
# UTF-8 Temporary.cpp
#
# [1]: https://www.unicode.org/Public/UCD/latest/ucdxml/
# [2]: https://www.unicode.org/reports/tr42/
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSAvoidUsingPositionalParameters', '')]
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseProcessBlockForPipelineCommand', '')]
[CmdletBinding()]
Param(
[Parameter(Position=0, ValueFromPipeline=$true, ParameterSetName="Parsed")]
[System.Xml.XmlDocument]$InputObject,
[Parameter(Position=1, ValueFromPipeline=$true, ParameterSetName="Parsed")]
[System.Xml.XmlDocument]$OverrideObject,
[Parameter(Position=0, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")]
[string]$Path = "ucd.nounihan.flat.xml",
[Parameter(Position=1, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")]
[string]$OverridePath = "overrides.xml",
[switch]$Pack, # Pack tightly based on width
[switch]$NoOverrides, # Do not include overrides
[switch]$Full = $False # Include Narrow codepoints
)
Enum CodepointWidth {
Narrow;
Wide;
Ambiguous;
Invalid;
}
# UCD Functions {{{
Function Get-UCDEntryRange($entry) {
$s = $e = 0
if ($null -ne $v.cp) {
# Individual Codepoint
$s = $e = [int]("0x"+$v.cp)
} ElseIf ($null -ne $v."first-cp") {
# Range of Codepoints
$s = [int]("0x"+$v."first-cp")
$e = [int]("0x"+$v."last-cp")
}
$s
$e
}
Function Get-UCDEntryWidth($entry) {
If ($entry.Emoji -eq "Y" -and $entry.EPres -eq "Y") {
[CodepointWidth]::Wide
Return
}
Switch($entry.ea) {
"N" { [CodepointWidth]::Narrow; Return }
"Na" { [CodepointWidth]::Narrow; Return }
"H" { [CodepointWidth]::Narrow; Return }
"W" { [CodepointWidth]::Wide; Return }
"F" { [CodepointWidth]::Wide; Return }
"A" { [CodepointWidth]::Ambiguous; Return }
}
[CodepointWidth]::Invalid
}
Function Get-UCDEntryFlags($entry) {
If ($script:Pack) {
# If we're "pack"ing entries, only the computed width matters for telling them apart
Get-UCDEntryWidth $entry
Return
}
$normalizedEAWidth = $entry.ea
$normalizedEAWidth = $normalizedEAWidth -eq "F" ? "W" : $normalizedEAWidth;
"{0}{1}{2}" -f $normalizedEAWidth, $entry.Emoji, $entry.EPres
}
# }}}
Class UnicodeRange : System.IComparable {
[int]$Start
[int]$End
[CodepointWidth]$Width
[string]$Flags
[string]$Comment
UnicodeRange([System.Xml.XmlElement]$ucdEntry) {
$this.Start, $this.End = Get-UCDEntryRange $ucdEntry
$this.Width = Get-UCDEntryWidth $ucdEntry
$this.Flags = Get-UCDEntryFlags $ucdEntry
If (-not $script:Pack -and $ucdEntry.Emoji -eq "Y" -and $ucdEntry.EPres -eq "Y") {
$this.Comment = "Emoji=Y EPres=Y"
}
If ($null -ne $ucdEntry.comment) {
$this.Comment = $ucdEntry.comment
}
}
[int] CompareTo([object]$Other) {
If ($Other -is [int]) {
Return $this.Start - $Other
}
Return $this.Start - $Other.Start
}
[bool] Merge([UnicodeRange]$Other) {
# If there's more than one codepoint between them, don't merge
If (($Other.Start - $this.End) -gt 1) {
Return $false
}
# Comments are different: do not merge
If ($this.Comment -ne $Other.Comment) {
Return $false
}
# Flags are different: do not merge
If ($this.Flags -ne $Other.Flags) {
Return $false
}
$this.End = $Other.End
Return $true
}
[int] Length() {
return $this.End - $this.Start + 1
}
}
Class UnicodeRangeList : System.Collections.Generic.List[Object] {
UnicodeRangeList([int]$Capacity) : base($Capacity) { }
[int] hidden _FindInsertionPoint([int]$codepoint) {
$l = $this.BinarySearch($codepoint)
If ($l -lt 0) {
# Return value <0: value was not found, return value is bitwise complement the index of the first >= value
Return -bNOT $l
}
Return $l
}
ReplaceUnicodeRange([UnicodeRange]$newRange) {
$subset = [System.Collections.Generic.List[Object]]::New(3)
$subset.Add($newRange)
$i = $this._FindInsertionPoint($newRange.Start)
# Left overlap can only ever be one (_FindInsertionPoint always returns the
# index immediately after the range whose Start is <= than ours).
$prev = $null
If($i -gt 0 -and $this[$i - 1].End -ge $newRange.Start) {
$prev = $i - 1
}
# Right overlap can be Infinite (because we didn't account for End)
# Find extent of right overlap
For($next = $i; ($next -lt $this.Count - 1) -and ($this[$next+1].Start -le $newRange.End); $next++) { }
If ($this[$next].Start -gt $newRange.End) {
# It turns out we didn't damage the following range; clear it
$next = $null
}
If ($null -ne $next) {
# Replace damaged elements after I with a truncated range
$last = $this[$next]
$this.RemoveRange($i, $next - $i + 1) # Remove damaged elements after I
$last.Start = $newRange.End + 1
If ($last.Start -le $last.End) {
$subset.Add($last)
}
}
If ($null -ne $prev) {
# Replace damaged elements before I with a truncated range
$first = $this[$prev]
$this.RemoveRange($prev, $i - $prev) # Remove damaged elements (b/c we may not need to re-add them!)
$first.End = $newRange.Start - 1
If ($first.End -ge $first.Start) {
$subset.Insert(0, $first)
}
$i = $prev # Update the insertion cursor
}
$this.InsertRange($i, $subset)
}
}
# Ingest UCD
If ($null -eq $InputObject) {
$InputObject = [xml](Get-Content $Path)
}
$UCDRepertoire = $InputObject.ucd.repertoire.ChildNodes | Sort-Object {
# Sort by either cp or first-cp (for ranges)
if ($null -ne $_.cp) {
[int]("0x"+$_.cp)
} ElseIf ($null -ne $_."first-cp") {
[int]("0x"+$_."first-cp")
}
}
If (-not $Full) {
$UCDRepertoire = $UCDRepertoire | Where-Object {
# Select everything Wide/Ambiguous/Full OR Emoji w/ Emoji Presentation
($_.ea -notin "N", "Na", "H") -or ($_.Emoji -eq "Y" -and $_.EPres -eq "Y")
}
}
$ranges = [UnicodeRangeList]::New(1024)
$c = 0
ForEach($v in $UCDRepertoire) {
$range = [UnicodeRange]::new($v)
$c += $range.Length()
If ($ranges.Count -gt 0 -and $ranges[$ranges.Count - 1].Merge($range)) {
# Merged into last entry
Continue
}
$ranges.Add([object]$range)
}
If (-not $NoOverrides) {
If ($null -eq $OverrideObject) {
$OverrideObject = [xml](Get-Content $OverridePath)
}
$OverrideRepertoire = $OverrideObject.ucd.repertoire.ChildNodes
$overrideCount = 0
ForEach($v in $OverrideRepertoire) {
$range = [UnicodeRange]::new($v)
$overrideCount += $range.Length()
$range.Comment = $range.Comment ?? "overridden without comment"
$ranges.ReplaceUnicodeRange($range)
}
}
# Emit Code
" // Generated by {0} -Pack:{1} -Full:{2} -NoOverrides:{3}" -f $MyInvocation.MyCommand.Name, $Pack, $Full, $NoOverrides
" // on {0} (UTC) from {1}." -f (Get-Date -AsUTC), $InputObject.ucd.description
" // {0} (0x{0:X}) codepoints covered." -f $c
If (-not $NoOverrides) {
" // {0} (0x{0:X}) codepoints overridden." -f $overrideCount
" // Override path: {0}" -f $OverridePath
}
" static constexpr std::array<UnicodeRange, {0}> s_wideAndAmbiguousTable{{" -f $ranges.Count
ForEach($_ in $ranges) {
$comment = ""
if ($null -ne $_.Comment) {
# We only vend comments when we aren't packing tightly
$comment = " // {0}" -f $_.Comment
}
" UnicodeRange{{ 0x{0:x}, 0x{1:x}, CodepointWidth::{2} }},{3}" -f $_.Start, $_.End, $_.Width, $comment
}
" };"