Skip to content

Commit 34afb53

Browse files
committed
Update Measure-String to rely on GetTextElementEnumerator.
This still needs tests to verify it works, but it works better than it did.
1 parent ee46661 commit 34afb53

File tree

1 file changed

+44
-38
lines changed

1 file changed

+44
-38
lines changed

source/public/Measure-String.ps1

Lines changed: 44 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ filter Measure-String {
1515
Measures the length of a string with support for escape sequences and wide characters
1616
.DESCRIPTION
1717
By default, ignores ANSI escape sequences when measuring the length
18-
Optionally can treat ambiguous characters as wide
19-
Optionally can treat emoji characters wide
18+
Optionally can count ambiguous characters as wide
19+
Optionally can count emoji characters as single characters (defaults to double because that's how they are typically displayed in terminals)
2020
.LINK
2121
Measure-String
2222
#>
@@ -25,76 +25,82 @@ filter Measure-String {
2525
[Parameter(ValueFromPipeline)]
2626
[string]$string,
2727

28+
# Set AmbiguousAsWide to count ambiguous width characters as 2 characters wide.
29+
# By default, they are counted as a single character.
2830
[switch]$AmbiguousAsWide,
2931

30-
[switch]$EmojiAsWide,
32+
# Set EmojiAsSingle to count emoji as a single character.
33+
# By default, we count emoji as wide (2 characters) because that's how they are typically displayed in terminals.
34+
[switch]$EmojiAsSingle,
3135

32-
[switch]$countAnsiEscapeCodes
36+
# Set CountAnsiEscapeCodes to include ANSI escape codes in the width count. By default, they are ignored.
37+
[switch]$CountAnsiEscapeCodes
3338
)
3439
if ($string.length -eq 0) {
3540
return 0;
3641
}
3742

38-
if (!$countAnsiEscapeCodes) {
43+
if (!$CountAnsiEscapeCodes) {
3944
$string = $string -replace $AnsiRegex
4045
}
4146

4247
if ($string.length -eq 0) {
4348
return 0;
4449
}
4550

46-
# PowerShell 5 (.NET 4) isn't UAX 29 compliant
47-
$width = 0;
48-
foreach ($character in $string.GetEnumerator()) {
49-
$codePoint = [int]$character;
50-
51-
# Ignore control characters
52-
if ($codePoint -le 0x1F -or ($codePoint -ge 0x7F -and $codePoint -le 0x9F)) {
53-
continue
54-
}
51+
# PowerShell 5 (.NET 4) isn't UAX 29 compliant, so this would not work correctly for grapheme clusters.
52+
# That is, in .NET Framework, GetTextElementEnumerator will split up "👩‍💻" into [👩, ZWJ, 💻] and count it as 3 instead of 1
53+
#requires -Version 7.5
5554

56-
# Ignore zero-width characters
57-
if (($codePoint -ge 0x200B -and $codePoint -le 0x200F) -or # Zero-width space, non-joiner, joiner, left-to-right mark, right-to-left mark
58-
$codePoint -eq 0xFEFF ) {
59-
# Zero-width no-break space# Zero-width no-break space
55+
$width = 0;
56+
foreach ($element in [System.Globalization.StringInfo]::GetTextElementEnumerator($string)) {
57+
$codepoint = [char]::ConvertToUtf32($element, 0)
58+
# If the whole element is ignorable, skip it.
59+
if ($element -match "^[\p{IsCombiningDiacriticalMarks}\p{IsCombiningMarksforSymbols}\p{IsVariationSelectors}\p{M}]+$") {
60+
Write-Debug "Ignoring ignorable element: U+$('{0:x4}' -f $codepoint)"
6061
continue
6162
}
6263

63-
# Ignore combining characters
64-
if (($codePoint -ge 0x300 -and $codePoint -le 0x36F) -or # Combining diacritical marks
65-
($codePoint -ge 0x1AB0 -and $codePoint -le 0x1AFF) -or # Combining diacritical marks extended
66-
($codePoint -ge 0x1DC0 -and $codePoint -le 0x1DFF) -or # Combining diacritical marks supplement
67-
($codePoint -ge 0x20D0 -and $codePoint -le 0x20FF) -or # Combining diacritical marks for symbols
68-
($codePoint -ge 0xFE20 -and $codePoint -le 0xFE2F)) {
69-
# Combining half marks
64+
if ($element -match $EmojiRegex) {
65+
if ($EmojiAsSingle) {
66+
Write-Debug "Treating emoji as single width: U+$('{0:x4}' -f $codepoint)"
67+
$width += 1 # Treat emojis as single width
68+
} else {
69+
Write-Debug "Treating emoji as double width: U+$('{0:x4}' -f $codepoint)"
70+
$width += 2 # Treat emojis as double width
71+
}
7072
continue
7173
}
7274

73-
# Ignore surrogate pairs
74-
if ($codePoint -ge 0xD800 -and $codePoint -le 0xDFFF) {
75+
# If it starts with something that takes up no space, trim that to find the real codepoint to measure.
76+
if (!($element = $element -replace "^[\p{IsCombiningDiacriticalMarks}\p{IsCombiningMarksforSymbols}\p{IsVariationSelectors}\p{Cf}\p{M}]+")) {
77+
Write-Debug "Element started with ignorable characters: U+$('{0:x4}' -f $codepoint)"
7578
continue
7679
}
80+
# $codepoint = [char]::IsSurrogatePair($element, 0) ? [char]::ConvertToUtf32($element, 0) : [char]$element[0]
81+
$codepoint = [char]::ConvertToUtf32($element, 0)
7782

78-
# Ignore variation selectors
79-
if ($codePoint -ge 0xFE00 -and $codePoint -le 0xFE0F) {
83+
# Ignore control characters [\u0000-\u001F\u007F-\u009F]
84+
if ($codepoint -le 0x1F -or ($codepoint -ge 0x7F -and $codepoint -le 0x9F)) {
85+
Write-Debug "Ignoring control character: U+$('{0:x4}' -f $codepoint)"
8086
continue
8187
}
8288

83-
# This covers some of the above cases, but we still keep them for performance reasons.
84-
if ([Char]::GetUnicodeCategory($character) -in 'NonSpacingMark', 'SpacingCombiningMark', 'EnclosingMark', 'Format') {
89+
# Ignore zero-width characters [\u200B-\u200D\uFEFF]
90+
if (($codepoint -ge 0x200B -and $codepoint -le 0x200F) -or # Zero-width space, non-joiner, joiner, left-to-right mark, right-to-left mark
91+
$codepoint -eq 0xFEFF ) {
92+
Write-Debug "Ignoring zero-width character: U+$('{0:x4}' -f $codepoint)"
8593
continue
8694
}
8795

88-
if ($character -match $EmojiRegex) {
89-
if ($EmojiAsWide) {
90-
$width += 2 # Treat emojis as double width
91-
} else {
92-
$width += 1 # Treat emojis as single width
93-
}
96+
# Ignore variation selectors [\uFE00-\uFE0F]
97+
if ($codepoint -ge 0xFE00 -and $codepoint -le 0xFE0F) {
98+
Write-Debug "Ignoring variation selector character: U+$('{0:x4}' -f $codepoint)"
9499
continue
95100
}
96101

97-
$width += Measure-EastAsianWidth $codePoint -AmbiguousAsWide:$AmbiguousAsWide
102+
$width += Measure-EastAsianWidth $codepoint -AmbiguousAsWide:$AmbiguousAsWide
103+
Write-Debug "After U+$('{0:x4}' -f $codepoint), width: $($width)"
98104
}
99105

100106
return $width;

0 commit comments

Comments
 (0)