@@ -15,8 +15,8 @@ filter Measure-String {
1515 Measures the length of a string with support for escape sequences and wide characters
1616 . DESCRIPTION
1717 By default, ignores ANSI escape sequences when measuring the length
18- Optionally can treat ambiguous characters as wide
19- Optionally can treat emoji characters wide
18+ Optionally can count ambiguous characters as wide
19+ Optionally can count emoji characters as single characters (defaults to double because that's how they are typically displayed in terminals)
2020 . LINK
2121 Measure-String
2222 #>
@@ -25,76 +25,82 @@ filter Measure-String {
2525 [Parameter (ValueFromPipeline )]
2626 [string ]$string ,
2727
28+ # Set AmbiguousAsWide to count ambiguous width characters as 2 characters wide.
29+ # By default, they are counted as a single character.
2830 [switch ]$AmbiguousAsWide ,
2931
30- [switch ]$EmojiAsWide ,
32+ # Set EmojiAsSingle to count emoji as a single character.
33+ # By default, we count emoji as wide (2 characters) because that's how they are typically displayed in terminals.
34+ [switch ]$EmojiAsSingle ,
3135
32- [switch ]$countAnsiEscapeCodes
36+ # Set CountAnsiEscapeCodes to include ANSI escape codes in the width count. By default, they are ignored.
37+ [switch ]$CountAnsiEscapeCodes
3338 )
3439 if ($string.length -eq 0 ) {
3540 return 0 ;
3641 }
3742
38- if (! $countAnsiEscapeCodes ) {
43+ if (! $CountAnsiEscapeCodes ) {
3944 $string = $string -replace $AnsiRegex
4045 }
4146
4247 if ($string.length -eq 0 ) {
4348 return 0 ;
4449 }
4550
46- # PowerShell 5 (.NET 4) isn't UAX 29 compliant
47- $width = 0 ;
48- foreach ($character in $string.GetEnumerator ()) {
49- $codePoint = [int ]$character ;
50-
51- # Ignore control characters
52- if ($codePoint -le 0x1F -or ($codePoint -ge 0x7F -and $codePoint -le 0x9F )) {
53- continue
54- }
51+ # PowerShell 5 (.NET 4) isn't UAX 29 compliant, so this would not work correctly for grapheme clusters.
52+ # That is, in .NET Framework, GetTextElementEnumerator will split up "👩💻" into [👩, ZWJ, 💻] and count it as 3 instead of 1
53+ # requires -Version 7.5
5554
56- # Ignore zero-width characters
57- if (($codePoint -ge 0x200B -and $codePoint -le 0x200F ) -or # Zero-width space, non-joiner, joiner, left-to-right mark, right-to-left mark
58- $codePoint -eq 0xFEFF ) {
59- # Zero-width no-break space# Zero-width no-break space
55+ $width = 0 ;
56+ foreach ($element in [System.Globalization.StringInfo ]::GetTextElementEnumerator($string )) {
57+ $codepoint = [char ]::ConvertToUtf32($element , 0 )
58+ # If the whole element is ignorable, skip it.
59+ if ($element -match " ^[\p{IsCombiningDiacriticalMarks}\p{IsCombiningMarksforSymbols}\p{IsVariationSelectors}\p{M}]+$" ) {
60+ Write-Debug " Ignoring ignorable element: U+$ ( ' {0:x4}' -f $codepoint ) "
6061 continue
6162 }
6263
63- # Ignore combining characters
64- if (($codePoint -ge 0x300 -and $codePoint -le 0x36F ) -or # Combining diacritical marks
65- ($codePoint -ge 0x1AB0 -and $codePoint -le 0x1AFF ) -or # Combining diacritical marks extended
66- ($codePoint -ge 0x1DC0 -and $codePoint -le 0x1DFF ) -or # Combining diacritical marks supplement
67- ($codePoint -ge 0x20D0 -and $codePoint -le 0x20FF ) -or # Combining diacritical marks for symbols
68- ($codePoint -ge 0xFE20 -and $codePoint -le 0xFE2F )) {
69- # Combining half marks
64+ if ($element -match $EmojiRegex ) {
65+ if ($EmojiAsSingle ) {
66+ Write-Debug " Treating emoji as single width: U+$ ( ' {0:x4}' -f $codepoint ) "
67+ $width += 1 # Treat emojis as single width
68+ } else {
69+ Write-Debug " Treating emoji as double width: U+$ ( ' {0:x4}' -f $codepoint ) "
70+ $width += 2 # Treat emojis as double width
71+ }
7072 continue
7173 }
7274
73- # Ignore surrogate pairs
74- if ($codePoint -ge 0xD800 -and $codePoint -le 0xDFFF ) {
75+ # If it starts with something that takes up no space, trim that to find the real codepoint to measure.
76+ if (! ($element = $element -replace " ^[\p{IsCombiningDiacriticalMarks}\p{IsCombiningMarksforSymbols}\p{IsVariationSelectors}\p{Cf}\p{M}]+" )) {
77+ Write-Debug " Element started with ignorable characters: U+$ ( ' {0:x4}' -f $codepoint ) "
7578 continue
7679 }
80+ # $codepoint = [char]::IsSurrogatePair($element, 0) ? [char]::ConvertToUtf32($element, 0) : [char]$element[0]
81+ $codepoint = [char ]::ConvertToUtf32($element , 0 )
7782
78- # Ignore variation selectors
79- if ($codePoint -ge 0xFE00 -and $codePoint -le 0xFE0F ) {
83+ # Ignore control characters [\u0000-\u001F\u007F-\u009F]
84+ if ($codepoint -le 0x1F -or ($codepoint -ge 0x7F -and $codepoint -le 0x9F )) {
85+ Write-Debug " Ignoring control character: U+$ ( ' {0:x4}' -f $codepoint ) "
8086 continue
8187 }
8288
83- # This covers some of the above cases, but we still keep them for performance reasons.
84- if ([Char ]::GetUnicodeCategory($character ) -in ' NonSpacingMark' , ' SpacingCombiningMark' , ' EnclosingMark' , ' Format' ) {
89+ # Ignore zero-width characters [\u200B-\u200D\uFEFF]
90+ if (($codepoint -ge 0x200B -and $codepoint -le 0x200F ) -or # Zero-width space, non-joiner, joiner, left-to-right mark, right-to-left mark
91+ $codepoint -eq 0xFEFF ) {
92+ Write-Debug " Ignoring zero-width character: U+$ ( ' {0:x4}' -f $codepoint ) "
8593 continue
8694 }
8795
88- if ($character -match $EmojiRegex ) {
89- if ($EmojiAsWide ) {
90- $width += 2 # Treat emojis as double width
91- } else {
92- $width += 1 # Treat emojis as single width
93- }
96+ # Ignore variation selectors [\uFE00-\uFE0F]
97+ if ($codepoint -ge 0xFE00 -and $codepoint -le 0xFE0F ) {
98+ Write-Debug " Ignoring variation selector character: U+$ ( ' {0:x4}' -f $codepoint ) "
9499 continue
95100 }
96101
97- $width += Measure-EastAsianWidth $codePoint - AmbiguousAsWide:$AmbiguousAsWide
102+ $width += Measure-EastAsianWidth $codepoint - AmbiguousAsWide:$AmbiguousAsWide
103+ Write-Debug " After U+$ ( ' {0:x4}' -f $codepoint ) , width: $ ( $width ) "
98104 }
99105
100106 return $width ;
0 commit comments