@@ -31,14 +31,51 @@ impl<'a> fmt::Display for PrintableString<'a> {
3131 fn fmt ( & self , f : & mut fmt:: Formatter ) -> Result < ( ) , fmt:: Error > {
3232 use core:: fmt:: Write ;
3333 for c in self . 0 . chars ( ) {
34- let c = if c. is_control ( ) { core:: char:: REPLACEMENT_CHARACTER } else { c } ;
34+ let c = if c. is_control ( ) || is_format_char ( c) {
35+ core:: char:: REPLACEMENT_CHARACTER
36+ } else {
37+ c
38+ } ;
3539 f. write_char ( c) ?;
3640 }
3741
3842 Ok ( ( ) )
3943 }
4044}
4145
46+ // Codepoints in Unicode general category `Cf` (Format), per Unicode standard. These are not
47+ // matched by `char::is_control` (which only covers `Cc`), but include the bidirectional override /
48+ // isolate controls (e.g. U+202E RLO) and zero-width characters behind the "Trojan Source" attack
49+ // family (CVE-2021-42574), where an attacker-supplied string renders to a human reader as
50+ // something other than its byte content. Strip them alongside `Cc` characters when sanitising
51+ // untrusted input.
52+ fn is_format_char ( c : char ) -> bool {
53+ matches ! (
54+ c as u32 ,
55+ 0x00AD
56+ | 0x0600 ..=0x0605
57+ | 0x061C
58+ | 0x06DD
59+ | 0x070F
60+ | 0x0890 ..=0x0891
61+ | 0x08E2
62+ | 0x180E
63+ | 0x200B ..=0x200F
64+ | 0x202A ..=0x202E
65+ | 0x2060 ..=0x2064
66+ | 0x2066 ..=0x206F
67+ | 0xFEFF
68+ | 0xFFF9 ..=0xFFFB
69+ | 0x110BD
70+ | 0x110CD
71+ | 0x13430 ..=0x1343F
72+ | 0x1BCA0 ..=0x1BCA3
73+ | 0x1D173 ..=0x1D17A
74+ | 0xE0001
75+ | 0xE0020 ..=0xE007F
76+ )
77+ }
78+
4279#[ cfg( test) ]
4380mod tests {
4481 use super :: PrintableString ;
@@ -50,4 +87,24 @@ mod tests {
5087 "I \u{1F496} LDK!\u{FFFD} \u{26A1} " ,
5188 ) ;
5289 }
90+
91+ #[ test]
92+ fn sanitizes_unicode_bidi_override_characters ( ) {
93+ // U+202E RIGHT-TO-LEFT OVERRIDE and friends are Unicode general category
94+ // `Cf` (Format), not `Cc` (Control). They enable "Trojan Source" /
95+ // bidi-spoofing attacks where an attacker-supplied string (e.g. a node
96+ // alias gossiped from a peer) renders to a human reader as something
97+ // other than its byte content. `PrintableString` is the sanitiser used
98+ // for exactly these untrusted strings, so it must replace them.
99+ let rendered = format ! ( "{}" , PrintableString ( "safe\u{202E} cipsxe.exe" ) ) ;
100+ assert ! (
101+ !rendered. contains( '\u{202E}' ) ,
102+ "PrintableString left a U+202E RLO override in its output: {:?}" ,
103+ rendered
104+ ) ;
105+
106+ // U+13440 is in the Egyptian Hieroglyph Format Controls block, but its
107+ // general category is `Mn`, not `Cf`, so the `Cf` range ends at U+1343F.
108+ assert_eq ! ( format!( "{}" , PrintableString ( "x\u{1343F} y\u{13440} z" ) ) , "x\u{FFFD} y\u{13440} z" ) ;
109+ }
53110}
0 commit comments