@@ -95,13 +95,19 @@ impl std::ops::BitOrAssign<Mask128> for Mask128 {
9595 }
9696}
9797
98+ /// Returns the vector-domain escape mask (Mask128) without extracting to bitmask.
99+ /// This allows combining multiple masks with SIMD OR before a single bitmask extraction.
98100#[ inline( always) ]
99- fn escaped_mask ( v : Simd128u ) -> NeonBits {
100- let x1f = Simd128u :: splat ( 0x1f ) ; // 0x00 ~ 0x20
101+ fn escaped_mask_vec ( v : Simd128u ) -> Mask128 {
102+ let x1f = Simd128u :: splat ( 0x1f ) ; // 0x00 ~ 0x1f
101103 let blash = Simd128u :: splat ( b'\\' ) ;
102104 let quote = Simd128u :: splat ( b'"' ) ;
103- let v = v. le ( & x1f) | v. eq ( & blash) | v. eq ( & quote) ;
104- v. bitmask ( )
105+ v. le ( & x1f) | v. eq ( & blash) | v. eq ( & quote)
106+ }
107+
108+ #[ inline( always) ]
109+ fn escaped_mask ( v : Simd128u ) -> NeonBits {
110+ escaped_mask_vec ( v) . bitmask ( )
105111}
106112
107113#[ target_feature( enable = "neon" ) ]
@@ -124,14 +130,14 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
124130 let v3 = Simd128u :: loadu ( sptr. add ( LANES * 2 ) ) ;
125131 let v4 = Simd128u :: loadu ( sptr. add ( LANES * 3 ) ) ;
126132
127- // Check all 4 masks
128- let mask1 = escaped_mask ( v1) ;
129- let mask2 = escaped_mask ( v2) ;
130- let mask3 = escaped_mask ( v3) ;
131- let mask4 = escaped_mask ( v4) ;
133+ // Compute escape masks in vector domain ( all independent, can pipeline)
134+ let m1 = escaped_mask_vec ( v1) ;
135+ let m2 = escaped_mask_vec ( v2) ;
136+ let m3 = escaped_mask_vec ( v3) ;
137+ let m4 = escaped_mask_vec ( v4) ;
132138
133- // Fast path: if all vectors are clean, write the entire chunk
134- if mask1 . all_zero ( ) && mask2 . all_zero ( ) && mask3 . all_zero ( ) && mask4 . all_zero ( ) {
139+ // Combined check: single bitmask extraction instead of four
140+ if ( m1 | m2 | m3 | m4 ) . bitmask ( ) . all_zero ( ) {
135141 v1. storeu ( dptr) ;
136142 v2. storeu ( dptr. add ( LANES ) ) ;
137143 v3. storeu ( dptr. add ( LANES * 2 ) ) ;
@@ -140,8 +146,8 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
140146 dptr = dptr. add ( CHUNK ) ;
141147 sptr = sptr. add ( CHUNK ) ;
142148 } else {
143- // Slow path: handle escape character
144- // Process v1
149+ // Slow path: extract individual bitmasks lazily
150+ let mask1 = m1 . bitmask ( ) ;
145151 v1. storeu ( dptr) ;
146152 if !mask1. all_zero ( ) {
147153 let cn = mask1. first_offset ( ) ;
@@ -155,7 +161,7 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
155161 dptr = dptr. add ( LANES ) ;
156162 sptr = sptr. add ( LANES ) ;
157163
158- // Process v2
164+ let mask2 = m2 . bitmask ( ) ;
159165 v2. storeu ( dptr) ;
160166 if !mask2. all_zero ( ) {
161167 let cn = mask2. first_offset ( ) ;
@@ -169,7 +175,7 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
169175 dptr = dptr. add ( LANES ) ;
170176 sptr = sptr. add ( LANES ) ;
171177
172- // Process v3
178+ let mask3 = m3 . bitmask ( ) ;
173179 v3. storeu ( dptr) ;
174180 if !mask3. all_zero ( ) {
175181 let cn = mask3. first_offset ( ) ;
@@ -183,7 +189,7 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
183189 dptr = dptr. add ( LANES ) ;
184190 sptr = sptr. add ( LANES ) ;
185191
186- // Process v4
192+ let mask4 = m4 . bitmask ( ) ;
187193 v4. storeu ( dptr) ;
188194 if !mask4. all_zero ( ) {
189195 let cn = mask4. first_offset ( ) ;
0 commit comments