Skip to content

Commit 19b0957

Browse files
authored
perf(neon): combine 4 escape masks in SIMD domain before GPR extraction (#61)
1 parent 7339b1b commit 19b0957

1 file changed

Lines changed: 22 additions & 16 deletions

File tree

src/simd/neon.rs

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,19 @@ impl std::ops::BitOrAssign<Mask128> for Mask128 {
9595
}
9696
}
9797

98+
/// Returns the vector-domain escape mask (Mask128) without extracting to bitmask.
99+
/// This allows combining multiple masks with SIMD OR before a single bitmask extraction.
98100
#[inline(always)]
99-
fn escaped_mask(v: Simd128u) -> NeonBits {
100-
let x1f = Simd128u::splat(0x1f); // 0x00 ~ 0x20
101+
fn escaped_mask_vec(v: Simd128u) -> Mask128 {
102+
let x1f = Simd128u::splat(0x1f); // 0x00 ~ 0x1f
101103
let blash = Simd128u::splat(b'\\');
102104
let quote = Simd128u::splat(b'"');
103-
let v = v.le(&x1f) | v.eq(&blash) | v.eq(&quote);
104-
v.bitmask()
105+
v.le(&x1f) | v.eq(&blash) | v.eq(&quote)
106+
}
107+
108+
#[inline(always)]
109+
fn escaped_mask(v: Simd128u) -> NeonBits {
110+
escaped_mask_vec(v).bitmask()
105111
}
106112

107113
#[target_feature(enable = "neon")]
@@ -124,14 +130,14 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
124130
let v3 = Simd128u::loadu(sptr.add(LANES * 2));
125131
let v4 = Simd128u::loadu(sptr.add(LANES * 3));
126132

127-
// Check all 4 masks
128-
let mask1 = escaped_mask(v1);
129-
let mask2 = escaped_mask(v2);
130-
let mask3 = escaped_mask(v3);
131-
let mask4 = escaped_mask(v4);
133+
// Compute escape masks in vector domain (all independent, can pipeline)
134+
let m1 = escaped_mask_vec(v1);
135+
let m2 = escaped_mask_vec(v2);
136+
let m3 = escaped_mask_vec(v3);
137+
let m4 = escaped_mask_vec(v4);
132138

133-
// Fast path: if all vectors are clean, write the entire chunk
134-
if mask1.all_zero() && mask2.all_zero() && mask3.all_zero() && mask4.all_zero() {
139+
// Combined check: single bitmask extraction instead of four
140+
if (m1 | m2 | m3 | m4).bitmask().all_zero() {
135141
v1.storeu(dptr);
136142
v2.storeu(dptr.add(LANES));
137143
v3.storeu(dptr.add(LANES * 2));
@@ -140,8 +146,8 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
140146
dptr = dptr.add(CHUNK);
141147
sptr = sptr.add(CHUNK);
142148
} else {
143-
// Slow path: handle escape character
144-
// Process v1
149+
// Slow path: extract individual bitmasks lazily
150+
let mask1 = m1.bitmask();
145151
v1.storeu(dptr);
146152
if !mask1.all_zero() {
147153
let cn = mask1.first_offset();
@@ -155,7 +161,7 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
155161
dptr = dptr.add(LANES);
156162
sptr = sptr.add(LANES);
157163

158-
// Process v2
164+
let mask2 = m2.bitmask();
159165
v2.storeu(dptr);
160166
if !mask2.all_zero() {
161167
let cn = mask2.first_offset();
@@ -169,7 +175,7 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
169175
dptr = dptr.add(LANES);
170176
sptr = sptr.add(LANES);
171177

172-
// Process v3
178+
let mask3 = m3.bitmask();
173179
v3.storeu(dptr);
174180
if !mask3.all_zero() {
175181
let cn = mask3.first_offset();
@@ -183,7 +189,7 @@ pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
183189
dptr = dptr.add(LANES);
184190
sptr = sptr.add(LANES);
185191

186-
// Process v4
192+
let mask4 = m4.bitmask();
187193
v4.storeu(dptr);
188194
if !mask4.all_zero() {
189195
let cn = mask4.first_offset();

0 commit comments

Comments
 (0)