Skip to content

Commit 8b4755d

Browse files
j-mendezclaude
andcommitted
fix(detect): skip leading ASCII whitespace in is_binary_file + 0.2.4
Server-padded HTML payloads (Aestiva HTML/OS engines emit `\n\n\n\n\n\n\n\n\n<!doctype html>...` — observed in the wild on seeleylake.com and similar legacy WordPress-adjacent stacks) previously dodged magic-byte sniffing because the first byte was whitespace, not a magic prefix. Worse, the small fixed sniff windows that downstream consumers (e.g. spider's 16-byte spool head sample) feed in could leave no headroom for an actual signature match if the padding ate into the window. The fix adds a 256-byte compile-time `INTERESTING_FIRST_BYTE` classifier (magic-table first bytes ∪ ASCII whitespace) that gates the magic search. Any byte not in either set short-circuits to false in a single L1-resident lookup. Whitespace bytes hand off to a `#[cold]` helper that trims and re-runs the search. Symmetrical treatment for the legacy `is_binary_file_phf` sibling. Bench delta (criterion, sub-bench `is_binary_file`): not_binary 3.86 ns -> 0.78 ns (-80%, 5x faster) png_match 7.33 ns -> 9.10 ns (+25%) jpeg_match 10.40 ns -> 11.81 ns (+16%) The dominant real-world case is "not binary" (HTML responses), so this is a net speedup; the absolute regression on binary headers is ~1-2 ns, well under the cost of any actual byte-level work the caller is doing on the body. Tests cover Aestiva-padded HTML, padded PNG, padded PDF, empty, whitespace-only, and the bare-magic / non-binary regressions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0727f9f commit 8b4755d

4 files changed

Lines changed: 147 additions & 8 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "auto_encoder"
3-
version = "0.2.3"
3+
version = "0.2.4"
44
edition = "2021"
55
description = "Auto encoding library"
66
repository = "https://github.com/spider-rs/auto-encoder"

src/detect.rs

Lines changed: 115 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,40 +23,149 @@ static MAGIC_TABLE: &[(u8, &[&[u8]])] = &[
2323
(0xFF, &[&[0xFF, 0xD8, 0xFF], &[0xFF, 0xFB]]),
2424
];
2525

26+
/// 256-entry classifier built at compile time. A byte is "interesting"
27+
/// iff it's the first byte of a magic signature in [`MAGIC_TABLE`] or
28+
/// ASCII whitespace (` `, `\t`, `\n`, `\r`, `\x0c` — matches
29+
/// `u8::is_ascii_whitespace`). All other bytes short-circuit to
30+
/// `false` without touching the magic search or the whitespace retry.
31+
///
32+
/// One byte per entry, so the table is a single 256-byte cache line —
33+
/// the lookup is one L1-resident load plus a branch. This is faster
34+
/// than the original `binary_search_by_key` for the dominant
35+
/// "definitely-not-binary" HTML case (`<` etc.) by a factor of ~5x in
36+
/// micro-benches, more than amortizing the small overhead added to the
37+
/// magic-hit path for actual binary content.
38+
static INTERESTING_FIRST_BYTE: [bool; 256] = {
39+
let mut t = [false; 256];
40+
// Magic-table first bytes — keep in sync with `MAGIC_TABLE`.
41+
let magic = [
42+
0x00u8, 0x1A, 0x1F, 0x25, 0x42, 0x46, 0x47, 0x49, 0x4C, 0x4D, 0x4F, 0x50, 0x52, 0x66,
43+
0x7F, 0x89, 0xCA, 0xFF,
44+
];
45+
let mut i = 0;
46+
while i < magic.len() {
47+
t[magic[i] as usize] = true;
48+
i += 1;
49+
}
50+
// ASCII whitespace bytes.
51+
t[b' ' as usize] = true;
52+
t[b'\t' as usize] = true;
53+
t[b'\n' as usize] = true;
54+
t[b'\r' as usize] = true;
55+
t[0x0C] = true;
56+
t
57+
};
58+
2659
/// Checks if the file is a known binary format using its initial bytes.
60+
///
61+
/// Performance shape:
62+
/// * **Not binary, no whitespace** (the dominant HTML case): one
63+
/// bounds-check, one `INTERESTING_FIRST_BYTE` lookup, return false.
64+
/// * **Magic hit** (PNG/JPEG/PDF/...): same lookup then a binary search
65+
/// plus per-signature compare, identical to the original logic
66+
/// except for the leading single-cycle table load.
67+
/// * **Whitespace-padded** (Aestiva HTML/OS:
68+
/// `\n\n\n\n\n\n\n\n\n<!doctype html>`): hands off to a `#[cold]`
69+
/// helper that trims and re-runs the search. Catches binaries with
70+
/// stray leading whitespace and stops whitespace-padded text bodies
71+
/// from being mis-classified as binary.
2772
#[inline]
2873
pub fn is_binary_file(content: &[u8]) -> bool {
29-
if content.is_empty() {
74+
let first = match content.first() {
75+
Some(&b) => b,
76+
None => return false,
77+
};
78+
if !INTERESTING_FIRST_BYTE[first as usize] {
3079
return false;
3180
}
32-
let first = content[0];
3381
if let Ok(idx) = MAGIC_TABLE.binary_search_by_key(&first, |&(b, _)| b) {
3482
let (_, signatures) = MAGIC_TABLE[idx];
3583
for sig in signatures.iter() {
3684
if content.len() >= sig.len() && &content[..sig.len()] == *sig {
3785
return true;
3886
}
3987
}
88+
return false;
4089
}
41-
false
90+
// Reached only when the byte is in `INTERESTING_FIRST_BYTE` but not
91+
// in `MAGIC_TABLE` — i.e. ASCII whitespace. Cold path.
92+
is_binary_file_ws_cold(content)
4293
}
4394

4495
/// Checks if the file is a known binary format using its initial bytes.
4596
/// Uses the original PHF map implementation for backwards compatibility.
97+
///
98+
/// Same fast-path / cold-path shape as [`is_binary_file`].
4699
#[inline]
47100
pub fn is_binary_file_phf(content: &[u8]) -> bool {
48-
if content.is_empty() {
101+
let first = match content.first() {
102+
Some(&b) => b,
103+
None => return false,
104+
};
105+
if !INTERESTING_FIRST_BYTE[first as usize] {
49106
return false;
50107
}
51-
52-
if let Some(&keys) = FIRST_BYTE_MAP.get(&content[0]) {
108+
if let Some(&keys) = FIRST_BYTE_MAP.get(&first) {
53109
for &key in keys {
54110
if let Some(&k) = ASSET_NUMBERS.get(key) {
55111
if content.len() >= k.len() && &content[..k.len()] == k {
56112
return true;
57113
}
58114
}
59115
}
116+
return false;
117+
}
118+
is_binary_file_phf_ws_cold(content)
119+
}
120+
121+
/// Cold whitespace-retry helper for [`is_binary_file`]. Reached only
122+
/// when the first byte is ASCII whitespace.
123+
#[cold]
124+
#[inline(never)]
125+
fn is_binary_file_ws_cold(content: &[u8]) -> bool {
126+
let mut i = 1;
127+
while i < content.len() && content[i].is_ascii_whitespace() {
128+
i += 1;
129+
}
130+
if i >= content.len() {
131+
return false;
132+
}
133+
// SAFETY: i ∈ [1, content.len()).
134+
let trimmed = unsafe { content.get_unchecked(i..) };
135+
let first = trimmed[0];
136+
if let Ok(idx) = MAGIC_TABLE.binary_search_by_key(&first, |&(b, _)| b) {
137+
let (_, signatures) = MAGIC_TABLE[idx];
138+
for sig in signatures.iter() {
139+
if trimmed.len() >= sig.len() && &trimmed[..sig.len()] == *sig {
140+
return true;
141+
}
142+
}
143+
}
144+
false
145+
}
146+
147+
/// Cold whitespace-retry helper for [`is_binary_file_phf`].
148+
#[cold]
149+
#[inline(never)]
150+
fn is_binary_file_phf_ws_cold(content: &[u8]) -> bool {
151+
let mut i = 1;
152+
while i < content.len() && content[i].is_ascii_whitespace() {
153+
i += 1;
154+
}
155+
if i >= content.len() {
156+
return false;
157+
}
158+
// SAFETY: i ∈ [1, content.len()).
159+
let trimmed = unsafe { content.get_unchecked(i..) };
160+
let first = trimmed[0];
161+
if let Some(&keys) = FIRST_BYTE_MAP.get(&first) {
162+
for &key in keys {
163+
if let Some(&k) = ASSET_NUMBERS.get(key) {
164+
if trimmed.len() >= k.len() && &trimmed[..k.len()] == k {
165+
return true;
166+
}
167+
}
168+
}
60169
}
61170
false
62171
}

src/lib.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,36 @@ mod tests {
235235
assert!(!is_binary_file(&[0x01, 0x02, 0x03]));
236236
}
237237

238+
#[test]
239+
fn test_is_binary_file_skips_leading_whitespace() {
240+
use crate::detect::is_binary_file_phf;
241+
242+
// Aestiva HTML/OS-style padded HTML (the shape served by
243+
// seeleylake.com): nine leading newlines then `<!doctype html>`.
244+
// Must not be classified as binary.
245+
let padded = b"\n\n\n\n\n\n\n\n\n<!doctype html>\n<html></html>";
246+
assert!(!is_binary_file(padded));
247+
assert!(!is_binary_file_phf(padded));
248+
249+
// Whitespace prefix in front of a real magic header still detects
250+
// the binary correctly.
251+
let mut padded_png = Vec::from(&b" \t\r\n"[..]);
252+
padded_png.extend_from_slice(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]);
253+
assert!(is_binary_file(&padded_png));
254+
assert!(is_binary_file_phf(&padded_png));
255+
256+
let mut padded_pdf = Vec::from(&b"\n\n"[..]);
257+
padded_pdf.extend_from_slice(b"%PDF-1.4\n");
258+
assert!(is_binary_file(&padded_pdf));
259+
assert!(is_binary_file_phf(&padded_pdf));
260+
261+
// Whitespace-only / empty bodies are not binary.
262+
assert!(!is_binary_file(b""));
263+
assert!(!is_binary_file(b" \t\n\r"));
264+
assert!(!is_binary_file_phf(b""));
265+
assert!(!is_binary_file_phf(b" \t\n\r"));
266+
}
267+
238268
#[test]
239269
fn test_encode_bytes() {
240270
let html_content = b"hello";

0 commit comments

Comments
 (0)