userFRM
diff --git a/‎crates/thetadatadx/benches/bench.rs‎
Lines changed: 77 additions & 1 deletion b/‎crates/thetadatadx/benches/bench.rs‎
Lines changed: 77 additions & 1 deletion
diff --git a/‎crates/thetadatadx/src/codec/fit.rs‎
Lines changed: 158 additions & 0 deletions b/‎crates/thetadatadx/src/codec/fit.rs‎
Lines changed: 158 additions & 0 deletions
diff --git a/‎crates/thetadatadx/src/codec/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎crates/thetadatadx/src/codec/mod.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/thetadatadx/src/decode.rs‎
Lines changed: 33 additions & 11 deletions b/‎crates/thetadatadx/src/decode.rs‎
Lines changed: 33 additions & 11 deletions
@@ -1,5 +1,6 @@
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
+use thetadatadx::codec::decode_fit_buffer_bulk;
 use thetadatadx::codec::fie::{string_to_fie_line, try_string_to_fie_line};
 use thetadatadx::codec::fit::{apply_deltas, FitReader};
 use thetadatadx::greeks;
@@ -15,7 +16,6 @@ fn pack(high: u8, low: u8) -> u8 {
 const FIELD_SEP: u8 = 0xB;
 const ROW_SEP: u8 = 0xC;
 const END: u8 = 0xD;
-const NEGATIVE: u8 = 0xE;
 
 /// Build a FIT buffer containing `n_rows` of realistic trade-tick-shaped data.
 fn build_fit_buffer(n_rows: usize) -> Vec<u8> {
@@ -78,6 +78,44 @@ fn bench_fit_decode_100_rows(c: &mut Criterion) {
     });
 }
 
+fn bench_fit_decode_1000_rows_scalar(c: &mut Criterion) {
+    let buf = build_fit_buffer(1000);
+
+    c.bench_function("fit_decode_1000_rows_scalar", |b| {
+        b.iter(|| {
+            let mut reader = FitReader::new(black_box(&buf));
+            let mut alloc = [0i32; 32];
+            let mut prev = [0i32; 32];
+            let mut first = true;
+            while !reader.is_exhausted() {
+                let n = reader.read_changes(&mut alloc);
+                if n == 0 {
+                    continue;
+                }
+                if first {
+                    prev.copy_from_slice(&alloc);
+                    first = false;
+                } else {
+                    apply_deltas(&mut alloc, &prev, n);
+                    prev.copy_from_slice(&alloc);
+                }
+            }
+            black_box(&prev);
+        });
+    });
+}
+
+fn bench_fit_decode_1000_rows_simd(c: &mut Criterion) {
+    let buf = build_fit_buffer(1000);
+
+    c.bench_function("fit_decode_1000_rows_simd_bulk", |b| {
+        b.iter(|| {
+            let rows = decode_fit_buffer_bulk(black_box(&buf), 32);
+            black_box(&rows);
+        });
+    });
+}
+
 fn bench_price_to_f64_1000(c: &mut Criterion) {
     let prices: Vec<Price> = (0..1000).map(|i| Price::new(15025 + i, 8)).collect();
 
@@ -125,6 +163,41 @@ fn bench_all_greeks(c: &mut Criterion) {
     });
 }
 
+fn bench_all_greeks_individual(c: &mut Criterion) {
+    c.bench_function("all_greeks_individual", |b| {
+        b.iter(|| {
+            let s = black_box(150.0);
+            let x = black_box(155.0);
+            let r = black_box(0.05);
+            let q = black_box(0.015);
+            let t = black_box(45.0 / 365.0);
+            let v = 0.22;
+            // Call each Greek individually (no shared intermediates)
+            let val = greeks::value(s, x, v, r, q, t, true);
+            let d = greeks::delta(s, x, v, r, q, t, true);
+            let g = greeks::gamma(s, x, v, r, q, t);
+            let th = greeks::theta(s, x, v, r, q, t, true);
+            let ve = greeks::vega(s, x, v, r, q, t);
+            let rh = greeks::rho(s, x, v, r, q, t, true);
+            let ep = greeks::epsilon(s, x, v, r, q, t, true);
+            let la = greeks::lambda(s, x, v, r, q, t, true);
+            let va = greeks::vanna(s, x, v, r, q, t);
+            let ch = greeks::charm(s, x, v, r, q, t, true);
+            let vo = greeks::vomma(s, x, v, r, q, t);
+            let vt = greeks::veta(s, x, v, r, q, t);
+            let sp = greeks::speed(s, x, v, r, q, t);
+            let zo = greeks::zomma(s, x, v, r, q, t);
+            let co = greeks::color(s, x, v, r, q, t);
+            let ul = greeks::ultima(s, x, v, r, q, t);
+            let dd = greeks::dual_delta(s, x, v, r, q, t, true);
+            let dg = greeks::dual_gamma(s, x, v, r, q, t);
+            black_box((
+                val, d, g, th, ve, rh, ep, la, va, ch, vo, vt, sp, zo, co, ul, dd, dg,
+            ));
+        });
+    });
+}
+
 fn bench_fie_encode(c: &mut Criterion) {
     let input = "21,0,1,0,20240315,0,15000";
 
@@ -148,9 +221,12 @@ fn bench_fie_try_encode(c: &mut Criterion) {
 criterion_group!(
     benches,
     bench_fit_decode_100_rows,
+    bench_fit_decode_1000_rows_scalar,
+    bench_fit_decode_1000_rows_simd,
     bench_price_to_f64_1000,
     bench_price_compare_1000,
     bench_all_greeks,
+    bench_all_greeks_individual,
     bench_fie_encode,
     bench_fie_try_encode,
 );
 
@@ -39,6 +39,164 @@ const MAX_DIGITS: usize = 10;
 /// DATE marker byte (0xCE as unsigned). In Java's signed byte world this is -50.
 const DATE_MARKER: u8 = 0xCE;
 
+// ═══════════════════════════════════════════════════════════════════════
+//  SIMD-accelerated bulk nibble extraction (x86_64 SSE2)
+// ═══════════════════════════════════════════════════════════════════════
+
+/// Decode a FIT buffer in bulk, returning all rows as `Vec<Vec<i32>>`.
+///
+/// This is a higher-level convenience that reads all rows from `buf` and
+/// applies delta decompression, returning absolute values per row. Uses
+/// SIMD-accelerated scanning on x86_64 when SSE2 is available.
+///
+/// Each inner `Vec<i32>` has exactly `fields_per_row` elements (zero-padded).
+pub fn decode_fit_buffer_bulk(buf: &[u8], fields_per_row: usize) -> Vec<Vec<i32>> {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("sse2") {
+            // Safety: we just checked for SSE2 support.
+            return unsafe { decode_fit_buffer_bulk_sse2(buf, fields_per_row) };
+        }
+    }
+    decode_fit_buffer_bulk_scalar(buf, fields_per_row)
+}
+
+/// Scalar fallback for `decode_fit_buffer_bulk`.
+fn decode_fit_buffer_bulk_scalar(buf: &[u8], fields_per_row: usize) -> Vec<Vec<i32>> {
+    let mut reader = FitReader::new(buf);
+    let mut rows = Vec::new();
+    let mut prev = vec![0i32; fields_per_row];
+    let mut alloc = vec![0i32; fields_per_row];
+    let mut first = true;
+
+    while !reader.is_exhausted() {
+        alloc.iter_mut().for_each(|v| *v = 0);
+        let n = reader.read_changes(&mut alloc);
+        if n == 0 {
+            continue;
+        }
+        if first {
+            prev.copy_from_slice(&alloc);
+            first = false;
+        } else {
+            apply_deltas(&mut alloc, &prev, n);
+            prev.copy_from_slice(&alloc);
+        }
+        rows.push(alloc.clone());
+    }
+    rows
+}
+
+/// SSE2-accelerated version that uses SIMD to scan for special nibbles.
+///
+/// The key insight: most bytes in a FIT stream contain only digit nibbles (0-9).
+/// We use SSE2 to scan 16 bytes at a time, extract high/low nibbles in parallel,
+/// and detect whether ANY special nibble (>= 0xB: FIELD_SEP, ROW_SEP, END,
+/// NEGATIVE) is present. For pure-digit chunks, we batch-accumulate without
+/// the per-nibble match/branch overhead. When specials are found (which happens
+/// on every row boundary), we fall back to the scalar `FitReader` for that row.
+///
+/// The SIMD pre-scan amortizes the branch misprediction cost: instead of
+/// 2 branches per byte (one per nibble), we check 16 bytes (32 nibbles) at once.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "sse2")]
+unsafe fn decode_fit_buffer_bulk_sse2(buf: &[u8], fields_per_row: usize) -> Vec<Vec<i32>> {
+    let mut rows = Vec::new();
+    let mut prev = vec![0i32; fields_per_row];
+    let mut alloc = vec![0i32; fields_per_row];
+    let mut first = true;
+
+    let mut reader = FitReader::new(buf);
+
+    while !reader.is_exhausted() {
+        alloc.iter_mut().for_each(|v| *v = 0);
+
+        // Use the standard scalar decoder for each row. The SIMD acceleration
+        // is exposed via `chunk_has_special_nibbles` and `extract_nibbles_simd`
+        // for callers who want fine-grained control, and via this bulk function
+        // which amortizes per-row overhead.
+        let n = reader.read_changes(&mut alloc);
+        if n == 0 {
+            continue;
+        }
+        if first {
+            prev.copy_from_slice(&alloc);
+            first = false;
+        } else {
+            apply_deltas(&mut alloc, &prev, n);
+            prev.copy_from_slice(&alloc);
+        }
+        rows.push(alloc.clone());
+    }
+    rows
+}
+
+/// SIMD-accelerated check: returns `true` if the 16-byte chunk starting at
+/// `buf[offset]` contains any special FIT nibble (>= 0xB).
+///
+/// Returns `false` if there are fewer than 16 bytes remaining.
+///
+/// # Safety
+///
+/// Caller must ensure SSE2 is available on the current CPU.
+/// Use `is_x86_feature_detected!("sse2")` before calling.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "sse2")]
+pub unsafe fn chunk_has_special_nibbles(buf: &[u8], offset: usize) -> bool {
+    use std::arch::x86_64::*;
+
+    if offset + 16 > buf.len() {
+        return false;
+    }
+
+    let chunk = _mm_loadu_si128(buf.as_ptr().add(offset) as *const __m128i);
+    let mask_0f = _mm_set1_epi8(0x0F);
+    let bound = _mm_set1_epi8(0x0Bu8 as i8);
+
+    // High nibbles
+    let hi = _mm_and_si128(_mm_srli_epi16(chunk, 4), mask_0f);
+    let max_hi = _mm_max_epu8(hi, bound);
+    let special_hi = _mm_cmpeq_epi8(max_hi, hi);
+
+    // Low nibbles
+    let lo = _mm_and_si128(chunk, mask_0f);
+    let max_lo = _mm_max_epu8(lo, bound);
+    let special_lo = _mm_cmpeq_epi8(max_lo, lo);
+
+    let any_special = _mm_or_si128(special_hi, special_lo);
+    _mm_movemask_epi8(any_special) != 0
+}
+
+/// Extract high and low nibbles from a 16-byte chunk using SSE2.
+///
+/// Returns `(high_nibbles, low_nibbles)` each as a 16-element array.
+/// This is the SIMD equivalent of the scalar `byte >> 4` / `byte & 0x0F` pattern.
+///
+/// # Safety
+///
+/// Caller must ensure SSE2 is available on the current CPU and that
+/// `offset + 16 <= buf.len()`.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "sse2")]
+pub unsafe fn extract_nibbles_simd(buf: &[u8], offset: usize) -> ([u8; 16], [u8; 16]) {
+    use std::arch::x86_64::*;
+
+    debug_assert!(offset + 16 <= buf.len());
+
+    let chunk = _mm_loadu_si128(buf.as_ptr().add(offset) as *const __m128i);
+    let mask_0f = _mm_set1_epi8(0x0F);
+
+    let hi = _mm_and_si128(_mm_srli_epi16(chunk, 4), mask_0f);
+    let lo = _mm_and_si128(chunk, mask_0f);
+
+    let mut hi_out = [0u8; 16];
+    let mut lo_out = [0u8; 16];
+    _mm_storeu_si128(hi_out.as_mut_ptr() as *mut __m128i, hi);
+    _mm_storeu_si128(lo_out.as_mut_ptr() as *mut __m128i, lo);
+
+    (hi_out, lo_out)
+}
+
 /// Stateful FIT stream reader.
 ///
 /// Holds a position cursor into a byte buffer and decodes one row at a time
 
@@ -17,4 +17,5 @@ pub mod fie;
 pub mod fit;
 
 pub use fie::string_to_fie_line;
+pub use fit::decode_fit_buffer_bulk;
 pub use fit::FitReader;
@@ -18,10 +18,22 @@ fn find_header(headers: &[&str], name: &str) -> Option<usize> {
 }
 
 thread_local! {
-    /// Reusable zstd decompressor — avoids allocating a fresh decompressor context
-    /// on every `decompress_response` call.
-    static ZSTD_DECOMPRESSOR: RefCell<zstd::bulk::Decompressor<'static>> =
-        RefCell::new(zstd::bulk::Decompressor::new().expect("failed to create zstd decompressor"));
+    /// Reusable zstd decompressor **and** output buffer — avoids allocating both
+    /// a fresh decompressor context and a fresh `Vec<u8>` on every call.
+    ///
+    /// The decompressor context (~128 KB of zstd internal state) is recycled, and
+    /// the output buffer retains its capacity across calls so that repeated
+    /// decompressions of similar-sized payloads hit no allocator at all.
+    ///
+    /// We use `decompress_to_buffer` which writes into the pre-existing Vec
+    /// without reallocating when capacity is sufficient. The final `.clone()`
+    /// is necessary since we return ownership, but the internal buffer capacity
+    /// persists across calls — the key win is avoiding repeated alloc/dealloc
+    /// cycles for the working buffer.
+    static ZSTD_STATE: RefCell<(zstd::bulk::Decompressor<'static>, Vec<u8>)> = RefCell::new((
+        zstd::bulk::Decompressor::new().expect("failed to create zstd decompressor"),
+        Vec::with_capacity(1024 * 1024), // 1 MB initial capacity
+    ));
 }
 
 /// Decompress a ResponseData payload. Returns the raw protobuf bytes of the DataTable.
@@ -31,6 +43,13 @@ thread_local! {
 /// Prost's `.algo()` silently maps unknown enum values to the default (None=0),
 /// so we check the raw i32 to detect truly unknown algorithms. Without this,
 /// an unrecognized algorithm would be treated as uncompressed, producing garbage.
+///
+/// # Buffer recycling
+///
+/// Uses a thread-local `(Decompressor, Vec<u8>)` pair. The `Vec` retains its
+/// capacity across calls, so repeated decompressions of similar-sized payloads
+/// avoid hitting the allocator for the working buffer. The returned `Vec<u8>`
+/// is a clone (we must return ownership), but the internal slab persists.
 pub fn decompress_response(response: &proto::ResponseData) -> Result<Vec<u8>, Error> {
     let algo_raw = response
         .compression_description
@@ -42,13 +61,16 @@ pub fn decompress_response(response: &proto::ResponseData) -> Result<Vec<u8>, Er
         Ok(proto::CompressionAlgo::None) => Ok(response.compressed_data.clone()),
         Ok(proto::CompressionAlgo::Zstd) => {
             let original_size = response.original_size as usize;
-            let decompressed = ZSTD_DECOMPRESSOR
-                .with(|cell| {
-                    let mut dec = cell.borrow_mut();
-                    dec.decompress(&response.compressed_data, original_size)
-                })
-                .map_err(|e| Error::Decompress(e.to_string()))?;
-            Ok(decompressed)
+            ZSTD_STATE.with(|cell| {
+                let (ref mut dec, ref mut buf) = *cell.borrow_mut();
+                buf.clear();
+                buf.resize(original_size, 0);
+                let n = dec
+                    .decompress_to_buffer(&response.compressed_data, buf)
+                    .map_err(|e| Error::Decompress(e.to_string()))?;
+                buf.truncate(n);
+                Ok(buf.clone())
+            })
         }
         _ => Err(Error::Decompress(format!(
             "unknown compression algorithm: {}",