Security review fixes (addresses #3)

jensens · claude · jensens · commit e12216af679a · 2026-02-21T22:06:53.000+01:00
- CODEC-C1: Validate non-negative length in LONG4 and BINSTRING opcodes
- CODEC-C2: Cap memo size at 100,000 entries to prevent OOM via LONG_BINPUT
- CODEC-H1: Add recursion depth limit (1,000) to encoder and PyObject converter
- CODEC-H2: Pre-scan dict keys to avoid quadratic re-processing of mixed-key dicts
- CODEC-M1: Limit LONG opcode text representation to 10,000 characters
- CODEC-M2: Reject odd-length item lists in BTree bucket format_flat_data()
- CODEC-M3: Cap BINUNICODE8/BINBYTES8 length at 256 MB before allocation

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -153,9 +153,10 @@ fields, persistent refs) where the codec is **1.1-1.8x faster** decode and
 8. **Pre-allocated decoder vectors** — stack, memo, and metastack start with
    `Vec::with_capacity` instead of empty, reducing reallocations during parsing.
 
-9. **Single-pass Dict decode** — removed the O(n) `all_string_keys` pre-scan.
-   Optimistically builds string-key PyDict in one pass; falls back to `@d`
-   format only if a non-string key is encountered (extremely rare in ZODB).
+9. **Pre-scan Dict decode** — checks `all_string_keys` with a cheap enum
+   discriminant scan before processing values. Builds string-key PyDict if
+   all keys are strings (>99% of ZODB dicts); otherwise uses `@d` format.
+   Avoids quadratic re-processing when mixed-key dicts are encountered.
 
 10. **Set/frozenset move** — REDUCE handler for `builtins.set`/`frozenset`
     moves the list items by value instead of cloning the entire Vec.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## 1.2.2
+
+Security review fixes (addresses #3):
+
+- **CODEC-C1:** Validate non-negative length in LONG4 and BINSTRING opcodes.
+- **CODEC-C2:** Cap memo size at 100,000 entries to prevent OOM via LONG_BINPUT.
+- **CODEC-H1:** Add recursion depth limit (1,000) to encoder and PyObject converter.
+- **CODEC-H2:** Pre-scan dict keys to avoid quadratic re-processing of mixed-key dicts.
+- **CODEC-M1:** Limit LONG opcode text representation to 10,000 characters.
+- **CODEC-M2:** Reject odd-length item lists in BTree bucket `format_flat_data()`.
+- **CODEC-M3:** Cap BINUNICODE8/BINBYTES8 length at 256 MB before allocation.
+
+
 ## 1.2.1 (2026-02-17)
 
 - Fix shared reference data loss: update memo after BUILD [#2]
diff --git a/src/btrees.rs b/src/btrees.rs
@@ -182,6 +182,11 @@ fn format_flat_data(
 
     if info.is_map {
         // Map type: alternating key-value pairs → @kv: [[k, v], ...]
+        if items.len() % 2 != 0 {
+            return Err(CodecError::InvalidData(
+                "BTree bucket has odd number of items for key-value pairs".to_string(),
+            ));
+        }
         let mut pairs = Vec::new();
         let mut i = 0;
         while i + 1 < items.len() {
@@ -241,6 +246,11 @@ fn bucket_state_to_json(
             let mut result_map = Map::new();
 
             if info.is_map {
+                if flat_data.len() % 2 != 0 {
+                    return Err(CodecError::InvalidData(
+                        "BTree bucket has odd number of items for key-value pairs".to_string(),
+                    ));
+                }
                 let mut pairs = Vec::new();
                 let mut i = 0;
                 while i + 1 < flat_data.len() {
@@ -751,4 +761,28 @@ mod tests {
         let json = btree_state_to_json(&info, &state, &pickle_value_to_json).unwrap();
         assert_eq!(json, json!({"@kv": []}));
     }
+
+    #[test]
+    fn test_format_flat_data_odd_items_error() {
+        let info = BTreeClassInfo {
+            kind: BTreeNodeKind::Bucket,
+            is_map: true,
+        };
+        // 3 items — odd number for key-value pairs
+        let items = vec![
+            PickleValue::Int(1),
+            PickleValue::String("one".to_string()),
+            PickleValue::Int(2),
+        ];
+        let to_json = |v: &PickleValue| -> Result<serde_json::Value, CodecError> {
+            match v {
+                PickleValue::Int(i) => Ok(serde_json::json!(*i)),
+                PickleValue::String(s) => Ok(serde_json::json!(s)),
+                _ => Err(CodecError::InvalidData("unexpected".to_string())),
+            }
+        };
+        let result = format_flat_data(&info, &items, &to_json);
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("odd number"));
+    }
 }
diff --git a/src/decode.rs b/src/decode.rs
@@ -3,6 +3,9 @@ use crate::opcodes::*;
 use crate::types::PickleValue;
 use num_bigint::BigInt;
 
+const MAX_MEMO_SIZE: usize = 100_000;
+const MAX_BINARY_SIZE: u64 = 256 * 1024 * 1024; // 256 MB
+
 /// Decode pickle bytes into a PickleValue AST.
 ///
 /// This implements a subset of the pickle virtual machine sufficient
@@ -99,6 +102,9 @@ impl<'a> Decoder<'a> {
                     let line = self.read_line()?;
                     let s = std::str::from_utf8(line).map_err(|_| CodecError::InvalidUtf8)?;
                     let s = s.trim().trim_end_matches('L');
+                    if s.len() > 10_000 {
+                        return Err(CodecError::InvalidData("LONG value too large".to_string()));
+                    }
                     let val: BigInt = s
                         .parse()
                         .map_err(|e| CodecError::InvalidData(format!("LONG parse: {e}")))?;
@@ -120,7 +126,11 @@ impl<'a> Decoder<'a> {
                     }
                 }
                 LONG4 => {
-                    let n = self.read_i32()? as usize;
+                    let n = self.read_i32()?;
+                    if n < 0 {
+                        return Err(CodecError::InvalidData("negative length in LONG4".to_string()));
+                    }
+                    let n = n as usize;
                     let bytes = self.read_bytes(n)?;
                     let val = BigInt::from_signed_bytes_le(bytes);
                     if let Ok(v) = i64::try_from(&val) {
@@ -148,7 +158,11 @@ impl<'a> Decoder<'a> {
 
                 // -- Strings (Python 2 str / bytes) --
                 BINSTRING => {
-                    let n = self.read_i32()? as usize;
+                    let n = self.read_i32()?;
+                    if n < 0 {
+                        return Err(CodecError::InvalidData("negative length in BINSTRING".to_string()));
+                    }
+                    let n = n as usize;
                     let bytes = self.read_bytes(n)?.to_vec();
                     self.push(PickleValue::Bytes(bytes));
                 }
@@ -193,7 +207,11 @@ impl<'a> Decoder<'a> {
                     self.push(PickleValue::String(s.to_string()));
                 }
                 BINUNICODE8 => {
-                    let n = self.read_u64()? as usize;
+                    let n = self.read_u64()?;
+                    if n > MAX_BINARY_SIZE {
+                        return Err(CodecError::InvalidData("BINUNICODE8 data too large".to_string()));
+                    }
+                    let n = n as usize;
                     let bytes = self.read_bytes(n)?;
                     let s =
                         std::str::from_utf8(bytes).map_err(|_| CodecError::InvalidUtf8)?;
@@ -212,7 +230,11 @@ impl<'a> Decoder<'a> {
                     self.push(PickleValue::Bytes(bytes));
                 }
                 BINBYTES8 => {
-                    let n = self.read_u64()? as usize;
+                    let n = self.read_u64()?;
+                    if n > MAX_BINARY_SIZE {
+                        return Err(CodecError::InvalidData("BINBYTES8 data too large".to_string()));
+                    }
+                    let n = n as usize;
                     let bytes = self.read_bytes(n)?.to_vec();
                     self.push(PickleValue::Bytes(bytes));
                 }
@@ -555,17 +577,17 @@ impl<'a> Decoder<'a> {
                 BINPUT => {
                     let idx = self.read_u8()? as usize;
                     let val = self.peek_value()?.clone();
-                    self.memo_put(idx, val);
+                    self.memo_put(idx, val)?;
                 }
                 LONG_BINPUT => {
                     let idx = self.read_u32()? as usize;
                     let val = self.peek_value()?.clone();
-                    self.memo_put(idx, val);
+                    self.memo_put(idx, val)?;
                 }
                 MEMOIZE => {
                     let val = self.peek_value()?.clone();
                     let idx = self.memo.len();
-                    self.memo_put(idx, val);
+                    self.memo_put(idx, val)?;
                 }
                 BINGET => {
                     let idx = self.read_u8()? as usize;
@@ -585,7 +607,7 @@ impl<'a> Decoder<'a> {
                         .parse()
                         .map_err(|e| CodecError::InvalidData(format!("PUT index: {e}")))?;
                     let val = self.peek_value()?.clone();
-                    self.memo_put(idx, val);
+                    self.memo_put(idx, val)?;
                 }
                 GET => {
                     let line = self.read_line()?;
@@ -705,11 +727,15 @@ impl<'a> Decoder<'a> {
 
     // -- Memo operations --
 
-    fn memo_put(&mut self, idx: usize, val: PickleValue) {
+    fn memo_put(&mut self, idx: usize, val: PickleValue) -> Result<(), CodecError> {
+        if idx >= MAX_MEMO_SIZE {
+            return Err(CodecError::InvalidData(format!("memo index {idx} exceeds maximum {MAX_MEMO_SIZE}")));
+        }
         if idx >= self.memo.len() {
             self.memo.resize(idx + 1, PickleValue::None);
         }
         self.memo[idx] = val;
+        Ok(())
     }
 
     fn memo_get(&self, idx: usize) -> Result<PickleValue, CodecError> {
@@ -881,4 +907,61 @@ mod tests {
             panic!("expected Tuple, got {:?}", result);
         }
     }
+
+    #[test]
+    fn test_long4_negative_length() {
+        // PROTO 2, LONG4 with length=-1 (0xFFFFFFFF as i32)
+        let data = b"\x80\x02\x8b\xff\xff\xff\xff";
+        let err = decode_pickle(data).unwrap_err();
+        assert!(err.to_string().contains("negative length"));
+    }
+
+    #[test]
+    fn test_binstring_negative_length() {
+        // PROTO 2, BINSTRING with length=-1
+        let data = b"\x80\x02T\xff\xff\xff\xff";
+        let err = decode_pickle(data).unwrap_err();
+        assert!(err.to_string().contains("negative length"));
+    }
+
+    #[test]
+    fn test_memo_index_too_large() {
+        // PROTO 2, NONE, LONG_BINPUT with index=4_000_000_000
+        let idx_bytes = 4_000_000_000u32.to_le_bytes();
+        let mut data = vec![0x80, 0x02, b'N', b'r'];
+        data.extend_from_slice(&idx_bytes);
+        let err = decode_pickle(&data).unwrap_err();
+        assert!(err.to_string().contains("memo index"));
+    }
+
+    #[test]
+    fn test_long_value_too_large() {
+        // PROTO 2, LONG with huge text representation
+        let mut data = vec![0x80, 0x02, b'L'];
+        data.extend_from_slice(&vec![b'9'; 20_000]);
+        data.push(b'\n');
+        data.push(b'.');
+        let err = decode_pickle(&data).unwrap_err();
+        assert!(err.to_string().contains("too large"));
+    }
+
+    #[test]
+    fn test_binunicode8_too_large() {
+        // PROTO 4, BINUNICODE8 with huge length
+        let mut data = vec![0x80, 0x04];
+        data.push(0x8d); // BINUNICODE8
+        data.extend_from_slice(&(1u64 << 40).to_le_bytes()); // 1 TB
+        let err = decode_pickle(&data).unwrap_err();
+        assert!(err.to_string().contains("too large"));
+    }
+
+    #[test]
+    fn test_binbytes8_too_large() {
+        // PROTO 4, BINBYTES8 with huge length
+        let mut data = vec![0x80, 0x04];
+        data.push(0x8e); // BINBYTES8
+        data.extend_from_slice(&(1u64 << 40).to_le_bytes()); // 1 TB
+        let err = decode_pickle(&data).unwrap_err();
+        assert!(err.to_string().contains("too large"));
+    }
 }
diff --git a/src/encode.rs b/src/encode.rs
diff --git a/src/pyconv.rs b/src/pyconv.rs