Skip to content

Commit e12216a

Browse files
jensensclaude
andcommitted
Security review fixes (addresses #3)
- CODEC-C1: Validate non-negative length in LONG4 and BINSTRING opcodes - CODEC-C2: Cap memo size at 100,000 entries to prevent OOM via LONG_BINPUT - CODEC-H1: Add recursion depth limit (1,000) to encoder and PyObject converter - CODEC-H2: Pre-scan dict keys to avoid quadratic re-processing of mixed-key dicts - CODEC-M1: Limit LONG opcode text representation to 10,000 characters - CODEC-M2: Reject odd-length item lists in BTree bucket format_flat_data() - CODEC-M3: Cap BINUNICODE8/BINBYTES8 length at 256 MB before allocation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c3de46a commit e12216a

6 files changed

Lines changed: 273 additions & 102 deletions

File tree

BENCHMARKS.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,10 @@ fields, persistent refs) where the codec is **1.1-1.8x faster** decode and
153153
8. **Pre-allocated decoder vectors** — stack, memo, and metastack start with
154154
`Vec::with_capacity` instead of empty, reducing reallocations during parsing.
155155

156-
9. **Single-pass Dict decode** — removed the O(n) `all_string_keys` pre-scan.
157-
Optimistically builds string-key PyDict in one pass; falls back to `@d`
158-
format only if a non-string key is encountered (extremely rare in ZODB).
156+
9. **Pre-scan Dict decode** — checks `all_string_keys` with a cheap enum
157+
discriminant scan before processing values. Builds string-key PyDict if
158+
all keys are strings (>99% of ZODB dicts); otherwise uses `@d` format.
159+
Avoids quadratic re-processing when mixed-key dicts are encountered.
159160

160161
10. **Set/frozenset move** — REDUCE handler for `builtins.set`/`frozenset`
161162
moves the list items by value instead of cloning the entire Vec.

CHANGES.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
# Changelog
22

3+
## 1.2.2
4+
5+
Security review fixes (addresses #3):
6+
7+
- **CODEC-C1:** Validate non-negative length in LONG4 and BINSTRING opcodes.
8+
- **CODEC-C2:** Cap memo size at 100,000 entries to prevent OOM via LONG_BINPUT.
9+
- **CODEC-H1:** Add recursion depth limit (1,000) to encoder and PyObject converter.
10+
- **CODEC-H2:** Pre-scan dict keys to avoid quadratic re-processing of mixed-key dicts.
11+
- **CODEC-M1:** Limit LONG opcode text representation to 10,000 characters.
12+
- **CODEC-M2:** Reject odd-length item lists in BTree bucket `format_flat_data()`.
13+
- **CODEC-M3:** Cap BINUNICODE8/BINBYTES8 length at 256 MB before allocation.
14+
15+
316
## 1.2.1 (2026-02-17)
417

518
- Fix shared reference data loss: update memo after BUILD [#2]

src/btrees.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,11 @@ fn format_flat_data(
182182

183183
if info.is_map {
184184
// Map type: alternating key-value pairs → @kv: [[k, v], ...]
185+
if items.len() % 2 != 0 {
186+
return Err(CodecError::InvalidData(
187+
"BTree bucket has odd number of items for key-value pairs".to_string(),
188+
));
189+
}
185190
let mut pairs = Vec::new();
186191
let mut i = 0;
187192
while i + 1 < items.len() {
@@ -241,6 +246,11 @@ fn bucket_state_to_json(
241246
let mut result_map = Map::new();
242247

243248
if info.is_map {
249+
if flat_data.len() % 2 != 0 {
250+
return Err(CodecError::InvalidData(
251+
"BTree bucket has odd number of items for key-value pairs".to_string(),
252+
));
253+
}
244254
let mut pairs = Vec::new();
245255
let mut i = 0;
246256
while i + 1 < flat_data.len() {
@@ -751,4 +761,28 @@ mod tests {
751761
let json = btree_state_to_json(&info, &state, &pickle_value_to_json).unwrap();
752762
assert_eq!(json, json!({"@kv": []}));
753763
}
764+
765+
#[test]
766+
fn test_format_flat_data_odd_items_error() {
767+
let info = BTreeClassInfo {
768+
kind: BTreeNodeKind::Bucket,
769+
is_map: true,
770+
};
771+
// 3 items — odd number for key-value pairs
772+
let items = vec![
773+
PickleValue::Int(1),
774+
PickleValue::String("one".to_string()),
775+
PickleValue::Int(2),
776+
];
777+
let to_json = |v: &PickleValue| -> Result<serde_json::Value, CodecError> {
778+
match v {
779+
PickleValue::Int(i) => Ok(serde_json::json!(*i)),
780+
PickleValue::String(s) => Ok(serde_json::json!(s)),
781+
_ => Err(CodecError::InvalidData("unexpected".to_string())),
782+
}
783+
};
784+
let result = format_flat_data(&info, &items, &to_json);
785+
assert!(result.is_err());
786+
assert!(result.unwrap_err().to_string().contains("odd number"));
787+
}
754788
}

src/decode.rs

Lines changed: 92 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ use crate::opcodes::*;
33
use crate::types::PickleValue;
44
use num_bigint::BigInt;
55

6+
const MAX_MEMO_SIZE: usize = 100_000;
7+
const MAX_BINARY_SIZE: u64 = 256 * 1024 * 1024; // 256 MB
8+
69
/// Decode pickle bytes into a PickleValue AST.
710
///
811
/// This implements a subset of the pickle virtual machine sufficient
@@ -99,6 +102,9 @@ impl<'a> Decoder<'a> {
99102
let line = self.read_line()?;
100103
let s = std::str::from_utf8(line).map_err(|_| CodecError::InvalidUtf8)?;
101104
let s = s.trim().trim_end_matches('L');
105+
if s.len() > 10_000 {
106+
return Err(CodecError::InvalidData("LONG value too large".to_string()));
107+
}
102108
let val: BigInt = s
103109
.parse()
104110
.map_err(|e| CodecError::InvalidData(format!("LONG parse: {e}")))?;
@@ -120,7 +126,11 @@ impl<'a> Decoder<'a> {
120126
}
121127
}
122128
LONG4 => {
123-
let n = self.read_i32()? as usize;
129+
let n = self.read_i32()?;
130+
if n < 0 {
131+
return Err(CodecError::InvalidData("negative length in LONG4".to_string()));
132+
}
133+
let n = n as usize;
124134
let bytes = self.read_bytes(n)?;
125135
let val = BigInt::from_signed_bytes_le(bytes);
126136
if let Ok(v) = i64::try_from(&val) {
@@ -148,7 +158,11 @@ impl<'a> Decoder<'a> {
148158

149159
// -- Strings (Python 2 str / bytes) --
150160
BINSTRING => {
151-
let n = self.read_i32()? as usize;
161+
let n = self.read_i32()?;
162+
if n < 0 {
163+
return Err(CodecError::InvalidData("negative length in BINSTRING".to_string()));
164+
}
165+
let n = n as usize;
152166
let bytes = self.read_bytes(n)?.to_vec();
153167
self.push(PickleValue::Bytes(bytes));
154168
}
@@ -193,7 +207,11 @@ impl<'a> Decoder<'a> {
193207
self.push(PickleValue::String(s.to_string()));
194208
}
195209
BINUNICODE8 => {
196-
let n = self.read_u64()? as usize;
210+
let n = self.read_u64()?;
211+
if n > MAX_BINARY_SIZE {
212+
return Err(CodecError::InvalidData("BINUNICODE8 data too large".to_string()));
213+
}
214+
let n = n as usize;
197215
let bytes = self.read_bytes(n)?;
198216
let s =
199217
std::str::from_utf8(bytes).map_err(|_| CodecError::InvalidUtf8)?;
@@ -212,7 +230,11 @@ impl<'a> Decoder<'a> {
212230
self.push(PickleValue::Bytes(bytes));
213231
}
214232
BINBYTES8 => {
215-
let n = self.read_u64()? as usize;
233+
let n = self.read_u64()?;
234+
if n > MAX_BINARY_SIZE {
235+
return Err(CodecError::InvalidData("BINBYTES8 data too large".to_string()));
236+
}
237+
let n = n as usize;
216238
let bytes = self.read_bytes(n)?.to_vec();
217239
self.push(PickleValue::Bytes(bytes));
218240
}
@@ -555,17 +577,17 @@ impl<'a> Decoder<'a> {
555577
BINPUT => {
556578
let idx = self.read_u8()? as usize;
557579
let val = self.peek_value()?.clone();
558-
self.memo_put(idx, val);
580+
self.memo_put(idx, val)?;
559581
}
560582
LONG_BINPUT => {
561583
let idx = self.read_u32()? as usize;
562584
let val = self.peek_value()?.clone();
563-
self.memo_put(idx, val);
585+
self.memo_put(idx, val)?;
564586
}
565587
MEMOIZE => {
566588
let val = self.peek_value()?.clone();
567589
let idx = self.memo.len();
568-
self.memo_put(idx, val);
590+
self.memo_put(idx, val)?;
569591
}
570592
BINGET => {
571593
let idx = self.read_u8()? as usize;
@@ -585,7 +607,7 @@ impl<'a> Decoder<'a> {
585607
.parse()
586608
.map_err(|e| CodecError::InvalidData(format!("PUT index: {e}")))?;
587609
let val = self.peek_value()?.clone();
588-
self.memo_put(idx, val);
610+
self.memo_put(idx, val)?;
589611
}
590612
GET => {
591613
let line = self.read_line()?;
@@ -705,11 +727,15 @@ impl<'a> Decoder<'a> {
705727

706728
// -- Memo operations --
707729

708-
fn memo_put(&mut self, idx: usize, val: PickleValue) {
730+
fn memo_put(&mut self, idx: usize, val: PickleValue) -> Result<(), CodecError> {
731+
if idx >= MAX_MEMO_SIZE {
732+
return Err(CodecError::InvalidData(format!("memo index {idx} exceeds maximum {MAX_MEMO_SIZE}")));
733+
}
709734
if idx >= self.memo.len() {
710735
self.memo.resize(idx + 1, PickleValue::None);
711736
}
712737
self.memo[idx] = val;
738+
Ok(())
713739
}
714740

715741
fn memo_get(&self, idx: usize) -> Result<PickleValue, CodecError> {
@@ -881,4 +907,61 @@ mod tests {
881907
panic!("expected Tuple, got {:?}", result);
882908
}
883909
}
910+
911+
#[test]
912+
fn test_long4_negative_length() {
913+
// PROTO 2, LONG4 with length=-1 (0xFFFFFFFF as i32)
914+
let data = b"\x80\x02\x8b\xff\xff\xff\xff";
915+
let err = decode_pickle(data).unwrap_err();
916+
assert!(err.to_string().contains("negative length"));
917+
}
918+
919+
#[test]
920+
fn test_binstring_negative_length() {
921+
// PROTO 2, BINSTRING with length=-1
922+
let data = b"\x80\x02T\xff\xff\xff\xff";
923+
let err = decode_pickle(data).unwrap_err();
924+
assert!(err.to_string().contains("negative length"));
925+
}
926+
927+
#[test]
928+
fn test_memo_index_too_large() {
929+
// PROTO 2, NONE, LONG_BINPUT with index=4_000_000_000
930+
let idx_bytes = 4_000_000_000u32.to_le_bytes();
931+
let mut data = vec![0x80, 0x02, b'N', b'r'];
932+
data.extend_from_slice(&idx_bytes);
933+
let err = decode_pickle(&data).unwrap_err();
934+
assert!(err.to_string().contains("memo index"));
935+
}
936+
937+
#[test]
938+
fn test_long_value_too_large() {
939+
// PROTO 2, LONG with huge text representation
940+
let mut data = vec![0x80, 0x02, b'L'];
941+
data.extend_from_slice(&vec![b'9'; 20_000]);
942+
data.push(b'\n');
943+
data.push(b'.');
944+
let err = decode_pickle(&data).unwrap_err();
945+
assert!(err.to_string().contains("too large"));
946+
}
947+
948+
#[test]
949+
fn test_binunicode8_too_large() {
950+
// PROTO 4, BINUNICODE8 with huge length
951+
let mut data = vec![0x80, 0x04];
952+
data.push(0x8d); // BINUNICODE8
953+
data.extend_from_slice(&(1u64 << 40).to_le_bytes()); // 1 TB
954+
let err = decode_pickle(&data).unwrap_err();
955+
assert!(err.to_string().contains("too large"));
956+
}
957+
958+
#[test]
959+
fn test_binbytes8_too_large() {
960+
// PROTO 4, BINBYTES8 with huge length
961+
let mut data = vec![0x80, 0x04];
962+
data.push(0x8e); // BINBYTES8
963+
data.extend_from_slice(&(1u64 << 40).to_le_bytes()); // 1 TB
964+
let err = decode_pickle(&data).unwrap_err();
965+
assert!(err.to_string().contains("too large"));
966+
}
884967
}

0 commit comments

Comments
 (0)