Skip to content

Commit 819b21b

Browse files
jensensclaude
andcommitted
Fix pickle memo stale snapshot for mutable containers (fixes #18)
BINPUT clones the stack top into the memo, capturing the empty state BEFORE SETITEMS/APPENDS/ADDITEMS populate the container. Later BINGET returned the stale empty copy. Fix by tracking memo-to-stack bindings and syncing memo entries after in-place mutations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 34526b5 commit 819b21b

2 files changed

Lines changed: 192 additions & 11 deletions

File tree

src/decode.rs

Lines changed: 152 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ struct Decoder<'a> {
3535
memo: Vec<PickleValue>,
3636
/// Metastack for MARK-based operations (saves/restores stack at MARK)
3737
metastack: Vec<Vec<PickleValue>>,
38+
/// Tracks which memo indices are bound to each stack slot (parallel to stack).
39+
/// When BINPUT stores from stack top, the memo index is recorded here.
40+
/// After in-place mutations (SETITEMS, APPENDS, etc.), stale memo entries
41+
/// are updated via sync_memo_top().
42+
stack_memo: Vec<Vec<usize>>,
43+
/// Saved stack_memo during MARK (parallel to metastack).
44+
meta_stack_memo: Vec<Vec<Vec<usize>>>,
3845
}
3946

4047
impl<'a> Decoder<'a> {
@@ -45,6 +52,8 @@ impl<'a> Decoder<'a> {
4552
stack: Vec::with_capacity(16),
4653
memo: Vec::with_capacity(16),
4754
metastack: Vec::with_capacity(4),
55+
stack_memo: Vec::with_capacity(16),
56+
meta_stack_memo: Vec::with_capacity(4),
4857
}
4958
}
5059

@@ -244,6 +253,8 @@ impl<'a> Decoder<'a> {
244253
// Save current stack, start a new one
245254
let old_stack = std::mem::take(&mut self.stack);
246255
self.metastack.push(old_stack);
256+
let old_sm = std::mem::take(&mut self.stack_memo);
257+
self.meta_stack_memo.push(old_sm);
247258
// Don't push Mark itself; everything above the mark
248259
// is captured by the current stack being empty
249260
}
@@ -299,6 +310,7 @@ impl<'a> Decoder<'a> {
299310
));
300311
}
301312
}
313+
self.sync_memo_top()?;
302314
}
303315
APPENDS => {
304316
let items = self.pop_mark()?;
@@ -323,6 +335,7 @@ impl<'a> Decoder<'a> {
323335
));
324336
}
325337
}
338+
self.sync_memo_top()?;
326339
}
327340

328341
// -- Dict --
@@ -356,6 +369,7 @@ impl<'a> Decoder<'a> {
356369
));
357370
}
358371
}
372+
self.sync_memo_top()?;
359373
}
360374
SETITEMS => {
361375
let items = self.pop_mark()?;
@@ -381,6 +395,7 @@ impl<'a> Decoder<'a> {
381395
));
382396
}
383397
}
398+
self.sync_memo_top()?;
384399
}
385400

386401
// -- Set/FrozenSet (protocol 4) --
@@ -395,6 +410,7 @@ impl<'a> Decoder<'a> {
395410
"ADDITEMS on non-set".to_string(),
396411
));
397412
}
413+
self.sync_memo_top()?;
398414
}
399415
FROZENSET => {
400416
let items = self.pop_mark()?;
@@ -493,9 +509,9 @@ impl<'a> Decoder<'a> {
493509
}
494510
BUILD => {
495511
let state = self.pop_value()?;
496-
let obj = self.pop_value()?;
497-
// Save pre-BUILD value so we can update stale memo entries.
498-
let old_obj = obj.clone();
512+
// Pop object and its memo bindings (so we can transfer them)
513+
let obj_bindings = self.stack_memo.pop().unwrap_or_default();
514+
let obj = self.stack.pop().ok_or(CodecError::StackUnderflow)?;
499515
match obj {
500516
PickleValue::Global { module, name } => {
501517
self.push(PickleValue::Instance(Box::new(InstanceData {
@@ -588,15 +604,12 @@ impl<'a> Decoder<'a> {
588604
})));
589605
}
590606
}
591-
// Update memo: replace stale pre-BUILD entries with the
592-
// new post-BUILD value so BINGET returns the correct form.
593-
// Move new_val on first match (typically only one entry).
594-
let new_val = self.peek_value()?.clone();
595-
for entry in self.memo.iter_mut() {
596-
if *entry == old_obj {
597-
*entry = new_val.clone();
598-
}
607+
// Transfer memo bindings from the old object to the new
608+
// stack top and update memo entries.
609+
if let Some(top_bindings) = self.stack_memo.last_mut() {
610+
top_bindings.extend(obj_bindings);
599611
}
612+
self.sync_memo_top()?;
600613
}
601614
NEWOBJ => {
602615
let args = self.pop_value()?;
@@ -645,16 +658,19 @@ impl<'a> Decoder<'a> {
645658
let idx = self.read_u8()? as usize;
646659
let val = self.peek_value()?.clone();
647660
self.memo_put(idx, val)?;
661+
self.record_memo_binding(idx);
648662
}
649663
LONG_BINPUT => {
650664
let idx = self.read_u32()? as usize;
651665
let val = self.peek_value()?.clone();
652666
self.memo_put(idx, val)?;
667+
self.record_memo_binding(idx);
653668
}
654669
MEMOIZE => {
655670
let val = self.peek_value()?.clone();
656671
let idx = self.memo.len();
657672
self.memo_put(idx, val)?;
673+
self.record_memo_binding(idx);
658674
}
659675
BINGET => {
660676
let idx = self.read_u8()? as usize;
@@ -675,6 +691,7 @@ impl<'a> Decoder<'a> {
675691
.map_err(|e| CodecError::InvalidData(format!("PUT index: {e}")))?;
676692
let val = self.peek_value()?.clone();
677693
self.memo_put(idx, val)?;
694+
self.record_memo_binding(idx);
678695
}
679696
GET => {
680697
let line = self.read_line()?;
@@ -761,10 +778,12 @@ impl<'a> Decoder<'a> {
761778
#[inline]
762779
fn push(&mut self, val: PickleValue) {
763780
self.stack.push(val);
781+
self.stack_memo.push(Vec::new());
764782
}
765783

766784
#[inline]
767785
fn pop_value(&mut self) -> Result<PickleValue, CodecError> {
786+
self.stack_memo.pop();
768787
self.stack.pop().ok_or(CodecError::StackUnderflow)
769788
}
770789

@@ -784,10 +803,16 @@ impl<'a> Decoder<'a> {
784803
// This is a pointer swap — no element-by-element drain needed.
785804
let items = std::mem::take(&mut self.stack);
786805

806+
// Also discard the memo bindings for popped items
807+
self.stack_memo.clear();
808+
787809
// Restore the previous stack from metastack
788810
if let Some(old_stack) = self.metastack.pop() {
789811
self.stack = old_stack;
790812
}
813+
if let Some(old_sm) = self.meta_stack_memo.pop() {
814+
self.stack_memo = old_sm;
815+
}
791816

792817
Ok(items)
793818
}
@@ -811,6 +836,31 @@ impl<'a> Decoder<'a> {
811836
.cloned()
812837
.ok_or_else(|| CodecError::InvalidData(format!("memo index {idx} not found")))
813838
}
839+
840+
/// Record that the current stack top was stored in memo at `idx`.
841+
#[inline]
842+
fn record_memo_binding(&mut self, idx: usize) {
843+
if let Some(bindings) = self.stack_memo.last_mut() {
844+
bindings.push(idx);
845+
}
846+
}
847+
848+
/// After an in-place mutation of the stack top (SETITEMS, APPENDS, etc.),
849+
/// update any memo entries that were cloned from the pre-mutation state.
850+
fn sync_memo_top(&mut self) -> Result<(), CodecError> {
851+
if let Some(bindings) = self.stack_memo.last() {
852+
if !bindings.is_empty() {
853+
let new_val = self.stack.last()
854+
.ok_or(CodecError::StackUnderflow)?.clone();
855+
for &idx in bindings {
856+
if idx < self.memo.len() {
857+
self.memo[idx] = new_val.clone();
858+
}
859+
}
860+
}
861+
}
862+
Ok(())
863+
}
814864
}
815865

816866
/// Convert a flat list [k1, v1, k2, v2, ...] into pairs [(k1, v1), (k2, v2), ...].
@@ -977,6 +1027,97 @@ mod tests {
9771027
}
9781028
}
9791029

1030+
#[test]
1031+
fn test_memo_updated_after_setitems() {
1032+
// Reproduces issue #18: EMPTY_DICT + BINPUT + SETITEMS leaves stale
1033+
// empty dict in memo. When the same dict is referenced twice (once in
1034+
// a list, once as a dict value), the second reference via BINGET returns
1035+
// the empty snapshot.
1036+
//
1037+
// Pickle VM sequence:
1038+
// PROTO 3
1039+
// EMPTY_DICT -> outer dict
1040+
// MARK
1041+
// BINUNICODE "a"
1042+
// EMPTY_DICT -> inner dict (shared)
1043+
// BINPUT 0 -> memo[0] = empty clone
1044+
// MARK
1045+
// BINUNICODE "x"
1046+
// BININT1 1
1047+
// SETITEMS -> inner dict = {"x": 1}
1048+
// BINUNICODE "b"
1049+
// BINGET 0 -> should be {"x": 1} (was {} before fix)
1050+
// SETITEMS -> outer = {"a": {"x": 1}, "b": {"x": 1}}
1051+
// STOP
1052+
let data: &[u8] = &[
1053+
0x80, 0x03, // PROTO 3
1054+
b'}', // EMPTY_DICT (outer)
1055+
b'(', // MARK
1056+
b'X', 0x01, 0x00, 0x00, 0x00, b'a', // BINUNICODE "a"
1057+
b'}', // EMPTY_DICT (inner)
1058+
b'q', 0x00, // BINPUT 0
1059+
b'(', // MARK
1060+
b'X', 0x01, 0x00, 0x00, 0x00, b'x', // BINUNICODE "x"
1061+
b'K', 0x01, // BININT1 1
1062+
b'u', // SETITEMS (fills inner)
1063+
b'X', 0x01, 0x00, 0x00, 0x00, b'b', // BINUNICODE "b"
1064+
b'h', 0x00, // BINGET 0
1065+
b'u', // SETITEMS (fills outer)
1066+
b'.', // STOP
1067+
];
1068+
let result = decode_pickle(data).unwrap();
1069+
1070+
let inner = PickleValue::Dict(vec![(
1071+
PickleValue::String("x".to_string()),
1072+
PickleValue::Int(1),
1073+
)]);
1074+
let expected = PickleValue::Dict(vec![
1075+
(PickleValue::String("a".to_string()), inner.clone()),
1076+
(PickleValue::String("b".to_string()), inner),
1077+
]);
1078+
assert_eq!(
1079+
result, expected,
1080+
"BINGET after SETITEMS should return populated dict, not empty"
1081+
);
1082+
}
1083+
1084+
#[test]
1085+
fn test_memo_updated_after_appends() {
1086+
// Same pattern but for lists: EMPTY_LIST + BINPUT + APPENDS.
1087+
//
1088+
// Pickle VM sequence:
1089+
// PROTO 3
1090+
// EMPTY_LIST -> inner list (shared)
1091+
// BINPUT 0 -> memo[0] = empty clone
1092+
// MARK
1093+
// BININT1 1
1094+
// BININT1 2
1095+
// APPENDS -> inner list = [1, 2]
1096+
// BINGET 0 -> should be [1, 2]
1097+
// TUPLE2 -> ([1,2], [1,2])
1098+
// STOP
1099+
let data: &[u8] = &[
1100+
0x80, 0x03, // PROTO 3
1101+
b']', // EMPTY_LIST
1102+
b'q', 0x00, // BINPUT 0
1103+
b'(', // MARK
1104+
b'K', 0x01, // BININT1 1
1105+
b'K', 0x02, // BININT1 2
1106+
b'e', // APPENDS
1107+
b'h', 0x00, // BINGET 0
1108+
0x86, // TUPLE2
1109+
b'.', // STOP
1110+
];
1111+
let result = decode_pickle(data).unwrap();
1112+
1113+
let inner = PickleValue::List(vec![PickleValue::Int(1), PickleValue::Int(2)]);
1114+
let expected = PickleValue::Tuple(vec![inner.clone(), inner]);
1115+
assert_eq!(
1116+
result, expected,
1117+
"BINGET after APPENDS should return populated list, not empty"
1118+
);
1119+
}
1120+
9801121
#[test]
9811122
fn test_long4_negative_length() {
9821123
// PROTO 2, LONG4 with length=-1 (0xFFFFFFFF as i32)

tests/test_basic_types.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,46 @@ def test_nested(self):
165165
assert pickle.loads(restored_pickle) == val
166166

167167

168+
class TestSharedReferences:
169+
"""Test that pickle memo sharing (same object in multiple places) roundtrips.
170+
171+
Regression tests for https://github.com/bluedynamics/zodb-json-codec/issues/18
172+
"""
173+
174+
def test_dict_values_shared_with_list(self):
175+
"""Dict values that are also list elements must preserve content."""
176+
sizes = [
177+
{"height": 75, "label": "Square", "width": 75},
178+
{"height": 683, "label": "Large", "width": 1024},
179+
]
180+
# sizes_dict values are the SAME objects as in sizes list
181+
sizes_dict = {item["label"]: item for item in sizes}
182+
val = {"sizes": sizes, "sizes_dict": sizes_dict}
183+
184+
data = pickle.dumps(val, protocol=3)
185+
json_str = zodb_json_codec.pickle_to_json(data)
186+
parsed = json.loads(json_str)
187+
# Verify sizes_dict values are NOT empty
188+
assert parsed["sizes_dict"]["Square"] == sizes[0]
189+
assert parsed["sizes_dict"]["Large"] == sizes[1]
190+
191+
# Full roundtrip
192+
restored_pickle = zodb_json_codec.json_to_pickle(json_str)
193+
result = pickle.loads(restored_pickle)
194+
assert result == val
195+
196+
def test_list_shared_reference(self):
197+
"""Same list appearing twice via memo must preserve content."""
198+
inner = [1, 2, 3]
199+
val = {"a": inner, "b": inner} # same object
200+
201+
data = pickle.dumps(val, protocol=3)
202+
json_str = zodb_json_codec.pickle_to_json(data)
203+
parsed = json.loads(json_str)
204+
assert parsed["a"] == [1, 2, 3]
205+
assert parsed["b"] == [1, 2, 3]
206+
207+
168208
class TestTuple:
169209
def test_empty(self):
170210
data = pickle.dumps((), protocol=3)

0 commit comments

Comments
 (0)