Skip to content

Commit a2a139e

Browse files
committed
Fix #13882: Decrement vocab.length when memory_zone clears transient lexemes
1 parent c1e7cb2 commit a2a139e

2 files changed

Lines changed: 73 additions & 1 deletion

File tree

spacy/tests/vocab_vectors/test_memory_zone.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from spacy.vocab import Vocab
2+
import pytest
23

34

45
def test_memory_zone_no_insertion():
@@ -34,3 +35,68 @@ def test_memory_zone_redundant_insertion():
3435
_ = vocab["dog"]
3536
assert "dog" in vocab
3637
assert "horse" not in vocab
38+
39+
40+
@pytest.mark.issue(13882)
41+
def test_memory_zone_vocab_length_decremented():
42+
"""Test that vocab.length is correctly decremented when memory_zone
43+
clears transient lexemes.
44+
45+
Bug: The length counter was incremented when adding lexemes but never
46+
decremented when memory_zone cleared transient lexemes, causing
47+
len(vocab) to grow continuously even though lexemes were properly removed.
48+
"""
49+
vocab = Vocab()
50+
51+
# Add some permanent lexemes
52+
vocab["hello"]
53+
vocab["world"]
54+
initial_len = len(vocab)
55+
assert initial_len == 2
56+
57+
# Add transient lexemes inside memory_zone
58+
with vocab.memory_zone():
59+
vocab["transient1"]
60+
vocab["transient2"]
61+
vocab["transient3"]
62+
inside_len = len(vocab)
63+
assert inside_len == 5 # 2 permanent + 3 transient
64+
65+
# After exiting memory_zone, length should return to initial
66+
after_zone_len = len(vocab)
67+
assert after_zone_len == initial_len, (
68+
f"vocab.length should be {initial_len} after memory_zone, "
69+
f"but got {after_zone_len}"
70+
)
71+
72+
# Verify by iteration that only permanent lexemes remain
73+
actual_count = sum(1 for _ in vocab)
74+
assert actual_count == initial_len
75+
assert after_zone_len == actual_count
76+
77+
78+
@pytest.mark.issue(13882)
79+
def test_memory_zone_multiple_cycles():
80+
"""Test that vocab.length is correctly maintained across multiple
81+
memory_zone cycles."""
82+
vocab = Vocab()
83+
vocab["permanent"]
84+
base_len = len(vocab)
85+
assert base_len == 1
86+
87+
# Multiple memory_zone cycles
88+
for i in range(3):
89+
with vocab.memory_zone():
90+
for j in range(5):
91+
vocab[f"temp_{i}_{j}"]
92+
93+
# Length should return to base after each cycle
94+
assert len(vocab) == base_len, (
95+
f"After cycle {i+1}, vocab.length should be {base_len}, "
96+
f"but got {len(vocab)}"
97+
)
98+
99+
# Final verification
100+
final_len = len(vocab)
101+
actual_count = sum(1 for _ in vocab)
102+
assert final_len == actual_count == base_len

spacy/vocab.pyx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,9 +249,15 @@ cdef class Vocab:
249249

250250
def _clear_transient_orths(self):
251251
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
252+
cdef hash_t orth
253+
cdef int num_cleared = 0
254+
252255
for orth in self._transient_orths:
253-
map_clear(self._by_orth.c_map, orth)
256+
if self._by_orth.get(orth) is not NULL:
257+
map_clear(self._by_orth.c_map, orth)
258+
num_cleared += 1
254259
self._transient_orths.clear()
260+
self.length -= num_cleared
255261

256262
def __contains__(self, key):
257263
"""Check whether the string or int key has an entry in the vocabulary.

0 commit comments

Comments
 (0)