Skip to content

Commit 1900ea2

Browse files
authored
Merge pull request #3177 from perspective-dev/str-expr-mem-leak
Fix `expressions` memory leak
2 parents 51ffdde + 922ba91 commit 1900ea2

5 files changed

Lines changed: 96 additions & 14 deletions

File tree

rust/perspective-python/perspective/tests/table/test_leaks.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,54 @@ def test_table_delete_with_view(self, sentinel):
5252
tbl.delete()
5353
mem2 = process.memory_info().rss
5454
assert (mem2 - mem) < 2000000
55+
56+
57+
class TestExpressionVocab(object):
58+
# Regression tests for the `t_expression_vocab` leak fixed in PR #3175.
59+
# Pre-fix, updates against a view with a string-producing expression
60+
# interned each result into a new 4KB vocab page once the prior page
61+
# filled, and never deduplicated against older pages. RSS grew without
62+
# bound. The fix adds cross-page deduplication; these tests assert the
63+
# steady state is flat.
64+
65+
def test_string_expression_update_no_leak(self):
66+
long_literal = "X" * 256
67+
tbl = Table({"x": "integer", "c": "string"}, index="x")
68+
view = tbl.view(expressions={"e": 'concat("c", \'' + long_literal + "')"})
69+
70+
for _ in range(100):
71+
tbl.update([{"x": 1, "c": "value"}])
72+
73+
process = psutil.Process(os.getpid())
74+
mem = process.memory_info().rss
75+
for _ in range(5000):
76+
tbl.update([{"x": 1, "c": "value"}])
77+
mem2 = process.memory_info().rss
78+
79+
view.delete()
80+
tbl.delete()
81+
82+
assert (mem2 - mem) < 2000000
83+
84+
def test_string_expression_update_bounded_vocab_no_leak(self):
85+
# Cycles through a small fixed set of distinct values. Exercises the
86+
# cross-page `string_exists` lookup the fix relies on: once the
87+
# vocabulary is fully populated, subsequent interns must resolve to
88+
# existing pointers rather than allocating new ones.
89+
values = ["alpha", "bravo", "charlie", "delta", "echo"]
90+
tbl = Table({"x": "integer", "c": "string"}, index="x")
91+
view = tbl.view(expressions={"e": 'upper("c")'})
92+
93+
for i in range(100):
94+
tbl.update([{"x": 1, "c": values[i % len(values)]}])
95+
96+
process = psutil.Process(os.getpid())
97+
mem = process.memory_info().rss
98+
for i in range(5000):
99+
tbl.update([{"x": 1, "c": values[i % len(values)]}])
100+
mem2 = process.memory_info().rss
101+
102+
view.delete()
103+
tbl.delete()
104+
105+
assert (mem2 - mem) < 2000000

rust/perspective-server/cpp/perspective/src/cpp/expression_vocab.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
namespace perspective {
1616

1717
t_expression_vocab::t_expression_vocab() {
18-
// Allocate 4096 bytes per page
19-
m_max_vocab_size = 64 * 64;
18+
// Pre-reserve each page to its full size so that `intern()` never
19+
// triggers a realloc inside `t_vocab`; previously-returned `const char*`
20+
// pointers must remain valid for the lifetime of the vocab.
21+
m_max_vocab_size = 16 * 64 * 64;
2022

2123
// Always start with one vocab
2224
allocate_new_vocab();
@@ -25,14 +27,22 @@ t_expression_vocab::t_expression_vocab() {
2527
const char*
2628
t_expression_vocab::intern(const char* str) {
2729
std::size_t bytelength = strlen(str);
30+
std::size_t hash = boost::hash_range(str, str + bytelength);
31+
t_uindex existing_idx;
32+
33+
for (auto& current_vocab : m_vocabs) {
34+
if (current_vocab.string_exists(str, hash, existing_idx)) {
35+
return current_vocab.unintern_c(existing_idx);
36+
}
37+
}
2838

2939
if (m_current_vocab_size + bytelength + 1 > m_max_vocab_size) {
3040
allocate_new_vocab();
3141
}
3242

43+
t_vocab& current_vocab = m_vocabs.front();
3344
m_current_vocab_size += bytelength + 1;
34-
t_vocab& current_vocab = m_vocabs[0];
35-
t_uindex interned_idx = current_vocab.get_interned(str);
45+
t_uindex interned_idx = current_vocab.get_interned(str, bytelength, hash);
3646
return current_vocab.unintern_c(interned_idx);
3747
}
3848

@@ -63,7 +73,7 @@ void
6373
t_expression_vocab::allocate_new_vocab() {
6474
t_vocab vocab;
6575
vocab.init(false);
66-
vocab.reserve(m_max_vocab_size, 64);
76+
vocab.reserve(m_max_vocab_size, m_max_vocab_size / 64);
6777
m_vocabs.insert(m_vocabs.begin(), std::move(vocab));
6878
m_current_vocab_size = 0;
6979
}

rust/perspective-server/cpp/perspective/src/cpp/sparse_tree.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -877,12 +877,6 @@ t_stree::update_shape_from_static(const t_dtree_ctx& ctx) {
877877
if (iter == m_nodes->get<by_pidx_hash>().end()) {
878878
// create node and enqueue
879879
sptidx = genidx();
880-
t_uindex aggsize = m_aggregates->size();
881-
if (sptidx == aggsize) {
882-
double scale = 1.3;
883-
t_uindex new_size = scale * aggsize;
884-
m_aggregates->extend(new_size);
885-
}
886880

887881
t_uindex dst_ridx = gen_aggidx();
888882

@@ -1105,7 +1099,7 @@ t_stree::gen_aggidx() {
11051099
t_uindex rval = m_cur_aggidx;
11061100
++m_cur_aggidx;
11071101
if (rval >= cur_cap) {
1108-
double nrows = ceil(.3 * double(rval));
1102+
double nrows = ceil(1.3 * double(rval));
11091103
m_aggregates->extend(static_cast<t_uindex>(nrows));
11101104
}
11111105

rust/perspective-server/cpp/perspective/src/cpp/vocab.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,18 +70,40 @@ t_vocab::string_exists(const char* c, t_uindex& interned) const {
7070
return true;
7171
}
7272

73+
bool
74+
t_vocab::string_exists(
75+
const char* c, std::size_t hash, t_uindex& interned
76+
) const {
77+
auto iter = m_map.find(c, hash);
78+
79+
if (iter == m_map.end()) {
80+
return false;
81+
}
82+
83+
interned = iter->second;
84+
return true;
85+
}
86+
7387
t_uindex
7488
t_vocab::get_interned(const char* s) {
89+
std::size_t bytelength = strlen(s);
90+
return get_interned(s, bytelength, boost::hash_range(s, s + bytelength));
91+
}
92+
93+
t_uindex
94+
t_vocab::get_interned(
95+
const char* s, std::size_t bytelength, std::size_t hash
96+
) {
7597
#ifdef PSP_COLUMN_VERIFY
7698
PSP_VERBOSE_ASSERT(s != 0, "Null string");
7799
#endif
78100

79-
t_sidxmap::iterator iter = m_map.find(s);
101+
t_sidxmap::iterator iter = m_map.find(s, hash);
80102

81103
t_uindex idx;
82104
t_uindex bidx;
83105
t_uindex eidx;
84-
t_uindex len = strlen(s) + 1;
106+
t_uindex len = bytelength + 1;
85107

86108
if (iter == m_map.end()) {
87109
idx = genidx();

rust/perspective-server/cpp/perspective/src/include/perspective/vocab.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,15 @@ class PERSPECTIVE_EXPORT t_vocab {
5757

5858
t_uindex get_interned(const std::string& s);
5959
t_uindex get_interned(const char* s);
60+
t_uindex
61+
get_interned(const char* s, std::size_t bytelength, std::size_t hash);
6062
void copy_vocabulary(const t_vocab& other);
6163
const char* unintern_c(t_uindex idx) const;
6264

6365
bool string_exists(const char* c, t_uindex& interned) const;
66+
bool string_exists(
67+
const char* c, std::size_t hash, t_uindex& interned
68+
) const;
6469

6570
void reserve(size_t total_string_size, size_t string_count);
6671

0 commit comments

Comments
 (0)