Skip to content

Commit 0653fa0

Browse files
authored
[mypyc] Various librt.strings fixes (#21314)
Includes these fixes: * Fix integer overflows * Don't mutate in append if codepoint is out of range * Micro-optimize copying during growth to copy less data * Fix memory leaks I used coding agent assist to find and fix issues.
1 parent 42f66fd commit 0653fa0

2 files changed

Lines changed: 112 additions & 21 deletions

File tree

mypyc/lib-rt/strings/librt_strings.c

Lines changed: 66 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,24 +24,34 @@ static PyTypeObject BytesWriterType;
2424

2525
static bool
2626
_grow_buffer(BytesWriterObject *data, Py_ssize_t n) {
27+
if (unlikely(n > PY_SSIZE_T_MAX - data->len)) {
28+
PyErr_NoMemory();
29+
return false;
30+
}
2731
Py_ssize_t target = data->len + n;
2832
Py_ssize_t size = data->capacity;
2933
do {
34+
if (unlikely(size > PY_SSIZE_T_MAX / 2)) {
35+
PyErr_NoMemory();
36+
return false;
37+
}
3038
size *= 2;
3139
} while (target >= size);
40+
char *new_buf;
3241
if (data->buf == data->data) {
3342
// Move from embedded buffer to heap-allocated buffer
34-
data->buf = PyMem_Malloc(size);
35-
if (data->buf != NULL) {
36-
memcpy(data->buf, data->data, WRITER_EMBEDDED_BUF_LEN);
43+
new_buf = PyMem_Malloc(size);
44+
if (new_buf != NULL) {
45+
memcpy(new_buf, data->data, data->len);
3746
}
3847
} else {
39-
data->buf = PyMem_Realloc(data->buf, size);
48+
new_buf = PyMem_Realloc(data->buf, size);
4049
}
41-
if (unlikely(data->buf == NULL)) {
50+
if (unlikely(new_buf == NULL)) {
4251
PyErr_NoMemory();
4352
return false;
4453
}
54+
data->buf = new_buf;
4555
data->capacity = size;
4656
return true;
4757
}
@@ -71,9 +81,8 @@ BytesWriter_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7181
}
7282

7383
BytesWriterObject *self = (BytesWriterObject *)type->tp_alloc(type, 0);
74-
if (self != NULL) {
84+
if (self != NULL)
7585
BytesWriter_init_internal(self);
76-
}
7786
return (PyObject *)self;
7887
}
7988

@@ -99,6 +108,9 @@ BytesWriter_init(BytesWriterObject *self, PyObject *args, PyObject *kwds)
99108
return -1;
100109
}
101110

111+
// Soft reset: free any heap buffer so re-initialization doesn't leak.
112+
if (self->buf != self->data && self->buf != NULL)
113+
PyMem_Free(self->buf);
102114
BytesWriter_init_internal(self);
103115
return 0;
104116
}
@@ -396,10 +408,23 @@ grow_string_buffer_helper(StringWriterObject *self, Py_ssize_t target_capacity,
396408
char old_kind = self->kind;
397409
Py_ssize_t new_capacity = self->capacity;
398410

411+
// Limit so that (new_capacity * 2) * new_kind stays within Py_ssize_t.
412+
// new_kind is always 1, 2, or 4, so use a shift instead of division.
413+
int shift = (new_kind == 4) ? 3 : (new_kind == 2 ? 2 : 1);
414+
Py_ssize_t cap_limit = PY_SSIZE_T_MAX >> shift;
399415
while (target_capacity >= new_capacity) {
416+
if (unlikely(new_capacity > cap_limit)) {
417+
PyErr_NoMemory();
418+
return false;
419+
}
400420
new_capacity *= 2;
401421
}
402422

423+
if (unlikely(new_capacity > cap_limit * 2)) {
424+
PyErr_NoMemory();
425+
return false;
426+
}
427+
403428
Py_ssize_t size_bytes = new_capacity * new_kind;
404429
char *new_buf;
405430
bool from_embedded = (self->buf == self->data);
@@ -433,6 +458,10 @@ grow_string_buffer_helper(StringWriterObject *self, Py_ssize_t target_capacity,
433458
}
434459

435460
static bool grow_string_buffer(StringWriterObject *data, Py_ssize_t n) {
461+
if (unlikely(n > PY_SSIZE_T_MAX - data->len)) {
462+
PyErr_NoMemory();
463+
return false;
464+
}
436465
return grow_string_buffer_helper(data, data->len + n, data->kind);
437466
}
438467

@@ -464,9 +493,8 @@ StringWriter_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
464493
}
465494

466495
StringWriterObject *self = (StringWriterObject *)type->tp_alloc(type, 0);
467-
if (self != NULL) {
496+
if (self != NULL)
468497
StringWriter_init_internal(self);
469-
}
470498
return (PyObject *)self;
471499
}
472500

@@ -492,6 +520,9 @@ StringWriter_init(StringWriterObject *self, PyObject *args, PyObject *kwds)
492520
return -1;
493521
}
494522

523+
// Soft reset: free any heap buffer so re-initialization doesn't leak.
524+
if (self->buf != self->data && self->buf != NULL)
525+
PyMem_Free(self->buf);
495526
StringWriter_init_internal(self);
496527
return 0;
497528
}
@@ -707,6 +738,13 @@ static void convert_string_data_in_place(char *buf, Py_ssize_t len,
707738
}
708739

709740
static char convert_string_buffer_kind(StringWriterObject *self, char old_kind, char new_kind) {
741+
// new_kind is always 2 or 4, so the max representable len is PY_SSIZE_T_MAX >> {1,2}.
742+
// Use a shift instead of a division for cheap overflow check.
743+
Py_ssize_t max_len = PY_SSIZE_T_MAX >> (new_kind == 4 ? 2 : 1);
744+
if (unlikely(self->len > max_len)) {
745+
PyErr_NoMemory();
746+
return CPY_NONE_ERROR;
747+
}
710748
// Current buffer size in bytes
711749
Py_ssize_t current_buf_size = (self->buf == self->data) ? WRITER_EMBEDDED_BUF_LEN : (self->capacity * old_kind);
712750
// Needed buffer size in bytes for new kind
@@ -748,7 +786,7 @@ static char string_writer_switch_kind(StringWriterObject *self, int32_t value) {
748786
static char string_append_slow_path(StringWriterObject *self, int32_t value) {
749787
if (self->kind == 2) {
750788
if ((uint32_t)value <= 0xffff) {
751-
// Kind stays the same
789+
// Fast path - kind 2 stays the same
752790
if (!ensure_string_writer_size(self, 1))
753791
return CPY_NONE_ERROR;
754792
// Copy 2-byte character to buffer
@@ -757,28 +795,35 @@ static char string_append_slow_path(StringWriterObject *self, int32_t value) {
757795
self->len++;
758796
return CPY_NONE;
759797
}
798+
// Always validate code point range before promotion (but after fast path).
799+
if (unlikely((uint32_t)value > 0x10FFFF))
800+
goto fail_range;
760801
if (string_writer_switch_kind(self, value) == CPY_NONE_ERROR)
761802
return CPY_NONE_ERROR;
762803
return string_append_slow_path(self, value);
763804
} else if (self->kind == 1) {
764805
// Check precondition -- this must only be used on slow path
765806
assert((uint32_t)value > 0xff);
807+
if (unlikely((uint32_t)value > 0x10FFFF))
808+
goto fail_range;
766809
if (string_writer_switch_kind(self, value) == CPY_NONE_ERROR)
767810
return CPY_NONE_ERROR;
768811
return string_append_slow_path(self, value);
769812
}
770813
assert(self->kind == 4);
771-
if ((uint32_t)value <= 0x10FFFF) {
772-
if (!ensure_string_writer_size(self, 1))
773-
return CPY_NONE_ERROR;
774-
// Copy 4-byte character to buffer
775-
uint32_t val32 = (uint32_t)value;
776-
memcpy(self->buf + self->len * 4, &val32, 4);
777-
self->len++;
778-
return CPY_NONE;
779-
}
780-
// Code point is out of valid Unicode range
781-
PyErr_Format(PyExc_ValueError, "code point %d is outside valid Unicode range (0-1114111)", value);
814+
if (unlikely((uint32_t)value > 0x10FFFF))
815+
goto fail_range;
816+
if (!ensure_string_writer_size(self, 1))
817+
return CPY_NONE_ERROR;
818+
// Copy 4-byte character to buffer
819+
uint32_t val32 = (uint32_t)value;
820+
memcpy(self->buf + self->len * 4, &val32, 4);
821+
self->len++;
822+
return CPY_NONE;
823+
824+
fail_range:
825+
PyErr_Format(PyExc_ValueError,
826+
"code point %d is outside valid Unicode range (0-1114111)", value);
782827
return CPY_NONE_ERROR;
783828
}
784829

mypyc/test-data/run-librt-strings.test

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,6 +1394,52 @@ def test_string_writer_wrapper_functions() -> None:
13941394
with assertRaises(IndexError):
13951395
s[-7]
13961396

1397+
def test_explicit_init_resets_via_any() -> None:
1398+
# Calling __init__ explicitly on an already-initialized writer resets it.
1399+
# Any previously-allocated heap buffer must be freed to avoid leaks.
1400+
bw: Any = BytesWriter()
1401+
bw.write(b"hello")
1402+
bw.__init__()
1403+
assert bw.getvalue() == b""
1404+
bw.write(b"world")
1405+
assert bw.getvalue() == b"world"
1406+
1407+
# Also trigger a heap-allocated buffer before the explicit __init__ call,
1408+
# to exercise the path that would otherwise leak memory.
1409+
bw2: Any = BytesWriter()
1410+
bw2.write(b"x" * 10000)
1411+
bw2.__init__()
1412+
assert bw2.getvalue() == b""
1413+
bw2.write(b"short")
1414+
assert bw2.getvalue() == b"short"
1415+
1416+
sw: Any = StringWriter()
1417+
sw.write("hello")
1418+
sw.__init__()
1419+
assert sw.getvalue() == ""
1420+
sw.write("world")
1421+
assert sw.getvalue() == "world"
1422+
1423+
sw2: Any = StringWriter()
1424+
sw2.write("y" * 10000)
1425+
sw2.__init__()
1426+
assert sw2.getvalue() == ""
1427+
sw2.write("short")
1428+
assert sw2.getvalue() == "short"
1429+
1430+
def test_new_without_init_is_usable() -> None:
1431+
# BytesWriter.__new__(BytesWriter) must return a fully initialized object
1432+
# that does not require an explicit __init__ call to be usable.
1433+
bw: Any = BytesWriter.__new__(BytesWriter)
1434+
assert bw.getvalue() == b""
1435+
bw.write(b"hello")
1436+
assert bw.getvalue() == b"hello"
1437+
1438+
sw: Any = StringWriter.__new__(StringWriter)
1439+
assert sw.getvalue() == ""
1440+
sw.write("hello")
1441+
assert sw.getvalue() == "hello"
1442+
13971443
[case testStringsFeaturesNotAvailableInNonExperimentalBuild_librt]
13981444
# This also ensures librt.strings can be built without experimental features
13991445
import librt.strings

0 commit comments

Comments
 (0)