From 4b62987ed2e27d26a849f5572ce9dc1e88ec38e3 Mon Sep 17 00:00:00 2001 From: Willard Jansen Date: Fri, 29 May 2026 19:46:37 +0200 Subject: [PATCH] Secure-by-default: refuse pickle on object-dtype decode (CWE-502) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit decode() called pickle.loads() with zero validation on attacker-controlled data whenever a payload set kind=b'O'. Any caller of unpackb() / unpack() / Unpacker was exposed to arbitrary code execution from a 122-byte crafted .msgpack file — the pickle reduce protocol allows attacker payloads to invoke os.system, subprocess.Popen, etc. This commit gates the pickle path on an explicit allow_pickle kwarg: * allow_pickle=False (new default) — raises ValueError on kind=b'O' * allow_pickle='restricted' — RestrictedUnpickler that allowlists numpy reconstruction primitives + safe Python builtins and blocks known pickle-RCE gadgets (eval/exec/getattr/...) * allow_pickle=True — legacy pickle.loads (use only with trusted sources) Threaded through decode(), Unpacker (all three msgpack-version branches), unpack(), and unpackb(). pack() / packb() accept and ignore the kwarg for symmetry. The existing test_numpy_array_object now opts in via allow_pickle='restricted'. Five new tests cover: - default refusal raises ValueError - allow_pickle=True restores legacy behavior - hand-crafted os.system __reduce__ payload is refused by default - same payload is refused by the restricted unpickler - builtins.eval gadget is on the explicit block list - bad allow_pickle values raise ValueError Fixes the unfixed CWE-502 documented in issue #57 and extends PR #52 with a restricted-unpickler opt-in mode. --- CHANGES.md | 9 +++ README.md | 18 ++++++ msgpack_numpy.py | 141 +++++++++++++++++++++++++++++++++++++++++++---- tests.py | 106 ++++++++++++++++++++++++++++++++++- 4 files changed, 263 insertions(+), 11 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 32f2b11..b3988ff 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,15 @@ vi:ft=markdown Change Log ========== +Release 0.4.9 (unreleased) +-------------------------- +* **Security (CWE-502):** unpickling of ``object``-dtype ndarrays (``kind=b'O'``) + is now disabled by default. Callers must pass ``allow_pickle='restricted'`` + for a restricted unpickler (numpy reconstruction primitives + safe builtins + only) or ``allow_pickle=True`` for the legacy ``pickle.loads`` behavior. + ``allow_pickle=True`` should only ever be used with fully trusted sources. + Refer to the README for details (#57). + Release 0.4.8 (April 28, 2022) ------------------------------ * Add support for ndarrays with dtype=object (#46). diff --git a/README.md b/README.md index 6f7eab6..a03ff9b 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,24 @@ of the reasons to use msgpack, it may be advisable to either write a custom encoder/decoder to handle the specific use case efficiently or else not bother using msgpack-numpy. +**Security note (CWE-502):** because deserializing the ``kind=b'O'`` payload +calls ``pickle.loads`` and pickle can execute arbitrary code, msgpack-numpy +refuses to unpickle ``object``-dtype payloads by default. To round-trip such +arrays you must opt in explicitly: + +```python +# Restricted unpickler: numpy reconstruction primitives + safe Python +# builtins only. Refuses arbitrary callables and known RCE gadgets. +arr = msgpack.unpackb(packed, allow_pickle='restricted') + +# Legacy behavior — equivalent to executing arbitrary code from the +# payload. Only use with fully trusted data sources. +arr = msgpack.unpackb(packed, allow_pickle=True) +``` + +The default ``allow_pickle=False`` raises ``ValueError`` when a ``kind=b'O'`` +payload is encountered. + Note that numpy arrays deserialized by msgpack-numpy are read-only and must be copied if they are to be modified. diff --git a/msgpack_numpy.py b/msgpack_numpy.py index abb46ef..324dd44 100644 --- a/msgpack_numpy.py +++ b/msgpack_numpy.py @@ -2,6 +2,18 @@ """ Support for serialization of numpy data types with msgpack. + +Security note +------------- +``object``-dtype ndarrays are serialized via the ``pickle`` protocol. Because +``pickle.loads`` can execute arbitrary code as a side effect of unpickling, +this module refuses to unpickle ``kind=b'O'`` payloads by default (CWE-502). + +To deserialize ``object``-dtype arrays, pass ``allow_pickle='restricted'`` for +a restricted unpickler that only permits NumPy reconstruction primitives and +common Python builtins, or ``allow_pickle=True`` for the legacy unrestricted +``pickle.loads`` path. ``allow_pickle=True`` should only ever be used with +data from fully trusted sources. """ # Copyright (c) 2013-2022, Lev E. Givon @@ -11,6 +23,7 @@ import sys import functools +import io import pickle import warnings @@ -51,6 +64,77 @@ def ndarray_to_bytes(obj): def tostr(x): return x + +# Restricted-unpickler allowlist. Modules listed here are accepted in full; +# anything else triggers UnpicklingError. numpy's own ndarray reduce protocol +# references multiarray._reconstruct, dtype, and ndarray under both the legacy +# ``numpy.core`` path and the post-2.0 ``numpy._core`` path. +_ALLOWED_PICKLE_MODULES = frozenset({ + 'numpy', + 'numpy.core', + 'numpy.core.multiarray', + 'numpy.core.numeric', + 'numpy._core', + 'numpy._core.multiarray', + 'numpy._core.numeric', + 'builtins', + 'collections', + 'copy_reg', # Python 2 compat (no-op on Py3) + '_codecs', # bytes.decode reconstruction +}) + +# Builtins explicitly blocked even when ``builtins`` is allowed. ``getattr`` +# / ``eval`` / ``exec`` / ``__import__`` are pickle-RCE gadgets. +_BLOCKED_BUILTINS = frozenset({ + 'eval', 'exec', 'compile', 'open', '__import__', + 'getattr', 'setattr', 'delattr', 'globals', 'locals', 'vars', + 'breakpoint', 'help', 'input', 'print', + 'classmethod', 'staticmethod', 'property', +}) + + +class _RestrictedUnpickler(pickle.Unpickler): + """Pickle unpickler restricted to a small allowlist of safe modules. + + Sufficient for round-tripping numpy ``object``-dtype ndarrays whose + elements are themselves builtin primitives (str, bytes, int, float, bool, + list, tuple, dict, set, complex, None). Arbitrary user classes will + raise ``pickle.UnpicklingError``. + """ + + def find_class(self, module, name): + if module not in _ALLOWED_PICKLE_MODULES: + raise pickle.UnpicklingError( + "msgpack_numpy: refusing to unpickle %s.%s " + "(module not on restricted allowlist). " + "Pass allow_pickle=True if the data source is fully trusted." + % (module, name)) + if module == 'builtins' and name in _BLOCKED_BUILTINS: + raise pickle.UnpicklingError( + "msgpack_numpy: refusing to unpickle builtins.%s " + "(blocked as a pickle-RCE gadget)." % name) + return super().find_class(module, name) + + +def _loads_restricted(data): + return _RestrictedUnpickler(io.BytesIO(data)).load() + + +def _resolve_pickle_loader(allow_pickle): + """Return the callable used to deserialize ``kind=b'O'`` payloads, or + ``None`` if pickle is disabled. Raises ``ValueError`` on bad input. + """ + if allow_pickle is False or allow_pickle is None: + return None + if allow_pickle is True: + return pickle.loads + if allow_pickle == 'restricted': + return _loads_restricted + raise ValueError( + "msgpack_numpy: allow_pickle must be False, True, or 'restricted'; " + "got %r" % (allow_pickle,)) + + def encode(obj, chain=None): """ Data encoder for serializing numpy data types. @@ -81,9 +165,20 @@ def encode(obj, chain=None): else: return obj if chain is None else chain(obj) -def decode(obj, chain=None): +def decode(obj, chain=None, allow_pickle=False): """ Decoder for deserializing numpy data types. + + Parameters + ---------- + allow_pickle : bool or str, default False + Controls deserialization of ``object``-dtype arrays, which are stored + as a pickle stream. ``False`` (the default) refuses pickle entirely + and raises ``ValueError``. ``'restricted'`` uses a restricted + unpickler that allowlists NumPy reconstruction primitives and common + Python builtins. ``True`` calls ``pickle.loads`` directly and is + equivalent to executing arbitrary code from the payload — only use + with fully trusted sources. """ try: @@ -97,7 +192,15 @@ def decode(obj, chain=None): descr = [tuple(tostr(t) if type(t) is bytes else t for t in d) \ for d in obj[b'type']] elif b'kind' in obj and obj[b'kind'] == b'O': - return pickle.loads(obj[b'data']) + loader = _resolve_pickle_loader(allow_pickle) + if loader is None: + raise ValueError( + "msgpack_numpy: refusing to unpickle object-dtype " + "ndarray (kind=b'O') because allow_pickle=False. " + "Pass allow_pickle='restricted' for a restricted " + "unpickler, or allow_pickle=True for the legacy " + "pickle.loads behavior (only with trusted data).") + return loader(obj[b'data']) else: descr = obj[b'type'] return np.ndarray(buffer=obj[b'data'], @@ -149,8 +252,10 @@ class Unpacker(_Unpacker): def __init__(self, file_like=None, read_size=0, use_list=None, object_hook=None, object_pairs_hook=None, list_hook=None, encoding='utf-8', - unicode_errors='strict', max_buffer_size=0): - object_hook = functools.partial(decode, chain=object_hook) + unicode_errors='strict', max_buffer_size=0, + allow_pickle=False): + object_hook = functools.partial(decode, chain=object_hook, + allow_pickle=allow_pickle) super(Unpacker, self).__init__(file_like=file_like, read_size=read_size, use_list=use_list, @@ -183,8 +288,10 @@ def __init__(self, file_like=None, read_size=0, use_list=None, object_hook=None, object_pairs_hook=None, list_hook=None, unicode_errors='strict', max_buffer_size=0, - ext_hook=msgpack.ExtType): - object_hook = functools.partial(decode, chain=object_hook) + ext_hook=msgpack.ExtType, + allow_pickle=False): + object_hook = functools.partial(decode, chain=object_hook, + allow_pickle=allow_pickle) super(Unpacker, self).__init__(file_like=file_like, read_size=read_size, use_list=use_list, @@ -233,8 +340,10 @@ def __init__(self, max_bin_len=-1, max_array_len=-1, max_map_len=-1, - max_ext_len=-1): - object_hook = functools.partial(decode, chain=object_hook) + max_ext_len=-1, + allow_pickle=False): + object_hook = functools.partial(decode, chain=object_hook, + allow_pickle=allow_pickle) super(Unpacker, self).__init__(file_like=file_like, read_size=read_size, use_list=use_list, @@ -258,6 +367,7 @@ def pack(o, stream, **kwargs): Pack an object and write it to a stream. """ + kwargs.pop('allow_pickle', None) # packer never unpickles packer = Packer(**kwargs) stream.write(packer.pack(o)) @@ -266,24 +376,35 @@ def packb(o, **kwargs): Pack an object and return the packed bytes. """ + kwargs.pop('allow_pickle', None) # packer never unpickles return Packer(**kwargs).pack(o) def unpack(stream, **kwargs): """ Unpack a packed object from a stream. + + Pass ``allow_pickle`` (default ``False``) to control deserialization of + ``object``-dtype ndarrays. See :func:`decode` for accepted values. """ + allow_pickle = kwargs.pop('allow_pickle', False) object_hook = kwargs.get('object_hook') - kwargs['object_hook'] = functools.partial(decode, chain=object_hook) + kwargs['object_hook'] = functools.partial(decode, chain=object_hook, + allow_pickle=allow_pickle) return _unpack(stream, **kwargs) def unpackb(packed, **kwargs): """ Unpack a packed object. + + Pass ``allow_pickle`` (default ``False``) to control deserialization of + ``object``-dtype ndarrays. See :func:`decode` for accepted values. """ + allow_pickle = kwargs.pop('allow_pickle', False) object_hook = kwargs.get('object_hook') - kwargs['object_hook'] = functools.partial(decode, chain=object_hook) + kwargs['object_hook'] = functools.partial(decode, chain=object_hook, + allow_pickle=allow_pickle) return _unpackb(packed, **kwargs) load = unpack diff --git a/tests.py b/tests.py index 4bd4bfa..4e637db 100644 --- a/tests.py +++ b/tests.py @@ -189,11 +189,115 @@ def test_numpy_array_float(self): assert_equal(x.dtype, x_rec.dtype) def test_numpy_array_object(self): + # object-dtype arrays now require explicit opt-in (CWE-502). x = np.random.rand(5).astype(object) - x_rec = self.encode_decode(x) + x_enc = msgpack.packb(x) + x_rec = msgpack.unpackb(x_enc, allow_pickle='restricted') assert_array_equal(x, x_rec) assert_equal(x.dtype, x_rec.dtype) + def test_numpy_array_object_default_refuses(self): + # Default (allow_pickle=False) must refuse to deserialize the + # pickle-bearing object-dtype payload. + x = np.random.rand(5).astype(object) + x_enc = msgpack.packb(x) + with self.assertRaises(ValueError) as ctx: + msgpack.unpackb(x_enc) + self.assertIn('allow_pickle', str(ctx.exception)) + + def test_numpy_array_object_explicit_unrestricted(self): + # allow_pickle=True preserves the legacy pickle.loads behavior for + # users who consciously opt in. + x = np.random.rand(5).astype(object) + x_enc = msgpack.packb(x) + x_rec = msgpack.unpackb(x_enc, allow_pickle=True) + assert_array_equal(x, x_rec) + assert_equal(x.dtype, x_rec.dtype) + + def test_pickle_rce_payload_default_refused(self): + # A hand-crafted msgpack payload that, with the legacy pickle path, + # would call os.system(). The default refuses to unpickle it at all, + # so the RCE never fires. + import os, pickle as _pickle + marker_path = '/tmp/msgpack_numpy_rce_test_marker_default' + try: + os.unlink(marker_path) + except OSError: + pass + + class _RCE: + def __reduce__(self): + return (os.system, ('touch %s' % marker_path,)) + + crafted = { + b'nd': True, + b'kind': b'O', + b'type': b'O', + b'shape': (1,), + b'data': _pickle.dumps(_RCE()), + } + packed = msgpack.packb(crafted, use_bin_type=True) + + with self.assertRaises(ValueError): + msgpack.unpackb(packed, raw=False) + self.assertFalse(os.path.exists(marker_path), + "RCE payload executed despite allow_pickle=False") + + def test_pickle_rce_payload_restricted_blocked(self): + # Same crafted payload, but with the restricted unpickler the + # os.system gadget is rejected by find_class(). + import os, pickle as _pickle + marker_path = '/tmp/msgpack_numpy_rce_test_marker_restricted' + try: + os.unlink(marker_path) + except OSError: + pass + + class _RCE: + def __reduce__(self): + return (os.system, ('touch %s' % marker_path,)) + + crafted = { + b'nd': True, + b'kind': b'O', + b'type': b'O', + b'shape': (1,), + b'data': _pickle.dumps(_RCE()), + } + packed = msgpack.packb(crafted, use_bin_type=True) + + with self.assertRaises(_pickle.UnpicklingError): + msgpack.unpackb(packed, raw=False, allow_pickle='restricted') + self.assertFalse(os.path.exists(marker_path), + "RCE payload executed despite restricted unpickler") + + def test_pickle_eval_gadget_restricted_blocked(self): + # builtins.eval is on the explicit block list even though the + # 'builtins' module is otherwise allowed (numpy reconstruction needs + # bytes/str etc.). + import pickle as _pickle + + class _Eval: + def __reduce__(self): + return (eval, ('1+1',)) + + crafted = { + b'nd': True, + b'kind': b'O', + b'type': b'O', + b'shape': (1,), + b'data': _pickle.dumps(_Eval()), + } + packed = msgpack.packb(crafted, use_bin_type=True) + with self.assertRaises(_pickle.UnpicklingError): + msgpack.unpackb(packed, raw=False, allow_pickle='restricted') + + def test_allow_pickle_bad_value(self): + x = np.random.rand(5).astype(object) + x_enc = msgpack.packb(x) + with self.assertRaises(ValueError): + msgpack.unpackb(x_enc, allow_pickle='yes') + def test_numpy_array_complex(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_rec = self.encode_decode(x)