Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ vi:ft=markdown
Change Log
==========

Release 0.4.9 (unreleased)
--------------------------
* **Security (CWE-502):** unpickling of ``object``-dtype ndarrays (``kind=b'O'``)
is now disabled by default. Callers must pass ``allow_pickle='restricted'``
for a restricted unpickler (numpy reconstruction primitives + safe builtins
only) or ``allow_pickle=True`` for the legacy ``pickle.loads`` behavior.
``allow_pickle=True`` should only ever be used with fully trusted sources.
Refer to the README for details (#57).

Release 0.4.8 (April 28, 2022)
------------------------------
* Add support for ndarrays with dtype=object (#46).
Expand Down
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,24 @@ of the reasons to use msgpack, it may be advisable to either write a custom
encoder/decoder to handle the specific use case efficiently or else not bother
using msgpack-numpy.

**Security note (CWE-502):** because deserializing the ``kind=b'O'`` payload
calls ``pickle.loads`` and pickle can execute arbitrary code, msgpack-numpy
refuses to unpickle ``object``-dtype payloads by default. To round-trip such
arrays you must opt in explicitly:

```python
# Restricted unpickler: numpy reconstruction primitives + safe Python
# builtins only. Refuses arbitrary callables and known RCE gadgets.
arr = msgpack.unpackb(packed, allow_pickle='restricted')

# Legacy behavior — equivalent to executing arbitrary code from the
# payload. Only use with fully trusted data sources.
arr = msgpack.unpackb(packed, allow_pickle=True)
```

The default ``allow_pickle=False`` raises ``ValueError`` when a ``kind=b'O'``
payload is encountered.

Note that numpy arrays deserialized by msgpack-numpy are read-only and must be copied
if they are to be modified.

Expand Down
141 changes: 131 additions & 10 deletions msgpack_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@

"""
Support for serialization of numpy data types with msgpack.

Security note
-------------
``object``-dtype ndarrays are serialized via the ``pickle`` protocol. Because
``pickle.loads`` can execute arbitrary code as a side effect of unpickling,
this module refuses to unpickle ``kind=b'O'`` payloads by default (CWE-502).

To deserialize ``object``-dtype arrays, pass ``allow_pickle='restricted'`` for
a restricted unpickler that only permits NumPy reconstruction primitives and
common Python builtins, or ``allow_pickle=True`` for the legacy unrestricted
``pickle.loads`` path. ``allow_pickle=True`` should only ever be used with
data from fully trusted sources.
"""

# Copyright (c) 2013-2022, Lev E. Givon
Expand All @@ -11,6 +23,7 @@

import sys
import functools
import io
import pickle
import warnings

Expand Down Expand Up @@ -51,6 +64,77 @@ def ndarray_to_bytes(obj):
def tostr(x):
return x


# Restricted-unpickler allowlist. Modules listed here are accepted in full;
# anything else triggers UnpicklingError. numpy's own ndarray reduce protocol
# references multiarray._reconstruct, dtype, and ndarray under both the legacy
# ``numpy.core`` path and the post-2.0 ``numpy._core`` path.
_ALLOWED_PICKLE_MODULES = frozenset({
'numpy',
'numpy.core',
'numpy.core.multiarray',
'numpy.core.numeric',
'numpy._core',
'numpy._core.multiarray',
'numpy._core.numeric',
'builtins',
'collections',
'copy_reg', # Python 2 compat (no-op on Py3)
'_codecs', # bytes.decode reconstruction
})

# Builtins explicitly blocked even when ``builtins`` is allowed. ``getattr``
# / ``eval`` / ``exec`` / ``__import__`` are pickle-RCE gadgets.
_BLOCKED_BUILTINS = frozenset({
'eval', 'exec', 'compile', 'open', '__import__',
'getattr', 'setattr', 'delattr', 'globals', 'locals', 'vars',
'breakpoint', 'help', 'input', 'print',
'classmethod', 'staticmethod', 'property',
})


class _RestrictedUnpickler(pickle.Unpickler):
"""Pickle unpickler restricted to a small allowlist of safe modules.

Sufficient for round-tripping numpy ``object``-dtype ndarrays whose
elements are themselves builtin primitives (str, bytes, int, float, bool,
list, tuple, dict, set, complex, None). Arbitrary user classes will
raise ``pickle.UnpicklingError``.
"""

def find_class(self, module, name):
if module not in _ALLOWED_PICKLE_MODULES:
raise pickle.UnpicklingError(
"msgpack_numpy: refusing to unpickle %s.%s "
"(module not on restricted allowlist). "
"Pass allow_pickle=True if the data source is fully trusted."
% (module, name))
if module == 'builtins' and name in _BLOCKED_BUILTINS:
raise pickle.UnpicklingError(
"msgpack_numpy: refusing to unpickle builtins.%s "
"(blocked as a pickle-RCE gadget)." % name)
return super().find_class(module, name)


def _loads_restricted(data):
return _RestrictedUnpickler(io.BytesIO(data)).load()


def _resolve_pickle_loader(allow_pickle):
"""Return the callable used to deserialize ``kind=b'O'`` payloads, or
``None`` if pickle is disabled. Raises ``ValueError`` on bad input.
"""
if allow_pickle is False or allow_pickle is None:
return None
if allow_pickle is True:
return pickle.loads
if allow_pickle == 'restricted':
return _loads_restricted
raise ValueError(
"msgpack_numpy: allow_pickle must be False, True, or 'restricted'; "
"got %r" % (allow_pickle,))


def encode(obj, chain=None):
"""
Data encoder for serializing numpy data types.
Expand Down Expand Up @@ -81,9 +165,20 @@ def encode(obj, chain=None):
else:
return obj if chain is None else chain(obj)

def decode(obj, chain=None):
def decode(obj, chain=None, allow_pickle=False):
"""
Decoder for deserializing numpy data types.

Parameters
----------
allow_pickle : bool or str, default False
Controls deserialization of ``object``-dtype arrays, which are stored
as a pickle stream. ``False`` (the default) refuses pickle entirely
and raises ``ValueError``. ``'restricted'`` uses a restricted
unpickler that allowlists NumPy reconstruction primitives and common
Python builtins. ``True`` calls ``pickle.loads`` directly and is
equivalent to executing arbitrary code from the payload — only use
with fully trusted sources.
"""

try:
Expand All @@ -97,7 +192,15 @@ def decode(obj, chain=None):
descr = [tuple(tostr(t) if type(t) is bytes else t for t in d) \
for d in obj[b'type']]
elif b'kind' in obj and obj[b'kind'] == b'O':
return pickle.loads(obj[b'data'])
loader = _resolve_pickle_loader(allow_pickle)
if loader is None:
raise ValueError(
"msgpack_numpy: refusing to unpickle object-dtype "
"ndarray (kind=b'O') because allow_pickle=False. "
"Pass allow_pickle='restricted' for a restricted "
"unpickler, or allow_pickle=True for the legacy "
"pickle.loads behavior (only with trusted data).")
return loader(obj[b'data'])
else:
descr = obj[b'type']
return np.ndarray(buffer=obj[b'data'],
Expand Down Expand Up @@ -149,8 +252,10 @@ class Unpacker(_Unpacker):
def __init__(self, file_like=None, read_size=0, use_list=None,
object_hook=None,
object_pairs_hook=None, list_hook=None, encoding='utf-8',
unicode_errors='strict', max_buffer_size=0):
object_hook = functools.partial(decode, chain=object_hook)
unicode_errors='strict', max_buffer_size=0,
allow_pickle=False):
object_hook = functools.partial(decode, chain=object_hook,
allow_pickle=allow_pickle)
super(Unpacker, self).__init__(file_like=file_like,
read_size=read_size,
use_list=use_list,
Expand Down Expand Up @@ -183,8 +288,10 @@ def __init__(self, file_like=None, read_size=0, use_list=None,
object_hook=None,
object_pairs_hook=None, list_hook=None,
unicode_errors='strict', max_buffer_size=0,
ext_hook=msgpack.ExtType):
object_hook = functools.partial(decode, chain=object_hook)
ext_hook=msgpack.ExtType,
allow_pickle=False):
object_hook = functools.partial(decode, chain=object_hook,
allow_pickle=allow_pickle)
super(Unpacker, self).__init__(file_like=file_like,
read_size=read_size,
use_list=use_list,
Expand Down Expand Up @@ -233,8 +340,10 @@ def __init__(self,
max_bin_len=-1,
max_array_len=-1,
max_map_len=-1,
max_ext_len=-1):
object_hook = functools.partial(decode, chain=object_hook)
max_ext_len=-1,
allow_pickle=False):
object_hook = functools.partial(decode, chain=object_hook,
allow_pickle=allow_pickle)
super(Unpacker, self).__init__(file_like=file_like,
read_size=read_size,
use_list=use_list,
Expand All @@ -258,6 +367,7 @@ def pack(o, stream, **kwargs):
Pack an object and write it to a stream.
"""

kwargs.pop('allow_pickle', None) # packer never unpickles
packer = Packer(**kwargs)
stream.write(packer.pack(o))

Expand All @@ -266,24 +376,35 @@ def packb(o, **kwargs):
Pack an object and return the packed bytes.
"""

kwargs.pop('allow_pickle', None) # packer never unpickles
return Packer(**kwargs).pack(o)

def unpack(stream, **kwargs):
"""
Unpack a packed object from a stream.

Pass ``allow_pickle`` (default ``False``) to control deserialization of
``object``-dtype ndarrays. See :func:`decode` for accepted values.
"""

allow_pickle = kwargs.pop('allow_pickle', False)
object_hook = kwargs.get('object_hook')
kwargs['object_hook'] = functools.partial(decode, chain=object_hook)
kwargs['object_hook'] = functools.partial(decode, chain=object_hook,
allow_pickle=allow_pickle)
return _unpack(stream, **kwargs)

def unpackb(packed, **kwargs):
"""
Unpack a packed object.

Pass ``allow_pickle`` (default ``False``) to control deserialization of
``object``-dtype ndarrays. See :func:`decode` for accepted values.
"""

allow_pickle = kwargs.pop('allow_pickle', False)
object_hook = kwargs.get('object_hook')
kwargs['object_hook'] = functools.partial(decode, chain=object_hook)
kwargs['object_hook'] = functools.partial(decode, chain=object_hook,
allow_pickle=allow_pickle)
return _unpackb(packed, **kwargs)

load = unpack
Expand Down
106 changes: 105 additions & 1 deletion tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,115 @@ def test_numpy_array_float(self):
assert_equal(x.dtype, x_rec.dtype)

def test_numpy_array_object(self):
# object-dtype arrays now require explicit opt-in (CWE-502).
x = np.random.rand(5).astype(object)
x_rec = self.encode_decode(x)
x_enc = msgpack.packb(x)
x_rec = msgpack.unpackb(x_enc, allow_pickle='restricted')
assert_array_equal(x, x_rec)
assert_equal(x.dtype, x_rec.dtype)

def test_numpy_array_object_default_refuses(self):
# Default (allow_pickle=False) must refuse to deserialize the
# pickle-bearing object-dtype payload.
x = np.random.rand(5).astype(object)
x_enc = msgpack.packb(x)
with self.assertRaises(ValueError) as ctx:
msgpack.unpackb(x_enc)
self.assertIn('allow_pickle', str(ctx.exception))

def test_numpy_array_object_explicit_unrestricted(self):
# allow_pickle=True preserves the legacy pickle.loads behavior for
# users who consciously opt in.
x = np.random.rand(5).astype(object)
x_enc = msgpack.packb(x)
x_rec = msgpack.unpackb(x_enc, allow_pickle=True)
assert_array_equal(x, x_rec)
assert_equal(x.dtype, x_rec.dtype)

def test_pickle_rce_payload_default_refused(self):
# A hand-crafted msgpack payload that, with the legacy pickle path,
# would call os.system(). The default refuses to unpickle it at all,
# so the RCE never fires.
import os, pickle as _pickle
marker_path = '/tmp/msgpack_numpy_rce_test_marker_default'
try:
os.unlink(marker_path)
except OSError:
pass

class _RCE:
def __reduce__(self):
return (os.system, ('touch %s' % marker_path,))

crafted = {
b'nd': True,
b'kind': b'O',
b'type': b'O',
b'shape': (1,),
b'data': _pickle.dumps(_RCE()),
}
packed = msgpack.packb(crafted, use_bin_type=True)

with self.assertRaises(ValueError):
msgpack.unpackb(packed, raw=False)
self.assertFalse(os.path.exists(marker_path),
"RCE payload executed despite allow_pickle=False")

def test_pickle_rce_payload_restricted_blocked(self):
# Same crafted payload, but with the restricted unpickler the
# os.system gadget is rejected by find_class().
import os, pickle as _pickle
marker_path = '/tmp/msgpack_numpy_rce_test_marker_restricted'
try:
os.unlink(marker_path)
except OSError:
pass

class _RCE:
def __reduce__(self):
return (os.system, ('touch %s' % marker_path,))

crafted = {
b'nd': True,
b'kind': b'O',
b'type': b'O',
b'shape': (1,),
b'data': _pickle.dumps(_RCE()),
}
packed = msgpack.packb(crafted, use_bin_type=True)

with self.assertRaises(_pickle.UnpicklingError):
msgpack.unpackb(packed, raw=False, allow_pickle='restricted')
self.assertFalse(os.path.exists(marker_path),
"RCE payload executed despite restricted unpickler")

def test_pickle_eval_gadget_restricted_blocked(self):
# builtins.eval is on the explicit block list even though the
# 'builtins' module is otherwise allowed (numpy reconstruction needs
# bytes/str etc.).
import pickle as _pickle

class _Eval:
def __reduce__(self):
return (eval, ('1+1',))

crafted = {
b'nd': True,
b'kind': b'O',
b'type': b'O',
b'shape': (1,),
b'data': _pickle.dumps(_Eval()),
}
packed = msgpack.packb(crafted, use_bin_type=True)
with self.assertRaises(_pickle.UnpicklingError):
msgpack.unpackb(packed, raw=False, allow_pickle='restricted')

def test_allow_pickle_bad_value(self):
x = np.random.rand(5).astype(object)
x_enc = msgpack.packb(x)
with self.assertRaises(ValueError):
msgpack.unpackb(x_enc, allow_pickle='yes')

def test_numpy_array_complex(self):
x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)
x_rec = self.encode_decode(x)
Expand Down