Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions src/marshmallow/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import abc
import base64
import collections
import copy
import datetime as dt
Expand Down Expand Up @@ -48,6 +49,7 @@
"AwareDateTime",
"Bool",
"Boolean",
"Bytes",
"Constant",
"Date",
"DateTime",
Expand Down Expand Up @@ -878,6 +880,95 @@ def _deserialize(self, value, attr, data, **kwargs) -> str:
raise self.make_error("invalid_utf8") from error


class Bytes(Field[bytes]):
Comment thread
rrad5409 marked this conversation as resolved.
"""
A field for deserializing strings into byte arrays.

:param encoding: Specifies the string encoding used when encoding/decoding to/from strings.
:param errors: Error behaviour when converting to/from a :class:`str`, inherited from it's constructor.
:param serialize: Specifies the return type when serializing.
`base64` and `str` use the value of `encoding` for the string.
:param kwargs: The same keyword arguments that :class:`Field` receives.

.. versionadded:: 4.3.0
"""
Comment thread
rrad5409 marked this conversation as resolved.

#: Default error messages.
default_error_messages = {
"not_bytes": "Not a bytes-like object.",
"unicode": "Invalid unicode string.",
}

def __init__(
self,
encoding: str = "utf-8",
errors: str = "strict",
serialize: typing.Literal["int", "str", "bytes", "base64"] = "base64",
**kwargs: Unpack[_BaseFieldKwargs],
):
super().__init__(**kwargs)
self.encoding = encoding
self.errors = errors
self.serialize = serialize

def _deserialize(
self,
value: typing.Any,
attr: str | None,
data: typing.Mapping[str, typing.Any] | None,
**kwargs: typing.Any,
) -> bytes:
try:
match value:
case str() as s:
return bytes(
s,
encoding=self.encoding,
errors=self.errors,
)
case int() as i:
return i.to_bytes(
length=max(1, (7 + i.bit_length()) // 8),
byteorder="big",
signed=i < 0,
)
Comment on lines +929 to +934
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Marshmallow often leans on the builtin constructor for type coercion to keep the implementation simple, then excludes types that might be confusing.

if isinstance(value, (bool, int)):
    raise ...
try:
    return bytes(value)
except TypeError:
    ...

Big int to bytes is probably out of scope.

Copy link
Copy Markdown
Author

@rrad5409 rrad5409 Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that YAML doesn't natively support writing a bytes value using hex/octal/binary notation, only encoded base64 strings. So I'm not able to write val: 0xDEADBEEF, get that deserialised to an int, and then convert it to bytes in the field. (sorry I made an error in my sample earlier, the header should have been an int not `bytes)

Especially when writing binary-related values (like registers, addresses, bitmasks etc), writing it as 0xABAA or 0b11001010 is far more convenient to write and more understandable.

The same reasoning applies to str - being forced to encode a string using base64 is pure inconvenience, when I could just write something: "Hello World".

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i also lean towards omitting this. i plan to finish up and release field-level pre_load soon, which should simplify your use case here.

  def int_to_bytes(value):
      if isinstance(value, int):
          return value.to_bytes(
              length=max(1, (7 + value.bit_length()) // 8),
              byteorder="big",
              signed=value < 0,
          )
      return value

class MySchema(Schema):
    foo = fields.Bytes(pre_load=int_to_bytes)

so to build off of @deckar01's suggestion, i think the final implementation should look something like:

  def _deserialize(self, value, attr, data, **kwargs) -> bytes:
      if isinstance(value, (bool, int)):
          raise self.make_error("invalid")
      if isinstance(value, str):
          try:
              return value.encode("utf-8")
          except UnicodeEncodeError as error:
              raise self.make_error("invalid") from error
      try:
          return bytes(value)
      except TypeError as error:
          raise self.make_error("invalid") from error

case _:
return bytes(value)
except TypeError as e:
raise self.make_error("not_bytes") from e
except UnicodeError as e:
raise self.make_error("unicode") from e

def _serialize(
self,
value: bytes,
attr: str | None,
obj: typing.Any,
**kwargs: typing.Any,
) -> str | int | bytes:
try:
match self.serialize:
case "str":
return str(
value,
encoding=self.encoding,
errors=self.errors,
)
case "base64":
return base64.standard_b64encode(value)
case "int":
return int.from_bytes(
value,
byteorder="big",
)
case "bytes":
return value
case _:
typing.assert_never(self.serialize)
except UnicodeError as e:
raise self.make_error("unicode") from e
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

serialization should not perform validation, since deserialized values are assumed to be valid. this was an explicit design choice.



class UUID(Field[uuid.UUID]):
"""A UUID field."""

Expand Down
17 changes: 17 additions & 0 deletions tests/test_deserialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,23 @@ def test_string_field_deserialization(self):
with pytest.raises(ValidationError):
field.deserialize({})

def test_bytes_field_deserialization(self):
field = fields.Bytes()
assert field.deserialize(b"foo") == b"foo"
assert field.deserialize(bytearray(b"foo")) == b"foo"
assert field.deserialize("foo") == b"foo"
assert field.deserialize(0xDEAD) == b"\xde\xad"
assert field.deserialize([0xBE, 0xEF]) == b"\xbe\xef"
assert field.deserialize((0xB, 0xA, 0xB, 0xE)) == b"\x0b\x0a\x0b\x0e"

with pytest.raises(ValidationError) as excinfo:
field.deserialize({"hi": 222})
assert excinfo.value.args[0] == "not a bytes-like object"

with pytest.raises(ValidationError) as excinfo:
field.deserialize(["12345"])
assert excinfo.value.args[0] == "not a bytes-like object"

def test_boolean_field_deserialization(self):
field = fields.Boolean()
assert field.deserialize(True) is True
Expand Down