Skip to content

Commit 8794c41

Browse files
benjefferyjeromekelleher
authored andcommitted
Add json+binary codec
1 parent 0e37e51 commit 8794c41

3 files changed

Lines changed: 196 additions & 0 deletions

File tree

python/CHANGELOG.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ Maintenance release.
107107
also around 10% faster.
108108
(:user:`benjeffery`, :pr:`3313`, :pr:`3317`, :issue:`1896`)
109109

110+
- Add ``json+binary`` metadata codec that allows storing binary data
111+
alongside JSON metadata. (:user:`benjeffery`, :pr:`3306`)
112+
110113
**Bugfixes**
111114

112115
- In some tables with mutations out-of-order ``TableCollection.sort`` did not re-order

python/tests/test_metadata.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,133 @@ def test_zero_length(self):
627627
assert ms.decode_row(b"") == {}
628628

629629

630+
class TestJSONBinaryCodec:
631+
def test_encode_requires_binary(self):
632+
ms = tskit.MetadataSchema({"codec": "json+binary"})
633+
with pytest.raises(
634+
exceptions.MetadataEncodingError,
635+
match="requires top-level '_binary' bytes-like value",
636+
):
637+
ms.validate_and_encode_row({})
638+
639+
def test_zero_length_blob(self):
640+
ms = tskit.MetadataSchema({"codec": "json+binary"})
641+
encoded = ms.validate_and_encode_row({"_binary": b""})
642+
decoded = ms.decode_row(encoded)
643+
assert isinstance(decoded["_binary"], memoryview)
644+
assert len(decoded["_binary"]) == 0
645+
# JSON portion was empty
646+
assert set(decoded.keys()) == {"_binary"}
647+
648+
def test_round_trip_with_blob_and_json(self):
649+
ms = tskit.MetadataSchema({"codec": "json+binary"})
650+
blob = b"\x00\x01\x02hello"
651+
row = {"label": "alpha", "count": 7, "_binary": blob}
652+
encoded = ms.validate_and_encode_row(row)
653+
out = ms.decode_row(encoded)
654+
assert out["label"] == "alpha"
655+
assert out["count"] == 7
656+
assert isinstance(out["_binary"], memoryview)
657+
assert out["_binary"].tobytes() == blob
658+
659+
def test_decode_without_magic_errors(self):
660+
ms = tskit.MetadataSchema({"codec": "json+binary"})
661+
# Plain JSON is not acceptable for this codec
662+
with pytest.raises(ValueError, match="missing magic header"):
663+
ms.decode_row(b"{}")
664+
665+
def test_simple_default(self):
666+
schema = {
667+
"codec": "json+binary",
668+
"type": "object",
669+
"properties": {"number": {"type": "number", "default": 5}},
670+
}
671+
ms = tskit.MetadataSchema(schema)
672+
# With json+binary, we need to provide _binary even for empty metadata
673+
assert ms.decode_row(ms.validate_and_encode_row({"_binary": b""})) == {
674+
"number": 5,
675+
"_binary": memoryview(b""),
676+
}
677+
assert ms.decode_row(
678+
ms.validate_and_encode_row({"_binary": b"", "number": 42})
679+
) == {"number": 42, "_binary": memoryview(b"")}
680+
681+
def test_nested_default_error(self):
682+
schema = {
683+
"codec": "json+binary",
684+
"type": "object",
685+
"properties": {
686+
"obj": {
687+
"type": "object",
688+
"properties": {
689+
"nested_obj_no_default": {
690+
"type": "object",
691+
"properties": {},
692+
},
693+
"nested_obj": {
694+
"type": "object",
695+
"properties": {},
696+
"default": {"foo": "bar"},
697+
},
698+
},
699+
}
700+
},
701+
}
702+
with pytest.raises(
703+
tskit.MetadataSchemaValidationError,
704+
match="Defaults can only be specified at the top level for JSON codec",
705+
):
706+
tskit.MetadataSchema(schema)
707+
708+
def test_bad_type_error(self):
709+
ms = tskit.MetadataSchema({"codec": "json+binary"})
710+
# json+binary first checks for _binary key, so we need a dict with _binary
711+
# but other fields that can't be JSON encoded
712+
with pytest.raises(
713+
exceptions.MetadataEncodingError,
714+
match="Could not encode metadata of type TableCollection",
715+
):
716+
ms.validate_and_encode_row(
717+
{"_binary": b"", "bad_field": tskit.TableCollection(1)}
718+
)
719+
720+
def test_skip_validation(self):
721+
ms = tskit.MetadataSchema({"codec": "json+binary"})
722+
assert ms._bypass_validation
723+
with patch.object(ms, "_validate_row", return_value=True) as mocked_validate:
724+
ms.validate_and_encode_row({"_binary": b""})
725+
assert mocked_validate.call_count == 0
726+
727+
def test_dont_skip_validation(self):
728+
ms = tskit.MetadataSchema({"codec": "json+binary", "properties": {"foo": {}}})
729+
assert not ms._bypass_validation
730+
with patch.object(ms, "_validate_row", return_value=True) as mocked_validate:
731+
ms.validate_and_encode_row({"_binary": b""})
732+
assert mocked_validate.call_count == 1
733+
734+
def test_binary_requires_buffer_protocol(self):
735+
ms = tskit.MetadataSchema({"codec": "json+binary"})
736+
with pytest.raises(
737+
exceptions.MetadataEncodingError,
738+
match="_binary must be bytes-like \\(buffer protocol\\)",
739+
):
740+
ms.validate_and_encode_row({"_binary": "not bytes"})
741+
742+
def test_decode_version_mismatch(self):
743+
ms = tskit.MetadataSchema({"codec": "json+binary"})
744+
header = metadata.JSONBinaryCodec._HDR.pack(
745+
metadata.JSONBinaryCodec.MAGIC,
746+
metadata.JSONBinaryCodec.VERSION + 1,
747+
len(b"{}"),
748+
0,
749+
)
750+
with pytest.raises(
751+
ValueError,
752+
match="Unsupported json\\+binary version",
753+
):
754+
ms.decode_row(header + b"{}")
755+
756+
630757
class TestStructCodec:
631758
def encode_decode(self, method_name, sub_schema, obj, buffer):
632759
assert (

python/tskit/metadata.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,72 @@ def decode(self, data: bytes) -> bytes:
195195
return data
196196

197197

198+
class JSONBinaryCodec(JSONCodec):
199+
"""
200+
A JSON codec that optionally packs a single binary blob alongside the
201+
canonical JSON bytes. The JSON portion is validated using the normal JSON
202+
schema rules; a reserved top-level key "_binary" is ignored for validation
203+
purposes and, if present at encode time, is stored as raw bytes appended
204+
after a small header and the JSON payload.
205+
206+
On encode, callers MUST supply a top-level "_binary" bytes-like value,
207+
even if zero length. On decode, the returned object will include a
208+
"_binary" key whose value is a memoryview over the decoded bytes.
209+
"""
210+
211+
MAGIC = b"JBLB"
212+
VERSION = 1
213+
_HDR = struct.Struct("<4sBQQ") # magic, version, json_len, blob_len
214+
215+
# Use the same validator behavior as JSONCodec; we do not special-case
216+
# validation for the reserved _binary key. If users set additionalProperties
217+
# to False, providing _binary will fail validation unless declared.
218+
219+
def encode(self, obj: Any) -> bytes:
220+
# Require a top-level _binary bytes-like entry; zero-length allowed
221+
if not isinstance(obj, dict) or "_binary" not in obj:
222+
raise exceptions.MetadataEncodingError(
223+
"json+binary requires top-level '_binary' bytes-like value"
224+
)
225+
try:
226+
blob_bytes = memoryview(obj["_binary"]).tobytes()
227+
except TypeError as e:
228+
raise exceptions.MetadataEncodingError(
229+
"_binary must be bytes-like (buffer protocol)"
230+
) from e
231+
232+
try:
233+
json_bytes = tskit.canonical_json(
234+
{k: v for k, v in obj.items() if k != "_binary"}
235+
).encode()
236+
except TypeError as e:
237+
raise exceptions.MetadataEncodingError(
238+
f"Could not encode metadata of type {str(e).split()[3]}"
239+
)
240+
241+
header = self._HDR.pack(
242+
self.MAGIC, self.VERSION, len(json_bytes), len(blob_bytes)
243+
)
244+
return header + json_bytes + blob_bytes
245+
246+
def decode(self, encoded: bytes) -> Any:
247+
if len(encoded) >= self._HDR.size and encoded[:4] == self.MAGIC:
248+
_, version, jlen, blen = self._HDR.unpack_from(encoded)
249+
if version != self.VERSION:
250+
raise ValueError("Unsupported json+binary version")
251+
start = self._HDR.size
252+
json_bytes = encoded[start : start + jlen]
253+
blob_bytes = encoded[start + jlen : start + jlen + blen]
254+
255+
result = super().decode(json_bytes)
256+
result["_binary"] = memoryview(blob_bytes)
257+
return result
258+
raise ValueError("Invalid json+binary payload: missing magic header")
259+
260+
261+
register_metadata_codec(JSONBinaryCodec, "json+binary")
262+
263+
198264
def binary_format_validator(validator, types, instance, schema):
199265
# We're hooking into jsonschemas validation code here, which works by creating
200266
# generators of exceptions, hence the yielding

0 commit comments

Comments
 (0)