python padding

petrelharp · petrelharp · commit 38baaedef9be · 2026-03-15T08:52:43.000-07:00
diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py
@@ -671,6 +671,51 @@ def test_round_trip_with_struct_and_json(self):
         out = ms.decode_row(encoded)
         assert out == row
 
+    def test_blob_bytes_aligned(self):
+        # test that the portion of the encoded metadata up until the struct
+        # is 8-byte aligned; we do that in the pedantic way
+        # of figuring out how much memory is being used per int
+        # in the struct part and subtracting that off
+        def schema_with_blobs(k):
+            schema = {
+                "codec": "json+struct",
+                "json": {
+                    "type": "object",
+                    "properties": {
+                        "label": {"type": "string"},
+                        "count": {"type": "number"},
+                    },
+                    "required": ["label"],
+                },
+                "struct": {
+                    "type": "object",
+                    "properties": {},
+                },
+            }
+            for j in range(k):
+                schema["struct"]["properties"][f"b{j}"] = {
+                    "type": "integer",
+                    "binaryFormat": "i",
+                }
+            return tskit.MetadataSchema(schema)
+
+        k_list = (0, 1, 2, 3)
+        schemas = [schema_with_blobs(k) for k in k_list]
+        rows = []
+        for k in k_list:
+            row = {"label": "alpha", "count": 7}
+            for j in range(k):
+                row[f"b{j}"] = j
+            rows.append(row)
+        encoded = [ms.validate_and_encode_row(row) for ms, row in zip(schemas, rows)]
+        dbytes = len(encoded[2]) - len(encoded[1])
+        assert len(encoded[3]) - len(encoded[2]) == dbytes
+        for k, en in zip(k_list, encoded):
+            assert (len(en) - k * dbytes) % 8 == 0
+        for ms, en, row in zip(schemas, encoded, rows):
+            decoded = ms.decode_row(en)
+            assert decoded == row
+
     def test_json_defaults_applied(self):
         schema = {
             "codec": "json+struct",
diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py
@@ -294,20 +294,25 @@ def encode(self, obj: Any) -> bytes:
         header = self._HDR.pack(
             self.MAGIC, self.VERSION, len(json_bytes), len(blob_bytes)
         )
-        return header + json_bytes + blob_bytes
+        padding_bytes = bytes((-(len(header) + len(json_bytes))) % 8)
+        return header + json_bytes + padding_bytes + blob_bytes
 
     def decode(self, encoded: bytes) -> Any:
         if len(encoded) >= self._HDR.size and encoded[:4] == self.MAGIC:
             _, version, jlen, blen = self._HDR.unpack_from(encoded)
             if version != self.VERSION:
                 raise ValueError("Unsupported json+struct version")
             start = self._HDR.size
-            if jlen > len(encoded) - start or blen > len(encoded) - start - jlen:
+            padding_length = (-(start + jlen)) % 8
+            if (
+                jlen > len(encoded) - start
+                or blen > len(encoded) - start - jlen - padding_length
+            ):
                 raise ValueError(
                     "Invalid json+struct payload: declared lengths exceed buffer size"
                 )
             json_bytes = encoded[start : start + jlen]
-            blob_bytes = encoded[start + jlen : start + jlen + blen]
+            blob_bytes = encoded[start + jlen : start + jlen + blen + padding_length]
             json_data = self.json_codec.decode(json_bytes)
             struct_data = self.struct_codec.decode(blob_bytes)
             overlap = set(json_data).intersection(struct_data)