Skip to content

Commit 765fd9b

Browse files
fix bytea serialization to preserve binary data during record/replay
1 parent d22ed0a commit 765fd9b

4 files changed

Lines changed: 45 additions & 32 deletions

File tree

drift/instrumentation/psycopg/e2e-tests/src/app.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -559,19 +559,11 @@ def test_sql_composed():
559559
except Exception as e:
560560
return jsonify({"error": str(e)}), 500
561561

562-
563-
564-
565-
# ===== BUG HUNTING TEST ENDPOINTS =====
566-
# These endpoints expose confirmed bugs in the psycopg instrumentation
567-
# Endpoints that passed tests have been removed
568-
569562
@app.route("/test/binary-uuid")
570563
def test_binary_uuid():
571564
"""Test binary UUID data type.
572565
573-
BUG HYPOTHESIS: UUID types may not serialize/deserialize correctly
574-
during RECORD/REPLAY because they are binary.
566+
Tests whether the instrumentation correctly handles binary UUID data types.
575567
"""
576568
try:
577569
import uuid
@@ -602,13 +594,11 @@ def test_binary_uuid():
602594
except Exception as e:
603595
return jsonify({"error": str(e)}), 500
604596

605-
606597
@app.route("/test/binary-bytea")
607598
def test_binary_bytea():
608599
"""Test binary bytea data type.
609600
610-
BUG HYPOTHESIS: Binary data (bytea) may not serialize/deserialize
611-
correctly during RECORD/REPLAY.
601+
Tests whether the instrumentation correctly handles binary bytea data types.
612602
"""
613603
try:
614604
with psycopg.connect(get_conn_string()) as conn, conn.cursor() as cur:

drift/instrumentation/psycopg/e2e-tests/src/test_requests.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -96,14 +96,8 @@ def make_request(method, endpoint, **kwargs):
9696

9797
make_request("GET", "/test/sql-composed")
9898

99-
# ===== BUG HUNTING TEST ENDPOINTS =====
100-
# These tests expose confirmed bugs in the psycopg instrumentation
101-
# See BUG_TRACKING.md for detailed information about each bug
102-
print("\n--- Bug Hunting Tests (REPLAY mode bugs - pass RECORD but fail REPLAY) ---\n")
103-
104-
# Bug 8: UUID parameter serialization issue during REPLAY
10599
make_request("GET", "/test/binary-uuid")
106-
# Bug 9: bytea data deserialization returns string instead of bytes
100+
107101
make_request("GET", "/test/binary-bytea")
108102

109103
print("\nAll requests completed successfully")

drift/instrumentation/utils/psycopg_utils.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,35 @@
22

33
from __future__ import annotations
44

5+
import base64
56
import datetime as dt
67
from typing import Any
78

89

910
def deserialize_db_value(val: Any) -> Any:
10-
"""Convert ISO datetime strings back to datetime objects for consistent serialization.
11+
"""Convert serialized values back to their original Python types.
1112
12-
During recording, datetime objects from the database are serialized to ISO format strings.
13-
During replay, we need to convert them back to datetime objects so that Flask/Django
14-
serializes them the same way (e.g., RFC 2822 vs ISO 8601 format).
13+
During recording, database values are serialized for JSON storage:
14+
- datetime objects -> ISO format strings
15+
- bytes/memoryview -> {"__bytes__": "<base64_encoded_data>"}
1516
16-
Only parses strings that contain a time component (T or space separator with :) to avoid
17-
incorrectly converting date-only strings or text that happens to look like dates.
17+
During replay, we need to convert them back to their original types so that
18+
application code (Flask/Django) handles them the same way.
1819
1920
Args:
2021
val: A value from the mocked database rows. Can be a string, list, dict, or any other type.
2122
2223
Returns:
23-
The value with ISO datetime strings converted back to datetime objects.
24+
The value with serialized types converted back to their original Python types.
2425
"""
25-
if isinstance(val, str):
26+
if isinstance(val, dict):
27+
# Check for bytes tagged structure
28+
if "__bytes__" in val and len(val) == 1:
29+
# Decode base64 back to bytes
30+
return base64.b64decode(val["__bytes__"])
31+
# Recursively deserialize dict values
32+
return {k: deserialize_db_value(v) for k, v in val.items()}
33+
elif isinstance(val, str):
2634
# Only parse strings that look like full datetime (must have time component)
2735
# This avoids converting date-only strings like "2024-01-15" or text columns
2836
# that happen to match date patterns
@@ -35,6 +43,4 @@ def deserialize_db_value(val: Any) -> Any:
3543
pass
3644
elif isinstance(val, list):
3745
return [deserialize_db_value(v) for v in val]
38-
elif isinstance(val, dict):
39-
return {k: deserialize_db_value(v) for k, v in val.items()}
4046
return val

drift/instrumentation/utils/serialization.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,33 @@
22

33
from __future__ import annotations
44

5+
import base64
56
import datetime
67
import uuid
78
from typing import Any
89

910

11+
def _serialize_bytes(val: bytes) -> Any:
12+
"""Serialize bytes to a JSON-compatible format.
13+
14+
Attempts UTF-8 decode first for text data (like COPY output).
15+
Falls back to base64 encoding with tagged structure for binary data
16+
that contains invalid UTF-8 sequences (like bytea columns).
17+
18+
Args:
19+
val: The bytes value to serialize.
20+
21+
Returns:
22+
Either a string (if valid UTF-8) or a dict {"__bytes__": "base64_data"}.
23+
"""
24+
try:
25+
# Try UTF-8 decode first - works for text data like COPY output
26+
return val.decode("utf-8")
27+
except UnicodeDecodeError:
28+
# Fall back to base64 for binary data with invalid UTF-8 sequences
29+
return {"__bytes__": base64.b64encode(val).decode("ascii")}
30+
31+
1032
def serialize_value(val: Any) -> Any:
1133
"""Convert non-JSON-serializable values to JSON-compatible types.
1234
@@ -22,10 +44,11 @@ def serialize_value(val: Any) -> Any:
2244
return val.isoformat()
2345
elif isinstance(val, uuid.UUID):
2446
return str(val)
25-
elif isinstance(val, bytes):
26-
return val.decode("utf-8", errors="replace")
2747
elif isinstance(val, memoryview):
28-
return bytes(val).decode("utf-8", errors="replace")
48+
# Convert memoryview to bytes first, then serialize
49+
return _serialize_bytes(bytes(val))
50+
elif isinstance(val, bytes):
51+
return _serialize_bytes(val)
2952
elif isinstance(val, (list, tuple)):
3053
return [serialize_value(v) for v in val]
3154
elif isinstance(val, dict):

0 commit comments

Comments
 (0)