Skip to content

Commit 853f169

Browse files
committed
Hashed blob names
1 parent 513a49a commit 853f169

3 files changed

Lines changed: 160 additions & 12 deletions

File tree

CHANGELOG.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,15 @@ ADDED
2020
(tail terminal instances indefinitely until stopped via `delete_job`).
2121
Exported blobs are self-describing: each blob carries an explicit
2222
`schema_version`, the orchestration's `OrchestrationState` metadata, and
23-
the full ordered event list. Each exported blob also carries
24-
`{"instance_id": <id>}` as destination-side metadata (the Azure writer
25-
persists this as Azure Blob metadata) so consumers can scan a container
26-
without parsing each blob body. The export workflow retries each instance up
23+
the full ordered event list. Blob names are a lowercase-hex SHA-256 of
24+
``{last_updated_at}|{instance_id}`` with the format extension appended
25+
(matches the .NET `ExportInstanceHistoryActivity` naming scheme), so
26+
re-exporting an instance after a later terminal update lands at a new
27+
blob path rather than overwriting the previous one, and instance IDs
28+
that differ only by `/` no longer collide. Each exported blob also
29+
carries `{"instance_id": <id>}` as destination-side metadata (the Azure
30+
writer persists this as Azure Blob metadata) so consumers can scan a
31+
container without parsing each blob body. The export workflow retries each instance up
2732
to 3 times with exponential backoff (15s/30s/60s), retries failed batches
2833
up to 3 times, caps in-flight exports via `max_parallel_exports`
2934
(default 32), continues-as-new every 5 page cycles to bound orchestrator

durabletask/extensions/history_export/activities.py

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,10 @@
2424

2525
from __future__ import annotations
2626

27+
import hashlib
2728
from collections.abc import Mapping
2829
from dataclasses import dataclass
29-
from datetime import datetime
30+
from datetime import datetime, timezone
3031
from typing import Any, cast
3132

3233
from durabletask import client as client_module
@@ -222,7 +223,21 @@ def export_instance_history(
222223
fmt=fmt,
223224
metadata=metadata,
224225
)
225-
blob_name = _blob_name_for(instance_id=instance_id, prefix=prefix, fmt=fmt)
226+
# Blob name is a SHA-256 hash of the instance's terminal
227+
# timestamp + instance ID (matches the .NET
228+
# ``ExportInstanceHistoryActivity`` scheme). This means:
229+
# • Two exports of the *same* completion produce the same
230+
# blob name (idempotent under retry when ``overwrite=True``).
231+
# • An instance re-exported after a later completion lands
232+
# at a new path rather than overwriting the previous one.
233+
# • Instance IDs that differ only by ``/`` no longer collide
234+
# under the old ``.replace("/", "_")`` transform.
235+
blob_name = _blob_name_for(
236+
instance_id=instance_id,
237+
last_updated_at=state.last_updated_at,
238+
prefix=prefix,
239+
fmt=fmt,
240+
)
226241
ctx.writer.write(
227242
instance_id=instance_id,
228243
container=container,
@@ -249,12 +264,56 @@ def export_instance_history(
249264
# Helpers
250265
# ----------------------------------------------------------------------
251266

252-
def _blob_name_for(*, instance_id: str, prefix: str | None, fmt: ExportFormat) -> str:
267+
def _blob_name_for(
268+
*,
269+
instance_id: str,
270+
last_updated_at: datetime,
271+
prefix: str | None,
272+
fmt: ExportFormat,
273+
) -> str:
274+
"""Return the destination blob name for one exported instance.
275+
276+
Matches the .NET ``ExportInstanceHistoryActivity.GenerateBlobFileName``
277+
scheme: lowercase-hex SHA-256 of
278+
``f"{last_updated_at:O}|{instance_id}"`` with the format-appropriate
279+
extension appended, optionally namespaced under the configured
280+
destination prefix. Hash byte-equivalence with .NET output
281+
requires matching the .NET ``DateTimeOffset.ToString("O")`` format
282+
exactly (see :func:`_dotnet_o_format`).
283+
"""
284+
timestamp_str = _dotnet_o_format(last_updated_at)
285+
hash_input = f"{timestamp_str}|{instance_id}"
286+
digest = hashlib.sha256(hash_input.encode("utf-8")).hexdigest()
253287
ext = file_extension_for(fmt)
254-
safe_id = instance_id.replace("/", "_")
288+
blob_name = f"{digest}{ext}"
255289
if prefix:
256-
return f"{prefix.rstrip('/')}/{safe_id}{ext}"
257-
return f"{safe_id}{ext}"
290+
return f"{prefix.rstrip('/')}/{blob_name}"
291+
return blob_name
292+
293+
294+
def _dotnet_o_format(dt: datetime) -> str:
295+
"""Format *dt* to match .NET ``DateTimeOffset.ToString("O")``.
296+
297+
.NET's round-trip format is ``yyyy-MM-ddTHH:mm:ss.fffffffK`` for
298+
``DateTimeOffset``, where ``K`` resolves to ``+HH:MM`` / ``-HH:MM``
299+
and fractional seconds always render with seven digits (100-ns
300+
ticks resolution). Python :class:`datetime.datetime` only carries
301+
microsecond precision (six digits), so the seventh digit is always
302+
a trailing zero. Naive datetimes are assumed UTC.
303+
"""
304+
if dt.tzinfo is None:
305+
dt = dt.replace(tzinfo=timezone.utc)
306+
base = dt.strftime("%Y-%m-%dT%H:%M:%S")
307+
fractional = f"{dt.microsecond:06d}0"
308+
offset = dt.utcoffset()
309+
if offset is None:
310+
offset_str = "+00:00"
311+
else:
312+
total_minutes = int(offset.total_seconds() // 60)
313+
sign = "+" if total_minutes >= 0 else "-"
314+
total_minutes = abs(total_minutes)
315+
offset_str = f"{sign}{total_minutes // 60:02d}:{total_minutes % 60:02d}"
316+
return f"{base}.{fractional}{offset_str}"
258317

259318

260319
def register(worker_instance: worker_module.TaskHubGrpcWorker) -> None:

tests/durabletask/extensions/history_export/test_activities.py

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,5 +318,89 @@ class _State:
318318
assert result["success"] is False
319319
assert "no longer terminal" in result["error"]
320320
assert "RUNNING" in result["error"]
321-
assert writer.calls == []
322-
assert stub_client.history_calls == 0
321+
322+
323+
# ---------------------------------------------------------------------
324+
# Unit tests for the N-8 blob-naming scheme
325+
# ---------------------------------------------------------------------
326+
327+
328+
def test_blob_name_matches_dotnet_hash_scheme():
329+
"""N-8: blob name is lowercase-hex sha256 of '{:O}|{instance_id}'.
330+
331+
Pins the exact .NET-aligned scheme so any future drift (timestamp
332+
format, hash function, casing) breaks loudly. The expected hash
333+
is computed by hand from the same inputs the activity would use.
334+
"""
335+
import hashlib
336+
337+
from durabletask.extensions.history_export.activities import (
338+
_blob_name_for,
339+
_dotnet_o_format,
340+
)
341+
342+
last_updated = datetime(2026, 6, 4, 17, 9, 9, 420990, tzinfo=timezone.utc)
343+
instance_id = "inst-1"
344+
fmt = ExportFormat(kind=ExportFormatKind.JSON)
345+
346+
# The seven-digit fractional-seconds format is what .NET emits for
347+
# the same instant.
348+
assert _dotnet_o_format(last_updated) == "2026-06-04T17:09:09.4209900+00:00"
349+
350+
expected_hash = hashlib.sha256(
351+
f"{_dotnet_o_format(last_updated)}|{instance_id}".encode("utf-8")
352+
).hexdigest()
353+
354+
assert _blob_name_for(
355+
instance_id=instance_id,
356+
last_updated_at=last_updated,
357+
prefix=None,
358+
fmt=fmt,
359+
) == f"{expected_hash}.json"
360+
361+
assert _blob_name_for(
362+
instance_id=instance_id,
363+
last_updated_at=last_updated,
364+
prefix="exports/run-1/",
365+
fmt=ExportFormat(kind=ExportFormatKind.JSONL_GZIP),
366+
) == f"exports/run-1/{expected_hash}.jsonl.gz"
367+
368+
369+
def test_blob_name_isolates_instance_ids_that_differ_only_by_slash():
370+
"""N-8: instance IDs containing '/' no longer collide.
371+
372+
The old scheme used ``instance_id.replace(\"/\", \"_\")`` which
373+
collapsed ``v1/x`` and ``v1_x`` to the same blob name. Hashing
374+
isolates them.
375+
"""
376+
from durabletask.extensions.history_export.activities import _blob_name_for
377+
378+
last_updated = datetime(2026, 6, 4, 17, 9, 9, 420990, tzinfo=timezone.utc)
379+
fmt = ExportFormat(kind=ExportFormatKind.JSON)
380+
381+
name_a = _blob_name_for(
382+
instance_id="v1/x", last_updated_at=last_updated, prefix=None, fmt=fmt,
383+
)
384+
name_b = _blob_name_for(
385+
instance_id="v1_x", last_updated_at=last_updated, prefix=None, fmt=fmt,
386+
)
387+
assert name_a != name_b
388+
389+
390+
def test_blob_name_changes_when_instance_terminal_timestamp_changes():
391+
"""N-8: re-export at a different terminal time lands at a new blob."""
392+
from durabletask.extensions.history_export.activities import _blob_name_for
393+
394+
fmt = ExportFormat(kind=ExportFormatKind.JSON)
395+
instance_id = "inst-x"
396+
397+
earlier = datetime(2026, 6, 4, 17, 0, 0, 0, tzinfo=timezone.utc)
398+
later = datetime(2026, 6, 4, 18, 0, 0, 0, tzinfo=timezone.utc)
399+
400+
name_earlier = _blob_name_for(
401+
instance_id=instance_id, last_updated_at=earlier, prefix=None, fmt=fmt,
402+
)
403+
name_later = _blob_name_for(
404+
instance_id=instance_id, last_updated_at=later, prefix=None, fmt=fmt,
405+
)
406+
assert name_earlier != name_later

0 commit comments

Comments
 (0)