Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 64 additions & 13 deletions packages/data/src/cache/functions/hashing/blob-to-hash.ts
Original file line number Diff line number Diff line change
@@ -1,26 +1,77 @@
// © 2026 Adobe. MIT License. See /LICENSE for details.
import { createSHA256 } from "hash-wasm";

export async function blobToHash(blob: Blob): Promise<string> {
const hasher = await createSHA256();
hasher.init();
// Performance assumptions
// -----------------------
// Blobs hashed here are frequently NOT fully in memory (disk- or
// stream-backed), so each `reader.read()` is genuine I/O latency, not a
// microtask hop over resident bytes. This makes the path I/O-bound, and the
// design optimizes for overlapping those waits across concurrent calls:
//
// - A single WASM hasher instance is reused for every call (instantiation
// is not free, and we don't want one per call).
// - Concurrency is achieved via hash-wasm's resumable save()/load() rather
// than a pool of instances: each call keeps its own `state` and only
// touches the shared instance in synchronous critical sections, so calls
// interleave at `read()` without serializing or corrupting each other.
//
// The trade-off is a save()/load() pair per chunk (cheap for SHA-256). If
// blobs were instead known to be fully in memory, this would be compute-bound,
// the I/O overlap would buy nothing, and buffering then hashing synchronously
// (one init→update→digest block, no await) would suffice.
//
// Note on the "single global hasher" decision: even compute-bound, extra
// instances would NOT help on this thread. WASM has no threads of its own and
// hasher.update() is synchronous, so on one JS thread only one hash advances
// at a time regardless of how many instances exist — a pool buys nothing here.
// Servicing multiple CPU-bound hashes truly in parallel requires Web Workers,
// each with its OWN instance on its OWN thread. That is the only thing a second
// instance is ever good for, and it lives at the worker boundary, not here. So
// within this thread, one global hasher is strictly correct and loses nothing.
import { type IHasher, createSHA256 } from "hash-wasm";

// Encode MIME type as UTF-16LE
const tCodes = new Uint16Array(blob.type.length);
for (let i = 0; i < blob.type.length; i++) {
tCodes[i] = blob.type.charCodeAt(i);
let hasherPromise: Promise<IHasher> | undefined;

export async function blobToHash(blob: Blob): Promise<string> {
if (hasherPromise === undefined) {
hasherPromise = createSHA256();
// Allow a later call to retry if instantiation failed, rather than
// poisoning the module with a permanently-rejected promise.
hasherPromise.catch(() => {
hasherPromise = undefined;
});
}
hasher.update(new Uint8Array(tCodes.buffer));
const hasher = await hasherPromise;

// One shared WASM instance serves all concurrent calls. The instance is
// touched only in synchronous init→…→save / load→update→save sequences,
// never held across an `await`, so each call carries its own `state` and
// their `reader.read()` I/O waits overlap freely without corrupting one
// another. See hash-wasm save()/load() resumable hashing.
hasher.init();
hasher.update(mimeTypeBytes(blob.type));
let state = hasher.save();

const reader = blob.stream().getReader();
let done = false;
while (!done) {
const result = await reader.read();
done = result.done === true;
if (!done && result.value != null) {
hasher.update(result.value);
const chunk = await reader.read();
done = chunk.done === true;
if (!done && chunk.value != null) {
hasher.load(state);
hasher.update(chunk.value);
state = hasher.save();
Comment on lines +60 to +62

@KodyJKing KodyJKing Jun 23, 2026

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The save and load are the only possible performance concession here.

The await reader.read() means concurrent hash calls can interleave and corrupt each other's state since the hasher is shared now.

}
}

hasher.load(state);
return hasher.digest("hex");
}

function mimeTypeBytes(type: string): Uint8Array {
// Encode MIME type as UTF-16LE
const codes = new Uint16Array(type.length);
for (let i = 0; i < type.length; i++) {
codes[i] = type.charCodeAt(i);
}
return new Uint8Array(codes.buffer);
}
71 changes: 71 additions & 0 deletions packages/data/src/cache/functions/hashing/hashing.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,43 @@ import { blobToHash } from "./blob-to-hash.js";
import { jsonToHash } from "./json-to-hash.js";
import { describe, expect, it } from "vitest";

const bytes = (s: string): Uint8Array => new TextEncoder().encode(s);

// Lets all pending microtasks (and the WASM-instantiation promise) settle, so
// each in-flight blobToHash call advances to its next `await reader.read()`.
const flush = (): Promise<void> => new Promise((resolve) => setTimeout(resolve, 0));

// A stand-in Blob whose stream yields `chunks` one `read()` at a time, but only
// when the test releases each gate. This hands the test control of the exact
// interleaving across concurrent calls — impossible with a real in-memory Blob,
// whose reads resolve on their own schedule.
function gatedBlob(type: string, chunks: Uint8Array[]) {
const gates: Array<() => void> = [];
let i = 0;
const reader = {
read: () =>
new Promise<ReadableStreamReadResult<Uint8Array>>((resolve) => {
gates.push(() =>
resolve(
i < chunks.length
? { done: false, value: chunks[i++] }
: { done: true, value: undefined },
),
);
}),
};
return {
// Case 1 cast: this implements the only members blobToHash reads off a
// Blob — `type` and `stream().getReader().read()`.
blob: { type, stream: () => ({ getReader: () => reader }) } as unknown as Blob,
releaseNext: (): boolean => {
const gate = gates.shift();
gate?.();
return gate !== undefined;
},
};
}

describe("test hashing", () => {
describe("blobToHash", () => {
it("should avoid collisions based on content and type", async () => {
Expand Down Expand Up @@ -108,6 +145,40 @@ describe("test hashing", () => {
expect(hash).toMatch(/^[a-f0-9]{64}$/);
}
});

it("interleaved concurrent reads match serial hashes", async () => {

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has been verified to fail when save/load are omitted, so this is correctly testing interleaved reads.

const inputs = [
{ type: "text/plain", chunks: ["alpha-", "one-", "end"] },
{ type: "application/octet-stream", chunks: ["BETA-", "two-", "END"] },
{ type: "", chunks: ["g", "amma", "!!!"] },
];

// Oracle: hash each input serially. SHA-256 is over the byte stream, so a
// real Blob of the concatenated chunks yields the same digest the gated
// blob must produce regardless of chunk boundaries.
const oracle: string[] = [];
for (const { type, chunks } of inputs) {
oracle.push(await blobToHash(new Blob(chunks, { type })));
}

// Concurrent: start every call, then drive the gates round-robin so the
// calls interleave between chunks — the exact pattern that corrupts a
// naively shared hasher.
const gated = inputs.map(({ type, chunks }) => gatedBlob(type, chunks.map(bytes)));
const results = gated.map((g) => blobToHash(g.blob));

let progressed = true;
while (progressed) {
await flush();
progressed = false;
for (const g of gated) {
if (g.releaseNext()) progressed = true;
}
}
await flush();

expect(await Promise.all(results)).toEqual(oracle);
});
});

describe("jsonToHash", () => {
Expand Down