Skip to content

Commit b41c26e

Browse files
feat(cache): validate cached model files by size + ETag before reuse
Each completed download now writes a sibling metadata record ({ etag, size, savedAt }) next to the cached blob. On a later load getModelFile validates the cached file before trusting it: - integrity: if the recorded size doesn't match the cached blob's byte length (truncated / partially-written / corrupt cache), re-download cleanly instead of feeding ORT a broken model and failing with a cryptic error. - freshness: a short, timeout-guarded HEAD compares the current upstream ETag against the one recorded at download time; a clear mismatch means the file genuinely changed (e.g. a moving 'main' revision moved) and is re-downloaded. The decision is deliberately conservative and defaults to reusing the cache: the HEAD is skipped when offline or when there's no recorded etag, and any failure (blocked HF, timeout, missing etag on either side) keeps the cache. So offline / firewalled startup stays instant and pinned-revision models never re-download. Legacy caches without metadata keep working unchanged. The pure decision (decideCacheAction) is factored out and unit-tested across the integrity, freshness, and precedence paths. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 1c42e87 commit b41c26e

2 files changed

Lines changed: 172 additions & 2 deletions

File tree

app/src/hub.js

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,15 @@ const FLUSH_INTERVAL = 8 * 1024 * 1024;
3636
const MAX_RETRIES = 6;
3737
const PARTIAL_PREFIX = 'partial-';
3838
const SEGMENT_INFIX = '-seg-';
39+
// Sibling record storing validation metadata ({ etag, size, savedAt }) for a
40+
// completed download, keyed META_PREFIX + cacheKey. Lets a later load confirm
41+
// the cached blob is intact (size) and unchanged upstream (etag) before reusing
42+
// it, instead of blindly trusting whatever bytes are in the cache.
43+
const META_PREFIX = 'meta-';
44+
// How long to wait on the freshness HEAD before falling back to the cache. Kept
45+
// short so a slow/blocked HuggingFace never stalls startup for a user who
46+
// already has the model cached.
47+
const REVALIDATE_TIMEOUT_MS = 4000;
3948
// If no chunk arrives for this long, abort the fetch and retry. Without it
4049
// a silently half-open connection (proxy idle-out, dropped TCP) hangs the
4150
// reader forever instead of triggering the existing retry/backoff logic.
@@ -48,6 +57,65 @@ function makeCacheKey(repoId, revision, subfolder, filename) {
4857
return `hf-${repoId}-${revision}-${subfolder}-${filename}`;
4958
}
5059

60+
/**
61+
* Decide whether a cached model file can be reused as-is or must be
62+
* re-downloaded. Deliberately conservative: it only returns 'redownload' on
63+
* positive evidence the cached bytes are wrong, so a flaky network, a blocked
64+
* HuggingFace, or a download predating the metadata feature never triggers a
65+
* needless multi-GB re-download. Everything else reuses the cache.
66+
*
67+
* Re-download is returned when:
68+
* - integrity: a recorded size exists and the cached blob's byte length does
69+
* not match it (truncated / partially-written / corrupt cache), or
70+
* - freshness: a successful HEAD returned an ETag that differs from the one
71+
* recorded at download time (upstream file genuinely changed).
72+
*
73+
* @param {Object} args
74+
* @param {number} args.cachedSize Byte length of the cached blob.
75+
* @param {?{etag?: string, size?: number}} args.meta Recorded metadata, or null.
76+
* @param {?{ok: boolean, etag: ?string}} args.head HEAD revalidation result, or
77+
* null when revalidation was skipped (offline) or failed.
78+
* @returns {'use'|'redownload'}
79+
*/
80+
export function decideCacheAction({ cachedSize, meta, head }) {
81+
// Integrity: we know how big the file should be and the cache disagrees.
82+
if (meta && typeof meta.size === 'number' && meta.size > 0 && cachedSize !== meta.size) {
83+
return 'redownload';
84+
}
85+
// Freshness: only act on a clear, two-sided ETag mismatch. A missing ETag on
86+
// either side (no recorded etag, HEAD failed/omitted it) means "can't tell" —
87+
// and we err toward keeping the cache.
88+
if (head && head.ok && head.etag && meta && meta.etag && head.etag !== meta.etag) {
89+
return 'redownload';
90+
}
91+
return 'use';
92+
}
93+
94+
/**
95+
* Best-effort HEAD request to read the current ETag for a URL, used to detect
96+
* whether an upstream file changed since it was cached. Never throws: any
97+
* network error, non-OK status, or timeout resolves to null so the caller
98+
* falls back to using the cache. Skipped entirely when the browser reports it
99+
* is offline.
100+
*
101+
* @param {string} url
102+
* @returns {Promise<{ok: boolean, etag: ?string}|null>}
103+
*/
104+
async function headRevalidate(url) {
105+
if (typeof navigator !== 'undefined' && navigator.onLine === false) return null;
106+
const ac = new AbortController();
107+
const timer = setTimeout(() => ac.abort(new Error('revalidate timeout')), REVALIDATE_TIMEOUT_MS);
108+
try {
109+
const resp = await fetch(url, { method: 'HEAD', signal: ac.signal });
110+
if (!resp.ok) return { ok: false, etag: null };
111+
return { ok: true, etag: resp.headers.get('etag') || resp.headers.get('last-modified') || null };
112+
} catch (_) {
113+
return null;
114+
} finally {
115+
clearTimeout(timer);
116+
}
117+
}
118+
51119
// Filenames accepted from HF's API. Restrict to a flat, safe alphabet so
52120
// a poisoned or attacker-controlled response cannot smuggle path
53121
// traversal ('..'), query/fragment delimiters ('?', '#'), URL-encoding
@@ -365,6 +433,15 @@ async function _streamAndCache(url, cacheKey, filename, progress, logTag, maxRet
365433
if (typeof indexedDB !== 'undefined') {
366434
try {
367435
await saveFileToDb(cacheKey, blob);
436+
// Record validation metadata next to the blob so a later load can verify
437+
// integrity (size) and freshness (etag) before reusing it. Best-effort:
438+
// a failure here just means the next load skips validation and trusts the
439+
// cache, which matches the pre-metadata behaviour.
440+
try {
441+
await saveFileToDb(META_PREFIX + cacheKey, { etag, size: blob.size, savedAt: Date.now() });
442+
} catch (e) {
443+
console.warn(`${logTag} Failed to write cache metadata for ${filename}:`, e);
444+
}
368445
console.log(`${logTag} Cached ${filename} in IndexedDB`);
369446
} catch (e) {
370447
console.warn(`${logTag} Failed to cache in IndexedDB:`, e);
@@ -405,8 +482,17 @@ export async function getModelFile(repoId, filename, options = {}) {
405482
try {
406483
const cachedBlob = await getFileFromDb(cacheKey);
407484
if (cachedBlob) {
408-
console.log(`[Hub] Using cached ${filename} from IndexedDB`);
409-
return URL.createObjectURL(cachedBlob);
485+
let meta = null;
486+
try { meta = await getFileFromDb(META_PREFIX + cacheKey); } catch (_) {}
487+
// Skip the freshness HEAD when there's no recorded etag to compare
488+
// against (nothing to learn) or no metadata at all (legacy cache):
489+
// size-only validation still runs, and we avoid a pointless round-trip.
490+
const head = meta?.etag ? await headRevalidate(url) : null;
491+
if (decideCacheAction({ cachedSize: cachedBlob.size, meta, head }) === 'use') {
492+
console.log(`[Hub] Using cached ${filename} from IndexedDB`);
493+
return URL.createObjectURL(cachedBlob);
494+
}
495+
console.warn(`[Hub] Cached ${filename} failed validation (stale or corrupt); re-downloading`);
410496
}
411497
} catch (e) {
412498
console.warn('[Hub] IndexedDB cache check failed:', e);
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// Tier-1 unit test for decideCacheAction (app/src/hub.js): the pure decision
2+
// that gates whether a cached model file is reused or re-downloaded. The rule
3+
// is deliberately conservative (default to reusing the cache) so a flaky
4+
// network or a blocked HuggingFace never forces a needless multi-GB download.
5+
// Built with Claude Code.
6+
7+
import { test, describe } from 'node:test';
8+
import assert from 'node:assert/strict';
9+
import { decideCacheAction } from '../../app/src/hub.js';
10+
11+
describe('decideCacheAction integrity (size)', () => {
12+
test('size matches recorded size -> use', () => {
13+
assert.equal(decideCacheAction({ cachedSize: 100, meta: { size: 100 }, head: null }), 'use');
14+
});
15+
16+
test('size differs from recorded size -> redownload (truncated/corrupt)', () => {
17+
assert.equal(decideCacheAction({ cachedSize: 90, meta: { size: 100 }, head: null }), 'redownload');
18+
});
19+
20+
test('no recorded size -> integrity check skipped -> use', () => {
21+
assert.equal(decideCacheAction({ cachedSize: 90, meta: {}, head: null }), 'use');
22+
});
23+
24+
test('recorded size of 0 is ignored -> use', () => {
25+
assert.equal(decideCacheAction({ cachedSize: 90, meta: { size: 0 }, head: null }), 'use');
26+
});
27+
28+
test('no metadata at all (legacy cache) -> use', () => {
29+
assert.equal(decideCacheAction({ cachedSize: 90, meta: null, head: null }), 'use');
30+
});
31+
});
32+
33+
describe('decideCacheAction freshness (etag)', () => {
34+
test('etags differ on a successful HEAD -> redownload', () => {
35+
assert.equal(
36+
decideCacheAction({ cachedSize: 100, meta: { size: 100, etag: 'aaa' }, head: { ok: true, etag: 'bbb' } }),
37+
'redownload'
38+
);
39+
});
40+
41+
test('etags match -> use', () => {
42+
assert.equal(
43+
decideCacheAction({ cachedSize: 100, meta: { size: 100, etag: 'aaa' }, head: { ok: true, etag: 'aaa' } }),
44+
'use'
45+
);
46+
});
47+
48+
test('HEAD failed (ok=false) -> use (trust cache)', () => {
49+
assert.equal(
50+
decideCacheAction({ cachedSize: 100, meta: { size: 100, etag: 'aaa' }, head: { ok: false, etag: null } }),
51+
'use'
52+
);
53+
});
54+
55+
test('HEAD skipped (null, e.g. offline) -> use', () => {
56+
assert.equal(
57+
decideCacheAction({ cachedSize: 100, meta: { size: 100, etag: 'aaa' }, head: null }),
58+
'use'
59+
);
60+
});
61+
62+
test('no recorded etag -> cannot compare -> use', () => {
63+
assert.equal(
64+
decideCacheAction({ cachedSize: 100, meta: { size: 100 }, head: { ok: true, etag: 'bbb' } }),
65+
'use'
66+
);
67+
});
68+
69+
test('HEAD returned no etag -> cannot compare -> use', () => {
70+
assert.equal(
71+
decideCacheAction({ cachedSize: 100, meta: { size: 100, etag: 'aaa' }, head: { ok: true, etag: null } }),
72+
'use'
73+
);
74+
});
75+
});
76+
77+
describe('decideCacheAction precedence', () => {
78+
test('size mismatch forces redownload even when etags match', () => {
79+
assert.equal(
80+
decideCacheAction({ cachedSize: 90, meta: { size: 100, etag: 'aaa' }, head: { ok: true, etag: 'aaa' } }),
81+
'redownload'
82+
);
83+
});
84+
});

0 commit comments

Comments
 (0)