fengmk2
diff --git a/‎Cargo.toml‎
Lines changed: 2 additions & 2 deletions b/‎Cargo.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎__test__/core/core-exports.test.ts‎
Lines changed: 65 additions & 0 deletions b/‎__test__/core/core-exports.test.ts‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎__test__/models/chat-session.test.ts‎
Lines changed: 82 additions & 2 deletions b/‎__test__/models/chat-session.test.ts‎
Lines changed: 82 additions & 2 deletions
diff --git a/‎__test__/models/model-loader-gemma4.test.ts‎
Lines changed: 163 additions & 1 deletion b/‎__test__/models/model-loader-gemma4.test.ts‎
Lines changed: 163 additions & 1 deletion
@@ -16,14 +16,14 @@ libc = "0.2"
 mimalloc-safe = { version = "0.1" }
 parquet = { version = "58", features = ["arrow"] }
 rand = "0.10"
-serde_json = "1"
+serde_json = { version = "1", features = ["preserve_order"] }
 serde = { version = "1", features = ["derive"] }
 thiserror = "2"
 tokio = { version = "1.48", features = ["rt-multi-thread", "macros", "sync"] }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3.19", default-features = false, features = ["std", "fmt", "json"] }
 tokenizers = "0.22"
-minijinja = { version = "2.5", features = ["json"] }
+minijinja = { version = "2.5", features = ["json", "preserve_order"] }
 uuid = { version = "1.11", features = ["v4"] }
 regex = "1.11"
 sqlx = { version = "0.8", features = ["runtime-tokio", "sqlite"] }
 
@@ -0,0 +1,65 @@
+/**
+ * Export-gate assertions for `@mlx-node/core`.
+ *
+ * Round-5 Codex follow-up: the Metal cache-pool drain function
+ * (`clear_cache` in `crates/mlx-core/src/cache_limit.rs`) is annotated
+ * `#[napi(namespace = "__internal__")]` on purpose — the drain is a
+ * process-wide `mlx_synchronize` routed through the default stream,
+ * which does NOT wait on the custom generation streams that per-model
+ * threads run on. Calling it while a decode is in flight risks racing
+ * live Metal command buffers.
+ *
+ * The namespace prefix is a deliberate speed-bump: the ONLY safe
+ * caller today is `@mlx-node/server`'s idle sweeper (which only
+ * triggers after the in-flight counter returns to zero). Every other
+ * caller has to opt in by acknowledging the `__internal__` prefix and
+ * reading the `@internal` caveat.
+ *
+ * These tests guard that gate:
+ *
+ *   1. The root module must NOT expose `clearCache` directly — a
+ *      regression where someone drops the `namespace = "__internal__"`
+ *      attribute would silently surface the footgun on the public API.
+ *   2. The namespace'd form must BE present and callable — a broken
+ *      native build (e.g. a `#[napi]` attribute error that drops the
+ *      export) would leave `@mlx-node/server`'s sweeper with no drain
+ *      path.
+ *   3. `memoryStats` stays exposed on the root as a cheap smoke test
+ *      that the broader binding is loading at all — if the native
+ *      addon is mis-linked both this AND the `__internal__` probe
+ *      fail, which localizes the problem.
+ */
+
+import { describe, expect, it } from 'vite-plus/test';
+
+// Intentionally read through `require` as well as ESM `import` so the
+// test catches both consumption patterns. The `vite.config.ts` alias
+// points `@mlx-node/core` at `packages/core/index.cjs`, so both
+// resolutions land on the same module instance in the test runtime.
+// eslint-disable-next-line @typescript-eslint/no-require-imports
+const coreRequire: Record<string, unknown> = require('@mlx-node/core');
+
+describe('@mlx-node/core public export surface', () => {
+  it('does NOT expose `clearCache` on the root namespace', () => {
+    // A regression here means someone dropped the
+    // `#[napi(namespace = "__internal__")]` attribute from
+    // `clear_cache` in `crates/mlx-core/src/cache_limit.rs`. That
+    // would make the process-wide Metal drain callable from user
+    // code without the `__internal__.` speed-bump — exactly the
+    // footgun the gate was designed to prevent.
+    expect(coreRequire.clearCache).toBeUndefined();
+  });
+
+  it('exposes `__internal__.clearCache` as a callable function', () => {
+    const internal = coreRequire.__internal__ as { clearCache?: unknown } | undefined;
+    expect(internal).toBeDefined();
+    expect(typeof internal?.clearCache).toBe('function');
+  });
+
+  it('exposes `memoryStats` on the root namespace as a callable function', () => {
+    // Acts as a "did the binding load at all" smoke test alongside
+    // the `__internal__` check above — a broken build drops BOTH,
+    // which is easier to diagnose than one failing in isolation.
+    expect(typeof coreRequire.memoryStats).toBe('function');
+  });
+});
@@ -20,10 +20,11 @@
  * export is added in T5.
  */
 import type { ChatConfig, ChatMessage, ChatResult } from '@mlx-node/core';
+import { ChatSession, type SessionCapableModel } from '@mlx-node/lm';
+import type { ChatStreamEvent, ChatStreamFinal } from '@mlx-node/lm';
 import { describe, expect, it, vi } from 'vite-plus/test';
 
-import { ChatSession, type SessionCapableModel } from '../../packages/lm/src/chat-session.js';
-import type { ChatStreamEvent, ChatStreamFinal } from '../../packages/lm/src/stream.js';
+import { resetPreservingNativeCacheForWarmReuse } from '../../packages/server/src/chat-session-warm-reuse.js';
 
 /** Build a minimal `ChatResult` sufficient for the session layer. */
 function makeChatResult(text: string): ChatResult {
@@ -76,6 +77,7 @@ function finalChunk(text: string, finishReason: string = 'stop'): ChatStreamFina
     promptTokens: 1,
     reasoningTokens: 0,
     rawText: text,
+    cachedTokens: 0,
   } satisfies ChatStreamFinal;
 }
 
@@ -530,6 +532,84 @@ describe('ChatSession', () => {
       expect(messages).toEqual([{ role: 'user', content: 'fresh' }]);
     });
 
+    it('public reset() always full-wipes — the keepNativeCache option is not accepted (Round 5 Fix #1)', async () => {
+      // Public contract: `ChatSession.reset()` is always a full wipe.
+      // Round 4 accidentally exposed a `{ keepNativeCache: true }`
+      // option that downstream consumers could call in a context
+      // where the shared native model still held an unrelated
+      // request's cache, reintroducing the cross-request cache-
+      // affinity leak that Round 3 closed. Round 5 removed the
+      // option from the public surface — the preserved-cache path
+      // is now behind the helper
+      // `resetPreservingNativeCacheForWarmReuse(session)`, which
+      // lives inside `@mlx-node/server` and is called exclusively by
+      // `SessionRegistry`-gated server endpoints. (Round 6 Fix #1
+      // refactored this from a class method into a module-level
+      // function; Round 7 Fix #2 relocated the module into the
+      // server package itself so there is no `@mlx-node/lm` export
+      // surface the helper could leak through.)
+      const { model, resetCaches } = makeMockModel();
+      const session = new ChatSession(model);
+
+      await session.send('one');
+      expect(session.turns).toBe(1);
+
+      // No-argument form always wipes.
+      await session.reset();
+      expect(resetCaches).toHaveBeenCalledTimes(1);
+      expect(session.turns).toBe(0);
+    });
+
+    it('resetPreservingNativeCacheForWarmReuse() wipes JS state only — no resetCaches call (Round 5 Fix #1 / Round 6 Fix #1 internal helper)', async () => {
+      // The module-level helper that replaced Round 4's
+      // `reset({ keepNativeCache: true })` public option, then Round 5's
+      // `_resetPreservingNativeCacheForWarmReuse()` class method. Used
+      // only by the server-side SessionRegistry warm-replay path on a
+      // tier-1 / tier-2 HIT, where the registry authoritatively vouches
+      // for the native cache belonging to this chain. Verify:
+      // (1) no resetCaches call, (2) turns/history zeroed so
+      // primeHistory() will accept the session.
+      const { model, resetCaches } = makeMockModel();
+      const session = new ChatSession(model);
+
+      await session.send('one');
+      await session.send('two');
+      expect(session.turns).toBe(2);
+
+      await resetPreservingNativeCacheForWarmReuse(session);
+      expect(resetCaches).not.toHaveBeenCalled();
+      expect(session.turns).toBe(0);
+      expect(session.hasImages).toBe(false);
+    });
+
+    it('resetPreservingNativeCacheForWarmReuse is NOT exported from @mlx-node/lm public surface (Round 6 Fix #1 / Round 7 Fix #2)', async () => {
+      // Structural guard: the helper lives inside `@mlx-node/server`
+      // (server-private module) and MUST NOT appear in either the
+      // `@mlx-node/lm` main export surface or the `@mlx-node/server`
+      // main export surface. Round 6 Fix #1 enforced the lm-public
+      // absence; Round 7 Fix #2 deleted the `@mlx-node/lm/internal`
+      // subpath export entirely and relocated the helper into the
+      // server package so the only reachable call site is
+      // `endpoints/responses.ts`, which already holds the
+      // `SessionRegistry` HIT gate. Downstream consumers doing a plain
+      // `import { ... } from '@mlx-node/lm'` or `from '@mlx-node/server'`
+      // must therefore not be able to discover or invoke the helper.
+      const lmPublicModule = (await import('../../packages/lm/src/index.js')) as Record<string, unknown>;
+      expect('resetPreservingNativeCacheForWarmReuse' in lmPublicModule).toBe(false);
+
+      const serverPublicModule = (await import('../../packages/server/src/index.js')) as Record<string, unknown>;
+      expect('resetPreservingNativeCacheForWarmReuse' in serverPublicModule).toBe(false);
+
+      // Sanity-check: the helper is still reachable from the
+      // server-private module the endpoint code imports. Tests can
+      // reach it via the relative path; downstream consumers cannot.
+      const serverPrivateModule = (await import('../../packages/server/src/chat-session-warm-reuse.js')) as Record<
+        string,
+        unknown
+      >;
+      expect(typeof serverPrivateModule.resetPreservingNativeCacheForWarmReuse).toBe('function');
+    });
+
     it('rejects reset() while a send() is in flight', async () => {
       let resolveFirst: (r: ChatResult) => void = () => {
         /* overwritten below */
 
@@ -1,6 +1,7 @@
+import type { Gemma4Config } from '@mlx-node/core';
 import { Gemma4Model as Gemma4ModelNative } from '@mlx-node/core';
 import { Gemma4Model } from '@mlx-node/lm';
-import { describe, it, expect } from 'vite-plus/test';
+import { describe, expect, it } from 'vite-plus/test';
 
 /**
  * Regression guard for the bug where `packages/lm/src/models/model-loader.ts`
@@ -56,3 +57,164 @@ describe('Gemma4Model re-export from @mlx-node/lm', () => {
     expect(fn.constructor.name).toBe('AsyncGeneratorFunction');
   });
 });
+
+/**
+ * A minimal `Gemma4Config` that satisfies the NAPI-derived struct so
+ * `new Gemma4ModelNative(cfg)` accepts it. Values are NOT meaningful —
+ * the stub constructor never materializes weights or a tokenizer, so
+ * only the shape matters. Kept inside the test file rather than a
+ * shared helper because the fields are documented fully by
+ * `packages/core/index.d.cts` and any drift is caught by the existing
+ * typecheck pass on the test suite.
+ */
+function stubConfig(overrides: Partial<Gemma4Config> = {}): Gemma4Config {
+  return {
+    vocabSize: 256,
+    hiddenSize: 8,
+    numHiddenLayers: 1,
+    numAttentionHeads: 1,
+    numKeyValueHeads: 1,
+    headDim: 8,
+    intermediateSize: 16,
+    rmsNormEps: 1e-6,
+    tieWordEmbeddings: false,
+    maxPositionEmbeddings: 128,
+    slidingWindow: 64,
+    layerTypes: ['full_attention'],
+    ropeTheta: 1_000_000,
+    ropeLocalBaseFreq: 10_000,
+    partialRotaryFactor: 0.25,
+    attentionKEqV: false,
+    perLayerInputEmbeds: false,
+    padTokenId: 0,
+    eosTokenIds: [1],
+    bosTokenId: 2,
+    attentionBias: false,
+    useDoubleWideMlp: false,
+    enableMoeBlock: false,
+    ...overrides,
+  };
+}
+
+/**
+ * Round-5 Finding B regression coverage.
+ *
+ * `new Gemma4Model(config)` was a runnable entry point before the
+ * cache-limit coordinator work landed. It is now a deliberate
+ * config-only stub (matches `VLModel::new(config)` /
+ * `QianfanOCRModel::new(config)`) because a no-op `new(config)` would
+ * have registered an empty coordinator delta and broken the
+ * deterministic-weight-bytes baseline.
+ *
+ * The Rust side uses an exact error message (see
+ * `crates/mlx-core/src/models/gemma4/model.rs`:
+ * `"Model not initialized. Call Gemma4Model.load() first."`) so this
+ * test asserts on a fragment (`"not initialized"`) to keep the probe
+ * robust against punctuation tweaks without letting the wrong error
+ * pass.
+ *
+ * We exercise the NATIVE class directly, not the `@mlx-node/lm`
+ * wrapper, because the wrapper adds async-generator streaming that
+ * masks the native rejection behind the generator protocol — the
+ * native surface is the one the handler dispatches on.
+ */
+describe('Gemma4Model(config) stub (round-5 Finding B)', () => {
+  it('returns an object with isInitialized=false and a numeric modelId', () => {
+    const stub = new Gemma4ModelNative(stubConfig());
+    expect(stub.isInitialized).toBe(false);
+    expect(typeof stub.modelId()).toBe('number');
+  });
+
+  it('rejects chatSessionStart with a "not initialized" error', async () => {
+    const stub = new Gemma4ModelNative(stubConfig());
+    await expect(stub.chatSessionStart([{ role: 'user', content: 'hi' }])).rejects.toThrow(/not initialized/i);
+  });
+
+  it('rejects chatSessionContinue with a "not initialized" error', async () => {
+    const stub = new Gemma4ModelNative(stubConfig());
+    await expect(stub.chatSessionContinue('hi', null, null)).rejects.toThrow(/not initialized/i);
+  });
+
+  it('rejects chatSessionContinueTool with a "not initialized" error', async () => {
+    const stub = new Gemma4ModelNative(stubConfig());
+    await expect(stub.chatSessionContinueTool('tool_123', '{"ok":true}')).rejects.toThrow(/not initialized/i);
+  });
+
+  it('rejects chatStreamSessionStart with a "not initialized" error', async () => {
+    const stub = new Gemma4ModelNative(stubConfig());
+    // Streaming methods still resolve on the NAPI boundary — they
+    // hand back a `ChatStreamHandle` — but the precondition check
+    // runs BEFORE the callback is ever invoked, so the promise must
+    // reject synchronously with the not-initialized message.
+    await expect(stub.chatStreamSessionStart([{ role: 'user', content: 'hi' }], null, () => {})).rejects.toThrow(
+      /not initialized/i,
+    );
+  });
+
+  it('rejects chatStreamSessionContinue with a "not initialized" error', async () => {
+    const stub = new Gemma4ModelNative(stubConfig());
+    await expect(stub.chatStreamSessionContinue('hi', null, null, () => {})).rejects.toThrow(/not initialized/i);
+  });
+
+  it('rejects chatStreamSessionContinueTool with a "not initialized" error', async () => {
+    const stub = new Gemma4ModelNative(stubConfig());
+    await expect(stub.chatStreamSessionContinueTool('tool_123', '{"ok":true}', null, () => {})).rejects.toThrow(
+      /not initialized/i,
+    );
+  });
+
+  it('resetCaches is a silent no-op on the stub', () => {
+    const stub = new Gemma4ModelNative(stubConfig());
+    // Matches the documented contract on the Rust impl: uninitialized
+    // stub returns Ok(()) so `ChatSession.reset()` is idempotent
+    // across stub + loaded instances.
+    expect(() => stub.resetCaches()).not.toThrow();
+  });
+});
+
+/**
+ * Round-5 Finding B also asks for a positive-path assertion that
+ * `Gemma4Model.load(validPath)` returns a runnable model. Real weights
+ * are not available in CI — they are multi-gigabyte HuggingFace
+ * downloads — so we assert the SHAPE of the class instead: `load` is a
+ * static async function, the stub produced by `new(config)` is an
+ * instance of the same class a `load()` call would return, and the
+ * runnable surface (`chatSessionStart` et al) is present on the
+ * prototype so a loaded instance would dispatch correctly. A
+ * full-weight end-to-end load is covered by the integration runs in
+ * `examples/` when real weights are available locally.
+ */
+describe('Gemma4Model.load() shape (round-5 Finding B)', () => {
+  it('exposes load as a static promise-returning function on the class', () => {
+    expect(typeof Gemma4ModelNative.load).toBe('function');
+    // NAPI-RS emits a plain function whose body dispatches to a
+    // native tokio task and returns a thenable — it is NOT a native
+    // JS `async function` (constructor name would be
+    // `AsyncFunction`), so we verify the return shape instead. A
+    // refactor that accidentally made `load()` sync would return an
+    // instance of `Gemma4ModelNative`, not a thenable, and this
+    // probe would catch it. We pass an intentionally-invalid path so
+    // no real disk I/O happens — the returned promise rejects, and
+    // we only care about the `then` shape on the returned value.
+    const ret = Gemma4ModelNative.load('/dev/null/__does_not_exist__');
+    expect(ret).toBeDefined();
+    expect(typeof (ret as { then?: unknown }).then).toBe('function');
+    // Swallow the eventual rejection so vitest does not flag an
+    // unhandled rejection on shutdown.
+    ret.then(
+      () => undefined,
+      () => undefined,
+    );
+  });
+
+  it('exposes the full session surface on the prototype', () => {
+    const proto = Gemma4ModelNative.prototype as unknown as Record<string, unknown>;
+    expect(typeof proto.chatSessionStart).toBe('function');
+    expect(typeof proto.chatSessionContinue).toBe('function');
+    expect(typeof proto.chatSessionContinueTool).toBe('function');
+    expect(typeof proto.chatStreamSessionStart).toBe('function');
+    expect(typeof proto.chatStreamSessionContinue).toBe('function');
+    expect(typeof proto.chatStreamSessionContinueTool).toBe('function');
+    expect(typeof proto.resetCaches).toBe('function');
+  });
+});