Agent-Pattern-Labs
diff --git a/‎.changeset/ray-prompt-cache.md‎
Lines changed: 4 additions & 0 deletions b/‎.changeset/ray-prompt-cache.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎bun.lock‎
Lines changed: 12 additions & 0 deletions b/‎bun.lock‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎packages/models/package.json‎
Lines changed: 1 addition & 0 deletions b/‎packages/models/package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/models/src/providers/llama-cpp.ts‎
Lines changed: 27 additions & 84 deletions b/‎packages/models/src/providers/llama-cpp.ts‎
Lines changed: 27 additions & 84 deletions
diff --git a/‎packages/models/tsconfig.json‎
Lines changed: 1 addition & 1 deletion b/‎packages/models/tsconfig.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/prompt-cache/package.json‎
Lines changed: 24 additions & 0 deletions b/‎packages/prompt-cache/package.json‎
Lines changed: 24 additions & 0 deletions
@@ -0,0 +1,4 @@
+---
+---
+
+Repo-only: add an internal prompt scaffold cache package and use it for llama.cpp template prompt reuse.
@@ -17,6 +17,7 @@
     "src"
   ],
   "dependencies": {
+    "@ray/prompt-cache": "workspace:*",
     "@ray/prompts": "workspace:*",
     "@razroo/ray-core": "workspace:*"
   }
 
@@ -14,11 +14,15 @@ import {
   type SchedulerSlotSnapshot,
   type WarmupInferenceRequest,
 } from "@razroo/ray-core";
+import { resolvePromptTemplateRequest } from "@ray/prompts";
 import {
-  renderPromptTemplate,
-  requirePromptTemplate,
-  resolvePromptTemplateRequest,
-} from "@ray/prompts";
+  PromptScaffoldCache,
+  buildPromptScaffoldCacheKey,
+  createPromptScaffold,
+  renderPromptFromScaffold,
+  renderPromptScaffoldTemplate,
+  type PromptScaffold,
+} from "@ray/prompt-cache";
 import {
   BACKEND_RESPONSE_BODY_LIMIT_BYTES,
   MAX_ADAPTER_TIMEOUT_MS,
@@ -47,6 +51,7 @@ const MAX_SLOT_SNAPSHOTS = 64;
 const MAX_FAMILY_PREFERRED_SLOT_KEYS = 512;
 const MAX_SLOT_FAMILY_ASSIGNMENTS = 64;
 const MAX_PROMPT_SCAFFOLD_CACHE_ENTRIES = 4_096;
+const PROMPT_SCAFFOLD_CACHE_TTL_MS = 86_400_000;
 const MAX_LLAMA_CPP_DIAGNOSTIC_NUMBER = 1_000_000_000;
 const MAX_LAUNCH_PROFILE_PATH_CHARS = 4_096;
 const MAX_LAUNCH_PROFILE_HOST_CHARS = 256;
@@ -188,11 +193,6 @@ interface LlamaCppSlotResponse {
   };
 }
 
-interface PromptScaffold {
-  segments: string[];
-  variableOrder: string[];
-}
-
 interface PreparedPromptState {
   prompt: string;
 }
@@ -726,7 +726,7 @@ export class LlamaCppProvider implements ModelProvider {
   } as const;
   private readonly preparationCache = new Map<string, ProviderRequestPreparation>();
   private readonly promptTokenCache = new Map<string, number>();
-  private readonly promptScaffolds = new Map<string, PromptScaffold>();
+  private readonly promptScaffolds: PromptScaffoldCache;
   private readonly familyPreferredSlots = new Map<string, number>();
   private readonly slotFamilyAssignments = new Map<number, string>();
   private readonly maxPreparationCacheEntries = 256;
@@ -743,6 +743,10 @@ export class LlamaCppProvider implements ModelProvider {
     this.modelId = model.id;
     this.adapter = snapshotLlamaCppAdapter(adapter, model.maxOutputTokens);
     this.maxPromptScaffoldEntries = this.adapter.promptScaffoldCacheEntries ?? 128;
+    this.promptScaffolds = new PromptScaffoldCache({
+      maxEntries: this.maxPromptScaffoldEntries,
+      ttlMs: PROMPT_SCAFFOLD_CACHE_TTL_MS,
+    });
   }
 
   async warm(): Promise<void> {
@@ -1516,8 +1520,8 @@ export class LlamaCppProvider implements ModelProvider {
     responseFormatType: "text" | "json_object",
     signal?: AbortSignal,
   ): Promise<PromptScaffold> {
-    const cacheKey = hashValue({
-      model: this.adapter.modelRef,
+    const cacheKey = buildPromptScaffoldCacheKey({
+      modelRef: this.adapter.modelRef,
       templateId,
       responseFormatType,
     });
@@ -1526,14 +1530,8 @@ export class LlamaCppProvider implements ModelProvider {
       return cached;
     }
 
-    const template = requirePromptTemplate(templateId);
-    const sentinelVariables = Object.fromEntries(
-      template.variables.map((variable: string, index: number) => [
-        variable,
-        `__RAY_PROMPT_VAR_${index}__`,
-      ]),
-    );
-    const rendered = renderPromptTemplate(template.id, sentinelVariables);
+    const scaffoldTemplate = renderPromptScaffoldTemplate(templateId);
+    const rendered = scaffoldTemplate.rendered;
     const prompt = await this.applyTemplate(
       {
         input: rendered.input,
@@ -1551,38 +1549,14 @@ export class LlamaCppProvider implements ModelProvider {
       },
       signal,
     );
-    const segments: string[] = [];
-    let cursor = 0;
-
-    for (const variable of template.variables) {
-      const sentinel = sentinelVariables[variable];
-      if (!sentinel) {
-        throw new RayError(`Prompt scaffold marker "${variable}" is missing`, {
-          code: "provider_invalid_response",
-          status: 500,
-        });
-      }
-      const position = prompt.indexOf(sentinel, cursor);
-
-      if (position === -1) {
-        throw new RayError(
-          `Prompt scaffold marker "${variable}" was not found in rendered prompt`,
-          {
-            code: "provider_invalid_response",
-            status: 500,
-          },
-        );
-      }
-
-      segments.push(prompt.slice(cursor, position));
-      cursor = position + sentinel.length;
-    }
-
-    segments.push(prompt.slice(cursor));
-    const scaffold: PromptScaffold = {
-      segments,
-      variableOrder: [...template.variables],
-    };
+    const scaffold = createPromptScaffold({
+      prompt,
+      variableOrder: scaffoldTemplate.variableOrder,
+      sentinelVariables: scaffoldTemplate.sentinelVariables,
+      templateId: rendered.id,
+      templateVersion: rendered.version,
+      family: rendered.family,
+    });
     this.setPromptScaffold(cacheKey, scaffold);
     return scaffold;
   }
@@ -1591,26 +1565,7 @@ export class LlamaCppProvider implements ModelProvider {
     scaffold: PromptScaffold,
     templateVariables: Record<string, string>,
   ): string {
-    let prompt = scaffold.segments[0] ?? "";
-
-    for (let index = 0; index < scaffold.variableOrder.length; index += 1) {
-      const variableName = scaffold.variableOrder[index];
-      if (!variableName) {
-        continue;
-      }
-      const value = templateVariables[variableName];
-      if (value === undefined) {
-        throw new RayError(`Missing template variable "${variableName}" for prompt scaffold`, {
-          code: "invalid_request",
-          status: 400,
-        });
-      }
-
-      prompt += value;
-      prompt += scaffold.segments[index + 1] ?? "";
-    }
-
-    return prompt;
+    return renderPromptFromScaffold(scaffold, templateVariables);
   }
 
   private async getSlotSnapshots(signal?: AbortSignal): Promise<SchedulerSlotSnapshot[]> {
@@ -1809,19 +1764,7 @@ export class LlamaCppProvider implements ModelProvider {
   }
 
   private setPromptScaffold(key: string, scaffold: PromptScaffold): void {
-    if (this.promptScaffolds.has(key)) {
-      this.promptScaffolds.delete(key);
-    }
-
     this.promptScaffolds.set(key, scaffold);
-
-    while (this.promptScaffolds.size > this.maxPromptScaffoldEntries) {
-      const oldestKey = this.promptScaffolds.keys().next().value;
-      if (!oldestKey) {
-        break;
-      }
-      this.promptScaffolds.delete(oldestKey);
-    }
   }
 
   private async fetchHealthPayload(): Promise<{
 
@@ -5,5 +5,5 @@
     "outDir": "dist"
   },
   "include": ["src/**/*.ts"],
-  "references": [{ "path": "../core" }, { "path": "../prompts" }]
+  "references": [{ "path": "../core" }, { "path": "../prompt-cache" }, { "path": "../prompts" }]
 }
@@ -0,0 +1,24 @@
+{
+  "name": "@ray/prompt-cache",
+  "private": true,
+  "version": "0.1.0",
+  "type": "module",
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "development": "./src/index.ts",
+      "default": "./dist/index.js"
+    }
+  },
+  "files": [
+    "dist",
+    "src"
+  ],
+  "dependencies": {
+    "@ray/cache": "workspace:*",
+    "@ray/prompts": "workspace:*",
+    "@razroo/ray-core": "workspace:*"
+  }
+}
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +---
++
 +Repo-only: add an internal prompt scaffold cache package and use it for llama.cpp template prompt reuse.
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`"src"`
`18`	`18`	`],`
`19`	`19`	`"dependencies": {`
	`20`	`+ "@ray/prompt-cache": "workspace:*",`
`20`	`21`	`"@ray/prompts": "workspace:*",`
`21`	`22`	`"@razroo/ray-core": "workspace:*"`
`22`	`23`	`}`
Original file line number	Diff line number	Diff line change
`@@ -5,5 +5,5 @@`
`5`	`5`	`"outDir": "dist"`
`6`	`6`	`},`
`7`	`7`	`"include": ["src/*/.ts"],`
`8`		`- "references": [{ "path": "../core" }, { "path": "../prompts" }]`
	`8`	`+ "references": [{ "path": "../core" }, { "path": "../prompt-cache" }, { "path": "../prompts" }]`
`9`	`9`	`}`