Skip to content

Commit d25b324

Browse files
Add prompt scaffold cache package
1 parent 9df44ab commit d25b324

11 files changed

Lines changed: 547 additions & 85 deletions

File tree

.changeset/ray-prompt-cache.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
---
3+
4+
Repo-only: add an internal prompt scaffold cache package and use it for llama.cpp template prompt reuse.

bun.lock

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/models/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
"src"
1818
],
1919
"dependencies": {
20+
"@ray/prompt-cache": "workspace:*",
2021
"@ray/prompts": "workspace:*",
2122
"@razroo/ray-core": "workspace:*"
2223
}

packages/models/src/providers/llama-cpp.ts

Lines changed: 27 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@ import {
1414
type SchedulerSlotSnapshot,
1515
type WarmupInferenceRequest,
1616
} from "@razroo/ray-core";
17+
import { resolvePromptTemplateRequest } from "@ray/prompts";
1718
import {
18-
renderPromptTemplate,
19-
requirePromptTemplate,
20-
resolvePromptTemplateRequest,
21-
} from "@ray/prompts";
19+
PromptScaffoldCache,
20+
buildPromptScaffoldCacheKey,
21+
createPromptScaffold,
22+
renderPromptFromScaffold,
23+
renderPromptScaffoldTemplate,
24+
type PromptScaffold,
25+
} from "@ray/prompt-cache";
2226
import {
2327
BACKEND_RESPONSE_BODY_LIMIT_BYTES,
2428
MAX_ADAPTER_TIMEOUT_MS,
@@ -47,6 +51,7 @@ const MAX_SLOT_SNAPSHOTS = 64;
4751
const MAX_FAMILY_PREFERRED_SLOT_KEYS = 512;
4852
const MAX_SLOT_FAMILY_ASSIGNMENTS = 64;
4953
const MAX_PROMPT_SCAFFOLD_CACHE_ENTRIES = 4_096;
54+
const PROMPT_SCAFFOLD_CACHE_TTL_MS = 86_400_000;
5055
const MAX_LLAMA_CPP_DIAGNOSTIC_NUMBER = 1_000_000_000;
5156
const MAX_LAUNCH_PROFILE_PATH_CHARS = 4_096;
5257
const MAX_LAUNCH_PROFILE_HOST_CHARS = 256;
@@ -188,11 +193,6 @@ interface LlamaCppSlotResponse {
188193
};
189194
}
190195

191-
interface PromptScaffold {
192-
segments: string[];
193-
variableOrder: string[];
194-
}
195-
196196
interface PreparedPromptState {
197197
prompt: string;
198198
}
@@ -726,7 +726,7 @@ export class LlamaCppProvider implements ModelProvider {
726726
} as const;
727727
private readonly preparationCache = new Map<string, ProviderRequestPreparation>();
728728
private readonly promptTokenCache = new Map<string, number>();
729-
private readonly promptScaffolds = new Map<string, PromptScaffold>();
729+
private readonly promptScaffolds: PromptScaffoldCache;
730730
private readonly familyPreferredSlots = new Map<string, number>();
731731
private readonly slotFamilyAssignments = new Map<number, string>();
732732
private readonly maxPreparationCacheEntries = 256;
@@ -743,6 +743,10 @@ export class LlamaCppProvider implements ModelProvider {
743743
this.modelId = model.id;
744744
this.adapter = snapshotLlamaCppAdapter(adapter, model.maxOutputTokens);
745745
this.maxPromptScaffoldEntries = this.adapter.promptScaffoldCacheEntries ?? 128;
746+
this.promptScaffolds = new PromptScaffoldCache({
747+
maxEntries: this.maxPromptScaffoldEntries,
748+
ttlMs: PROMPT_SCAFFOLD_CACHE_TTL_MS,
749+
});
746750
}
747751

748752
async warm(): Promise<void> {
@@ -1516,8 +1520,8 @@ export class LlamaCppProvider implements ModelProvider {
15161520
responseFormatType: "text" | "json_object",
15171521
signal?: AbortSignal,
15181522
): Promise<PromptScaffold> {
1519-
const cacheKey = hashValue({
1520-
model: this.adapter.modelRef,
1523+
const cacheKey = buildPromptScaffoldCacheKey({
1524+
modelRef: this.adapter.modelRef,
15211525
templateId,
15221526
responseFormatType,
15231527
});
@@ -1526,14 +1530,8 @@ export class LlamaCppProvider implements ModelProvider {
15261530
return cached;
15271531
}
15281532

1529-
const template = requirePromptTemplate(templateId);
1530-
const sentinelVariables = Object.fromEntries(
1531-
template.variables.map((variable: string, index: number) => [
1532-
variable,
1533-
`__RAY_PROMPT_VAR_${index}__`,
1534-
]),
1535-
);
1536-
const rendered = renderPromptTemplate(template.id, sentinelVariables);
1533+
const scaffoldTemplate = renderPromptScaffoldTemplate(templateId);
1534+
const rendered = scaffoldTemplate.rendered;
15371535
const prompt = await this.applyTemplate(
15381536
{
15391537
input: rendered.input,
@@ -1551,38 +1549,14 @@ export class LlamaCppProvider implements ModelProvider {
15511549
},
15521550
signal,
15531551
);
1554-
const segments: string[] = [];
1555-
let cursor = 0;
1556-
1557-
for (const variable of template.variables) {
1558-
const sentinel = sentinelVariables[variable];
1559-
if (!sentinel) {
1560-
throw new RayError(`Prompt scaffold marker "${variable}" is missing`, {
1561-
code: "provider_invalid_response",
1562-
status: 500,
1563-
});
1564-
}
1565-
const position = prompt.indexOf(sentinel, cursor);
1566-
1567-
if (position === -1) {
1568-
throw new RayError(
1569-
`Prompt scaffold marker "${variable}" was not found in rendered prompt`,
1570-
{
1571-
code: "provider_invalid_response",
1572-
status: 500,
1573-
},
1574-
);
1575-
}
1576-
1577-
segments.push(prompt.slice(cursor, position));
1578-
cursor = position + sentinel.length;
1579-
}
1580-
1581-
segments.push(prompt.slice(cursor));
1582-
const scaffold: PromptScaffold = {
1583-
segments,
1584-
variableOrder: [...template.variables],
1585-
};
1552+
const scaffold = createPromptScaffold({
1553+
prompt,
1554+
variableOrder: scaffoldTemplate.variableOrder,
1555+
sentinelVariables: scaffoldTemplate.sentinelVariables,
1556+
templateId: rendered.id,
1557+
templateVersion: rendered.version,
1558+
family: rendered.family,
1559+
});
15861560
this.setPromptScaffold(cacheKey, scaffold);
15871561
return scaffold;
15881562
}
@@ -1591,26 +1565,7 @@ export class LlamaCppProvider implements ModelProvider {
15911565
scaffold: PromptScaffold,
15921566
templateVariables: Record<string, string>,
15931567
): string {
1594-
let prompt = scaffold.segments[0] ?? "";
1595-
1596-
for (let index = 0; index < scaffold.variableOrder.length; index += 1) {
1597-
const variableName = scaffold.variableOrder[index];
1598-
if (!variableName) {
1599-
continue;
1600-
}
1601-
const value = templateVariables[variableName];
1602-
if (value === undefined) {
1603-
throw new RayError(`Missing template variable "${variableName}" for prompt scaffold`, {
1604-
code: "invalid_request",
1605-
status: 400,
1606-
});
1607-
}
1608-
1609-
prompt += value;
1610-
prompt += scaffold.segments[index + 1] ?? "";
1611-
}
1612-
1613-
return prompt;
1568+
return renderPromptFromScaffold(scaffold, templateVariables);
16141569
}
16151570

16161571
private async getSlotSnapshots(signal?: AbortSignal): Promise<SchedulerSlotSnapshot[]> {
@@ -1809,19 +1764,7 @@ export class LlamaCppProvider implements ModelProvider {
18091764
}
18101765

18111766
private setPromptScaffold(key: string, scaffold: PromptScaffold): void {
1812-
if (this.promptScaffolds.has(key)) {
1813-
this.promptScaffolds.delete(key);
1814-
}
1815-
18161767
this.promptScaffolds.set(key, scaffold);
1817-
1818-
while (this.promptScaffolds.size > this.maxPromptScaffoldEntries) {
1819-
const oldestKey = this.promptScaffolds.keys().next().value;
1820-
if (!oldestKey) {
1821-
break;
1822-
}
1823-
this.promptScaffolds.delete(oldestKey);
1824-
}
18251768
}
18261769

18271770
private async fetchHealthPayload(): Promise<{

packages/models/tsconfig.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55
"outDir": "dist"
66
},
77
"include": ["src/**/*.ts"],
8-
"references": [{ "path": "../core" }, { "path": "../prompts" }]
8+
"references": [{ "path": "../core" }, { "path": "../prompt-cache" }, { "path": "../prompts" }]
99
}

packages/prompt-cache/package.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"name": "@ray/prompt-cache",
3+
"private": true,
4+
"version": "0.1.0",
5+
"type": "module",
6+
"main": "./dist/index.js",
7+
"types": "./dist/index.d.ts",
8+
"exports": {
9+
".": {
10+
"types": "./dist/index.d.ts",
11+
"development": "./src/index.ts",
12+
"default": "./dist/index.js"
13+
}
14+
},
15+
"files": [
16+
"dist",
17+
"src"
18+
],
19+
"dependencies": {
20+
"@ray/cache": "workspace:*",
21+
"@ray/prompts": "workspace:*",
22+
"@razroo/ray-core": "workspace:*"
23+
}
24+
}

0 commit comments

Comments
 (0)