Skip to content

Commit 7f1712c

Browse files
(fix): enforce embedding model token limit to prevent overflow (openclaw#13455)
* fix: enforce embedding model token limit to prevent 8192 overflow - Replace EMBEDDING_APPROX_CHARS_PER_TOKEN=1 with UTF-8 byte length estimation (safe upper bound for tokenizer output) - Add EMBEDDING_MODEL_MAX_TOKENS=8192 hard cap - Add splitChunkToTokenLimit() that binary-searches for the largest safe split point, with surrogate pair handling - Add enforceChunkTokenLimit() wrapper called in indexFile() after chunkMarkdown(), before any embedding API call - Fixes: session files with large JSONL entries could produce chunks exceeding text-embedding-3-small's 8192 token limit Tests: 2 new colocated tests in manager.embedding-token-limit.test.ts - Verifies oversized ASCII chunks are split to <=8192 bytes each - Verifies multibyte (emoji) content batching respects byte limits * fix: make embedding token limit provider-aware - Add optional maxInputTokens to EmbeddingProvider interface - Each provider (openai, gemini, voyage) reports its own limit - Known-limits map as fallback: openai 8192, gemini 2048, voyage 32K - Resolution: provider field > known map > default 8192 - Backward compatible: local/llama uses fallback * fix: enforce embedding input size limits (openclaw#13455) (thanks @rodrigouroz) --------- Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
1 parent c95b378 commit 7f1712c

9 files changed

Lines changed: 277 additions & 11 deletions
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import type { EmbeddingProvider } from "./embeddings.js";
2+
import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js";
3+
import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js";
4+
import { hashText, type MemoryChunk } from "./internal.js";
5+
6+
export function enforceEmbeddingMaxInputTokens(
7+
provider: EmbeddingProvider,
8+
chunks: MemoryChunk[],
9+
): MemoryChunk[] {
10+
const maxInputTokens = resolveEmbeddingMaxInputTokens(provider);
11+
const out: MemoryChunk[] = [];
12+
13+
for (const chunk of chunks) {
14+
if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) {
15+
out.push(chunk);
16+
continue;
17+
}
18+
19+
for (const text of splitTextToUtf8ByteLimit(chunk.text, maxInputTokens)) {
20+
out.push({
21+
startLine: chunk.startLine,
22+
endLine: chunk.endLine,
23+
text,
24+
hash: hashText(text),
25+
});
26+
}
27+
}
28+
29+
return out;
30+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// Helpers for enforcing embedding model input size limits.
2+
//
3+
// We use UTF-8 byte length as a conservative upper bound for tokenizer output.
4+
// Tokenizers operate over bytes; a token must contain at least one byte, so
5+
// token_count <= utf8_byte_length.
6+
7+
export function estimateUtf8Bytes(text: string): number {
8+
if (!text) {
9+
return 0;
10+
}
11+
return Buffer.byteLength(text, "utf8");
12+
}
13+
14+
export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] {
15+
if (maxUtf8Bytes <= 0) {
16+
return [text];
17+
}
18+
if (estimateUtf8Bytes(text) <= maxUtf8Bytes) {
19+
return [text];
20+
}
21+
22+
const parts: string[] = [];
23+
let cursor = 0;
24+
while (cursor < text.length) {
25+
// The number of UTF-16 code units is always <= the number of UTF-8 bytes.
26+
// This makes `cursor + maxUtf8Bytes` a safe upper bound on the next split point.
27+
let low = cursor + 1;
28+
let high = Math.min(text.length, cursor + maxUtf8Bytes);
29+
let best = cursor;
30+
31+
while (low <= high) {
32+
const mid = Math.floor((low + high) / 2);
33+
const bytes = estimateUtf8Bytes(text.slice(cursor, mid));
34+
if (bytes <= maxUtf8Bytes) {
35+
best = mid;
36+
low = mid + 1;
37+
} else {
38+
high = mid - 1;
39+
}
40+
}
41+
42+
if (best <= cursor) {
43+
best = Math.min(text.length, cursor + 1);
44+
}
45+
46+
// Avoid splitting inside a surrogate pair.
47+
if (
48+
best < text.length &&
49+
best > cursor &&
50+
text.charCodeAt(best - 1) >= 0xd800 &&
51+
text.charCodeAt(best - 1) <= 0xdbff &&
52+
text.charCodeAt(best) >= 0xdc00 &&
53+
text.charCodeAt(best) <= 0xdfff
54+
) {
55+
best -= 1;
56+
}
57+
58+
const part = text.slice(cursor, best);
59+
if (!part) {
60+
break;
61+
}
62+
parts.push(part);
63+
cursor = best;
64+
}
65+
66+
return parts;
67+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import type { EmbeddingProvider } from "./embeddings.js";
2+
3+
const DEFAULT_EMBEDDING_MAX_INPUT_TOKENS = 8192;
4+
5+
const KNOWN_EMBEDDING_MAX_INPUT_TOKENS: Record<string, number> = {
6+
"openai:text-embedding-3-small": 8192,
7+
"openai:text-embedding-3-large": 8192,
8+
"openai:text-embedding-ada-002": 8191,
9+
"gemini:text-embedding-004": 2048,
10+
"voyage:voyage-3": 32000,
11+
"voyage:voyage-3-lite": 16000,
12+
"voyage:voyage-code-3": 32000,
13+
};
14+
15+
export function resolveEmbeddingMaxInputTokens(provider: EmbeddingProvider): number {
16+
if (typeof provider.maxInputTokens === "number") {
17+
return provider.maxInputTokens;
18+
}
19+
20+
// Provider/model mapping is best-effort; different providers use different
21+
// limits and we prefer to be conservative when we don't know.
22+
const key = `${provider.id}:${provider.model}`.toLowerCase();
23+
const known = KNOWN_EMBEDDING_MAX_INPUT_TOKENS[key];
24+
if (typeof known === "number") {
25+
return known;
26+
}
27+
28+
// Provider-specific conservative fallbacks. This prevents us from accidentally
29+
// using the OpenAI default for providers with much smaller limits.
30+
if (provider.id.toLowerCase() === "gemini") {
31+
return 2048;
32+
}
33+
34+
return DEFAULT_EMBEDDING_MAX_INPUT_TOKENS;
35+
}

src/memory/embeddings-gemini.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ export type GeminiEmbeddingClient = {
1212

1313
const DEFAULT_GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
1414
export const DEFAULT_GEMINI_EMBEDDING_MODEL = "gemini-embedding-001";
15+
const GEMINI_MAX_INPUT_TOKENS: Record<string, number> = {
16+
"text-embedding-004": 2048,
17+
};
1518
const debugEmbeddings = isTruthyEnvValue(process.env.OPENCLAW_DEBUG_MEMORY_EMBEDDINGS);
1619
const log = createSubsystemLogger("memory/embeddings");
1720

@@ -117,6 +120,7 @@ export async function createGeminiEmbeddingProvider(
117120
provider: {
118121
id: "gemini",
119122
model: client.model,
123+
maxInputTokens: GEMINI_MAX_INPUT_TOKENS[client.model],
120124
embedQuery,
121125
embedBatch,
122126
},

src/memory/embeddings-openai.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ export type OpenAiEmbeddingClient = {
99

1010
export const DEFAULT_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small";
1111
const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
12+
const OPENAI_MAX_INPUT_TOKENS: Record<string, number> = {
13+
"text-embedding-3-small": 8192,
14+
"text-embedding-3-large": 8192,
15+
"text-embedding-ada-002": 8191,
16+
};
1217

1318
export function normalizeOpenAiModel(model: string): string {
1419
const trimmed = model.trim();
@@ -51,6 +56,7 @@ export async function createOpenAiEmbeddingProvider(
5156
provider: {
5257
id: "openai",
5358
model: client.model,
59+
maxInputTokens: OPENAI_MAX_INPUT_TOKENS[client.model],
5460
embedQuery: async (text) => {
5561
const [vec] = await embed([text]);
5662
return vec ?? [];

src/memory/embeddings-voyage.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ export type VoyageEmbeddingClient = {
99

1010
export const DEFAULT_VOYAGE_EMBEDDING_MODEL = "voyage-4-large";
1111
const DEFAULT_VOYAGE_BASE_URL = "https://api.voyageai.com/v1";
12+
const VOYAGE_MAX_INPUT_TOKENS: Record<string, number> = {
13+
"voyage-3": 32000,
14+
"voyage-3-lite": 16000,
15+
"voyage-code-3": 32000,
16+
};
1217

1318
export function normalizeVoyageModel(model: string): string {
1419
const trimmed = model.trim();
@@ -59,6 +64,7 @@ export async function createVoyageEmbeddingProvider(
5964
provider: {
6065
id: "voyage",
6166
model: client.model,
67+
maxInputTokens: VOYAGE_MAX_INPUT_TOKENS[client.model],
6268
embedQuery: async (text) => {
6369
const [vec] = await embed([text], "query");
6470
return vec ?? [];

src/memory/embeddings.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export type { VoyageEmbeddingClient } from "./embeddings-voyage.js";
2424
export type EmbeddingProvider = {
2525
id: string;
2626
model: string;
27+
maxInputTokens?: number;
2728
embedQuery: (text: string) => Promise<number[]>;
2829
embedBatch: (texts: string[]) => Promise<number[][]>;
2930
};
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import fs from "node:fs/promises";
2+
import os from "node:os";
3+
import path from "node:path";
4+
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
5+
import { getMemorySearchManager, type MemoryIndexManager } from "./index.js";
6+
7+
const embedBatch = vi.fn(async (texts: string[]) => texts.map(() => [0, 1, 0]));
8+
const embedQuery = vi.fn(async () => [0, 1, 0]);
9+
10+
vi.mock("./embeddings.js", () => ({
11+
createEmbeddingProvider: async () => ({
12+
requestedProvider: "openai",
13+
provider: {
14+
id: "mock",
15+
model: "mock-embed",
16+
maxInputTokens: 8192,
17+
embedQuery,
18+
embedBatch,
19+
},
20+
}),
21+
}));
22+
23+
describe("memory embedding token limits", () => {
24+
let workspaceDir: string;
25+
let indexPath: string;
26+
let manager: MemoryIndexManager | null = null;
27+
28+
beforeEach(async () => {
29+
embedBatch.mockReset();
30+
embedQuery.mockReset();
31+
embedBatch.mockImplementation(async (texts: string[]) => texts.map(() => [0, 1, 0]));
32+
embedQuery.mockImplementation(async () => [0, 1, 0]);
33+
workspaceDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-mem-token-"));
34+
indexPath = path.join(workspaceDir, "index.sqlite");
35+
await fs.mkdir(path.join(workspaceDir, "memory"));
36+
});
37+
38+
afterEach(async () => {
39+
if (manager) {
40+
await manager.close();
41+
manager = null;
42+
}
43+
await fs.rm(workspaceDir, { recursive: true, force: true });
44+
});
45+
46+
it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => {
47+
const content = "x".repeat(9500);
48+
await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-09.md"), content);
49+
50+
const cfg = {
51+
agents: {
52+
defaults: {
53+
workspace: workspaceDir,
54+
memorySearch: {
55+
provider: "openai",
56+
model: "mock-embed",
57+
store: { path: indexPath },
58+
chunking: { tokens: 10_000, overlap: 0 },
59+
sync: { watch: false, onSessionStart: false, onSearch: false },
60+
query: { minScore: 0 },
61+
},
62+
},
63+
list: [{ id: "main", default: true }],
64+
},
65+
};
66+
67+
const result = await getMemorySearchManager({ cfg, agentId: "main" });
68+
expect(result.manager).not.toBeNull();
69+
if (!result.manager) {
70+
throw new Error("manager missing");
71+
}
72+
manager = result.manager;
73+
await manager.sync({ force: true });
74+
75+
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
76+
expect(inputs.length).toBeGreaterThan(1);
77+
expect(
78+
Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))),
79+
).toBeLessThanOrEqual(8192);
80+
});
81+
82+
it("uses UTF-8 byte estimates when batching multibyte chunks", async () => {
83+
const line = "😀".repeat(1800);
84+
const content = `${line}\n${line}\n${line}`;
85+
await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-10.md"), content);
86+
87+
const cfg = {
88+
agents: {
89+
defaults: {
90+
workspace: workspaceDir,
91+
memorySearch: {
92+
provider: "openai",
93+
model: "mock-embed",
94+
store: { path: indexPath },
95+
chunking: { tokens: 1000, overlap: 0 },
96+
sync: { watch: false, onSessionStart: false, onSearch: false },
97+
query: { minScore: 0 },
98+
},
99+
},
100+
list: [{ id: "main", default: true }],
101+
},
102+
};
103+
104+
const result = await getMemorySearchManager({ cfg, agentId: "main" });
105+
expect(result.manager).not.toBeNull();
106+
if (!result.manager) {
107+
throw new Error("manager missing");
108+
}
109+
manager = result.manager;
110+
await manager.sync({ force: true });
111+
112+
const batchSizes = embedBatch.mock.calls.map(
113+
(call) => (call[0] as string[] | undefined)?.length ?? 0,
114+
);
115+
expect(batchSizes.length).toBe(3);
116+
expect(batchSizes.every((size) => size === 1)).toBe(true);
117+
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
118+
expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true);
119+
});
120+
});

src/memory/manager.ts

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ import {
2727
runOpenAiEmbeddingBatches,
2828
} from "./batch-openai.js";
2929
import { type VoyageBatchRequest, runVoyageEmbeddingBatches } from "./batch-voyage.js";
30+
import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
31+
import { estimateUtf8Bytes } from "./embedding-input-limits.js";
3032
import { DEFAULT_GEMINI_EMBEDDING_MODEL } from "./embeddings-gemini.js";
3133
import { DEFAULT_OPENAI_EMBEDDING_MODEL } from "./embeddings-openai.js";
3234
import { DEFAULT_VOYAGE_EMBEDDING_MODEL } from "./embeddings-voyage.js";
@@ -87,7 +89,6 @@ const FTS_TABLE = "chunks_fts";
8789
const EMBEDDING_CACHE_TABLE = "embedding_cache";
8890
const SESSION_DIRTY_DEBOUNCE_MS = 5000;
8991
const EMBEDDING_BATCH_MAX_TOKENS = 8000;
90-
const EMBEDDING_APPROX_CHARS_PER_TOKEN = 1;
9192
const EMBEDDING_INDEX_CONCURRENCY = 4;
9293
const EMBEDDING_RETRY_MAX_ATTEMPTS = 3;
9394
const EMBEDDING_RETRY_BASE_DELAY_MS = 500;
@@ -1543,20 +1544,13 @@ export class MemoryIndexManager implements MemorySearchManager {
15431544
.run(META_KEY, value);
15441545
}
15451546

1546-
private estimateEmbeddingTokens(text: string): number {
1547-
if (!text) {
1548-
return 0;
1549-
}
1550-
return Math.ceil(text.length / EMBEDDING_APPROX_CHARS_PER_TOKEN);
1551-
}
1552-
15531547
private buildEmbeddingBatches(chunks: MemoryChunk[]): MemoryChunk[][] {
15541548
const batches: MemoryChunk[][] = [];
15551549
let current: MemoryChunk[] = [];
15561550
let currentTokens = 0;
15571551

15581552
for (const chunk of chunks) {
1559-
const estimate = this.estimateEmbeddingTokens(chunk.text);
1553+
const estimate = estimateUtf8Bytes(chunk.text);
15601554
const wouldExceed =
15611555
current.length > 0 && currentTokens + estimate > EMBEDDING_BATCH_MAX_TOKENS;
15621556
if (wouldExceed) {
@@ -2206,8 +2200,11 @@ export class MemoryIndexManager implements MemorySearchManager {
22062200
options: { source: MemorySource; content?: string },
22072201
) {
22082202
const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
2209-
const chunks = chunkMarkdown(content, this.settings.chunking).filter(
2210-
(chunk) => chunk.text.trim().length > 0,
2203+
const chunks = enforceEmbeddingMaxInputTokens(
2204+
this.provider,
2205+
chunkMarkdown(content, this.settings.chunking).filter(
2206+
(chunk) => chunk.text.trim().length > 0,
2207+
),
22112208
);
22122209
if (options.source === "sessions" && "lineMap" in entry) {
22132210
remapChunkLines(chunks, entry.lineMap);

0 commit comments

Comments
 (0)