Skip to content

Commit 8f61bfd

Browse files
authored
Add file attachment context handling (#141)
- Support text file attachments alongside images - Persist, validate, and forward file context through server and UI - Relax attachment lookup to handle non-image files
1 parent 36f6f99 commit 8f61bfd

17 files changed

Lines changed: 998 additions & 331 deletions

apps/server/src/attachmentStore.ts

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
import { randomUUID } from "node:crypto";
2-
import { existsSync } from "node:fs";
2+
import { existsSync, readdirSync } from "node:fs";
33

44
import type { ChatAttachment } from "@okcode/contracts";
55

66
import {
77
normalizeAttachmentRelativePath,
88
resolveAttachmentRelativePath,
99
} from "./attachmentPaths.ts";
10-
import { inferImageExtension, SAFE_IMAGE_FILE_EXTENSIONS } from "./imageMime.ts";
11-
12-
const ATTACHMENT_FILENAME_EXTENSIONS = [...SAFE_IMAGE_FILE_EXTENSIONS, ".bin"];
10+
import { inferAttachmentExtension } from "./imageMime.ts";
1311
const ATTACHMENT_ID_THREAD_SEGMENT_MAX_CHARS = 80;
1412
const ATTACHMENT_ID_THREAD_SEGMENT_PATTERN = "[a-z0-9_]+(?:-[a-z0-9_]+)*";
1513
const ATTACHMENT_ID_UUID_PATTERN = "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}";
@@ -55,8 +53,9 @@ export function parseThreadSegmentFromAttachmentId(attachmentId: string): string
5553

5654
export function attachmentRelativePath(attachment: ChatAttachment): string {
5755
switch (attachment.type) {
58-
case "image": {
59-
const extension = inferImageExtension({
56+
case "image":
57+
case "file": {
58+
const extension = inferAttachmentExtension({
6059
mimeType: attachment.mimeType,
6160
fileName: attachment.name,
6261
});
@@ -83,10 +82,20 @@ export function resolveAttachmentPathById(input: {
8382
if (!normalizedId || normalizedId.includes("/") || normalizedId.includes(".")) {
8483
return null;
8584
}
86-
for (const extension of ATTACHMENT_FILENAME_EXTENSIONS) {
85+
let entries: string[];
86+
try {
87+
entries = readdirSync(input.attachmentsDir);
88+
} catch {
89+
return null;
90+
}
91+
for (const entry of entries) {
92+
const entryId = parseAttachmentIdFromRelativePath(entry);
93+
if (entryId !== normalizedId) {
94+
continue;
95+
}
8796
const maybePath = resolveAttachmentRelativePath({
8897
attachmentsDir: input.attachmentsDir,
89-
relativePath: `${normalizedId}${extension}`,
98+
relativePath: entry,
9099
});
91100
if (maybePath && existsSync(maybePath)) {
92101
return maybePath;

apps/server/src/attachmentText.ts

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
import {
2+
PROVIDER_SEND_TURN_MAX_INPUT_CHARS,
3+
type ChatFileAttachment,
4+
} from "@okcode/contracts";
5+
6+
const MAX_FILE_CONTEXT_TOTAL_CHARS = 80_000;
7+
const MAX_FILE_CONTEXT_CHARS_PER_FILE = 24_000;
8+
const TEXT_DECODER = new TextDecoder("utf-8", { fatal: false });
9+
const TEXTUAL_MIME_SUBSTRINGS = [
10+
"json",
11+
"xml",
12+
"yaml",
13+
"toml",
14+
"javascript",
15+
"typescript",
16+
"markdown",
17+
"csv",
18+
"graphql",
19+
"sql",
20+
"x-sh",
21+
"x-shellscript",
22+
];
23+
const TEXTUAL_FILE_EXTENSIONS = new Set([
24+
"c",
25+
"cc",
26+
"cfg",
27+
"conf",
28+
"cpp",
29+
"cs",
30+
"css",
31+
"csv",
32+
"env",
33+
"go",
34+
"graphql",
35+
"h",
36+
"hpp",
37+
"html",
38+
"ini",
39+
"java",
40+
"js",
41+
"json",
42+
"jsx",
43+
"kt",
44+
"log",
45+
"lua",
46+
"md",
47+
"mjs",
48+
"php",
49+
"pl",
50+
"py",
51+
"rb",
52+
"rs",
53+
"scss",
54+
"sh",
55+
"sql",
56+
"svg",
57+
"swift",
58+
"toml",
59+
"ts",
60+
"tsx",
61+
"txt",
62+
"vue",
63+
"xml",
64+
"yaml",
65+
"yml",
66+
"zsh",
67+
]);
68+
69+
function attachmentExtension(fileName: string): string {
70+
const match = /\.([a-z0-9]{1,12})$/i.exec(fileName.trim());
71+
return match?.[1]?.toLowerCase() ?? "";
72+
}
73+
74+
function looksTextLikeMimeType(mimeType: string): boolean {
75+
const normalized = mimeType.trim().toLowerCase();
76+
if (normalized.startsWith("text/")) {
77+
return true;
78+
}
79+
return TEXTUAL_MIME_SUBSTRINGS.some((part) => normalized.includes(part));
80+
}
81+
82+
function looksTextLikeFileName(fileName: string): boolean {
83+
return TEXTUAL_FILE_EXTENSIONS.has(attachmentExtension(fileName));
84+
}
85+
86+
function hasSuspiciousControlBytes(text: string): boolean {
87+
let suspiciousCount = 0;
88+
let visibleCount = 0;
89+
for (let index = 0; index < text.length; index += 1) {
90+
const codePoint = text.charCodeAt(index);
91+
if (codePoint === 0) {
92+
return true;
93+
}
94+
if (codePoint < 32 && codePoint !== 9 && codePoint !== 10 && codePoint !== 13) {
95+
suspiciousCount += 1;
96+
continue;
97+
}
98+
visibleCount += 1;
99+
}
100+
if (visibleCount === 0) {
101+
return suspiciousCount > 0;
102+
}
103+
return suspiciousCount / Math.max(visibleCount, 1) > 0.02;
104+
}
105+
106+
export function extractTextAttachmentContents(input: {
107+
readonly mimeType: string;
108+
readonly fileName: string;
109+
readonly bytes: Uint8Array;
110+
}): string | null {
111+
if (input.bytes.byteLength === 0) {
112+
return "";
113+
}
114+
const decoded = TEXT_DECODER.decode(input.bytes);
115+
if (hasSuspiciousControlBytes(decoded)) {
116+
return null;
117+
}
118+
const replacementCount = decoded.split("\uFFFD").length - 1;
119+
const replacementRatio = replacementCount / Math.max(decoded.length, 1);
120+
const expectedText =
121+
looksTextLikeMimeType(input.mimeType) || looksTextLikeFileName(input.fileName);
122+
if (replacementRatio > (expectedText ? 0.02 : 0.005)) {
123+
return null;
124+
}
125+
if (!expectedText && decoded.trim().length === 0) {
126+
return null;
127+
}
128+
return decoded.replace(/\r\n?/g, "\n");
129+
}
130+
131+
export function buildFileAttachmentContextText(input: {
132+
readonly baseText: string;
133+
readonly attachments: ReadonlyArray<{
134+
readonly attachment: ChatFileAttachment;
135+
readonly text: string;
136+
}>;
137+
readonly maxChars?: number;
138+
}): string {
139+
if (input.attachments.length === 0) {
140+
return input.baseText;
141+
}
142+
143+
const maxChars = Math.max(
144+
1,
145+
Math.floor(input.maxChars ?? PROVIDER_SEND_TURN_MAX_INPUT_CHARS),
146+
);
147+
let result = input.baseText;
148+
let usedFileContextChars = 0;
149+
let omittedCount = 0;
150+
151+
const append = (chunk: string): boolean => {
152+
if (chunk.length === 0) {
153+
return true;
154+
}
155+
if (result.length + chunk.length > maxChars) {
156+
return false;
157+
}
158+
result += chunk;
159+
return true;
160+
};
161+
162+
const header = `${result.length > 0 ? "\n\n" : ""}Attached file context:`;
163+
if (!append(header)) {
164+
return result;
165+
}
166+
167+
for (const [index, entry] of input.attachments.entries()) {
168+
const openBlock =
169+
"\n\n<attached_file>\n" +
170+
`name: ${entry.attachment.name}\n` +
171+
`mime_type: ${entry.attachment.mimeType}\n` +
172+
`size_bytes: ${entry.attachment.sizeBytes}\n` +
173+
"content:\n";
174+
const closeBlock = "\n</attached_file>";
175+
const remainingContextBudget =
176+
MAX_FILE_CONTEXT_TOTAL_CHARS - usedFileContextChars - openBlock.length - closeBlock.length;
177+
const remainingTotalBudget = maxChars - result.length - openBlock.length - closeBlock.length;
178+
const maxContentChars = Math.min(
179+
MAX_FILE_CONTEXT_CHARS_PER_FILE,
180+
remainingContextBudget,
181+
remainingTotalBudget,
182+
);
183+
184+
if (maxContentChars <= 0) {
185+
omittedCount = input.attachments.length - index;
186+
break;
187+
}
188+
189+
const truncationNote = "\n[content truncated to fit input limits]";
190+
const needsTruncation = entry.text.length > maxContentChars;
191+
const availableContentChars = needsTruncation
192+
? Math.max(0, maxContentChars - truncationNote.length)
193+
: maxContentChars;
194+
if (availableContentChars <= 0) {
195+
omittedCount = input.attachments.length - index;
196+
break;
197+
}
198+
199+
const blockBody = entry.text.slice(0, availableContentChars);
200+
const block = `${openBlock}${blockBody}${needsTruncation ? truncationNote : ""}${closeBlock}`;
201+
if (!append(block)) {
202+
omittedCount = input.attachments.length - index;
203+
break;
204+
}
205+
usedFileContextChars += block.length;
206+
}
207+
208+
if (omittedCount > 0) {
209+
append(`\n\n[${omittedCount} attached file(s) omitted due to input size limits.]`);
210+
}
211+
212+
return result;
213+
}

apps/server/src/imageMime.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import Mime from "@effect/platform-node/Mime";
22

3+
const SAFE_ATTACHMENT_FILE_EXTENSION_PATTERN = /^[a-z0-9]{1,12}$/i;
4+
35
export const IMAGE_EXTENSION_BY_MIME_TYPE: Record<string, string> = {
46
"image/avif": ".avif",
57
"image/bmp": ".bmp",
@@ -29,6 +31,10 @@ export const SAFE_IMAGE_FILE_EXTENSIONS = new Set([
2931
".webp",
3032
]);
3133

34+
export function isImageMimeType(mimeType: string): boolean {
35+
return mimeType.trim().toLowerCase().startsWith("image/");
36+
}
37+
3238
export function parseBase64DataUrl(
3339
dataUrl: string,
3440
): { readonly mimeType: string; readonly base64: string } | null {
@@ -77,3 +83,25 @@ export function inferImageExtension(input: { mimeType: string; fileName?: string
7783

7884
return ".bin";
7985
}
86+
87+
export function inferAttachmentExtension(input: { mimeType: string; fileName?: string }): string {
88+
if (isImageMimeType(input.mimeType)) {
89+
return inferImageExtension(input);
90+
}
91+
92+
const mimeExtension = Mime.getExtension(input.mimeType);
93+
if (
94+
typeof mimeExtension === "string" &&
95+
SAFE_ATTACHMENT_FILE_EXTENSION_PATTERN.test(mimeExtension.replace(/^\./, ""))
96+
) {
97+
return mimeExtension.startsWith(".") ? mimeExtension : `.${mimeExtension}`;
98+
}
99+
100+
const fileName = input.fileName?.trim() ?? "";
101+
const extensionMatch = /\.([a-z0-9]{1,12})$/i.exec(fileName);
102+
if (extensionMatch?.[1]) {
103+
return `.${extensionMatch[1].toLowerCase()}`;
104+
}
105+
106+
return ".bin";
107+
}

apps/server/src/orchestration/Layers/ProjectionPipeline.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,6 @@ function collectThreadAttachmentRelativePaths(
221221
const relativePaths = new Set<string>();
222222
for (const message of messages) {
223223
for (const attachment of message.attachments ?? []) {
224-
if (attachment.type !== "image") {
225-
continue;
226-
}
227224
const attachmentThreadSegment = parseThreadSegmentFromAttachmentId(attachment.id);
228225
if (!attachmentThreadSegment || attachmentThreadSegment !== threadSegment) {
229226
continue;

0 commit comments

Comments
 (0)