|
| 1 | +import { |
| 2 | + PROVIDER_SEND_TURN_MAX_INPUT_CHARS, |
| 3 | + type ChatFileAttachment, |
| 4 | +} from "@okcode/contracts"; |
| 5 | + |
| 6 | +const MAX_FILE_CONTEXT_TOTAL_CHARS = 80_000; |
| 7 | +const MAX_FILE_CONTEXT_CHARS_PER_FILE = 24_000; |
| 8 | +const TEXT_DECODER = new TextDecoder("utf-8", { fatal: false }); |
| 9 | +const TEXTUAL_MIME_SUBSTRINGS = [ |
| 10 | + "json", |
| 11 | + "xml", |
| 12 | + "yaml", |
| 13 | + "toml", |
| 14 | + "javascript", |
| 15 | + "typescript", |
| 16 | + "markdown", |
| 17 | + "csv", |
| 18 | + "graphql", |
| 19 | + "sql", |
| 20 | + "x-sh", |
| 21 | + "x-shellscript", |
| 22 | +]; |
| 23 | +const TEXTUAL_FILE_EXTENSIONS = new Set([ |
| 24 | + "c", |
| 25 | + "cc", |
| 26 | + "cfg", |
| 27 | + "conf", |
| 28 | + "cpp", |
| 29 | + "cs", |
| 30 | + "css", |
| 31 | + "csv", |
| 32 | + "env", |
| 33 | + "go", |
| 34 | + "graphql", |
| 35 | + "h", |
| 36 | + "hpp", |
| 37 | + "html", |
| 38 | + "ini", |
| 39 | + "java", |
| 40 | + "js", |
| 41 | + "json", |
| 42 | + "jsx", |
| 43 | + "kt", |
| 44 | + "log", |
| 45 | + "lua", |
| 46 | + "md", |
| 47 | + "mjs", |
| 48 | + "php", |
| 49 | + "pl", |
| 50 | + "py", |
| 51 | + "rb", |
| 52 | + "rs", |
| 53 | + "scss", |
| 54 | + "sh", |
| 55 | + "sql", |
| 56 | + "svg", |
| 57 | + "swift", |
| 58 | + "toml", |
| 59 | + "ts", |
| 60 | + "tsx", |
| 61 | + "txt", |
| 62 | + "vue", |
| 63 | + "xml", |
| 64 | + "yaml", |
| 65 | + "yml", |
| 66 | + "zsh", |
| 67 | +]); |
| 68 | + |
| 69 | +function attachmentExtension(fileName: string): string { |
| 70 | + const match = /\.([a-z0-9]{1,12})$/i.exec(fileName.trim()); |
| 71 | + return match?.[1]?.toLowerCase() ?? ""; |
| 72 | +} |
| 73 | + |
| 74 | +function looksTextLikeMimeType(mimeType: string): boolean { |
| 75 | + const normalized = mimeType.trim().toLowerCase(); |
| 76 | + if (normalized.startsWith("text/")) { |
| 77 | + return true; |
| 78 | + } |
| 79 | + return TEXTUAL_MIME_SUBSTRINGS.some((part) => normalized.includes(part)); |
| 80 | +} |
| 81 | + |
| 82 | +function looksTextLikeFileName(fileName: string): boolean { |
| 83 | + return TEXTUAL_FILE_EXTENSIONS.has(attachmentExtension(fileName)); |
| 84 | +} |
| 85 | + |
| 86 | +function hasSuspiciousControlBytes(text: string): boolean { |
| 87 | + let suspiciousCount = 0; |
| 88 | + let visibleCount = 0; |
| 89 | + for (let index = 0; index < text.length; index += 1) { |
| 90 | + const codePoint = text.charCodeAt(index); |
| 91 | + if (codePoint === 0) { |
| 92 | + return true; |
| 93 | + } |
| 94 | + if (codePoint < 32 && codePoint !== 9 && codePoint !== 10 && codePoint !== 13) { |
| 95 | + suspiciousCount += 1; |
| 96 | + continue; |
| 97 | + } |
| 98 | + visibleCount += 1; |
| 99 | + } |
| 100 | + if (visibleCount === 0) { |
| 101 | + return suspiciousCount > 0; |
| 102 | + } |
| 103 | + return suspiciousCount / Math.max(visibleCount, 1) > 0.02; |
| 104 | +} |
| 105 | + |
| 106 | +export function extractTextAttachmentContents(input: { |
| 107 | + readonly mimeType: string; |
| 108 | + readonly fileName: string; |
| 109 | + readonly bytes: Uint8Array; |
| 110 | +}): string | null { |
| 111 | + if (input.bytes.byteLength === 0) { |
| 112 | + return ""; |
| 113 | + } |
| 114 | + const decoded = TEXT_DECODER.decode(input.bytes); |
| 115 | + if (hasSuspiciousControlBytes(decoded)) { |
| 116 | + return null; |
| 117 | + } |
| 118 | + const replacementCount = decoded.split("\uFFFD").length - 1; |
| 119 | + const replacementRatio = replacementCount / Math.max(decoded.length, 1); |
| 120 | + const expectedText = |
| 121 | + looksTextLikeMimeType(input.mimeType) || looksTextLikeFileName(input.fileName); |
| 122 | + if (replacementRatio > (expectedText ? 0.02 : 0.005)) { |
| 123 | + return null; |
| 124 | + } |
| 125 | + if (!expectedText && decoded.trim().length === 0) { |
| 126 | + return null; |
| 127 | + } |
| 128 | + return decoded.replace(/\r\n?/g, "\n"); |
| 129 | +} |
| 130 | + |
| 131 | +export function buildFileAttachmentContextText(input: { |
| 132 | + readonly baseText: string; |
| 133 | + readonly attachments: ReadonlyArray<{ |
| 134 | + readonly attachment: ChatFileAttachment; |
| 135 | + readonly text: string; |
| 136 | + }>; |
| 137 | + readonly maxChars?: number; |
| 138 | +}): string { |
| 139 | + if (input.attachments.length === 0) { |
| 140 | + return input.baseText; |
| 141 | + } |
| 142 | + |
| 143 | + const maxChars = Math.max( |
| 144 | + 1, |
| 145 | + Math.floor(input.maxChars ?? PROVIDER_SEND_TURN_MAX_INPUT_CHARS), |
| 146 | + ); |
| 147 | + let result = input.baseText; |
| 148 | + let usedFileContextChars = 0; |
| 149 | + let omittedCount = 0; |
| 150 | + |
| 151 | + const append = (chunk: string): boolean => { |
| 152 | + if (chunk.length === 0) { |
| 153 | + return true; |
| 154 | + } |
| 155 | + if (result.length + chunk.length > maxChars) { |
| 156 | + return false; |
| 157 | + } |
| 158 | + result += chunk; |
| 159 | + return true; |
| 160 | + }; |
| 161 | + |
| 162 | + const header = `${result.length > 0 ? "\n\n" : ""}Attached file context:`; |
| 163 | + if (!append(header)) { |
| 164 | + return result; |
| 165 | + } |
| 166 | + |
| 167 | + for (const [index, entry] of input.attachments.entries()) { |
| 168 | + const openBlock = |
| 169 | + "\n\n<attached_file>\n" + |
| 170 | + `name: ${entry.attachment.name}\n` + |
| 171 | + `mime_type: ${entry.attachment.mimeType}\n` + |
| 172 | + `size_bytes: ${entry.attachment.sizeBytes}\n` + |
| 173 | + "content:\n"; |
| 174 | + const closeBlock = "\n</attached_file>"; |
| 175 | + const remainingContextBudget = |
| 176 | + MAX_FILE_CONTEXT_TOTAL_CHARS - usedFileContextChars - openBlock.length - closeBlock.length; |
| 177 | + const remainingTotalBudget = maxChars - result.length - openBlock.length - closeBlock.length; |
| 178 | + const maxContentChars = Math.min( |
| 179 | + MAX_FILE_CONTEXT_CHARS_PER_FILE, |
| 180 | + remainingContextBudget, |
| 181 | + remainingTotalBudget, |
| 182 | + ); |
| 183 | + |
| 184 | + if (maxContentChars <= 0) { |
| 185 | + omittedCount = input.attachments.length - index; |
| 186 | + break; |
| 187 | + } |
| 188 | + |
| 189 | + const truncationNote = "\n[content truncated to fit input limits]"; |
| 190 | + const needsTruncation = entry.text.length > maxContentChars; |
| 191 | + const availableContentChars = needsTruncation |
| 192 | + ? Math.max(0, maxContentChars - truncationNote.length) |
| 193 | + : maxContentChars; |
| 194 | + if (availableContentChars <= 0) { |
| 195 | + omittedCount = input.attachments.length - index; |
| 196 | + break; |
| 197 | + } |
| 198 | + |
| 199 | + const blockBody = entry.text.slice(0, availableContentChars); |
| 200 | + const block = `${openBlock}${blockBody}${needsTruncation ? truncationNote : ""}${closeBlock}`; |
| 201 | + if (!append(block)) { |
| 202 | + omittedCount = input.attachments.length - index; |
| 203 | + break; |
| 204 | + } |
| 205 | + usedFileContextChars += block.length; |
| 206 | + } |
| 207 | + |
| 208 | + if (omittedCount > 0) { |
| 209 | + append(`\n\n[${omittedCount} attached file(s) omitted due to input size limits.]`); |
| 210 | + } |
| 211 | + |
| 212 | + return result; |
| 213 | +} |
0 commit comments