Skip to content

Commit fae419d

Browse files
committed
take out media stripping to its own file
1 parent fac656c commit fae419d

2 files changed

Lines changed: 142 additions & 136 deletions

File tree

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/**
2+
* Inline media content source, with a potentially very large base64
3+
* blob or data: uri.
4+
*/
5+
export type ContentMedia = Record<string, unknown> &
6+
(
7+
| {
8+
media_type: string;
9+
data: string;
10+
}
11+
| {
12+
image_url: `data:${string}`;
13+
}
14+
| {
15+
image_url: { url: `data:${string}` };
16+
}
17+
| {
18+
type: 'blob' | 'base64';
19+
content: string;
20+
}
21+
| {
22+
b64_json: string;
23+
}
24+
| {
25+
uri: `data:${string}`;
26+
}
27+
| {
28+
type: 'input_audio';
29+
input_audio: { data: string };
30+
}
31+
| {
32+
type: 'file';
33+
file: { file_data?: string };
34+
}
35+
);
36+
37+
/**
38+
* Check if a content part is an OpenAI/Anthropic media source
39+
*/
40+
export function isContentMedia(part: unknown): part is ContentMedia {
41+
if (!part || typeof part !== 'object') return false;
42+
43+
return (
44+
isContentMediaSource(part) ||
45+
hasInlineData(part) ||
46+
hasImageUrl(part) ||
47+
hasInputAudio(part) ||
48+
hasFileData(part) ||
49+
('media_type' in part && typeof part.media_type === 'string' && 'data' in part) ||
50+
('type' in part && (part.type === 'blob' || part.type === 'base64')) ||
51+
'b64_json' in part ||
52+
('type' in part && 'result' in part && part.type === 'image_generation') ||
53+
('uri' in part && typeof part.uri === 'string' && part.uri.startsWith('data:'))
54+
);
55+
}
56+
57+
function hasImageUrl(part: NonNullable<unknown>): boolean {
58+
if (!('image_url' in part)) return false;
59+
if (typeof part.image_url === 'string') return part.image_url.startsWith('data:');
60+
return hasNestedImageUrl(part);
61+
}
62+
63+
function hasNestedImageUrl(part: NonNullable<unknown>): part is { image_url: { url: string } } {
64+
return (
65+
'image_url' in part &&
66+
!!part.image_url &&
67+
typeof part.image_url === 'object' &&
68+
'url' in part.image_url &&
69+
typeof part.image_url.url === 'string' &&
70+
part.image_url.url.startsWith('data:')
71+
);
72+
}
73+
74+
function isContentMediaSource(part: NonNullable<unknown>): boolean {
75+
return 'type' in part && typeof part.type === 'string' && 'source' in part && isContentMedia(part.source);
76+
}
77+
78+
function hasInlineData(part: NonNullable<unknown>): part is { inlineData: { data?: string } } {
79+
return (
80+
'inlineData' in part &&
81+
!!part.inlineData &&
82+
typeof part.inlineData === 'object' &&
83+
'data' in part.inlineData &&
84+
typeof part.inlineData.data === 'string'
85+
);
86+
}
87+
88+
function hasInputAudio(part: NonNullable<unknown>): part is { type: 'input_audio'; input_audio: { data: string } } {
89+
return (
90+
'type' in part &&
91+
part.type === 'input_audio' &&
92+
'input_audio' in part &&
93+
!!part.input_audio &&
94+
typeof part.input_audio === 'object' &&
95+
'data' in part.input_audio &&
96+
typeof part.input_audio.data === 'string'
97+
);
98+
}
99+
100+
function hasFileData(part: NonNullable<unknown>): part is { type: 'file'; file: { file_data: string } } {
101+
return (
102+
'type' in part &&
103+
part.type === 'file' &&
104+
'file' in part &&
105+
!!part.file &&
106+
typeof part.file === 'object' &&
107+
'file_data' in part.file &&
108+
typeof part.file.file_data === 'string'
109+
);
110+
}
111+
112+
const REMOVED_STRING = '[Blob substitute]';
113+
114+
const MEDIA_FIELDS = ['image_url', 'data', 'content', 'b64_json', 'result', 'uri'] as const;
115+
116+
/**
117+
* Replace inline binary data in a single media content part with a placeholder.
118+
*/
119+
export function stripInlineMediaFromSingleMessage(part: ContentMedia): ContentMedia {
120+
const strip = { ...part };
121+
if (isContentMedia(strip.source)) {
122+
strip.source = stripInlineMediaFromSingleMessage(strip.source);
123+
}
124+
if (hasInlineData(part)) {
125+
strip.inlineData = { ...part.inlineData, data: REMOVED_STRING };
126+
}
127+
if (hasNestedImageUrl(part)) {
128+
strip.image_url = { ...part.image_url, url: REMOVED_STRING };
129+
}
130+
if (hasInputAudio(part)) {
131+
strip.input_audio = { ...part.input_audio, data: REMOVED_STRING };
132+
}
133+
if (hasFileData(part)) {
134+
strip.file = { ...part.file, file_data: REMOVED_STRING };
135+
}
136+
for (const field of MEDIA_FIELDS) {
137+
if (typeof strip[field] === 'string') strip[field] = REMOVED_STRING;
138+
}
139+
return strip;
140+
}

packages/core/src/tracing/ai/messageTruncation.ts

Lines changed: 2 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import { isContentMedia, stripInlineMediaFromSingleMessage } from './mediaStripping';
2+
13
/**
24
* Default maximum size in bytes for GenAI messages.
35
* Messages exceeding this limit will be truncated.
@@ -23,42 +25,6 @@ type ContentArrayMessage = {
2325
}[];
2426
};
2527

26-
/**
27-
* Inline media content source, with a potentially very large base64
28-
* blob or data: uri.
29-
*/
30-
type ContentMedia = Record<string, unknown> &
31-
(
32-
| {
33-
media_type: string;
34-
data: string;
35-
}
36-
| {
37-
image_url: `data:${string}`;
38-
}
39-
| {
40-
image_url: { url: `data:${string}` };
41-
}
42-
| {
43-
type: 'blob' | 'base64';
44-
content: string;
45-
}
46-
| {
47-
b64_json: string;
48-
}
49-
| {
50-
uri: `data:${string}`;
51-
}
52-
| {
53-
type: 'input_audio';
54-
input_audio: { data: string };
55-
}
56-
| {
57-
type: 'file';
58-
file: { file_data?: string };
59-
}
60-
);
61-
6228
/**
6329
* Message format used by Google GenAI API.
6430
* Parts can be strings or objects with a text property.
@@ -175,75 +141,6 @@ function isContentArrayMessage(message: unknown): message is ContentArrayMessage
175141
return message !== null && typeof message === 'object' && 'content' in message && Array.isArray(message.content);
176142
}
177143

178-
/**
179-
* Check if a content part is an OpenAI/Anthropic media source
180-
*/
181-
function isContentMedia(part: unknown): part is ContentMedia {
182-
if (!part || typeof part !== 'object') return false;
183-
184-
return (
185-
isContentMediaSource(part) ||
186-
hasInlineData(part) ||
187-
hasImageUrl(part) ||
188-
hasInputAudio(part) ||
189-
hasFileData(part) ||
190-
('media_type' in part && typeof part.media_type === 'string' && 'data' in part) ||
191-
('type' in part && (part.type === 'blob' || part.type === 'base64')) ||
192-
'b64_json' in part ||
193-
('type' in part && 'result' in part && part.type === 'image_generation') ||
194-
('uri' in part && typeof part.uri === 'string' && part.uri.startsWith('data:'))
195-
);
196-
}
197-
function hasImageUrl(part: NonNullable<unknown>): boolean {
198-
if (!('image_url' in part)) return false;
199-
if (typeof part.image_url === 'string') return part.image_url.startsWith('data:');
200-
return hasNestedImageUrl(part);
201-
}
202-
function hasNestedImageUrl(part: NonNullable<unknown>): part is { image_url: { url: string } } {
203-
return (
204-
'image_url' in part &&
205-
!!part.image_url &&
206-
typeof part.image_url === 'object' &&
207-
'url' in part.image_url &&
208-
typeof part.image_url.url === 'string' &&
209-
part.image_url.url.startsWith('data:')
210-
);
211-
}
212-
function isContentMediaSource(part: NonNullable<unknown>): boolean {
213-
return 'type' in part && typeof part.type === 'string' && 'source' in part && isContentMedia(part.source);
214-
}
215-
function hasInlineData(part: NonNullable<unknown>): part is { inlineData: { data?: string } } {
216-
return (
217-
'inlineData' in part &&
218-
!!part.inlineData &&
219-
typeof part.inlineData === 'object' &&
220-
'data' in part.inlineData &&
221-
typeof part.inlineData.data === 'string'
222-
);
223-
}
224-
function hasInputAudio(part: NonNullable<unknown>): part is { type: 'input_audio'; input_audio: { data: string } } {
225-
return (
226-
'type' in part &&
227-
part.type === 'input_audio' &&
228-
'input_audio' in part &&
229-
!!part.input_audio &&
230-
typeof part.input_audio === 'object' &&
231-
'data' in part.input_audio &&
232-
typeof part.input_audio.data === 'string'
233-
);
234-
}
235-
function hasFileData(part: NonNullable<unknown>): part is { type: 'file'; file: { file_data: string } } {
236-
return (
237-
'type' in part &&
238-
part.type === 'file' &&
239-
'file' in part &&
240-
!!part.file &&
241-
typeof part.file === 'object' &&
242-
'file_data' in part.file &&
243-
typeof part.file.file_data === 'string'
244-
);
245-
}
246-
247144
/**
248145
* Check if a message has the Google GenAI parts format.
249146
*/
@@ -368,37 +265,6 @@ function truncateSingleMessage(message: unknown, maxBytes: number): unknown[] {
368265
return [];
369266
}
370267

371-
const REMOVED_STRING = '[Blob substitute]';
372-
373-
const MEDIA_FIELDS = ['image_url', 'data', 'content', 'b64_json', 'result', 'uri'] as const;
374-
375-
function stripInlineMediaFromSingleMessage(part: ContentMedia): ContentMedia {
376-
const strip = { ...part };
377-
if (isContentMedia(strip.source)) {
378-
strip.source = stripInlineMediaFromSingleMessage(strip.source);
379-
}
380-
// google genai inline data blob objects
381-
if (hasInlineData(part)) {
382-
strip.inlineData = { ...part.inlineData, data: REMOVED_STRING };
383-
}
384-
// OpenAI vision format: { image_url: { url: "data:..." } }
385-
if (hasNestedImageUrl(part)) {
386-
strip.image_url = { ...part.image_url, url: REMOVED_STRING };
387-
}
388-
// OpenAI audio format: { type: "input_audio", input_audio: { data: "...", format: "wav" } }
389-
if (hasInputAudio(part)) {
390-
strip.input_audio = { ...part.input_audio, data: REMOVED_STRING };
391-
}
392-
// OpenAI file format: { type: "file", file: { file_data: "...", filename: "..." } }
393-
if (hasFileData(part)) {
394-
strip.file = { ...part.file, file_data: REMOVED_STRING };
395-
}
396-
for (const field of MEDIA_FIELDS) {
397-
if (typeof strip[field] === 'string') strip[field] = REMOVED_STRING;
398-
}
399-
return strip;
400-
}
401-
402268
/**
403269
* Strip the inline media from message arrays.
404270
*

0 commit comments

Comments
 (0)