Skip to content

Commit cfcc94d

Browse files
ae2079claude
andcommitted
fix mixed-case DATA: bypass in img transform; broaden MIME test
CodeRabbit pointed out that sanitize-html lowercases the URL scheme before its allowedSchemes check, so `DATA:`/`dAtA:` URLs reach transformTags.img — and the previous case-sensitive `attribs.src.startsWith('data:')` guard would skip MIME validation entirely on those. A payload like `<img src="DATA:text/html,<script>…">` would have slipped past the image-MIME allowlist. Make the data: detection case-insensitive (only the scheme prefix, not the full URL — the SAFE_DATA_IMAGE_URL regex already uses /i for the MIME match). Add regression tests covering DATA:, dAtA:, Data: with non-image payloads (must be stripped) and DATA:image/PNG (must be preserved). Also extend the existing image-MIME preservation test to cover bmp and x-icon, which were already in SAFE_DATA_IMAGE_URL but not exercised. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 53d7d8e commit cfcc94d

2 files changed

Lines changed: 48 additions & 4 deletions

File tree

src/utils/htmlSanitizer.test.ts

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,14 +159,16 @@ describe('sanitizeProjectRichText', () => {
159159
assert.equal(once, twice);
160160
});
161161

162-
it('preserves legacy base64 image src on <img> (PNG, JPEG, GIF, WebP, SVG)', () => {
162+
it('preserves legacy base64 image src on <img> (every allowed MIME)', () => {
163163
const inputs = [
164164
'<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB">',
165165
'<img src="data:image/jpeg;base64,/9j/4AAQSkZJRgAB">',
166166
'<img src="data:image/jpg;base64,/9j/4AAQSkZJRgAB">',
167167
'<img src="data:image/gif;base64,R0lGODlhAQABAIAAAP">',
168168
'<img src="data:image/webp;base64,UklGRiQAAABXRUJQ">',
169169
'<img src="data:image/svg+xml;base64,PHN2ZyB4bWxucz0i">',
170+
'<img src="data:image/bmp;base64,Qk0=">',
171+
'<img src="data:image/x-icon;base64,AAABAA==">',
170172
];
171173
for (const input of inputs) {
172174
const out = sanitizeProjectRichText(input);
@@ -204,6 +206,42 @@ describe('sanitizeProjectRichText', () => {
204206
const out = sanitizeProjectRichText(input);
205207
assert.notInclude(out, 'data:');
206208
});
209+
210+
it('strips mixed-case DATA: URLs that try to bypass MIME validation', () => {
211+
// sanitize-html lowercases scheme for the allowedSchemes check, so
212+
// `DATA:`/`dAtA:` URLs reach transformTags.img. A case-sensitive
213+
// startsWith('data:') guard would let non-image payloads through.
214+
const inputs = [
215+
'<img src="DATA:text/html,<script>alert(1)</script>">',
216+
'<img src="dAtA:application/javascript;base64,YWxlcnQoMSk=">',
217+
'<img src="Data:text/plain,hello">',
218+
];
219+
for (const input of inputs) {
220+
const out = sanitizeProjectRichText(input);
221+
assert.notInclude(
222+
out.toLowerCase(),
223+
'text/html',
224+
`should strip text/html data URL: ${input}`,
225+
);
226+
assert.notInclude(
227+
out.toLowerCase(),
228+
'application/javascript',
229+
`should strip application/javascript data URL: ${input}`,
230+
);
231+
assert.notInclude(
232+
out.toLowerCase(),
233+
'text/plain',
234+
`should strip text/plain data URL: ${input}`,
235+
);
236+
}
237+
});
238+
239+
it('preserves mixed-case DATA: URLs when MIME is a valid image', () => {
240+
const input =
241+
'<img src="DATA:image/PNG;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB">';
242+
const out = sanitizeProjectRichText(input);
243+
assert.match(out, /data:image\/png/i);
244+
});
207245
});
208246

209247
describe('getRichTextPlainLength', () => {

src/utils/htmlSanitizer.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,17 @@ export const sanitizeProjectRichText = (html: string = ''): string => {
133133
// only let through data URLs that declare an image MIME type. This
134134
// blocks data:text/html or data:application/javascript payloads that
135135
// could otherwise piggyback on the data: allowance.
136+
//
137+
// sanitize-html itself lowercases the scheme for the allowedSchemes
138+
// check (so `DATA:`/`Data:` pass scheme filtering), so the
139+
// `data:`-prefix detection here has to be case-insensitive too —
140+
// otherwise a mixed-case `DATA:text/html,<script>…</script>` would
141+
// skip MIME validation entirely.
136142
img: (tagName, attribs) => {
143+
const src = attribs.src ?? '';
137144
if (
138-
attribs.src &&
139-
attribs.src.startsWith('data:') &&
140-
!SAFE_DATA_IMAGE_URL.test(attribs.src)
145+
src.slice(0, 5).toLowerCase() === 'data:' &&
146+
!SAFE_DATA_IMAGE_URL.test(src)
141147
) {
142148
delete attribs.src;
143149
}

0 commit comments

Comments
 (0)