fix mixed-case DATA: bypass in img transform; broaden MIME test

ae2079 · claude · ae2079 · commit cfcc94d4819a · 2026-05-24T22:58:33.000+03:30
CodeRabbit pointed out that sanitize-html lowercases the URL scheme
before its allowedSchemes check, so `DATA:`/`dAtA:` URLs reach
transformTags.img — and the previous case-sensitive
`attribs.src.startsWith('data:')` guard would skip MIME validation
entirely on those. A payload like `&lt;img src="DATA:text/html,&lt;script&gt;…"&gt;`
would have slipped past the image-MIME allowlist.

Make the data: detection case-insensitive (only the scheme prefix, not
the full URL — the SAFE_DATA_IMAGE_URL regex already uses /i for the
MIME match). Add regression tests covering DATA:, dAtA:, Data: with
non-image payloads (must be stripped) and DATA:image/PNG (must be
preserved).

Also extend the existing image-MIME preservation test to cover bmp and
x-icon, which were already in SAFE_DATA_IMAGE_URL but not exercised.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/utils/htmlSanitizer.test.ts b/src/utils/htmlSanitizer.test.ts
@@ -159,14 +159,16 @@ describe('sanitizeProjectRichText', () => {
     assert.equal(once, twice);
   });
 
-  it('preserves legacy base64 image src on <img> (PNG, JPEG, GIF, WebP, SVG)', () => {
+  it('preserves legacy base64 image src on <img> (every allowed MIME)', () => {
     const inputs = [
       '<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB">',
       '<img src="data:image/jpeg;base64,/9j/4AAQSkZJRgAB">',
       '<img src="data:image/jpg;base64,/9j/4AAQSkZJRgAB">',
       '<img src="data:image/gif;base64,R0lGODlhAQABAIAAAP">',
       '<img src="data:image/webp;base64,UklGRiQAAABXRUJQ">',
       '<img src="data:image/svg+xml;base64,PHN2ZyB4bWxucz0i">',
+      '<img src="data:image/bmp;base64,Qk0=">',
+      '<img src="data:image/x-icon;base64,AAABAA==">',
     ];
     for (const input of inputs) {
       const out = sanitizeProjectRichText(input);
@@ -204,6 +206,42 @@ describe('sanitizeProjectRichText', () => {
     const out = sanitizeProjectRichText(input);
     assert.notInclude(out, 'data:');
   });
+
+  it('strips mixed-case DATA: URLs that try to bypass MIME validation', () => {
+    // sanitize-html lowercases scheme for the allowedSchemes check, so
+    // `DATA:`/`dAtA:` URLs reach transformTags.img. A case-sensitive
+    // startsWith('data:') guard would let non-image payloads through.
+    const inputs = [
+      '<img src="DATA:text/html,<script>alert(1)</script>">',
+      '<img src="dAtA:application/javascript;base64,YWxlcnQoMSk=">',
+      '<img src="Data:text/plain,hello">',
+    ];
+    for (const input of inputs) {
+      const out = sanitizeProjectRichText(input);
+      assert.notInclude(
+        out.toLowerCase(),
+        'text/html',
+        `should strip text/html data URL: ${input}`,
+      );
+      assert.notInclude(
+        out.toLowerCase(),
+        'application/javascript',
+        `should strip application/javascript data URL: ${input}`,
+      );
+      assert.notInclude(
+        out.toLowerCase(),
+        'text/plain',
+        `should strip text/plain data URL: ${input}`,
+      );
+    }
+  });
+
+  it('preserves mixed-case DATA: URLs when MIME is a valid image', () => {
+    const input =
+      '<img src="DATA:image/PNG;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB">';
+    const out = sanitizeProjectRichText(input);
+    assert.match(out, /data:image\/png/i);
+  });
 });
 
 describe('getRichTextPlainLength', () => {
diff --git a/src/utils/htmlSanitizer.ts b/src/utils/htmlSanitizer.ts
@@ -133,11 +133,17 @@ export const sanitizeProjectRichText = (html: string = ''): string => {
       // only let through data URLs that declare an image MIME type. This
       // blocks data:text/html or data:application/javascript payloads that
       // could otherwise piggyback on the data: allowance.
+      //
+      // sanitize-html itself lowercases the scheme for the allowedSchemes
+      // check (so `DATA:`/`Data:` pass scheme filtering), so the
+      // `data:`-prefix detection here has to be case-insensitive too —
+      // otherwise a mixed-case `DATA:text/html,<script>…</script>` would
+      // skip MIME validation entirely.
       img: (tagName, attribs) => {
+        const src = attribs.src ?? '';
         if (
-          attribs.src &&
-          attribs.src.startsWith('data:') &&
-          !SAFE_DATA_IMAGE_URL.test(attribs.src)
+          src.slice(0, 5).toLowerCase() === 'data:' &&
+          !SAFE_DATA_IMAGE_URL.test(src)
         ) {
           delete attribs.src;
         }