fix: imports encoded in utf-16 break DocxZipper

harbournick · harbournick · commit 9bc488d40430 · 2025-09-02T21:42:17.000-07:00
diff --git a/packages/super-editor/src/core/DocxZipper.js b/packages/super-editor/src/core/DocxZipper.js
@@ -1,6 +1,7 @@
 import xmljs from 'xml-js';
 import JSZip from 'jszip';
 import { getContentTypesFromXml } from './super-converter/helpers.js';
+import { ensureXmlString, isXmlLike } from './encoding-helpers.js';
 
 /**
  * Class to handle unzipping and zipping of docx files
@@ -37,42 +38,37 @@ class DocxZipper {
     const extractedFiles = await this.unzip(file);
     const files = Object.entries(extractedFiles.files);
 
-    const mediaObjects = {};
-    const validTypes = ['xml', 'rels'];
-    for (const file of files) {
-      const [, zipEntry] = file;
-
-      if (validTypes.some((validType) => zipEntry.name.endsWith(validType))) {
-        const content = await zipEntry.async('string');
-        this.files.push({
-          name: zipEntry.name,
-          content,
-        });
+    for (const [, zipEntry] of files) {
+      const name = zipEntry.name;
+
+      if (isXmlLike(name)) {
+        // Read raw bytes and decode (handles UTF-8 & UTF-16)
+        const u8 = await zipEntry.async('uint8array');
+        const content = ensureXmlString(u8);
+        this.files.push({ name, content });
       } else if (
-        (zipEntry.name.startsWith('word/media') && zipEntry.name !== 'word/media/') ||
-        (zipEntry.name.startsWith('media') && zipEntry.name !== 'media/')
+        (name.startsWith('word/media') && name !== 'word/media/') ||
+        (name.startsWith('media') && name !== 'media/')
       ) {
-        // If we are in node, we need to convert the buffer to base64
+        // Media files
         if (isNode) {
           const buffer = await zipEntry.async('nodebuffer');
           const fileBase64 = buffer.toString('base64');
-          this.mediaFiles[zipEntry.name] = fileBase64;
-        }
-
-        // If we are in the browser, we can use the base64 directly
-        else {
+          this.mediaFiles[name] = fileBase64;
+        } else {
           const blob = await zipEntry.async('blob');
-          const extension = this.getFileExtension(zipEntry.name);
+          const extension = this.getFileExtension(name);
           const fileBase64 = await zipEntry.async('base64');
-          this.mediaFiles[zipEntry.name] = `data:image/${extension};base64,${fileBase64}`;
+          this.mediaFiles[name] = `data:image/${extension};base64,${fileBase64}`;
 
-          const file = new File([blob], zipEntry.name, { type: blob.type });
-          const imageUrl = URL.createObjectURL(file);
-          this.media[zipEntry.name] = imageUrl;
+          const fileObj = new File([blob], name, { type: blob.type });
+          const imageUrl = URL.createObjectURL(fileObj);
+          this.media[name] = imageUrl;
         }
-      } else if (zipEntry.name.startsWith('word/fonts') && zipEntry.name !== 'word/fonts/') {
+      } else if (name.startsWith('word/fonts') && name !== 'word/fonts/') {
+        // Font files
         const uint8array = await zipEntry.async('uint8array');
-        this.fonts[zipEntry.name] = uint8array;
+        this.fonts[name] = uint8array;
       }
     }
 
diff --git a/packages/super-editor/src/core/DocxZipper.test.js b/packages/super-editor/src/core/DocxZipper.test.js
@@ -2,6 +2,7 @@ import path from 'path';
 import fs from 'fs';
 import { describe, it, expect, beforeEach } from 'vitest';
 import DocxZipper from './DocxZipper';
+import JSZip from 'jszip';
 
 async function readFileAsBuffer(filePath) {
   const resolvedPath = path.resolve(__dirname, filePath);
@@ -48,3 +49,62 @@ describe('DocxZipper - file extraction', () => {
     expect(documentXml).toBeTruthy();
   });
 });
+
+// Helper to build a UTF-16LE Buffer with BOM
+function utf16leWithBOM(str) {
+  const bom = Buffer.from([0xff, 0xfe]);
+  const body = Buffer.from(str, 'utf16le');
+  return Buffer.concat([bom, body]);
+}
+
+describe('DocxZipper - UTF-16 XML handling', () => {
+  let zipper;
+  beforeEach(() => {
+    zipper = new DocxZipper();
+  });
+
+  it('decodes a UTF-16LE customXml part correctly (was failing before fix)', async () => {
+    const zip = new JSZip();
+
+    // Minimal [Content_Types].xml to look like a docx
+    const contentTypes = `<?xml version="1.0" encoding="UTF-8"?>
+      <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+        <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+        <Default Extension="xml" ContentType="application/xml"/>
+        <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+      </Types>`;
+    zip.file('[Content_Types].xml', contentTypes);
+
+    // A basic UTF-8 document.xml so there's at least one normal XML entry
+    const documentXml = `<?xml version="1.0" encoding="UTF-8"?>
+      <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+        <w:body><w:p><w:r><w:t>Hello</w:t></w:r></w:p></w:body>
+      </w:document>`;
+    zip.file('word/document.xml', documentXml);
+
+    // The problematic UTF-16LE customXml item
+    const customXmlUtf16 = `<?xml version="1.0" encoding="utf-16"?>
+<properties xmlns="http://www.imanage.com/work/xmlschema">
+  <documentid>TELEKOM!4176814.1</documentid>
+  <senderid>A675398</senderid>
+  <senderemail>GUDRUN.JORDAN@TELEKOM.DE</senderemail>
+  <lastmodified>2023-07-06T15:09:00.0000000+02:00</lastmodified>
+  <database>TELEKOM</database>
+</properties>`;
+    zip.file('customXml/item2.xml', utf16leWithBOM(customXmlUtf16));
+
+    // Generate the zip as a Node buffer and feed it to the zipper
+    const buf = await zip.generateAsync({ type: 'nodebuffer' });
+    const files = await zipper.getDocxData(buf /* isNode not needed for XML */);
+
+    // Find the customXml item
+    const item2 = files.find((f) => f.name === 'customXml/item2.xml');
+    expect(item2).toBeTruthy();
+
+    // ✅ With the fix, content is a clean JS string:
+    expect(item2.content).toContain('<?xml'); // prolog present
+    expect(item2.content).toContain('<properties'); // real tag (no NULs interleaved)
+    expect(item2.content).not.toMatch(/\u0000/); // no embedded NULs
+    expect(item2.content.toLowerCase()).toContain('encoding="utf-16"');
+  });
+});
diff --git a/packages/super-editor/src/core/encoding-helpers.js b/packages/super-editor/src/core/encoding-helpers.js
@@ -0,0 +1,80 @@
+/**
+ * Quick check for .xml / .rels
+ * @param {string} name
+ * @returns {boolean} True if the name has a .xml or .rels extension
+ */
+export const isXmlLike = (name) => /\.xml$|\.rels$/i.test(name);
+
+/**
+ * Hex dump for optional debugging
+ * @param {Uint8Array|ArrayBuffer} bytes
+ * @param {number} n
+ * @returns {string} Hex dump
+ */
+export function hex(bytes, n = 32) {
+  const u8 = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
+  return Array.from(u8.slice(0, n))
+    .map((b) => b.toString(16).padStart(2, '0'))
+    .join(' ');
+}
+
+/**
+ * Try to detect encoding by BOM / null density
+ * @param {Uint8Array} u8
+ * @returns {string} Detected encoding
+ */
+export function sniffEncoding(u8) {
+  if (u8.length >= 2) {
+    const b0 = u8[0],
+      b1 = u8[1];
+    if (b0 === 0xff && b1 === 0xfe) return 'utf-16le';
+    if (b0 === 0xfe && b1 === 0xff) return 'utf-16be';
+  }
+  // Heuristic: lots of NULs near the start → likely UTF-16
+  let nul = 0;
+  for (let i = 0; i < Math.min(64, u8.length); i++) if (u8[i] === 0) nul++;
+  if (nul > 16) return 'utf-16le';
+  return 'utf-8';
+}
+
+/**
+ * Remove leading BOM from already-decoded JS string
+ * @param {string} str
+ * @returns {string} Cleaned string without BOM
+ */
+export function stripBOM(str) {
+  return str && str.charCodeAt(0) === 0xfeff ? str.slice(1) : str;
+}
+
+/**
+ * Decode XML/RELS content to a clean JS string.
+ * Accepts: string | Uint8Array | ArrayBuffer
+ * @param {string|Uint8Array|ArrayBuffer} content
+ * @returns {string} Clean XML string
+ */
+export function ensureXmlString(content) {
+  if (typeof content === 'string') return stripBOM(content);
+
+  // Accept: Buffer, Uint8Array, DataView, any TypedArray, or ArrayBuffer
+  let u8 = null;
+
+  if (content && typeof content === 'object') {
+    if (content instanceof Uint8Array) {
+      u8 = content;
+    } else if (typeof Buffer !== 'undefined' && Buffer.isBuffer && Buffer.isBuffer(content)) {
+      // Node Buffer
+      u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength);
+    } else if (ArrayBuffer.isView && ArrayBuffer.isView(content)) {
+      // Any ArrayBufferView: DataView or other TypedArray
+      u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength);
+    } else if (content.constructor && (content instanceof ArrayBuffer || content.constructor.name === 'ArrayBuffer')) {
+      u8 = new Uint8Array(content);
+    }
+  }
+
+  if (!u8) throw new Error('Unsupported content type for XML');
+
+  const enc = sniffEncoding(u8);
+  let xml = new TextDecoder(enc).decode(u8);
+  return stripBOM(xml);
+}
diff --git a/packages/super-editor/src/core/encoding-helpers.test.js b/packages/super-editor/src/core/encoding-helpers.test.js
@@ -0,0 +1,142 @@
+import { describe, it, expect } from 'vitest';
+import { isXmlLike, hex, sniffEncoding, stripBOM, ensureXmlString } from './encoding-helpers.js';
+
+function utf16leWithBOM(str) {
+  const bom = Buffer.from([0xff, 0xfe]);
+  const body = Buffer.from(str, 'utf16le');
+  return Buffer.concat([bom, body]);
+}
+
+function utf16beWithBOM(str) {
+  const le = Buffer.from(str, 'utf16le');
+  const swapped = Buffer.alloc(le.length);
+  for (let i = 0; i < le.length; i += 2) {
+    swapped[i] = le[i + 1];
+    swapped[i + 1] = le[i];
+  }
+  const bom = Buffer.from([0xfe, 0xff]);
+  return Buffer.concat([bom, swapped]);
+}
+
+function noBOMUtf16leBytes(str) {
+  // UTF-16LE WITHOUT a BOM (to trigger the NUL-heuristic)
+  return Buffer.from(str, 'utf16le');
+}
+
+describe('isXmlLike', () => {
+  it('matches .xml and .rels', () => {
+    expect(isXmlLike('word/document.xml')).toBe(true);
+    expect(isXmlLike('word/_rels/document.xml.rels')).toBe(true);
+    expect(isXmlLike('docProps/core.xml')).toBe(true);
+  });
+  it('rejects non-xml', () => {
+    expect(isXmlLike('word/media/image1.png')).toBe(false);
+    expect(isXmlLike('customXml/item1.xml.bin')).toBe(false);
+    expect(isXmlLike('word/fonts/font1.odttf')).toBe(false);
+  });
+});
+
+describe('hex', () => {
+  it('renders hex dump of first N bytes', () => {
+    const u8 = new Uint8Array([0xff, 0xfe, 0x3c, 0x00, 0x3f, 0x00]);
+    expect(hex(u8, 6)).toBe('ff fe 3c 00 3f 00');
+  });
+});
+
+describe('sniffEncoding', () => {
+  it('detects UTF-16LE by BOM', () => {
+    const u8 = utf16leWithBOM('<?xml version="1.0"?>');
+    expect(sniffEncoding(u8)).toBe('utf-16le');
+  });
+  it('detects UTF-16BE by BOM', () => {
+    const u8 = utf16beWithBOM('<?xml version="1.0"?>');
+    expect(sniffEncoding(u8)).toBe('utf-16be');
+  });
+  it('defaults to utf-8 for plain ASCII/UTF-8', () => {
+    const u8 = new TextEncoder().encode('<?xml version="1.0"?><a/>');
+    expect(sniffEncoding(u8)).toBe('utf-8');
+  });
+  it('heuristically detects UTF-16 (no BOM) via NUL density', () => {
+    const u8 = noBOMUtf16leBytes('<?xml version="1.0"?><root/>');
+    // Our heuristic returns 'utf-16le' for lots of NULs
+    expect(sniffEncoding(u8)).toBe('utf-16le');
+  });
+});
+
+describe('stripBOM', () => {
+  it('removes U+FEFF if present', () => {
+    const s = '\uFEFF<?xml?><r/>';
+    expect(stripBOM(s)).toBe('<?xml?><r/>');
+  });
+  it('no-ops when no BOM present', () => {
+    const s = '<?xml?><r/>';
+    expect(stripBOM(s)).toBe(s);
+  });
+});
+
+describe('ensureXmlString', () => {
+  it('returns same string when given a plain XML string', () => {
+    const s = '<?xml version="1.0"?><r/>';
+    expect(ensureXmlString(s)).toBe(s);
+  });
+
+  it('strips leading BOM from a decoded string', () => {
+    const s = '\uFEFF<?xml version="1.0"?><r/>';
+    expect(ensureXmlString(s)).toBe('<?xml version="1.0"?><r/>');
+  });
+
+  it('decodes UTF-8 bytes', () => {
+    const u8 = new TextEncoder().encode('<?xml version="1.0"?><root>héllo</root>');
+    const out = ensureXmlString(u8);
+    expect(out).toContain('<?xml');
+    expect(out).toContain('héllo');
+  });
+
+  it('decodes UTF-16LE with BOM bytes', () => {
+    const u8 = utf16leWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
+    const out = ensureXmlString(u8);
+    expect(out.toLowerCase()).toContain('encoding="utf-16"');
+    expect(out).toContain('<props>');
+    expect(out).not.toMatch(/\u0000/);
+  });
+
+  it('decodes UTF-16BE with BOM bytes', () => {
+    const u8 = utf16beWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
+    const out = ensureXmlString(u8);
+    expect(out.toLowerCase()).toContain('encoding="utf-16"');
+    expect(out).toContain('<props>');
+    expect(out).not.toMatch(/\u0000/);
+  });
+
+  it('decodes UTF-16 (no BOM) via heuristic', () => {
+    const u8 = noBOMUtf16leBytes('<?xml version="1.0"?><root>NOBOM</root>');
+    const out = ensureXmlString(u8);
+    expect(out).toContain('<root>');
+    expect(out).toContain('NOBOM');
+    expect(out).not.toMatch(/\u0000/);
+  });
+
+  it('accepts ArrayBuffer input', () => {
+    const u8 = new TextEncoder().encode('<?xml version="1.0"?><r/>');
+    const out = ensureXmlString(u8.buffer);
+    expect(out).toContain('<r/>');
+  });
+
+  it('throws on unsupported content types', () => {
+    expect(() => ensureXmlString(12345)).toThrow(/Unsupported content type/);
+  });
+
+  it('decodes from Node Buffer (utf-8)', () => {
+    const buf = Buffer.from('<?xml version="1.0"?><root/>', 'utf8');
+    const out = ensureXmlString(buf);
+    expect(out).toContain('<root/>');
+  });
+});
+
+describe('ensureXmlString cross-env', () => {
+  it('decodes from Node Buffer (utf-8)', () => {
+    const buf = Buffer.from('<?xml version="1.0"?><root/>', 'utf8');
+    const out = ensureXmlString(buf);
+    expect(out).toContain('<root/>');
+  });
+});