Skip to content

Commit 9bc488d

Browse files
committed
fix: imports encoded in utf-16 break DocxZipper
1 parent ac7fe3c commit 9bc488d

File tree

4 files changed

+304
-26
lines changed

4 files changed

+304
-26
lines changed

packages/super-editor/src/core/DocxZipper.js

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import xmljs from 'xml-js';
22
import JSZip from 'jszip';
33
import { getContentTypesFromXml } from './super-converter/helpers.js';
4+
import { ensureXmlString, isXmlLike } from './encoding-helpers.js';
45

56
/**
67
* Class to handle unzipping and zipping of docx files
@@ -37,42 +38,37 @@ class DocxZipper {
3738
const extractedFiles = await this.unzip(file);
3839
const files = Object.entries(extractedFiles.files);
3940

40-
const mediaObjects = {};
41-
const validTypes = ['xml', 'rels'];
42-
for (const file of files) {
43-
const [, zipEntry] = file;
44-
45-
if (validTypes.some((validType) => zipEntry.name.endsWith(validType))) {
46-
const content = await zipEntry.async('string');
47-
this.files.push({
48-
name: zipEntry.name,
49-
content,
50-
});
41+
for (const [, zipEntry] of files) {
42+
const name = zipEntry.name;
43+
44+
if (isXmlLike(name)) {
45+
// Read raw bytes and decode (handles UTF-8 & UTF-16)
46+
const u8 = await zipEntry.async('uint8array');
47+
const content = ensureXmlString(u8);
48+
this.files.push({ name, content });
5149
} else if (
52-
(zipEntry.name.startsWith('word/media') && zipEntry.name !== 'word/media/') ||
53-
(zipEntry.name.startsWith('media') && zipEntry.name !== 'media/')
50+
(name.startsWith('word/media') && name !== 'word/media/') ||
51+
(name.startsWith('media') && name !== 'media/')
5452
) {
55-
// If we are in node, we need to convert the buffer to base64
53+
// Media files
5654
if (isNode) {
5755
const buffer = await zipEntry.async('nodebuffer');
5856
const fileBase64 = buffer.toString('base64');
59-
this.mediaFiles[zipEntry.name] = fileBase64;
60-
}
61-
62-
// If we are in the browser, we can use the base64 directly
63-
else {
57+
this.mediaFiles[name] = fileBase64;
58+
} else {
6459
const blob = await zipEntry.async('blob');
65-
const extension = this.getFileExtension(zipEntry.name);
60+
const extension = this.getFileExtension(name);
6661
const fileBase64 = await zipEntry.async('base64');
67-
this.mediaFiles[zipEntry.name] = `data:image/${extension};base64,${fileBase64}`;
62+
this.mediaFiles[name] = `data:image/${extension};base64,${fileBase64}`;
6863

69-
const file = new File([blob], zipEntry.name, { type: blob.type });
70-
const imageUrl = URL.createObjectURL(file);
71-
this.media[zipEntry.name] = imageUrl;
64+
const fileObj = new File([blob], name, { type: blob.type });
65+
const imageUrl = URL.createObjectURL(fileObj);
66+
this.media[name] = imageUrl;
7267
}
73-
} else if (zipEntry.name.startsWith('word/fonts') && zipEntry.name !== 'word/fonts/') {
68+
} else if (name.startsWith('word/fonts') && name !== 'word/fonts/') {
69+
// Font files
7470
const uint8array = await zipEntry.async('uint8array');
75-
this.fonts[zipEntry.name] = uint8array;
71+
this.fonts[name] = uint8array;
7672
}
7773
}
7874

packages/super-editor/src/core/DocxZipper.test.js

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import path from 'path';
22
import fs from 'fs';
33
import { describe, it, expect, beforeEach } from 'vitest';
44
import DocxZipper from './DocxZipper';
5+
import JSZip from 'jszip';
56

67
async function readFileAsBuffer(filePath) {
78
const resolvedPath = path.resolve(__dirname, filePath);
@@ -48,3 +49,62 @@ describe('DocxZipper - file extraction', () => {
4849
expect(documentXml).toBeTruthy();
4950
});
5051
});
52+
53+
// Helper to build a UTF-16LE Buffer with BOM
54+
function utf16leWithBOM(str) {
55+
const bom = Buffer.from([0xff, 0xfe]);
56+
const body = Buffer.from(str, 'utf16le');
57+
return Buffer.concat([bom, body]);
58+
}
59+
60+
describe('DocxZipper - UTF-16 XML handling', () => {
61+
let zipper;
62+
beforeEach(() => {
63+
zipper = new DocxZipper();
64+
});
65+
66+
it('decodes a UTF-16LE customXml part correctly (was failing before fix)', async () => {
67+
const zip = new JSZip();
68+
69+
// Minimal [Content_Types].xml to look like a docx
70+
const contentTypes = `<?xml version="1.0" encoding="UTF-8"?>
71+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
72+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
73+
<Default Extension="xml" ContentType="application/xml"/>
74+
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
75+
</Types>`;
76+
zip.file('[Content_Types].xml', contentTypes);
77+
78+
// A basic UTF-8 document.xml so there's at least one normal XML entry
79+
const documentXml = `<?xml version="1.0" encoding="UTF-8"?>
80+
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
81+
<w:body><w:p><w:r><w:t>Hello</w:t></w:r></w:p></w:body>
82+
</w:document>`;
83+
zip.file('word/document.xml', documentXml);
84+
85+
// The problematic UTF-16LE customXml item
86+
const customXmlUtf16 = `<?xml version="1.0" encoding="utf-16"?>
87+
<properties xmlns="http://www.imanage.com/work/xmlschema">
88+
<documentid>TELEKOM!4176814.1</documentid>
89+
<senderid>A675398</senderid>
90+
<senderemail>GUDRUN.JORDAN@TELEKOM.DE</senderemail>
91+
<lastmodified>2023-07-06T15:09:00.0000000+02:00</lastmodified>
92+
<database>TELEKOM</database>
93+
</properties>`;
94+
zip.file('customXml/item2.xml', utf16leWithBOM(customXmlUtf16));
95+
96+
// Generate the zip as a Node buffer and feed it to the zipper
97+
const buf = await zip.generateAsync({ type: 'nodebuffer' });
98+
const files = await zipper.getDocxData(buf /* isNode not needed for XML */);
99+
100+
// Find the customXml item
101+
const item2 = files.find((f) => f.name === 'customXml/item2.xml');
102+
expect(item2).toBeTruthy();
103+
104+
// ✅ With the fix, content is a clean JS string:
105+
expect(item2.content).toContain('<?xml'); // prolog present
106+
expect(item2.content).toContain('<properties'); // real tag (no NULs interleaved)
107+
expect(item2.content).not.toMatch(/\u0000/); // no embedded NULs
108+
expect(item2.content.toLowerCase()).toContain('encoding="utf-16"');
109+
});
110+
});
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/**
2+
* Quick check for .xml / .rels
3+
* @param {string} name
4+
* @returns {boolean} True if the name has a .xml or .rels extension
5+
*/
6+
export const isXmlLike = (name) => /\.xml$|\.rels$/i.test(name);
7+
8+
/**
9+
* Hex dump for optional debugging
10+
* @param {Uint8Array|ArrayBuffer} bytes
11+
* @param {number} n
12+
* @returns {string} Hex dump
13+
*/
14+
export function hex(bytes, n = 32) {
15+
const u8 = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
16+
return Array.from(u8.slice(0, n))
17+
.map((b) => b.toString(16).padStart(2, '0'))
18+
.join(' ');
19+
}
20+
21+
/**
22+
* Try to detect encoding by BOM / null density
23+
* @param {Uint8Array} u8
24+
* @returns {string} Detected encoding
25+
*/
26+
export function sniffEncoding(u8) {
27+
if (u8.length >= 2) {
28+
const b0 = u8[0],
29+
b1 = u8[1];
30+
if (b0 === 0xff && b1 === 0xfe) return 'utf-16le';
31+
if (b0 === 0xfe && b1 === 0xff) return 'utf-16be';
32+
}
33+
// Heuristic: lots of NULs near the start → likely UTF-16
34+
let nul = 0;
35+
for (let i = 0; i < Math.min(64, u8.length); i++) if (u8[i] === 0) nul++;
36+
if (nul > 16) return 'utf-16le';
37+
return 'utf-8';
38+
}
39+
40+
/**
41+
* Remove leading BOM from already-decoded JS string
42+
* @param {string} str
43+
* @returns {string} Cleaned string without BOM
44+
*/
45+
export function stripBOM(str) {
46+
return str && str.charCodeAt(0) === 0xfeff ? str.slice(1) : str;
47+
}
48+
49+
/**
50+
* Decode XML/RELS content to a clean JS string.
51+
* Accepts: string | Uint8Array | ArrayBuffer
52+
* @param {string|Uint8Array|ArrayBuffer} content
53+
* @returns {string} Clean XML string
54+
*/
55+
export function ensureXmlString(content) {
56+
if (typeof content === 'string') return stripBOM(content);
57+
58+
// Accept: Buffer, Uint8Array, DataView, any TypedArray, or ArrayBuffer
59+
let u8 = null;
60+
61+
if (content && typeof content === 'object') {
62+
if (content instanceof Uint8Array) {
63+
u8 = content;
64+
} else if (typeof Buffer !== 'undefined' && Buffer.isBuffer && Buffer.isBuffer(content)) {
65+
// Node Buffer
66+
u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength);
67+
} else if (ArrayBuffer.isView && ArrayBuffer.isView(content)) {
68+
// Any ArrayBufferView: DataView or other TypedArray
69+
u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength);
70+
} else if (content.constructor && (content instanceof ArrayBuffer || content.constructor.name === 'ArrayBuffer')) {
71+
u8 = new Uint8Array(content);
72+
}
73+
}
74+
75+
if (!u8) throw new Error('Unsupported content type for XML');
76+
77+
const enc = sniffEncoding(u8);
78+
let xml = new TextDecoder(enc).decode(u8);
79+
return stripBOM(xml);
80+
}
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { isXmlLike, hex, sniffEncoding, stripBOM, ensureXmlString } from './encoding-helpers.js';
3+
4+
function utf16leWithBOM(str) {
5+
const bom = Buffer.from([0xff, 0xfe]);
6+
const body = Buffer.from(str, 'utf16le');
7+
return Buffer.concat([bom, body]);
8+
}
9+
10+
function utf16beWithBOM(str) {
11+
const le = Buffer.from(str, 'utf16le');
12+
const swapped = Buffer.alloc(le.length);
13+
for (let i = 0; i < le.length; i += 2) {
14+
swapped[i] = le[i + 1];
15+
swapped[i + 1] = le[i];
16+
}
17+
const bom = Buffer.from([0xfe, 0xff]);
18+
return Buffer.concat([bom, swapped]);
19+
}
20+
21+
function noBOMUtf16leBytes(str) {
22+
// UTF-16LE WITHOUT a BOM (to trigger the NUL-heuristic)
23+
return Buffer.from(str, 'utf16le');
24+
}
25+
26+
describe('isXmlLike', () => {
27+
it('matches .xml and .rels', () => {
28+
expect(isXmlLike('word/document.xml')).toBe(true);
29+
expect(isXmlLike('word/_rels/document.xml.rels')).toBe(true);
30+
expect(isXmlLike('docProps/core.xml')).toBe(true);
31+
});
32+
it('rejects non-xml', () => {
33+
expect(isXmlLike('word/media/image1.png')).toBe(false);
34+
expect(isXmlLike('customXml/item1.xml.bin')).toBe(false);
35+
expect(isXmlLike('word/fonts/font1.odttf')).toBe(false);
36+
});
37+
});
38+
39+
describe('hex', () => {
40+
it('renders hex dump of first N bytes', () => {
41+
const u8 = new Uint8Array([0xff, 0xfe, 0x3c, 0x00, 0x3f, 0x00]);
42+
expect(hex(u8, 6)).toBe('ff fe 3c 00 3f 00');
43+
});
44+
});
45+
46+
describe('sniffEncoding', () => {
47+
it('detects UTF-16LE by BOM', () => {
48+
const u8 = utf16leWithBOM('<?xml version="1.0"?>');
49+
expect(sniffEncoding(u8)).toBe('utf-16le');
50+
});
51+
it('detects UTF-16BE by BOM', () => {
52+
const u8 = utf16beWithBOM('<?xml version="1.0"?>');
53+
expect(sniffEncoding(u8)).toBe('utf-16be');
54+
});
55+
it('defaults to utf-8 for plain ASCII/UTF-8', () => {
56+
const u8 = new TextEncoder().encode('<?xml version="1.0"?><a/>');
57+
expect(sniffEncoding(u8)).toBe('utf-8');
58+
});
59+
it('heuristically detects UTF-16 (no BOM) via NUL density', () => {
60+
const u8 = noBOMUtf16leBytes('<?xml version="1.0"?><root/>');
61+
// Our heuristic returns 'utf-16le' for lots of NULs
62+
expect(sniffEncoding(u8)).toBe('utf-16le');
63+
});
64+
});
65+
66+
describe('stripBOM', () => {
67+
it('removes U+FEFF if present', () => {
68+
const s = '\uFEFF<?xml?><r/>';
69+
expect(stripBOM(s)).toBe('<?xml?><r/>');
70+
});
71+
it('no-ops when no BOM present', () => {
72+
const s = '<?xml?><r/>';
73+
expect(stripBOM(s)).toBe(s);
74+
});
75+
});
76+
77+
describe('ensureXmlString', () => {
78+
it('returns same string when given a plain XML string', () => {
79+
const s = '<?xml version="1.0"?><r/>';
80+
expect(ensureXmlString(s)).toBe(s);
81+
});
82+
83+
it('strips leading BOM from a decoded string', () => {
84+
const s = '\uFEFF<?xml version="1.0"?><r/>';
85+
expect(ensureXmlString(s)).toBe('<?xml version="1.0"?><r/>');
86+
});
87+
88+
it('decodes UTF-8 bytes', () => {
89+
const u8 = new TextEncoder().encode('<?xml version="1.0"?><root>héllo</root>');
90+
const out = ensureXmlString(u8);
91+
expect(out).toContain('<?xml');
92+
expect(out).toContain('héllo');
93+
});
94+
95+
it('decodes UTF-16LE with BOM bytes', () => {
96+
const u8 = utf16leWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
97+
const out = ensureXmlString(u8);
98+
expect(out.toLowerCase()).toContain('encoding="utf-16"');
99+
expect(out).toContain('<props>');
100+
expect(out).not.toMatch(/\u0000/);
101+
});
102+
103+
it('decodes UTF-16BE with BOM bytes', () => {
104+
const u8 = utf16beWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
105+
const out = ensureXmlString(u8);
106+
expect(out.toLowerCase()).toContain('encoding="utf-16"');
107+
expect(out).toContain('<props>');
108+
expect(out).not.toMatch(/\u0000/);
109+
});
110+
111+
it('decodes UTF-16 (no BOM) via heuristic', () => {
112+
const u8 = noBOMUtf16leBytes('<?xml version="1.0"?><root>NOBOM</root>');
113+
const out = ensureXmlString(u8);
114+
expect(out).toContain('<root>');
115+
expect(out).toContain('NOBOM');
116+
expect(out).not.toMatch(/\u0000/);
117+
});
118+
119+
it('accepts ArrayBuffer input', () => {
120+
const u8 = new TextEncoder().encode('<?xml version="1.0"?><r/>');
121+
const out = ensureXmlString(u8.buffer);
122+
expect(out).toContain('<r/>');
123+
});
124+
125+
it('throws on unsupported content types', () => {
126+
expect(() => ensureXmlString(12345)).toThrow(/Unsupported content type/);
127+
});
128+
129+
it('decodes from Node Buffer (utf-8)', () => {
130+
const buf = Buffer.from('<?xml version="1.0"?><root/>', 'utf8');
131+
const out = ensureXmlString(buf);
132+
expect(out).toContain('<root/>');
133+
});
134+
});
135+
136+
describe('ensureXmlString cross-env', () => {
137+
it('decodes from Node Buffer (utf-8)', () => {
138+
const buf = Buffer.from('<?xml version="1.0"?><root/>', 'utf8');
139+
const out = ensureXmlString(buf);
140+
expect(out).toContain('<root/>');
141+
});
142+
});

0 commit comments

Comments
 (0)