Skip to content

Commit d08d272

Browse files
authored
Multiple encoding support (jameslan#134)
1 parent cd4b975 commit d08d272

File tree

8 files changed

+254
-52
lines changed

8 files changed

+254
-52
lines changed

src/document.mts

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ function parse<Input>(
229229
ctxt,
230230
source,
231231
url,
232-
null,
232+
options.encoding ?? null,
233233
xmlOptions,
234234
);
235235
try {
@@ -281,12 +281,21 @@ export class XmlDocument extends XmlDisposable<XmlDocument> {
281281

282282
/**
283283
* Parse and create an {@link XmlDocument} from an XML string.
284+
*
285+
* Note: Only UTF-8 encoding is supported for string input.
286+
* For other encodings, use {@link fromBuffer} instead.
287+
*
284288
* @param source The XML string
289+
* @param options Parsing options
290+
* @throws Error when encoding is not 'utf-8'
285291
*/
286292
static fromString(
287293
source: string,
288294
options: ParseOptions = {},
289295
): XmlDocument {
296+
if (options.encoding && options.encoding !== 'utf-8') {
297+
throw new XmlError('Non-UTF-8 encoding is not supported for string input, use fromBuffer instead');
298+
}
290299
return parse(xmlReadString, source, options.url ?? null, options);
291300
}
292301

@@ -304,13 +313,22 @@ export class XmlDocument extends XmlDisposable<XmlDocument> {
304313

305314
/**
306315
* Save the XmlDocument to a string
316+
*
317+
* By default, it outputs utf-8 encoded bytes,
318+
* while `ascii` is another allowed option for `options.encoding`,
319+
* which converts non-ascii characters into numeric character references.
320+
*
307321
* @param options options to adjust the saving behavior
308322
* @see {@link save}
309323
* @see {@link XmlElement#toString}
310324
*/
311325
toString(options?: SaveOptions): string {
326+
const saveOptions = options ?? { format: true };
327+
if (saveOptions.encoding && saveOptions.encoding !== 'utf-8' && saveOptions.encoding !== 'ascii') {
328+
throw new XmlError('Only utf-8 or ascii is supported in toString(). For other encodings, use save().');
329+
}
312330
const handler = new XmlStringOutputBufferHandler();
313-
this.save(handler, options);
331+
this.save(handler, { encoding: 'utf-8', ...saveOptions });
314332

315333
return handler.result;
316334
}
@@ -327,13 +345,15 @@ export class XmlDocument extends XmlDisposable<XmlDocument> {
327345
/**
328346
* Save the XmlDocument to a buffer and invoke the callbacks to process.
329347
*
348+
* By default, it outputs with original encoding.
349+
*
330350
* @param handler handlers to process the content in the buffer
331351
* @param options options to adjust the saving behavior
332352
* @see {@link toString}
333353
* @see {@link XmlElement#save}
334354
*/
335355
save(handler: XmlOutputBufferHandler, options?: SaveOptions) {
336-
const ctxt = xmlSaveToIO(handler, null, xmlSaveOption(options));
356+
const ctxt = xmlSaveToIO(handler, options?.encoding ?? null, xmlSaveOption(options));
337357
if (options?.indentString) {
338358
if (xmlSaveSetIndentString(ctxt, options.indentString) < 0) {
339359
throw new XmlError('Failed to set indent string');

src/libxml2.mts

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,9 @@ export function xmlReadString(
131131
return withStringUTF8(
132132
xmlString,
133133
(xmlBuf, len) => withStrings(
134-
(urlBuf) => libxml2._xmlCtxtReadMemory(ctxt, xmlBuf, len, urlBuf, 0, options),
134+
(urlBuf, enc) => libxml2._xmlCtxtReadMemory(ctxt, xmlBuf, len, urlBuf, enc, options),
135135
url,
136+
encoding,
136137
),
137138
);
138139
}
@@ -147,8 +148,9 @@ export function xmlReadMemory(
147148
return withCString(
148149
xmlBuffer,
149150
(xmlBuf, len) => withStrings(
150-
(urlBuf) => libxml2._xmlCtxtReadMemory(ctxt, xmlBuf, len, urlBuf, 0, options),
151+
(urlBuf, enc) => libxml2._xmlCtxtReadMemory(ctxt, xmlBuf, len, urlBuf, enc, options),
151152
url,
153+
encoding,
152154
),
153155
);
154156
}
@@ -518,6 +520,12 @@ export interface SaveOptions {
518520
* @default Two spaces: " "
519521
*/
520522
indentString?: string;
523+
/**
524+
* The encoding to use for the output.
525+
*
526+
* @default The original encoding of the document or utf-8
527+
*/
528+
encoding?: string;
521529
}
522530

523531
export function xmlSaveOption(options?: SaveOptions): number {
@@ -580,8 +588,10 @@ export function xmlSaveToIO(
580588
format: number,
581589
): XmlSaveCtxtPtr {
582590
const index = outputHandlerStorage.allocate(handler); // will be freed in outputClose
583-
// Support only UTF-8 as of now
584-
return libxml2._xmlSaveToIO(outputWrite, outputClose, index, 0, format);
591+
return withStringUTF8(
592+
encoding,
593+
(encBuf) => libxml2._xmlSaveToIO(outputWrite, outputClose, index, encBuf, format),
594+
);
585595
}
586596

587597
enum XmlParserInputFlags {

src/nodes.mts

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -801,13 +801,15 @@ export class XmlElement extends XmlTreeNode {
801801
/**
802802
* Save the XmlElement to a buffer and invoke the callbacks to process.
803803
*
804+
* By default, it outputs utf-8 encoded bytes. Use `options.encoding` to change it.
805+
*
804806
* @param handler handlers to process the content in the buffer
805807
* @param options options to adjust the saving behavior
806808
* @see {@link toString}
807809
* @see {@link XmlDocument#save}
808810
*/
809811
save(handler: XmlOutputBufferHandler, options?: SaveOptions) {
810-
const ctxt = xmlSaveToIO(handler, null, xmlSaveOption(options));
812+
const ctxt = xmlSaveToIO(handler, options?.encoding ?? 'utf-8', xmlSaveOption(options));
811813
if (options?.indentString) {
812814
if (xmlSaveSetIndentString(ctxt, options.indentString) < 0) {
813815
throw new XmlError('Failed to set indent string');
@@ -819,13 +821,22 @@ export class XmlElement extends XmlTreeNode {
819821

820822
/**
821823
* Save the XmlElement to a string
824+
*
825+
* By default, it outputs utf-8 encoded bytes,
826+
* while `ascii` is another allowed option for `options.encoding`,
827+
* which converts non-ascii characters into numeric character references.
828+
*
822829
* @param options options to adjust the saving behavior
823830
* @see {@link save}
824831
* @see {@link XmlDocument#toString}
825832
*/
826833
toString(options?: SaveOptions): string {
834+
const saveOptions = options ?? { format: true };
835+
if (saveOptions.encoding && saveOptions.encoding !== 'utf-8' && saveOptions.encoding !== 'ascii') {
836+
throw new XmlError('Only utf-8 or ascii is supported in toString(). For other encodings, use save().');
837+
}
827838
const handler = new XmlStringOutputBufferHandler();
828-
this.save(handler, options);
839+
this.save(handler, saveOptions);
829840

830841
return handler.result;
831842
}

test/backend/encoding.spec.mts

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import { expect } from 'chai';
2+
import { XmlDocument, XmlElement, XsdValidator } from '@libxml2-wasm/lib/index.mjs';
3+
import * as fs from 'node:fs/promises';
4+
5+
// Use iso8859-15 input, which has to be a file, and has to use node fs module to read the file
6+
describe('encoding', () => {
7+
let xmlBuffer: Buffer;
8+
9+
before(async () => {
10+
xmlBuffer = await fs.readFile('test/testfiles/iso8859-15.xml');
11+
});
12+
13+
describe('parse', () => {
14+
it('should support non-utf8 encoding', () => {
15+
using doc = XmlDocument.fromBuffer(
16+
xmlBuffer,
17+
{ encoding: 'iso8859-15' },
18+
);
19+
expect(doc.get('asdf/@RT')?.content).to.equal('Müller');
20+
});
21+
22+
it('should use encoding from xml declaration', () => {
23+
using doc = XmlDocument.fromBuffer(xmlBuffer);
24+
expect(doc.get('asdf/@RT')?.content).to.equal('Müller');
25+
});
26+
});
27+
28+
describe('validate', () => {
29+
it('should validate with xsd', () => {
30+
using doc = XmlDocument.fromBuffer(xmlBuffer);
31+
using schema = XmlDocument.fromString(`<?xml version="1.0" encoding="utf-8"?>
32+
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
33+
<xsd:simpleType name="RTType">
34+
<xsd:restriction base="xsd:string">
35+
<xsd:enumeration value="Müller"/>
36+
</xsd:restriction>
37+
</xsd:simpleType>
38+
<xsd:element name="levelone">
39+
<xsd:complexType>
40+
<xsd:sequence>
41+
<xsd:element name="asdf">
42+
<xsd:complexType>
43+
<xsd:attribute name="RT" type="RTType" use="required"/>
44+
</xsd:complexType>
45+
</xsd:element>
46+
</xsd:sequence>
47+
</xsd:complexType>
48+
</xsd:element>
49+
</xsd:schema>`);
50+
using validator = XsdValidator.fromDoc(schema);
51+
validator.validate(doc);
52+
});
53+
});
54+
55+
describe('document save', () => {
56+
it('saves to original encoding by default', () => {
57+
using doc = XmlDocument.fromBuffer(xmlBuffer);
58+
59+
const outputBuffer = Buffer.alloc(xmlBuffer.length);
60+
doc.save({
61+
write: (buf: Uint8Array) => { outputBuffer.set(buf); return buf.byteLength; },
62+
close: () => true,
63+
});
64+
expect(outputBuffer).to.deep.equal(xmlBuffer);
65+
});
66+
67+
it('saves to specified encoding', () => {
68+
using doc = XmlDocument.fromBuffer(xmlBuffer);
69+
70+
expect(doc.toString()).to.equal(`\
71+
<?xml version="1.0" encoding="utf-8"?>
72+
<levelone>
73+
<asdf RT="Müller"/>
74+
</levelone>
75+
`);
76+
});
77+
78+
it.skip('saves to specified encoding without format', () => {
79+
using doc = XmlDocument.fromBuffer(xmlBuffer);
80+
81+
expect(doc.toString({ format: false })).to.equal(`\
82+
<?xml version="1.0" encoding="utf-8"?>
83+
<levelone><asdf RT="Müller"/></levelone>
84+
`);
85+
});
86+
});
87+
88+
describe('element save', () => {
89+
it('save to utf-8 by default', () => {
90+
using doc = XmlDocument.fromBuffer(xmlBuffer);
91+
92+
const outputBuffer = Buffer.alloc(xmlBuffer.length);
93+
(doc.get('/levelone/asdf') as XmlElement).save({
94+
write: (buf: Uint8Array) => { outputBuffer.set(buf); return buf.byteLength; },
95+
close: () => true,
96+
});
97+
expect(outputBuffer.indexOf(Buffer.from('RT="Müller"'))).to.above(0);
98+
});
99+
100+
it('save utf-8 xml to other encoding', () => {
101+
using doc = XmlDocument.fromString(`\
102+
<?xml version="1.0" encoding="utf-8"?>
103+
<levelone>
104+
<asdf RT="Müller"/>
105+
</levelone>`);
106+
107+
const outputBuffer = Buffer.alloc(1024);
108+
(doc.get('/levelone/asdf') as XmlElement).save({
109+
write: (buf: Uint8Array) => { outputBuffer.set(buf); return buf.byteLength; },
110+
close: () => true,
111+
}, { encoding: 'iso8859-15' });
112+
const posRT = outputBuffer.indexOf(Buffer.from('RT="M'));
113+
expect(outputBuffer[posRT + 5]).to.equal(0xfc); // ü
114+
expect(outputBuffer[posRT + 6]).to.equal(0x6c); // l
115+
expect(outputBuffer[posRT + 7]).to.equal(0x6c); // l
116+
expect(outputBuffer[posRT + 8]).to.equal(0x65); // e
117+
});
118+
});
119+
});

test/crossplatform/document.spec.mts

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ describe('XmlDocument', () => {
5757
const newDoc = XmlDocument.create();
5858
newDoc.createRoot('docs');
5959
expect(newDoc.toString()).to.equal(`\
60-
<?xml version="1.0"?>
60+
<?xml version="1.0" encoding="utf-8"?>
6161
<docs/>
6262
`);
6363
});
@@ -66,7 +66,7 @@ describe('XmlDocument', () => {
6666
const newDoc = XmlDocument.create();
6767
newDoc.createRoot('docs', 'http://example.com');
6868
expect(newDoc.toString()).to.equal(`\
69-
<?xml version="1.0"?>
69+
<?xml version="1.0" encoding="utf-8"?>
7070
<docs xmlns="http://example.com"/>
7171
`);
7272
});
@@ -75,7 +75,7 @@ describe('XmlDocument', () => {
7575
const newDoc = XmlDocument.create();
7676
newDoc.createRoot('docs', 'http://example.com', 'ex');
7777
expect(newDoc.toString()).to.equal(`\
78-
<?xml version="1.0"?>
78+
<?xml version="1.0" encoding="utf-8"?>
7979
<ex:docs xmlns:ex="http://example.com"/>
8080
`);
8181
});
@@ -103,9 +103,18 @@ describe('XmlDocument', () => {
103103
});
104104

105105
describe('toString', () => {
106+
it('allows utf-8 or ascii', () => {
107+
expect(() => doc.toString({ encoding: 'utf-8' })).to.not.throw();
108+
expect(() => doc.toString({ encoding: 'ascii' })).to.not.throw();
109+
expect(() => doc.toString({ encoding: 'iso8859-1' })).to.throw(
110+
XmlError,
111+
'Only utf-8 or ascii is supported in toString(). For other encodings, use save().',
112+
);
113+
});
114+
106115
it('formats output by default', () => {
107116
expect(doc.toString()).to.equal(`\
108-
<?xml version="1.0"?>
117+
<?xml version="1.0" encoding="utf-8"?>
109118
<docs>
110119
<doc/>
111120
</docs>
@@ -114,14 +123,14 @@ describe('XmlDocument', () => {
114123

115124
it('not format when required', () => {
116125
expect(doc.toString({ format: false })).to.equal(`\
117-
<?xml version="1.0"?>
126+
<?xml version="1.0" encoding="utf-8"?>
118127
<docs><doc/></docs>
119128
`);
120129
});
121130

122131
it('can set indent string', () => {
123132
expect(doc.toString({ format: true, indentString: ' ' })).to.equal(`\
124-
<?xml version="1.0"?>
133+
<?xml version="1.0" encoding="utf-8"?>
125134
<docs>
126135
<doc/>
127136
</docs>
@@ -138,7 +147,7 @@ describe('XmlDocument', () => {
138147

139148
it('can avoid empty tags', () => {
140149
expect(doc.toString({ format: true, noEmptyTags: true })).to.equal(`\
141-
<?xml version="1.0"?>
150+
<?xml version="1.0" encoding="utf-8"?>
142151
<docs>
143152
<doc></doc>
144153
</docs>
@@ -164,6 +173,12 @@ describe('XmlDocument', () => {
164173
const text = d.toString();
165174
expect(text).to.contain('Jan Sedloň');
166175
});
176+
177+
it('generates numeric character references', () => {
178+
using d = XmlDocument.fromString('<root><name>Jan Sedloň</name></root>');
179+
const text = d.toString({ encoding: 'ascii' });
180+
expect(text).to.contain('Jan Sedlo&#328;');
181+
});
167182
});
168183

169184
describe('processXInclude', () => {
@@ -173,7 +188,7 @@ describe('XmlDocument', () => {
173188

174189
it('does nothing w/o XInclude nodes', () => {
175190
expect(doc.processXInclude()).to.equal(0);
176-
expect(doc.toString({ format: false })).to.equal('<?xml version="1.0"?>\n<docs><doc/></docs>\n');
191+
expect(doc.toString({ format: false })).to.equal('<?xml version="1.0" encoding="utf-8"?>\n<docs><doc/></docs>\n');
177192
});
178193

179194
it('processes XInclude nodes', () => {

0 commit comments

Comments
 (0)