Skip to content

Commit 9628331

Browse files
committed
Improved support for valid files
1 parent 4c9bd7b commit 9628331

4 files changed

Lines changed: 214 additions & 31 deletions

File tree

ts/DOMBuilder.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ export class DOMBuilder implements ContentHandler {
132132
const attList: AttListDecl = new AttListDecl(elementName, attributesText.trim());
133133
attList.getAttributes().forEach((attDecl: AttDecl, name: string) => {
134134
const defaultValue: string = attDecl.getDefaultValue();
135-
if (defaultValue) {
135+
if (defaultValue && !lexicalMapForElement?.has(name)) {
136136
lexicalMapForElement?.set(name, defaultValue);
137137
}
138138
});

ts/SAXParser.ts

Lines changed: 185 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ export class SAXParser {
6060
private namespaceStack: Array<Map<string, string>>;
6161
ignoreGrammars: boolean = false;
6262
private lastParsedAttributeLexical: Map<string, string> = new Map<string, string>();
63+
private characterRunPreservedCR: Set<number>;
6364

6465
static readonly MIN_BUFFER_SIZE: number = 2048;
6566

@@ -75,6 +76,7 @@ export class SAXParser {
7576
this.grammarHandler = new GrammarHandler();
7677
this.namespaceMap = new Map<string, string>();
7778
this.namespaceStack = [];
79+
this.characterRunPreservedCR = new Set<number>();
7880
this.resetNamespaceContext();
7981
}
8082

@@ -232,6 +234,11 @@ export class SAXParser {
232234
continue;
233235
}
234236
if (this.lookingAt('<')) {
237+
if (this.rootParsed && this.elementStack === 0) {
238+
// After the document element closes, only comments, processing instructions,
239+
// and whitespace are permitted. Any additional element markup is ill-formed.
240+
throw new Error('Malformed XML document: additional element found after the root element');
241+
}
235242
this.startElement();
236243
continue;
237244
}
@@ -421,9 +428,10 @@ export class SAXParser {
421428
if (entityValue.length === 1) {
422429
grammar.addEntityReferenceUsage('&' + name + ';', entityValue);
423430
}
424-
// Fully expand the entity replacement text (custom and character references)
425-
const expandedValue: string = this.expandEntities(entityValue);
426-
this.handleEntityContent(expandedValue);
431+
// Expand custom entities first, then numeric character references while preserving predefined entities
432+
const expandedCustom: string = this.expandCustomEntities(entityValue);
433+
const expandedCharacters: string = this.expandCharacterReferences(expandedCustom);
434+
this.handleEntityContent(expandedCharacters);
427435
}
428436
} else {
429437
// Entity not found - handle as skipped entity
@@ -642,39 +650,100 @@ export class SAXParser {
642650
}
643651

644652
cleanCharacterRun(): void {
653+
if (this.characterRun === '') {
654+
this.characterRunPreservedCR.clear();
655+
return;
656+
}
657+
645658
if (this.characterRun !== '') {
646659
// Note: Don't expand entities here since parseEntityReference already handles
647660
// entity expansion with full recursion. The characterRun contains regular
648661
// character data that doesn't need entity expansion.
649662
let content: string = this.characterRun;
663+
const normalizedContent: string = this.normalizeCharacterRun(content);
650664

651665
if (this.rootParsed) {
652666
if (this.elementStack === 0) {
653667
// document ended
654668
// Normalize line endings per XML 1.0 spec section 2.11
655-
const normalizedContent: string = XMLUtils.normalizeLines(content);
656669
this.contentHandler!.ignorableWhitespace(normalizedContent);
657670
} else {
658671
// in an element - check xml:space
659672
const preserveWhitespace: boolean = this.isXmlSpacePreserve();
660673
if (preserveWhitespace || !this.isWhitespaceOnly(content)) {
661674
// Preserve whitespace or contains non-whitespace - treat as significant
662675
// Normalize line endings per XML 1.0 spec section 2.11
663-
const normalizedContent: string = XMLUtils.normalizeLines(content);
664676
this.contentHandler!.characters(normalizedContent);
665677
} else {
666678
// Default mode and only whitespace - treat as ignorable
667679
// Normalize line endings per XML 1.0 spec section 2.11
668-
const normalizedContent: string = XMLUtils.normalizeLines(content);
669680
this.contentHandler!.ignorableWhitespace(normalizedContent);
670681
}
671682
}
672683
} else {
673684
// in prolog
674-
this.contentHandler!.ignorableWhitespace(this.characterRun);
685+
this.contentHandler!.ignorableWhitespace(normalizedContent);
675686
}
676687
this.characterRun = '';
688+
this.characterRunPreservedCR.clear();
689+
}
690+
}
691+
692+
private normalizeCharacterRun(content: string): string {
693+
if (this.characterRunPreservedCR.size === 0) {
694+
return XMLUtils.normalizeLines(content);
695+
}
696+
697+
let result = '';
698+
for (let i = 0; i < content.length; i++) {
699+
const char = content.charAt(i);
700+
if (char === '\r' && !this.characterRunPreservedCR.has(i)) {
701+
if (i + 1 < content.length && content.charAt(i + 1) === '\n' && !this.characterRunPreservedCR.has(i + 1)) {
702+
result += '\n';
703+
i++;
704+
} else {
705+
result += '\n';
706+
}
707+
} else {
708+
result += char;
709+
}
677710
}
711+
return result;
712+
}
713+
714+
private appendToCharacterRun(text: string, options?: { decodePredefined?: boolean }): void {
715+
if (!text) {
716+
return;
717+
}
718+
719+
const decodePredefined: boolean = options?.decodePredefined !== undefined ? options.decodePredefined : true;
720+
const processed: string = decodePredefined ? this.decodePredefinedEntities(text) : text;
721+
722+
if (!processed) {
723+
return;
724+
}
725+
726+
const startIndex = this.characterRun.length;
727+
this.characterRun += processed;
728+
729+
for (let i = 0; i < processed.length; i++) {
730+
if (processed.charAt(i) === '\r') {
731+
this.characterRunPreservedCR.add(startIndex + i);
732+
}
733+
}
734+
}
735+
736+
private decodePredefinedEntities(text: string): string {
737+
if (text.indexOf('&') === -1) {
738+
return text;
739+
}
740+
741+
return text
742+
.replace(/&lt;/g, '<')
743+
.replace(/&gt;/g, '>')
744+
.replace(/&amp;/g, '&')
745+
.replace(/&quot;/g, '"')
746+
.replace(/&apos;/g, "'");
678747
}
679748

680749
private isXmlSpacePreserve(): boolean {
@@ -1151,7 +1220,9 @@ export class SAXParser {
11511220
valueStart++;
11521221
}
11531222
// Skip opening quote
1223+
let quoteChar: string | undefined;
11541224
if (valueStart < pair.length && (pair[valueStart] === '"' || pair[valueStart] === "'")) {
1225+
quoteChar = pair[valueStart];
11551226
valueStart++;
11561227
}
11571228
// Find end (skip closing quote)
@@ -1168,8 +1239,18 @@ export class SAXParser {
11681239
// Check for unescaped ampersands (not part of valid entity references)
11691240
this.validateAttributeValueWellFormedness(value);
11701241
}
1242+
1243+
value = this.normalizeLiteralAttributeLineBreaks(value, lexicalValue);
1244+
1245+
if (quoteChar === undefined && quotedValue.length > 0) {
1246+
quoteChar = quotedValue.charAt(0);
1247+
}
1248+
if (quoteChar !== undefined) {
1249+
this.validateAttributeLexicalCharacters(value, quoteChar);
1250+
}
11711251
// Expand entity references in attribute values
11721252
value = this.expandEntities(value);
1253+
this.validateAttributeCharacterSet(value);
11731254

11741255
// Well-formedness check: detect duplicate attributes
11751256
if (map.has(name)) {
@@ -1183,6 +1264,27 @@ export class SAXParser {
11831264
return map;
11841265
}
11851266

1267+
private normalizeLiteralAttributeLineBreaks(value: string, lexicalValue: string): string {
1268+
if (lexicalValue.indexOf('\r') === -1 && lexicalValue.indexOf('\n') === -1) {
1269+
return value;
1270+
}
1271+
1272+
// Only normalize line breaks that appeared literally in the attribute source.
1273+
// Entity references (e.g. &#13;) are preserved so they can expand to individual spaces later.
1274+
let normalized: string = value;
1275+
1276+
if (lexicalValue.indexOf('\r\n') !== -1) {
1277+
normalized = normalized.replace(/\r\n/g, '\n');
1278+
}
1279+
1280+
if (lexicalValue.indexOf('\r') !== -1 && lexicalValue.indexOf('\r\n') === -1) {
1281+
normalized = normalized.replace(/\r/g, '\n');
1282+
}
1283+
1284+
// Literal lone LF characters become a single LF (already \n)
1285+
return normalized;
1286+
}
1287+
11861288
private validateAttributeValueWellFormedness(value: string): void {
11871289
let i = 0;
11881290
while (i < value.length) {
@@ -1226,6 +1328,49 @@ export class SAXParser {
12261328
}
12271329
}
12281330
}
1331+
1332+
private validateAttributeLexicalCharacters(value: string, quoteChar: string): void {
1333+
let index = 0;
1334+
while (index < value.length) {
1335+
const codePoint: number = value.codePointAt(index)!;
1336+
const char: string = String.fromCodePoint(codePoint);
1337+
1338+
if (char === '<') {
1339+
throw new Error(`Well-formedness error: raw '<' is not allowed inside attribute values`);
1340+
}
1341+
1342+
if (char === quoteChar) {
1343+
throw new Error(`Well-formedness error: attribute value contains unescaped ${quoteChar}`);
1344+
}
1345+
1346+
const isValid: boolean = this.xmlVersion === '1.0'
1347+
? XMLUtils.isValidXml10Char(codePoint)
1348+
: XMLUtils.isValidXml11Char(codePoint);
1349+
if (!isValid) {
1350+
const codeHex: string = codePoint.toString(16).toUpperCase().padStart(4, '0');
1351+
throw new Error(`Invalid character in attribute value: U+${codeHex} is not allowed in XML ${this.xmlVersion}`);
1352+
}
1353+
1354+
index += (codePoint > 0xFFFF) ? 2 : 1;
1355+
}
1356+
}
1357+
1358+
private validateAttributeCharacterSet(value: string): void {
1359+
let index = 0;
1360+
while (index < value.length) {
1361+
const codePoint: number = value.codePointAt(index)!;
1362+
const isValid: boolean = this.xmlVersion === '1.0'
1363+
? XMLUtils.isValidXml10Char(codePoint)
1364+
: XMLUtils.isValidXml11Char(codePoint);
1365+
if (!isValid) {
1366+
const codeHex: string = codePoint.toString(16).toUpperCase().padStart(4, '0');
1367+
throw new Error(`Invalid character in attribute value after entity expansion: U+${codeHex} is not allowed in XML ${this.xmlVersion}`);
1368+
}
1369+
1370+
index += (codePoint > 0xFFFF) ? 2 : 1;
1371+
}
1372+
}
1373+
12291374
normalizeAndDefaultAttributes(elementName: string, attributesMap: Map<string, string>): AttributeNormalizationResult {
12301375
const attributeInfos: Map<string, AttributeInfo> = this.grammarHandler.getGrammar().getElementAttributes(elementName);
12311376

@@ -1280,7 +1425,7 @@ export class SAXParser {
12801425

12811426
normalizeAttributeByType(value: string, type: string): string {
12821427
if (type === 'CDATA') {
1283-
// For CDATA attributes, replace tabs, carriage returns, and line feeds with spaces while preserving other whitespace.
1428+
// For CDATA attributes, replace control whitespace characters with spaces per XML 1.0 normalization rules.
12841429
return value.replace(/[\t\r\n]/g, ' ');
12851430
} else {
12861431
// For non-CDATA attributes: normalize all whitespace and collapse
@@ -1569,6 +1714,36 @@ export class SAXParser {
15691714
let i: number = 0;
15701715

15711716
while (i < text.length) {
1717+
if (text.startsWith('<![CDATA[', i)) {
1718+
const end = text.indexOf(']]>', i);
1719+
if (end === -1) {
1720+
throw new Error('Malformed entity content: Unterminated CDATA section inside entity value');
1721+
}
1722+
result += text.substring(i, end + 3);
1723+
i = end + 3;
1724+
continue;
1725+
}
1726+
1727+
if (text.startsWith('<!--', i)) {
1728+
const end = text.indexOf('-->', i);
1729+
if (end === -1) {
1730+
throw new Error('Malformed entity content: Unterminated comment inside entity value');
1731+
}
1732+
result += text.substring(i, end + 3);
1733+
i = end + 3;
1734+
continue;
1735+
}
1736+
1737+
if (text.startsWith('<?', i)) {
1738+
const end = text.indexOf('?>', i);
1739+
if (end === -1) {
1740+
throw new Error('Malformed entity content: Unterminated processing instruction inside entity value');
1741+
}
1742+
result += text.substring(i, end + 2);
1743+
i = end + 2;
1744+
continue;
1745+
}
1746+
15721747
if (text.charAt(i) === '&') {
15731748
// Find the end of the entity reference
15741749
let endPos: number = text.indexOf(';', i);
@@ -1652,7 +1827,7 @@ export class SAXParser {
16521827

16531828
const flushText = () => {
16541829
if (textBuffer.length > 0) {
1655-
this.characterRun += textBuffer;
1830+
this.appendToCharacterRun(textBuffer, { decodePredefined: true });
16561831
textBuffer = '';
16571832
}
16581833
};
@@ -1667,7 +1842,7 @@ export class SAXParser {
16671842
}
16681843
const cdataContent = entityValue.substring(index + 9, endCdata);
16691844
this.contentHandler!.startCDATA();
1670-
this.characterRun += cdataContent;
1845+
this.appendToCharacterRun(cdataContent, { decodePredefined: false });
16711846
this.cleanCharacterRun();
16721847
this.contentHandler!.endCDATA();
16731848
index = endCdata + 3;

ts/XMLCanonicalizer.ts

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,16 @@ export class XMLCanonicalizer {
7575
const attributes: XMLAttribute[] = this.getSortedAttributes(element);
7676
for (const attr of attributes) {
7777
const lexicalValue: string | undefined = attr.getLexicalValue();
78-
if (lexicalValue !== undefined && (!attr.isSpecified() || lexicalValue.includes('&#'))) {
79-
result += ' ' + attr.getName() + '="' + lexicalValue + '"';
80-
} else {
81-
result += ' ' + attr.getName() + '="' + this.escapeAttributeValue(attr.getValue()) + '"';
78+
if (lexicalValue !== undefined) {
79+
const preserveLexical: boolean = attr.isSpecified()
80+
? this.shouldPreserveAttributeLexicalValue(lexicalValue)
81+
: this.shouldPreserveDefaultAttributeLexicalValue(lexicalValue);
82+
if (preserveLexical) {
83+
result += ' ' + attr.getName() + '="' + lexicalValue + '"';
84+
continue;
85+
}
8286
}
87+
result += ' ' + attr.getName() + '="' + this.escapeAttributeValue(attr.getValue()) + '"';
8388
}
8489

8590
result += '>';
@@ -147,19 +152,14 @@ export class XMLCanonicalizer {
147152
let result = '';
148153
for (let i = 0; i < text.length; i++) {
149154
const char = text.charAt(i);
150-
if (this.grammar) {
151-
const originalRef = this.grammar.consumeEntityReference(char);
152-
if (originalRef && this.shouldRestoreReference(originalRef)) {
153-
result += originalRef;
154-
continue;
155-
}
156-
}
157-
158155
switch (char) {
159156
case '&': result += '&amp;'; break;
160157
case '<': result += '&lt;'; break;
161158
case '>': result += '&gt;'; break;
159+
case '\t': result += '&#9;'; break;
160+
case '\n': result += '&#10;'; break;
162161
case '\r': result += '&#13;'; break;
162+
case '"': result += '&quot;'; break;
163163
default: result += char; break;
164164
}
165165
}
@@ -184,13 +184,12 @@ export class XMLCanonicalizer {
184184
return result;
185185
}
186186

187-
private shouldRestoreReference(reference: string): boolean {
188-
if (reference.startsWith('&#')) {
189-
return false;
190-
}
191-
if (reference === '&apos;') {
192-
return false;
193-
}
194-
return true;
187+
private shouldPreserveAttributeLexicalValue(lexicalValue: string): boolean {
188+
return lexicalValue.includes('&#');
189+
}
190+
191+
private shouldPreserveDefaultAttributeLexicalValue(lexicalValue: string): boolean {
192+
// Preserve lexical form for default attributes when they rely on entity or parameter references.
193+
return lexicalValue.includes('&#') || lexicalValue.includes('&') || lexicalValue.includes('%');
195194
}
196195
}

0 commit comments

Comments
 (0)