@@ -60,6 +60,7 @@ export class SAXParser {
6060 private namespaceStack : Array < Map < string , string > > ;
6161 ignoreGrammars : boolean = false ;
6262 private lastParsedAttributeLexical : Map < string , string > = new Map < string , string > ( ) ;
63+ private characterRunPreservedCR : Set < number > ;
6364
6465 static readonly MIN_BUFFER_SIZE : number = 2048 ;
6566
@@ -75,6 +76,7 @@ export class SAXParser {
7576 this . grammarHandler = new GrammarHandler ( ) ;
7677 this . namespaceMap = new Map < string , string > ( ) ;
7778 this . namespaceStack = [ ] ;
79+ this . characterRunPreservedCR = new Set < number > ( ) ;
7880 this . resetNamespaceContext ( ) ;
7981 }
8082
@@ -232,6 +234,11 @@ export class SAXParser {
232234 continue ;
233235 }
234236 if ( this . lookingAt ( '<' ) ) {
237+ if ( this . rootParsed && this . elementStack === 0 ) {
238+ // After the document element closes, only comments, processing instructions,
239+ // and whitespace are permitted. Any additional element markup is ill-formed.
240+ throw new Error ( 'Malformed XML document: additional element found after the root element' ) ;
241+ }
235242 this . startElement ( ) ;
236243 continue ;
237244 }
@@ -421,9 +428,10 @@ export class SAXParser {
421428 if ( entityValue . length === 1 ) {
422429 grammar . addEntityReferenceUsage ( '&' + name + ';' , entityValue ) ;
423430 }
424- // Fully expand the entity replacement text (custom and character references)
425- const expandedValue : string = this . expandEntities ( entityValue ) ;
426- this . handleEntityContent ( expandedValue ) ;
431+ // Expand custom entities first, then numeric character references while preserving predefined entities
432+ const expandedCustom : string = this . expandCustomEntities ( entityValue ) ;
433+ const expandedCharacters : string = this . expandCharacterReferences ( expandedCustom ) ;
434+ this . handleEntityContent ( expandedCharacters ) ;
427435 }
428436 } else {
429437 // Entity not found - handle as skipped entity
@@ -642,39 +650,100 @@ export class SAXParser {
642650 }
643651
644652 cleanCharacterRun ( ) : void {
653+ if ( this . characterRun === '' ) {
654+ this . characterRunPreservedCR . clear ( ) ;
655+ return ;
656+ }
657+
645658 if ( this . characterRun !== '' ) {
646659 // Note: Don't expand entities here since parseEntityReference already handles
647660 // entity expansion with full recursion. The characterRun contains regular
648661 // character data that doesn't need entity expansion.
649662 let content : string = this . characterRun ;
663+ const normalizedContent : string = this . normalizeCharacterRun ( content ) ;
650664
651665 if ( this . rootParsed ) {
652666 if ( this . elementStack === 0 ) {
653667 // document ended
654668 // Normalize line endings per XML 1.0 spec section 2.11
655- const normalizedContent : string = XMLUtils . normalizeLines ( content ) ;
656669 this . contentHandler ! . ignorableWhitespace ( normalizedContent ) ;
657670 } else {
658671 // in an element - check xml:space
659672 const preserveWhitespace : boolean = this . isXmlSpacePreserve ( ) ;
660673 if ( preserveWhitespace || ! this . isWhitespaceOnly ( content ) ) {
661674 // Preserve whitespace or contains non-whitespace - treat as significant
662675 // Normalize line endings per XML 1.0 spec section 2.11
663- const normalizedContent : string = XMLUtils . normalizeLines ( content ) ;
664676 this . contentHandler ! . characters ( normalizedContent ) ;
665677 } else {
666678 // Default mode and only whitespace - treat as ignorable
667679 // Normalize line endings per XML 1.0 spec section 2.11
668- const normalizedContent : string = XMLUtils . normalizeLines ( content ) ;
669680 this . contentHandler ! . ignorableWhitespace ( normalizedContent ) ;
670681 }
671682 }
672683 } else {
673684 // in prolog
674- this . contentHandler ! . ignorableWhitespace ( this . characterRun ) ;
685+ this . contentHandler ! . ignorableWhitespace ( normalizedContent ) ;
675686 }
676687 this . characterRun = '' ;
688+ this . characterRunPreservedCR . clear ( ) ;
689+ }
690+ }
691+
692+ private normalizeCharacterRun ( content : string ) : string {
693+ if ( this . characterRunPreservedCR . size === 0 ) {
694+ return XMLUtils . normalizeLines ( content ) ;
695+ }
696+
697+ let result = '' ;
698+ for ( let i = 0 ; i < content . length ; i ++ ) {
699+ const char = content . charAt ( i ) ;
700+ if ( char === '\r' && ! this . characterRunPreservedCR . has ( i ) ) {
701+ if ( i + 1 < content . length && content . charAt ( i + 1 ) === '\n' && ! this . characterRunPreservedCR . has ( i + 1 ) ) {
702+ result += '\n' ;
703+ i ++ ;
704+ } else {
705+ result += '\n' ;
706+ }
707+ } else {
708+ result += char ;
709+ }
677710 }
711+ return result ;
712+ }
713+
714+ private appendToCharacterRun ( text : string , options ?: { decodePredefined ?: boolean } ) : void {
715+ if ( ! text ) {
716+ return ;
717+ }
718+
719+ const decodePredefined : boolean = options ?. decodePredefined !== undefined ? options . decodePredefined : true ;
720+ const processed : string = decodePredefined ? this . decodePredefinedEntities ( text ) : text ;
721+
722+ if ( ! processed ) {
723+ return ;
724+ }
725+
726+ const startIndex = this . characterRun . length ;
727+ this . characterRun += processed ;
728+
729+ for ( let i = 0 ; i < processed . length ; i ++ ) {
730+ if ( processed . charAt ( i ) === '\r' ) {
731+ this . characterRunPreservedCR . add ( startIndex + i ) ;
732+ }
733+ }
734+ }
735+
736+ private decodePredefinedEntities ( text : string ) : string {
737+ if ( text . indexOf ( '&' ) === - 1 ) {
738+ return text ;
739+ }
740+
741+ return text
742+ . replace ( / & l t ; / g, '<' )
743+ . replace ( / & g t ; / g, '>' )
744+ . replace ( / & a m p ; / g, '&' )
745+ . replace ( / & q u o t ; / g, '"' )
746+ . replace ( / & a p o s ; / g, "'" ) ;
678747 }
679748
680749 private isXmlSpacePreserve ( ) : boolean {
@@ -1151,7 +1220,9 @@ export class SAXParser {
11511220 valueStart ++ ;
11521221 }
11531222 // Skip opening quote
1223+ let quoteChar : string | undefined ;
11541224 if ( valueStart < pair . length && ( pair [ valueStart ] === '"' || pair [ valueStart ] === "'" ) ) {
1225+ quoteChar = pair [ valueStart ] ;
11551226 valueStart ++ ;
11561227 }
11571228 // Find end (skip closing quote)
@@ -1168,8 +1239,18 @@ export class SAXParser {
11681239 // Check for unescaped ampersands (not part of valid entity references)
11691240 this . validateAttributeValueWellFormedness ( value ) ;
11701241 }
1242+
1243+ value = this . normalizeLiteralAttributeLineBreaks ( value , lexicalValue ) ;
1244+
1245+ if ( quoteChar === undefined && quotedValue . length > 0 ) {
1246+ quoteChar = quotedValue . charAt ( 0 ) ;
1247+ }
1248+ if ( quoteChar !== undefined ) {
1249+ this . validateAttributeLexicalCharacters ( value , quoteChar ) ;
1250+ }
11711251 // Expand entity references in attribute values
11721252 value = this . expandEntities ( value ) ;
1253+ this . validateAttributeCharacterSet ( value ) ;
11731254
11741255 // Well-formedness check: detect duplicate attributes
11751256 if ( map . has ( name ) ) {
@@ -1183,6 +1264,27 @@ export class SAXParser {
11831264 return map ;
11841265 }
11851266
1267+ private normalizeLiteralAttributeLineBreaks ( value : string , lexicalValue : string ) : string {
1268+ if ( lexicalValue . indexOf ( '\r' ) === - 1 && lexicalValue . indexOf ( '\n' ) === - 1 ) {
1269+ return value ;
1270+ }
1271+
1272+ // Only normalize line breaks that appeared literally in the attribute source.
1273+ // Entity references (e.g. ) are preserved so they can expand to individual spaces later.
1274+ let normalized : string = value ;
1275+
1276+ if ( lexicalValue . indexOf ( '\r\n' ) !== - 1 ) {
1277+ normalized = normalized . replace ( / \r \n / g, '\n' ) ;
1278+ }
1279+
1280+ if ( lexicalValue . indexOf ( '\r' ) !== - 1 && lexicalValue . indexOf ( '\r\n' ) === - 1 ) {
1281+ normalized = normalized . replace ( / \r / g, '\n' ) ;
1282+ }
1283+
1284+ // Literal lone LF characters become a single LF (already \n)
1285+ return normalized ;
1286+ }
1287+
11861288 private validateAttributeValueWellFormedness ( value : string ) : void {
11871289 let i = 0 ;
11881290 while ( i < value . length ) {
@@ -1226,6 +1328,49 @@ export class SAXParser {
12261328 }
12271329 }
12281330 }
1331+
1332+ private validateAttributeLexicalCharacters ( value : string , quoteChar : string ) : void {
1333+ let index = 0 ;
1334+ while ( index < value . length ) {
1335+ const codePoint : number = value . codePointAt ( index ) ! ;
1336+ const char : string = String . fromCodePoint ( codePoint ) ;
1337+
1338+ if ( char === '<' ) {
1339+ throw new Error ( `Well-formedness error: raw '<' is not allowed inside attribute values` ) ;
1340+ }
1341+
1342+ if ( char === quoteChar ) {
1343+ throw new Error ( `Well-formedness error: attribute value contains unescaped ${ quoteChar } ` ) ;
1344+ }
1345+
1346+ const isValid : boolean = this . xmlVersion === '1.0'
1347+ ? XMLUtils . isValidXml10Char ( codePoint )
1348+ : XMLUtils . isValidXml11Char ( codePoint ) ;
1349+ if ( ! isValid ) {
1350+ const codeHex : string = codePoint . toString ( 16 ) . toUpperCase ( ) . padStart ( 4 , '0' ) ;
1351+ throw new Error ( `Invalid character in attribute value: U+${ codeHex } is not allowed in XML ${ this . xmlVersion } ` ) ;
1352+ }
1353+
1354+ index += ( codePoint > 0xFFFF ) ? 2 : 1 ;
1355+ }
1356+ }
1357+
1358+ private validateAttributeCharacterSet ( value : string ) : void {
1359+ let index = 0 ;
1360+ while ( index < value . length ) {
1361+ const codePoint : number = value . codePointAt ( index ) ! ;
1362+ const isValid : boolean = this . xmlVersion === '1.0'
1363+ ? XMLUtils . isValidXml10Char ( codePoint )
1364+ : XMLUtils . isValidXml11Char ( codePoint ) ;
1365+ if ( ! isValid ) {
1366+ const codeHex : string = codePoint . toString ( 16 ) . toUpperCase ( ) . padStart ( 4 , '0' ) ;
1367+ throw new Error ( `Invalid character in attribute value after entity expansion: U+${ codeHex } is not allowed in XML ${ this . xmlVersion } ` ) ;
1368+ }
1369+
1370+ index += ( codePoint > 0xFFFF ) ? 2 : 1 ;
1371+ }
1372+ }
1373+
12291374 normalizeAndDefaultAttributes ( elementName : string , attributesMap : Map < string , string > ) : AttributeNormalizationResult {
12301375 const attributeInfos : Map < string , AttributeInfo > = this . grammarHandler . getGrammar ( ) . getElementAttributes ( elementName ) ;
12311376
@@ -1280,7 +1425,7 @@ export class SAXParser {
12801425
12811426 normalizeAttributeByType ( value : string , type : string ) : string {
12821427 if ( type === 'CDATA' ) {
1283- // For CDATA attributes, replace tabs, carriage returns, and line feeds with spaces while preserving other whitespace .
1428+ // For CDATA attributes, replace control whitespace characters with spaces per XML 1.0 normalization rules .
12841429 return value . replace ( / [ \t \r \n ] / g, ' ' ) ;
12851430 } else {
12861431 // For non-CDATA attributes: normalize all whitespace and collapse
@@ -1569,6 +1714,36 @@ export class SAXParser {
15691714 let i : number = 0 ;
15701715
15711716 while ( i < text . length ) {
1717+ if ( text . startsWith ( '<![CDATA[' , i ) ) {
1718+ const end = text . indexOf ( ']]>' , i ) ;
1719+ if ( end === - 1 ) {
1720+ throw new Error ( 'Malformed entity content: Unterminated CDATA section inside entity value' ) ;
1721+ }
1722+ result += text . substring ( i , end + 3 ) ;
1723+ i = end + 3 ;
1724+ continue ;
1725+ }
1726+
1727+ if ( text . startsWith ( '<!--' , i ) ) {
1728+ const end = text . indexOf ( '-->' , i ) ;
1729+ if ( end === - 1 ) {
1730+ throw new Error ( 'Malformed entity content: Unterminated comment inside entity value' ) ;
1731+ }
1732+ result += text . substring ( i , end + 3 ) ;
1733+ i = end + 3 ;
1734+ continue ;
1735+ }
1736+
1737+ if ( text . startsWith ( '<?' , i ) ) {
1738+ const end = text . indexOf ( '?>' , i ) ;
1739+ if ( end === - 1 ) {
1740+ throw new Error ( 'Malformed entity content: Unterminated processing instruction inside entity value' ) ;
1741+ }
1742+ result += text . substring ( i , end + 2 ) ;
1743+ i = end + 2 ;
1744+ continue ;
1745+ }
1746+
15721747 if ( text . charAt ( i ) === '&' ) {
15731748 // Find the end of the entity reference
15741749 let endPos : number = text . indexOf ( ';' , i ) ;
@@ -1652,7 +1827,7 @@ export class SAXParser {
16521827
16531828 const flushText = ( ) => {
16541829 if ( textBuffer . length > 0 ) {
1655- this . characterRun += textBuffer ;
1830+ this . appendToCharacterRun ( textBuffer , { decodePredefined : true } ) ;
16561831 textBuffer = '' ;
16571832 }
16581833 } ;
@@ -1667,7 +1842,7 @@ export class SAXParser {
16671842 }
16681843 const cdataContent = entityValue . substring ( index + 9 , endCdata ) ;
16691844 this . contentHandler ! . startCDATA ( ) ;
1670- this . characterRun += cdataContent ;
1845+ this . appendToCharacterRun ( cdataContent , { decodePredefined : false } ) ;
16711846 this . cleanCharacterRun ( ) ;
16721847 this . contentHandler ! . endCDATA ( ) ;
16731848 index = endCdata + 3 ;
0 commit comments