1717
1818import java .io .IOException ;
1919import java .io .Reader ;
20+ import java .util .Arrays ;
2021
2122/**
22- * Even though the html5 specification is working with codepoints, this input
23- * stream will only emit chars and "-1".
24- *
25- * This has some interesting consequences that we will need to fully explore:
26- * <ul>
27- * <li>Character position is not the same as the current position
28- * <li>other unknown issues??
29- * </ul>
23+ * Wrapped and abstracted input. Can most likely be optimized.
3024 */
3125abstract class ProcessedInputStream {
3226
33- private boolean crFound ;
3427 protected final ResizableIntBuffer buffer = new ResizableIntBuffer ();
3528
3629 protected abstract int read ();
@@ -40,22 +33,40 @@ static class StringProcessedInputStream extends ProcessedInputStream {
4033 protected final char [] input ;
4134
4235 StringProcessedInputStream (String input ) {
43- this .input = input .toCharArray ();
36+ this .input = normalize (input );
37+ }
38+
39+ private static char [] normalize (String s ) {
40+ char [] arr = s .toCharArray ();
41+ int n = arr .length ;
42+ int j = 0 ;
43+ for (int i = 0 ; i < n ; i ++) {
44+ char c = arr [i ];
45+ if (c == '\r' ) {
46+ arr [j ++] = '\n' ;
47+ if (i + 1 < n && arr [i + 1 ] == '\n' ) {
48+ i ++;
49+ }
50+ } else {
51+ arr [j ++] = c ;
52+ }
53+ }
54+ return j == n ? arr : Arrays .copyOf (arr , j );
4455 }
4556
4657 @ Override
4758 protected int read () {
48- try {
59+ if ( pos < input . length ) {
4960 return input [pos ++];
50- } catch (IndexOutOfBoundsException s ) {
51- return -1 ;
5261 }
62+ return -1 ;
5363 }
5464 }
5565
5666 static final class ReaderProcessedInputStream extends ProcessedInputStream {
5767
5868 private final Reader reader ;
69+ private boolean crFound ;
5970
6071 ReaderProcessedInputStream (Reader reader ) {
6172 this .reader = reader ;
@@ -64,37 +75,31 @@ static final class ReaderProcessedInputStream extends ProcessedInputStream {
6475 @ Override
6576 protected int read () {
6677 try {
67- return reader .read ();
78+ int chr = reader .read ();
79+ if (crFound ) {
80+ crFound = false ;
81+ if (chr == Characters .LF ) {
82+ chr = reader .read ();
83+ }
84+ }
85+
86+ if (chr == Characters .CR ) {
87+ crFound = true ;
88+ chr = Characters .LF ;
89+ }
90+ return chr ;
6891 } catch (IOException ioe ) {
6992 throw new ParserException (ioe );
7093 }
7194 }
7295 }
7396
7497 //
75- private int readWithCRHandling () {
76- int chr = read ();
77- if (crFound ) {
78- //chr = handleCrFoundInternal(chr);
79- crFound = false ;
80- if (chr == Characters .LF ) {
81- chr = read ();
82- }
83- }
84-
85- if (chr == Characters .CR ) {
86- // handleChrIsCR
87- crFound = true ;
88- chr = Characters .LF ;
89- }
90- return chr ;
91- }
92-
9398 int peekNextInputCharacter (int offset ) {
9499 if (buffer .length () < offset ) {
95100 // fill buffer
96101 for (int i = buffer .length (); i < offset ; i ++) {
97- buffer .add (readWithCRHandling ());
102+ buffer .add (read ());
98103 }
99104 }
100105 return buffer .getCharAt (offset );
@@ -111,7 +116,7 @@ int getNextInputCharacterAndConsume() {
111116 }
112117
113118 int consume () {
114- return buffer .isEmpty ? readWithCRHandling () : buffer .removeFirst ();
119+ return buffer .isEmpty ? read () : buffer .removeFirst ();
115120 }
116121
117122 void reconsume (int chr ) {
0 commit comments