Skip to content

Commit 004cac3

Browse files
committed
pre-process string input
1 parent d5570ad commit 004cac3

2 files changed

Lines changed: 47 additions & 34 deletions

File tree

src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java

Lines changed: 39 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,13 @@
1717

1818
import java.io.IOException;
1919
import java.io.Reader;
20+
import java.util.Arrays;
2021

2122
/**
22-
* Even though the html5 specification is working with codepoints, this input
23-
* stream will only emit chars and "-1".
24-
*
25-
* This has some interesting consequences that we will need to fully explore:
26-
* <ul>
27-
* <li>Character position is not the same as the current position
28-
* <li>other unknown issues??
29-
* </ul>
23+
* Wrapped and abstracted input. Can most likely be optimized.
3024
*/
3125
abstract class ProcessedInputStream {
3226

33-
private boolean crFound;
3427
protected final ResizableIntBuffer buffer = new ResizableIntBuffer();
3528

3629
protected abstract int read();
@@ -40,22 +33,40 @@ static class StringProcessedInputStream extends ProcessedInputStream {
4033
protected final char[] input;
4134

4235
StringProcessedInputStream(String input) {
43-
this.input = input.toCharArray();
36+
this.input = normalize(input);
37+
}
38+
39+
private static char[] normalize(String s) {
40+
char[] arr = s.toCharArray();
41+
int n = arr.length;
42+
int j = 0;
43+
for (int i = 0; i < n; i++) {
44+
char c = arr[i];
45+
if (c == '\r') {
46+
arr[j++] = '\n';
47+
if (i + 1 < n && arr[i + 1] == '\n') {
48+
i++;
49+
}
50+
} else {
51+
arr[j++] = c;
52+
}
53+
}
54+
return j == n ? arr : Arrays.copyOf(arr, j);
4455
}
4556

4657
@Override
4758
protected int read() {
48-
try {
59+
if (pos < input.length) {
4960
return input[pos++];
50-
} catch (IndexOutOfBoundsException s) {
51-
return -1;
5261
}
62+
return -1;
5363
}
5464
}
5565

5666
static final class ReaderProcessedInputStream extends ProcessedInputStream {
5767

5868
private final Reader reader;
69+
private boolean crFound;
5970

6071
ReaderProcessedInputStream(Reader reader) {
6172
this.reader = reader;
@@ -64,37 +75,31 @@ static final class ReaderProcessedInputStream extends ProcessedInputStream {
6475
@Override
6576
protected int read() {
6677
try {
67-
return reader.read();
78+
int chr = reader.read();
79+
if (crFound) {
80+
crFound = false;
81+
if (chr == Characters.LF) {
82+
chr = reader.read();
83+
}
84+
}
85+
86+
if (chr == Characters.CR) {
87+
crFound = true;
88+
chr = Characters.LF;
89+
}
90+
return chr;
6891
} catch (IOException ioe) {
6992
throw new ParserException(ioe);
7093
}
7194
}
7295
}
7396

7497
//
75-
private int readWithCRHandling() {
76-
int chr = read();
77-
if (crFound) {
78-
//chr = handleCrFoundInternal(chr);
79-
crFound = false;
80-
if (chr == Characters.LF) {
81-
chr = read();
82-
}
83-
}
84-
85-
if (chr == Characters.CR) {
86-
// handleChrIsCR
87-
crFound = true;
88-
chr = Characters.LF;
89-
}
90-
return chr;
91-
}
92-
9398
int peekNextInputCharacter(int offset) {
9499
if (buffer.length() < offset) {
95100
// fill buffer
96101
for (int i = buffer.length(); i < offset; i++) {
97-
buffer.add(readWithCRHandling());
102+
buffer.add(read());
98103
}
99104
}
100105
return buffer.getCharAt(offset);
@@ -111,7 +116,7 @@ int getNextInputCharacterAndConsume() {
111116
}
112117

113118
int consume() {
114-
return buffer.isEmpty ? readWithCRHandling() : buffer.removeFirst();
119+
return buffer.isEmpty ? read() : buffer.removeFirst();
115120
}
116121

117122
void reconsume(int chr) {

src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,5 +103,13 @@ char[] copyBackingCharArray() {
103103
return Arrays.copyOf(buff, pos);
104104
}
105105

106+
void append(char[] c, int offset, int length) {
107+
if (pos + length >= buff.length) {
108+
buff = Arrays.copyOf(buff, Math.max(pos + length, buff.length * 2 + 2));
109+
}
110+
System.arraycopy(c, offset, buff, pos, length);
111+
pos += length;
112+
}
113+
106114

107115
}

0 commit comments

Comments
 (0)