Skip to content

Commit 40bcbb0

Browse files
committed
tmp
1 parent e80967c commit 40bcbb0

4 files changed

Lines changed: 129 additions & 80 deletions

File tree

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282
<dependency>
8383
<groupId>org.jsoup</groupId>
8484
<artifactId>jsoup</artifactId>
85-
<version>1.21.1</version>
85+
<version>1.22.2</version>
8686
<scope>test</scope>
8787
</dependency>
8888
<!--

src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java

Lines changed: 80 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@
1818
import java.io.DataInputStream;
1919
import java.io.IOException;
2020
import java.util.ArrayList;
21+
import java.util.HashMap;
22+
import java.util.LinkedList;
2123
import java.util.List;
2224
import java.util.Map;
25+
import java.util.Queue;
2326
import java.util.TreeMap;
2427
import java.util.zip.GZIPInputStream;
2528

@@ -36,10 +39,81 @@
3639
*/
3740
final class EntitiesPrefix {
3841

39-
static final EntitiesPrefix ENTITIES = prepare();
42+
static {
43+
prepare();
44+
}
4045

41-
private static EntitiesPrefix prepare() {
42-
EntitiesPrefix e = new EntitiesPrefix(null);
46+
private static void flatten(EntitiesPrefix root) {
47+
List<EntitiesPrefix> nodes = new ArrayList<>();
48+
Map<EntitiesPrefix, Integer> nodeToIndex = new HashMap<>();
49+
Queue<EntitiesPrefix> queue = new LinkedList<>();
50+
queue.add(root);
51+
nodeToIndex.put(root, 0);
52+
53+
while (!queue.isEmpty()) {
54+
EntitiesPrefix node = queue.poll();
55+
nodes.add(node);
56+
if (node.childsCompacted != null) {
57+
for (EntitiesPrefix child : node.childsCompacted) {
58+
if (child != null && !nodeToIndex.containsKey(child)) {
59+
nodeToIndex.put(child, nodes.size() + queue.size() + 1);
60+
queue.add(child);
61+
}
62+
}
63+
}
64+
}
65+
// re-indexing to ensure contiguous indices in BFS order
66+
nodeToIndex.clear();
67+
for (int i = 0; i < nodes.size(); i++) {
68+
nodeToIndex.put(nodes.get(i), i);
69+
}
70+
71+
int nodeCount = nodes.size();
72+
NODE_CHAR = new char[nodeCount];
73+
NODE_CHILDREN_BASE = new int[nodeCount];
74+
NODE_MIN_CHILD_CHAR = new char[nodeCount];
75+
NODE_CHILDREN_COUNT = new int[nodeCount];
76+
NODE_RESULTS = new char[nodeCount][];
77+
78+
List<Integer> childPointersList = new ArrayList<>();
79+
80+
for (int i = 0; i < nodeCount; i++) {
81+
EntitiesPrefix node = nodes.get(i);
82+
NODE_CHAR[i] = node.c;
83+
NODE_RESULTS[i] = node.chars;
84+
85+
if (node.childsCompacted != null) {
86+
NODE_CHILDREN_BASE[i] = childPointersList.size();
87+
NODE_MIN_CHILD_CHAR[i] = (char) node.offset;
88+
NODE_CHILDREN_COUNT[i] = node.childsCompacted.length;
89+
for (EntitiesPrefix child : node.childsCompacted) {
90+
if (child != null) {
91+
childPointersList.add(nodeToIndex.get(child));
92+
} else {
93+
childPointersList.add(-1);
94+
}
95+
}
96+
} else {
97+
NODE_CHILDREN_BASE[i] = -1;
98+
NODE_CHILDREN_COUNT[i] = 0;
99+
}
100+
}
101+
102+
CHILD_POINTERS = new int[childPointersList.size()];
103+
for (int i = 0; i < childPointersList.size(); i++) {
104+
CHILD_POINTERS[i] = childPointersList.get(i);
105+
}
106+
}
107+
108+
static char[] NODE_CHAR;
109+
static int[] NODE_CHILDREN_BASE;
110+
static char[] NODE_MIN_CHILD_CHAR;
111+
static int[] NODE_CHILDREN_COUNT;
112+
static char[][] NODE_RESULTS;
113+
static int[] CHILD_POINTERS;
114+
115+
private static void prepare() {
116+
EntitiesPrefix e = new EntitiesPrefix();
43117
try (DataInputStream dais = new DataInputStream(new GZIPInputStream(EntitiesPrefix.class.getResourceAsStream("/ch/digitalfondue/jfiveparse/entities-with-1-2-codepoint")))) {
44118

45119
// number of entities with only one codepoint
@@ -53,7 +127,7 @@ private static EntitiesPrefix prepare() {
53127
}
54128

55129
e.compact();
56-
return e;
130+
flatten(e);
57131
} catch (IOException ioe) {
58132
throw new IllegalStateException(ioe);
59133
}
@@ -71,10 +145,8 @@ private static EntitiesPrefix prepare() {
71145
private int offset;
72146
private EntitiesPrefix[] childsCompacted;
73147
//
74-
private final EntitiesPrefix parent;
75148

76-
EntitiesPrefix(EntitiesPrefix prefix) {
77-
this.parent = prefix;
149+
EntitiesPrefix() {
78150
}
79151

80152
void compact() {
@@ -97,31 +169,6 @@ void compact() {
97169
}
98170
}
99171

100-
String getString() {
101-
StringBuilder sb = new StringBuilder();
102-
103-
sb.append(c);
104-
105-
var p = parent;
106-
while (p != null) {
107-
sb.append(p.c);
108-
p = p.parent;
109-
}
110-
111-
return sb.reverse().toString();
112-
}
113-
114-
EntitiesPrefix getMaybeCompleteParent() {
115-
var p = parent;
116-
while (p != null) {
117-
if (p.isComplete()) {
118-
return p;
119-
}
120-
p = p.parent;
121-
}
122-
return null;
123-
}
124-
125172
void addWord(String s, int[] codepoints) {
126173
c = s.charAt(0);
127174
if (s.length() == 1) {
@@ -144,26 +191,9 @@ void addWord(String s, int[] codepoints) {
144191
}
145192

146193
if (!tmpChilds.containsKey(nextVal)) {
147-
tmpChilds.put(nextVal, new EntitiesPrefix(this));
194+
tmpChilds.put(nextVal, new EntitiesPrefix());
148195
}
149196
tmpChilds.get(nextVal).addWord(s.substring(1), codepoints);
150197
}
151198
}
152-
153-
boolean isComplete() {
154-
return chars != null;
155-
}
156-
157-
EntitiesPrefix getNode(char c) {
158-
if (childsCompacted != null) {
159-
int idx = c - offset;
160-
if (idx < 0 || idx >= childsCompacted.length) {
161-
return null;
162-
} else {
163-
return childsCompacted[idx];
164-
}
165-
} else {
166-
return null;
167-
}
168-
}
169199
}

src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java

Lines changed: 47 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1721,36 +1721,57 @@ static char[] consumeCharacterReference(int additionalCharacter, boolean inAttri
17211721

17221722
private static char[] parseEntity(boolean inAttribute, ProcessedInputStream processedInputStream, Tokenizer tokenHandler, int chr) {
17231723
int matchedCount = 0;
1724-
var currentPrefix = EntitiesPrefix.ENTITIES;
1724+
int currentNodeIdx = 0;
1725+
int lastCompleteNodeIdx = -1;
1726+
int lastCompleteMatchedCount = 0;
1727+
17251728
ResizableCharBuilder tentativelyMatched = new ResizableCharBuilder();
17261729

17271730
for (;;) {
1731+
if (EntitiesPrefix.NODE_RESULTS[currentNodeIdx] != null) {
1732+
lastCompleteNodeIdx = currentNodeIdx;
1733+
lastCompleteMatchedCount = matchedCount;
1734+
}
1735+
17281736
int next = processedInputStream.peekNextInputCharacter(matchedCount + 1);
17291737
if (next != Characters.EOF) {
17301738
tentativelyMatched.append((char) next);
1731-
}
1732-
var tmpPrefix = currentPrefix.getNode((char) next);
1733-
if (tmpPrefix != null) {
1734-
currentPrefix = tmpPrefix;
1735-
matchedCount++;
17361739
} else {
17371740
break;
17381741
}
1739-
}
17401742

1741-
if (!currentPrefix.isComplete()) {
1742-
var maybeCompleteParent = currentPrefix.getMaybeCompleteParent();
1743-
if (maybeCompleteParent != null) {
1744-
currentPrefix = maybeCompleteParent;
1743+
char nextChar = (char) next;
1744+
int childrenBase = EntitiesPrefix.NODE_CHILDREN_BASE[currentNodeIdx];
1745+
boolean found = false;
1746+
if (childrenBase != -1) {
1747+
int offset = nextChar - EntitiesPrefix.NODE_MIN_CHILD_CHAR[currentNodeIdx];
1748+
if (offset >= 0 && offset < EntitiesPrefix.NODE_CHILDREN_COUNT[currentNodeIdx]) {
1749+
int nextNodeIdx = EntitiesPrefix.CHILD_POINTERS[childrenBase + offset];
1750+
if (nextNodeIdx != -1) {
1751+
currentNodeIdx = nextNodeIdx;
1752+
matchedCount++;
1753+
found = true;
1754+
}
1755+
}
1756+
}
1757+
1758+
if (!found) {
1759+
break;
17451760
}
17461761
}
17471762

1748-
if (currentPrefix.isComplete()) {
1749-
String entityMatched = currentPrefix.getString();
1763+
if (EntitiesPrefix.NODE_RESULTS[currentNodeIdx] == null) {
1764+
currentNodeIdx = lastCompleteNodeIdx;
1765+
matchedCount = lastCompleteMatchedCount;
1766+
}
1767+
1768+
if (currentNodeIdx != -1 && EntitiesPrefix.NODE_RESULTS[currentNodeIdx] != null) {
1769+
char nodeChar = EntitiesPrefix.NODE_CHAR[currentNodeIdx];
1770+
char[] result = EntitiesPrefix.NODE_RESULTS[currentNodeIdx];
17501771
if (inAttribute) {
1751-
return handleCompleteEntityInAttribute(processedInputStream, tokenHandler, currentPrefix, entityMatched);
1772+
return handleCompleteEntityInAttribute(processedInputStream, tokenHandler, nodeChar, matchedCount, result);
17521773
} else {
1753-
return handleCompleteEntityNotInAttribute(processedInputStream, tokenHandler, currentPrefix, entityMatched);
1774+
return handleCompleteEntityNotInAttribute(processedInputStream, tokenHandler, nodeChar, matchedCount, result);
17541775
}
17551776
} else {
17561777
// handleUncompleteEntity
@@ -1766,7 +1787,7 @@ private static char[] parseEntity(boolean inAttribute, ProcessedInputStream proc
17661787
boolean emitParseError = tentativelyMatchedLength > 1 && tentativelyMatched.at(tentativelyMatchedLength - 1) == Characters.SEMICOLON;
17671788
if (emitParseError) {
17681789
for (int i = 0; emitParseError && i < tentativelyMatchedLength - 1; i++) {
1769-
emitParseError = Common.isAlphaNumericASCII(chr);
1790+
emitParseError = Common.isAlphaNumericASCII(tentativelyMatched.at(i));
17701791
}
17711792
}
17721793

@@ -1777,32 +1798,30 @@ private static char[] parseEntity(boolean inAttribute, ProcessedInputStream proc
17771798
}
17781799
}
17791800

1780-
1781-
1782-
private static char[] handleCompleteEntityNotInAttribute(ProcessedInputStream processedInputStream, Tokenizer tokenHandler, EntitiesPrefix currentPrefix, String entityMatched) {
1783-
if ((currentPrefix.c) != Characters.SEMICOLON) {
1801+
private static char[] handleCompleteEntityNotInAttribute(ProcessedInputStream processedInputStream, Tokenizer tokenHandler, char nodeChar, int matchedCount, char[] chars) {
1802+
if (nodeChar != Characters.SEMICOLON) {
17841803
tokenHandler.emitParseError();
17851804
}
17861805

1787-
processedInputStream.consume(entityMatched.length() - 1);
1788-
return currentPrefix.chars;
1806+
processedInputStream.consume(matchedCount);
1807+
return chars;
17891808
}
17901809

1791-
private static char[] handleCompleteEntityInAttribute(ProcessedInputStream processedInputStream, Tokenizer tokenHandler, EntitiesPrefix currentPrefix, String entityMatched) {
1792-
if (currentPrefix.c != Characters.SEMICOLON) {
1793-
int nextCharacterAfterMatchedEntity = processedInputStream.peekNextInputCharacter(entityMatched.length());
1810+
private static char[] handleCompleteEntityInAttribute(ProcessedInputStream processedInputStream, Tokenizer tokenHandler, char nodeChar, int matchedCount, char[] chars) {
1811+
if (nodeChar != Characters.SEMICOLON) {
1812+
int nextCharacterAfterMatchedEntity = processedInputStream.peekNextInputCharacter(matchedCount + 1);
17941813
if (Common.isAlphaNumericASCII(nextCharacterAfterMatchedEntity)) {
17951814
return null;
17961815
} else if (Characters.EQUALS_SIGN == nextCharacterAfterMatchedEntity) {
17971816
tokenHandler.emitParseError();
17981817
return null;
17991818
} else {
1800-
return handleCompleteEntityNotInAttribute(processedInputStream, tokenHandler, currentPrefix, entityMatched);
1819+
return handleCompleteEntityNotInAttribute(processedInputStream, tokenHandler, nodeChar, matchedCount, chars);
18011820
}
18021821

18031822
} else {
1804-
processedInputStream.consume(entityMatched.length() - 1);
1805-
return currentPrefix.chars;
1823+
processedInputStream.consume(matchedCount);
1824+
return chars;
18061825
}
18071826
}
18081827

src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public static void main(String[] args) throws IOException {
4343
String json = Files.readString(Paths.get("src/test/resources/entities.json"));
4444
Map<String, EntityValues> m = new GsonBuilder().create().fromJson(json, type);
4545

46-
EntitiesPrefix p = new EntitiesPrefix(null);
46+
EntitiesPrefix p = new EntitiesPrefix();
4747

4848
ByteArrayOutputStream baosOneCodePoint = new ByteArrayOutputStream();
4949
GZIPOutputStream osOneCodePoint = new GZIPOutputStream(baosOneCodePoint);

0 commit comments

Comments
 (0)