Skip to content

Commit 3ce3fe1

Browse files
committed
introduce HTMLElementsProvider and HTMLElementsWithCache
1 parent f1ce618 commit 3ce3fe1

3 files changed

Lines changed: 219 additions & 19 deletions

File tree

src/main/java/org/htmlunit/cyberneko/HTMLConfiguration.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,14 +126,19 @@ public class HTMLConfiguration extends ParserConfigurationSettings implements XM
126126
/** Namespace binder. */
127127
private final NamespaceBinder namespaceBinder_ = new NamespaceBinder(this);
128128

129-
private final HTMLElements htmlElements_;
129+
private final HTMLElementsProvider htmlElements_;
130130

131131
/** Default constructor. */
132132
public HTMLConfiguration() {
133133
this(new HTMLElements());
134134
}
135135

136+
// for backward compatibility
136137
public HTMLConfiguration(final HTMLElements htmlElements) {
138+
this((HTMLElementsProvider) htmlElements);
139+
}
140+
141+
public HTMLConfiguration(final HTMLElementsProvider htmlElements) {
137142
htmlElements_ = htmlElements;
138143

139144
// add components
@@ -269,7 +274,7 @@ public XMLErrorHandler getErrorHandler() {
269274
/**
270275
* @return the configured {@link HTMLElements}
271276
*/
272-
public HTMLElements getHtmlElements() {
277+
public HTMLElementsProvider getHtmlElements() {
273278
return htmlElements_;
274279
}
275280

src/main/java/org/htmlunit/cyberneko/HTMLElements.java

Lines changed: 156 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,16 @@
2424

2525
/**
2626
* Collection of HTML element information.
27+
* This does not manage any state and therefore it is thread safe.
28+
* There is also an optimized but not thread save implementation
29+
* of the {@link HTMLElementsProvider} interface - {@link HTMLElementsWithCache}.
2730
*
2831
* @author Andy Clark
2932
* @author Ahmed Ashour
3033
* @author Marc Guillemot
3134
* @author Ronald Brill
3235
*/
33-
public class HTMLElements {
36+
public class HTMLElements implements HTMLElementsProvider {
3437

3538
// element codes
3639

@@ -199,7 +202,7 @@ public class HTMLElements {
199202
private final HashMap<String, Element> elementsByNameForReference_ = new HashMap<>();
200203

201204
// this is a optimized version which will be later queried
202-
private FastHashMap<String, Element>[] elementsByNamePerLength_;
205+
FastHashMap<String, Element>[] elementsByNamePerLength_;
203206

204207
public HTMLElements() {
205208
final Element[][] elementsArray = new Element[26][];
@@ -628,19 +631,17 @@ private void defineParents(final Element element) {
628631
}
629632

630633
/**
631-
* @return the element information for the specified element code.
632-
*
633-
* @param code The element code.
634+
* {@inheritDoc}
634635
*/
636+
@Override
635637
public final Element getElement(final short code) {
636638
return elementsByCode_[code];
637639
}
638640

639641
/**
640-
* @return the element information for the specified element name.
641-
*
642-
* @param ename the element name.
642+
* {@inheritDoc}
643643
*/
644+
@Override
644645
public final Element getElement(final String ename) {
645646
Element element = getElement(ename, NO_SUCH_ELEMENT);
646647
if (element == NO_SUCH_ELEMENT) {
@@ -656,11 +657,9 @@ public final Element getElement(final String ename) {
656657
}
657658

658659
/**
659-
* @return the element information for the specified element name.
660-
*
661-
* @param ename the element name.
662-
* @param elementIfNotFound the default element to return if not found.
660+
* {@inheritDoc}
663661
*/
662+
@Override
664663
public final Element getElement(final String ename, final Element elementIfNotFound) {
665664
int length = ename.length();
666665
if (length > elementsByNamePerLength_.length) {
@@ -689,11 +688,9 @@ public final Element getElement(final String ename, final Element elementIfNotFo
689688
}
690689

691690
/**
692-
* @return the element information for the specified element name.
693-
*
694-
* @param enameLC the element name in lower case
695-
* @param elementIfNotFound the default element to return if not found.
691+
* {@inheritDoc}
696692
*/
693+
@Override
697694
public final Element getElementLC(final String enameLC, final Element elementIfNotFound) {
698695
int length = enameLC.length();
699696
if (length > elementsByNamePerLength_.length) {
@@ -705,7 +702,6 @@ public final Element getElementLC(final String enameLC, final Element elementIfN
705702
return elementIfNotFound;
706703
}
707704

708-
// check the current form casing first, which is mostly lowercase only
709705
Element r = entry.get(enameLC);
710706
if (r == null) {
711707
return elementIfNotFound;
@@ -714,6 +710,149 @@ public final Element getElementLC(final String enameLC, final Element elementIfN
714710
return r;
715711
}
716712

713+
public static class HTMLElementsWithCache implements HTMLElementsProvider {
714+
715+
private final HTMLElements htmlElements_;
716+
717+
// this map helps us to know what elements we don't have and speed things up
718+
private final FastHashMap<String, Boolean> unknownElements_;
719+
720+
public HTMLElementsWithCache(final HTMLElements htmlElements) {
721+
htmlElements_ = htmlElements;
722+
unknownElements_ = new FastHashMap<>(11, 0.70f);
723+
}
724+
725+
@Override
726+
public Element getElement(short code) {
727+
return htmlElements_.getElement(code);
728+
}
729+
730+
@Override
731+
public Element getElement(String ename) {
732+
Element element = getElement(ename, htmlElements_.NO_SUCH_ELEMENT);
733+
if (element == htmlElements_.NO_SUCH_ELEMENT) {
734+
element = new Element(UNKNOWN,
735+
ename.toUpperCase(Locale.ROOT),
736+
htmlElements_.NO_SUCH_ELEMENT.flags,
737+
htmlElements_.NO_SUCH_ELEMENT.parentCodes_,
738+
htmlElements_.NO_SUCH_ELEMENT.bounds,
739+
htmlElements_.NO_SUCH_ELEMENT.closes);
740+
element.parent = htmlElements_.NO_SUCH_ELEMENT.parent;
741+
}
742+
return element;
743+
}
744+
745+
@Override
746+
public Element getElement(String ename, Element elementIfNotFound) {
747+
int length = ename.length();
748+
if (length > htmlElements_.elementsByNamePerLength_.length) {
749+
if (unknownElements_.get(ename) != null) {
750+
// we added it to the cache, so we know it has been
751+
// queried once unsuccessfully before
752+
return elementIfNotFound;
753+
}
754+
755+
// remember that we had a miss
756+
unknownElements_.put(ename, Boolean.TRUE);
757+
758+
return elementIfNotFound;
759+
}
760+
761+
FastHashMap<String, Element> entry = htmlElements_.elementsByNamePerLength_[length - 1];
762+
if (entry == null) {
763+
// check first if we know that we don't know and avoid the
764+
// lowercasing later
765+
if (unknownElements_.get(ename) != null) {
766+
// we added it to the cache, so we know it has been
767+
// queried once unsuccessfully before
768+
return elementIfNotFound;
769+
}
770+
771+
// remember that we had a miss
772+
unknownElements_.put(ename, Boolean.TRUE);
773+
774+
return elementIfNotFound;
775+
}
776+
777+
Element r = entry.get(ename);
778+
if (r == null) {
779+
// check first if we know that we don't know and avoid the
780+
// lowercasing later
781+
if (unknownElements_.get(ename) != null) {
782+
// we added it to the cache, so we know it has been
783+
// queried once unsuccessfully before
784+
return elementIfNotFound;
785+
}
786+
787+
// we have not found it in its current form, might be uppercase
788+
// or mixed case, so try all lowercase for sanity, we speculated that
789+
// good HTML is mostly all lowercase in the first place so this is the
790+
// fallback for atypical HTML
791+
// we also have not seen that element missing yet
792+
r = entry.get(ename.toLowerCase(Locale.ROOT));
793+
if (r == null) {
794+
// remember that we had a miss
795+
unknownElements_.put(ename, Boolean.TRUE);
796+
return elementIfNotFound;
797+
}
798+
}
799+
800+
return r;
801+
}
802+
803+
@Override
804+
public Element getElementLC(String enameLC, Element elementIfNotFound) {
805+
int length = enameLC.length();
806+
if (length > htmlElements_.elementsByNamePerLength_.length) {
807+
if (unknownElements_.get(enameLC) != null) {
808+
// we added it to the cache, so we know it has been
809+
// queried once unsuccessfully before
810+
return elementIfNotFound;
811+
}
812+
813+
// remember that we had a miss
814+
unknownElements_.put(enameLC, Boolean.TRUE);
815+
816+
return elementIfNotFound;
817+
}
818+
819+
FastHashMap<String, Element> entry = htmlElements_.elementsByNamePerLength_[length - 1];
820+
if (entry == null) {
821+
// check first if we know that we don't know and avoid the
822+
// lowercasing later
823+
if (unknownElements_.get(enameLC) != null) {
824+
// we added it to the cache, so we know it has been
825+
// queried once unsuccessfully before
826+
return elementIfNotFound;
827+
}
828+
829+
// remember that we had a miss
830+
unknownElements_.put(enameLC, Boolean.TRUE);
831+
832+
return elementIfNotFound;
833+
}
834+
835+
Element r = entry.get(enameLC);
836+
if (r == null) {
837+
// check first if we know that we don't know and avoid the
838+
// lowercasing later
839+
if (unknownElements_.get(enameLC) != null) {
840+
// we added it to the cache, so we know it has been
841+
// queried once unsuccessfully before
842+
return elementIfNotFound;
843+
}
844+
845+
// remember that we had a miss
846+
unknownElements_.put(enameLC, Boolean.TRUE);
847+
848+
return elementIfNotFound;
849+
}
850+
851+
return r;
852+
}
853+
854+
}
855+
717856
/**
718857
* Element information.
719858
*
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Copyright (c) 2002-2009 Andy Clark, Marc Guillemot
3+
* Copyright (c) 2017-2024 Ronald Brill
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.htmlunit.cyberneko;
17+
18+
import org.htmlunit.cyberneko.HTMLElements.Element;
19+
20+
/**
21+
* Interface to support different {@link Element}'s providers
22+
*
23+
* @author Ronald Brill
24+
*/
25+
public interface HTMLElementsProvider {
26+
27+
/**
28+
* @return the element information for the specified element code.
29+
*
30+
* @param code The element code.
31+
*/
32+
Element getElement(final short code);
33+
34+
/**
35+
* @return the element information for the specified element name.
36+
*
37+
* @param ename the element name.
38+
*/
39+
Element getElement(final String ename);
40+
41+
/**
42+
* @return the element information for the specified element name.
43+
*
44+
* @param ename the element name.
45+
* @param elementIfNotFound the default element to return if not found.
46+
*/
47+
Element getElement(final String ename, final Element elementIfNotFound);
48+
49+
/**
50+
* @return the element information for the specified element name.
51+
*
52+
* @param enameLC the element name in lower case
53+
* @param elementIfNotFound the default element to return if not found.
54+
*/
55+
Element getElementLC(final String enameLC, final Element elementIfNotFound);
56+
}

0 commit comments

Comments
 (0)