2424
2525/**
2626 * Collection of HTML element information.
27+ * This does not manage any state and therefore it is thread safe.
28+ * There is also an optimized but not thread save implementation
29+ * of the {@link HTMLElementsProvider} interface - {@link HTMLElementsWithCache}.
2730 *
2831 * @author Andy Clark
2932 * @author Ahmed Ashour
3033 * @author Marc Guillemot
3134 * @author Ronald Brill
3235 */
33- public class HTMLElements {
36+ public class HTMLElements implements HTMLElementsProvider {
3437
3538 // element codes
3639
@@ -199,7 +202,7 @@ public class HTMLElements {
199202 private final HashMap <String , Element > elementsByNameForReference_ = new HashMap <>();
200203
201204 // this is a optimized version which will be later queried
202- private FastHashMap <String , Element >[] elementsByNamePerLength_ ;
205+ FastHashMap <String , Element >[] elementsByNamePerLength_ ;
203206
204207 public HTMLElements () {
205208 final Element [][] elementsArray = new Element [26 ][];
@@ -628,19 +631,17 @@ private void defineParents(final Element element) {
628631 }
629632
630633 /**
631- * @return the element information for the specified element code.
632- *
633- * @param code The element code.
634+ * {@inheritDoc}
634635 */
636+ @ Override
635637 public final Element getElement (final short code ) {
636638 return elementsByCode_ [code ];
637639 }
638640
639641 /**
640- * @return the element information for the specified element name.
641- *
642- * @param ename the element name.
642+ * {@inheritDoc}
643643 */
644+ @ Override
644645 public final Element getElement (final String ename ) {
645646 Element element = getElement (ename , NO_SUCH_ELEMENT );
646647 if (element == NO_SUCH_ELEMENT ) {
@@ -656,11 +657,9 @@ public final Element getElement(final String ename) {
656657 }
657658
658659 /**
659- * @return the element information for the specified element name.
660- *
661- * @param ename the element name.
662- * @param elementIfNotFound the default element to return if not found.
660+ * {@inheritDoc}
663661 */
662+ @ Override
664663 public final Element getElement (final String ename , final Element elementIfNotFound ) {
665664 int length = ename .length ();
666665 if (length > elementsByNamePerLength_ .length ) {
@@ -689,11 +688,9 @@ public final Element getElement(final String ename, final Element elementIfNotFo
689688 }
690689
691690 /**
692- * @return the element information for the specified element name.
693- *
694- * @param enameLC the element name in lower case
695- * @param elementIfNotFound the default element to return if not found.
691+ * {@inheritDoc}
696692 */
693+ @ Override
697694 public final Element getElementLC (final String enameLC , final Element elementIfNotFound ) {
698695 int length = enameLC .length ();
699696 if (length > elementsByNamePerLength_ .length ) {
@@ -705,7 +702,6 @@ public final Element getElementLC(final String enameLC, final Element elementIfN
705702 return elementIfNotFound ;
706703 }
707704
708- // check the current form casing first, which is mostly lowercase only
709705 Element r = entry .get (enameLC );
710706 if (r == null ) {
711707 return elementIfNotFound ;
@@ -714,6 +710,149 @@ public final Element getElementLC(final String enameLC, final Element elementIfN
714710 return r ;
715711 }
716712
713+ public static class HTMLElementsWithCache implements HTMLElementsProvider {
714+
715+ private final HTMLElements htmlElements_ ;
716+
717+ // this map helps us to know what elements we don't have and speed things up
718+ private final FastHashMap <String , Boolean > unknownElements_ ;
719+
720+ public HTMLElementsWithCache (final HTMLElements htmlElements ) {
721+ htmlElements_ = htmlElements ;
722+ unknownElements_ = new FastHashMap <>(11 , 0.70f );
723+ }
724+
725+ @ Override
726+ public Element getElement (short code ) {
727+ return htmlElements_ .getElement (code );
728+ }
729+
730+ @ Override
731+ public Element getElement (String ename ) {
732+ Element element = getElement (ename , htmlElements_ .NO_SUCH_ELEMENT );
733+ if (element == htmlElements_ .NO_SUCH_ELEMENT ) {
734+ element = new Element (UNKNOWN ,
735+ ename .toUpperCase (Locale .ROOT ),
736+ htmlElements_ .NO_SUCH_ELEMENT .flags ,
737+ htmlElements_ .NO_SUCH_ELEMENT .parentCodes_ ,
738+ htmlElements_ .NO_SUCH_ELEMENT .bounds ,
739+ htmlElements_ .NO_SUCH_ELEMENT .closes );
740+ element .parent = htmlElements_ .NO_SUCH_ELEMENT .parent ;
741+ }
742+ return element ;
743+ }
744+
745+ @ Override
746+ public Element getElement (String ename , Element elementIfNotFound ) {
747+ int length = ename .length ();
748+ if (length > htmlElements_ .elementsByNamePerLength_ .length ) {
749+ if (unknownElements_ .get (ename ) != null ) {
750+ // we added it to the cache, so we know it has been
751+ // queried once unsuccessfully before
752+ return elementIfNotFound ;
753+ }
754+
755+ // remember that we had a miss
756+ unknownElements_ .put (ename , Boolean .TRUE );
757+
758+ return elementIfNotFound ;
759+ }
760+
761+ FastHashMap <String , Element > entry = htmlElements_ .elementsByNamePerLength_ [length - 1 ];
762+ if (entry == null ) {
763+ // check first if we know that we don't know and avoid the
764+ // lowercasing later
765+ if (unknownElements_ .get (ename ) != null ) {
766+ // we added it to the cache, so we know it has been
767+ // queried once unsuccessfully before
768+ return elementIfNotFound ;
769+ }
770+
771+ // remember that we had a miss
772+ unknownElements_ .put (ename , Boolean .TRUE );
773+
774+ return elementIfNotFound ;
775+ }
776+
777+ Element r = entry .get (ename );
778+ if (r == null ) {
779+ // check first if we know that we don't know and avoid the
780+ // lowercasing later
781+ if (unknownElements_ .get (ename ) != null ) {
782+ // we added it to the cache, so we know it has been
783+ // queried once unsuccessfully before
784+ return elementIfNotFound ;
785+ }
786+
787+ // we have not found it in its current form, might be uppercase
788+ // or mixed case, so try all lowercase for sanity, we speculated that
789+ // good HTML is mostly all lowercase in the first place so this is the
790+ // fallback for atypical HTML
791+ // we also have not seen that element missing yet
792+ r = entry .get (ename .toLowerCase (Locale .ROOT ));
793+ if (r == null ) {
794+ // remember that we had a miss
795+ unknownElements_ .put (ename , Boolean .TRUE );
796+ return elementIfNotFound ;
797+ }
798+ }
799+
800+ return r ;
801+ }
802+
803+ @ Override
804+ public Element getElementLC (String enameLC , Element elementIfNotFound ) {
805+ int length = enameLC .length ();
806+ if (length > htmlElements_ .elementsByNamePerLength_ .length ) {
807+ if (unknownElements_ .get (enameLC ) != null ) {
808+ // we added it to the cache, so we know it has been
809+ // queried once unsuccessfully before
810+ return elementIfNotFound ;
811+ }
812+
813+ // remember that we had a miss
814+ unknownElements_ .put (enameLC , Boolean .TRUE );
815+
816+ return elementIfNotFound ;
817+ }
818+
819+ FastHashMap <String , Element > entry = htmlElements_ .elementsByNamePerLength_ [length - 1 ];
820+ if (entry == null ) {
821+ // check first if we know that we don't know and avoid the
822+ // lowercasing later
823+ if (unknownElements_ .get (enameLC ) != null ) {
824+ // we added it to the cache, so we know it has been
825+ // queried once unsuccessfully before
826+ return elementIfNotFound ;
827+ }
828+
829+ // remember that we had a miss
830+ unknownElements_ .put (enameLC , Boolean .TRUE );
831+
832+ return elementIfNotFound ;
833+ }
834+
835+ Element r = entry .get (enameLC );
836+ if (r == null ) {
837+ // check first if we know that we don't know and avoid the
838+ // lowercasing later
839+ if (unknownElements_ .get (enameLC ) != null ) {
840+ // we added it to the cache, so we know it has been
841+ // queried once unsuccessfully before
842+ return elementIfNotFound ;
843+ }
844+
845+ // remember that we had a miss
846+ unknownElements_ .put (enameLC , Boolean .TRUE );
847+
848+ return elementIfNotFound ;
849+ }
850+
851+ return r ;
852+ }
853+
854+ }
855+
717856 /**
718857 * Element information.
719858 *
0 commit comments