2525import java .util .Objects ;
2626import java .util .Optional ;
2727import java .util .Set ;
28+ import java .util .function .Supplier ;
2829
2930import com .google .common .collect .BoundType ;
3031import com .google .common .collect .Range ;
4445import org .apache .calcite .rex .RexUtil ;
4546import org .apache .calcite .rex .RexVisitorImpl ;
4647import org .apache .calcite .sql .SqlKind ;
48+ import org .apache .calcite .sql .fun .SqlStdOperatorTable ;
4749import org .apache .calcite .sql .type .SqlTypeName ;
4850import org .apache .calcite .sql .type .SqlTypeUtil ;
4951import org .apache .calcite .util .ImmutableBitSet ;
52+ import org .apache .calcite .util .Sarg ;
5053import org .apache .datasketches .kll .KllFloatsSketch ;
5154import org .apache .datasketches .memory .Memory ;
5255import org .apache .datasketches .quantilescommon .QuantileSearchCriteria ;
5356import org .apache .hadoop .hive .ql .optimizer .calcite .HiveCalciteUtil ;
5457import org .apache .hadoop .hive .ql .optimizer .calcite .HiveConfPlannerContext ;
5558import org .apache .hadoop .hive .ql .optimizer .calcite .RelOptHiveTable ;
56- import org .apache .hadoop .hive .ql .optimizer .calcite .SearchTransformer ;
5759import org .apache .hadoop .hive .ql .optimizer .calcite .reloperators .HiveIn ;
5860import org .apache .hadoop .hive .ql .optimizer .calcite .reloperators .HiveTableScan ;
5961import org .apache .hadoop .hive .ql .plan .ColStatistics ;
@@ -65,6 +67,8 @@ public class FilterSelectivityEstimator extends RexVisitorImpl<Double> {
6567
6668 protected static final Logger LOG = LoggerFactory .getLogger (FilterSelectivityEstimator .class );
6769
70+ private static final double DEFAULT_COMPARISON_SELECTIVITY = 1.0 / 3.0 ;
71+
6872 private final RelNode childRel ;
6973 private final double childCardinality ;
7074 private final RelMetadataQuery mq ;
@@ -114,7 +118,8 @@ public Double visitCall(RexCall call) {
114118 break ;
115119 }
116120 case SEARCH :
117- return new SearchTransformer <>(rexBuilder , call , RexUnknownAs .FALSE ).transform ().accept (this );
121+ selectivity = computeSearchSelectivity (call );
122+ break ;
118123 case OR : {
119124 selectivity = computeDisjunctionSelectivity (call );
120125 break ;
@@ -159,7 +164,7 @@ public Double visitCall(RexCall call) {
159164 case GREATER_THAN_OR_EQUAL :
160165 case LESS_THAN :
161166 case GREATER_THAN : {
162- selectivity = computeRangePredicateSelectivity (call , call .getKind ());
167+ selectivity = computeComparisonPredicateSelectivity (call , call .getKind ());
163168 break ;
164169 }
165170
@@ -405,8 +410,8 @@ private static Range<Float> makeRange(float lower, float upper, BoundType upperT
405410 return lower > upper ? Range .closedOpen (0f , 0f ) : Range .range (lower , BoundType .CLOSED , upper , upperType );
406411 }
407412
408- private double computeRangePredicateSelectivity (RexCall call , SqlKind op ) {
409- double defaultSelectivity = (( double ) 1 / ( double ) 3 ) ;
413+ private double computeComparisonPredicateSelectivity (RexCall call , SqlKind op ) {
414+ double defaultSelectivity = DEFAULT_COMPARISON_SELECTIVITY ;
410415 if (!(childRel instanceof HiveTableScan )) {
411416 return defaultSelectivity ;
412417 }
@@ -440,34 +445,56 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
440445 boundaryValues [boundaryIdx ] = value ;
441446 inclusive [boundaryIdx ] = openBound ? BoundType .OPEN : BoundType .CLOSED ;
442447 Range <Float > boundaries = Range .range (boundaryValues [0 ], inclusive [0 ], boundaryValues [1 ], inclusive [1 ]);
443-
444- // extract the column index from the other operator
445- final HiveTableScan scan = (HiveTableScan ) childRel ;
446448 int inputRefOpIndex = 1 - literalOpIdx ;
447449 RexNode node = operands .get (inputRefOpIndex );
448- if (isRemovableCast (node , scan )) {
449- Range <Float > typeRange = getRangeOfType (node .getType ());
450- boundaries = adjustRangeToType (boundaries , node .getType (), typeRange );
450+ return computeRangePredicateSelectivity (() -> defaultSelectivity , node , boundaries );
451+ }
452+
453+ private Double computeRangePredicateSelectivity (Supplier <Double > defaultSelectivity , RexNode operand ,
454+ Range <Float > boundaries ) {
455+ return computeRangePredicateSelectivity (defaultSelectivity , operand , boundaries , false );
456+ }
457+
458+ /**
459+ * Computes the selectivity of an operand in a certain range trying to leverage the histogram information.
460+ * Returns the default selectivity if the histogram is not available.
461+ */
462+ private Double computeRangePredicateSelectivity (Supplier <Double > defaultSelectivity , RexNode operand ,
463+ Range <Float > boundaries , boolean inverseBool /* true only for NOT_BETWEEN */ ) {
464+ if (!(childRel instanceof HiveTableScan )) {
465+ return defaultSelectivity .get ();
466+ }
451467
452- node = RexUtil .removeCast (node );
468+ final HiveTableScan scan = (HiveTableScan ) childRel ;
469+ Range <Float > typeRange = inverseBool ? Range .closed (Float .NEGATIVE_INFINITY , Float .POSITIVE_INFINITY ) : null ;
470+ if (isRemovableCast (operand , scan )) {
471+ typeRange = getRangeOfType (operand .getType ());
472+ boundaries = adjustRangeToType (boundaries , operand .getType (), typeRange );
473+ operand = RexUtil .removeCast (operand );
453474 }
454475
455476 int inputRefIndex = -1 ;
456- if (node .getKind ().equals (SqlKind .INPUT_REF )) {
457- inputRefIndex = ((RexInputRef ) node ).getIndex ();
477+ if (operand .getKind ().equals (SqlKind .INPUT_REF )) {
478+ inputRefIndex = ((RexInputRef ) operand ).getIndex ();
458479 }
459480
460481 if (inputRefIndex < 0 ) {
461- return defaultSelectivity ;
482+ return defaultSelectivity . get () ;
462483 }
463484
464485 final List <ColStatistics > colStats = scan .getColStat (Collections .singletonList (inputRefIndex ));
465486 if (colStats .isEmpty () || !isHistogramAvailable (colStats .get (0 ))) {
466- return defaultSelectivity ;
487+ return defaultSelectivity . get () ;
467488 }
468489
469490 final KllFloatsSketch kll = KllFloatsSketch .heapify (Memory .wrap (colStats .get (0 ).getHistogram ()));
470491 double rawSelectivity = rangedSelectivity (kll , boundaries );
492+ if (inverseBool ) {
493+ // when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted
494+ // if there's a cast, the inversion is with respect to its codomain (range of the values of the cast)
495+ double typeRangeSelectivity = rangedSelectivity (kll , typeRange );
496+ rawSelectivity = typeRangeSelectivity - rawSelectivity ;
497+ }
471498 return scaleSelectivityToNullableValues (kll , rawSelectivity , scan );
472499 }
473500
@@ -511,7 +538,6 @@ private Double computeBetweenPredicateSelectivity(RexCall call) {
511538 Optional <Float > rightLiteral = extractLiteral (operands .get (3 ));
512539
513540 if (hasLiteralBool && leftLiteral .isPresent () && rightLiteral .isPresent ()) {
514- final HiveTableScan scan = (HiveTableScan ) childRel ;
515541 float leftValue = leftLiteral .get ();
516542 float rightValue = rightLiteral .get ();
517543
@@ -522,36 +548,9 @@ private Double computeBetweenPredicateSelectivity(RexCall call) {
522548 }
523549
524550 Range <Float > rangeBoundaries = makeRange (leftValue , rightValue , BoundType .CLOSED );
525- Range <Float > typeBoundaries = inverseBool ? Range .closed (Float .NEGATIVE_INFINITY , Float .POSITIVE_INFINITY ) : null ;
526-
527551 RexNode expr = operands .get (1 ); // expr to be checked by the BETWEEN
528- if (isRemovableCast (expr , scan )) {
529- typeBoundaries = getRangeOfType (expr .getType ());
530- rangeBoundaries = adjustRangeToType (rangeBoundaries , expr .getType (), typeBoundaries );
531- expr = RexUtil .removeCast (expr );
532- }
533-
534- int inputRefIndex = -1 ;
535- if (expr .getKind ().equals (SqlKind .INPUT_REF )) {
536- inputRefIndex = ((RexInputRef ) expr ).getIndex ();
537- }
538-
539- if (inputRefIndex < 0 ) {
540- return computeFunctionSelectivity (call );
541- }
542-
543- final List <ColStatistics > colStats = scan .getColStat (Collections .singletonList (inputRefIndex ));
544- if (!colStats .isEmpty () && isHistogramAvailable (colStats .get (0 ))) {
545- final KllFloatsSketch kll = KllFloatsSketch .heapify (Memory .wrap (colStats .get (0 ).getHistogram ()));
546- double rawSelectivity = rangedSelectivity (kll , rangeBoundaries );
547- if (inverseBool ) {
548- // when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted
549- // if there's a cast, the inversion is with respect to its codomain (range of the values of the cast)
550- double typeRangeSelectivity = rangedSelectivity (kll , typeBoundaries );
551- rawSelectivity = typeRangeSelectivity - rawSelectivity ;
552- }
553- return scaleSelectivityToNullableValues (kll , rawSelectivity , scan );
554- }
552+ return computeRangePredicateSelectivity (() -> computeFunctionSelectivity (call ), expr , rangeBoundaries ,
553+ inverseBool );
555554 }
556555 return computeFunctionSelectivity (call );
557556 }
@@ -603,6 +602,106 @@ private Optional<Float> extractLiteral(SqlTypeName typeName, Object boundValueOb
603602 return Optional .of (value );
604603 }
605604
605+ private double computeSearchSelectivity (RexCall search ) {
606+ return new SearchSelectivityHelper <>(search ).compute ();
607+ }
608+
609+ /**
610+ * Auxiliary class to compute the selectivity of a SEARCH expression.
611+ */
612+ private final class SearchSelectivityHelper <C extends Comparable <C >> {
613+ private final RexNode ref ;
614+ private final Sarg <C > sarg ;
615+ private final RelDataType operandType ;
616+
617+ private SearchSelectivityHelper (RexCall search ) {
618+ ref = search .getOperands ().get (0 );
619+ RexLiteral literal = (RexLiteral ) search .operands .get (1 );
620+ sarg = Objects .requireNonNull (literal .getValueAs (Sarg .class ), "Sarg" );
621+ operandType = literal .getType ();
622+ }
623+
624+ private RexNode makeLiteral (C value ) {
625+ return rexBuilder .makeLiteral (value , operandType , true , true );
626+ }
627+
628+ private double compute () {
629+ final List <RexNode > inLiterals = new ArrayList <>();
630+ final List <Double > rangeSelectivities = new ArrayList <>();
631+ for (Range <C > range : sarg .rangeSet .asRanges ()) {
632+ if (!range .hasLowerBound () && !range .hasUpperBound ()) {
633+ return 1.0 ; // "all" range
634+ }
635+ processRangeSelectivity (range , rangeSelectivities , inLiterals );
636+ }
637+
638+ final List <Double > searchSelectivities = new ArrayList <>();
639+ if (!rangeSelectivities .isEmpty () && rangeSelectivities .stream ().noneMatch (Objects ::isNull )) {
640+ // Aggregate all ranges selectivity, respecting the max value of 1
641+ double total = Math .min (1.0 , rangeSelectivities .stream ().mapToDouble (Double ::doubleValue ).sum ());
642+ if (total == 1.0 ) {
643+ return 1.0 ;
644+ }
645+ searchSelectivities .add (total );
646+ } else {
647+ searchSelectivities .addAll (rangeSelectivities );
648+ }
649+
650+ if (!inLiterals .isEmpty ()) {
651+ if (inLiterals .size () == 1 ) {
652+ searchSelectivities .add (rexBuilder .makeCall (SqlStdOperatorTable .EQUALS , ref , inLiterals .get (0 ))
653+ .accept (FilterSelectivityEstimator .this ));
654+ } else {
655+ List <RexNode > operands = new ArrayList <>(inLiterals .size () + 1 );
656+ operands .add (ref );
657+ operands .addAll (inLiterals );
658+ searchSelectivities .add (rexBuilder .makeCall (HiveIn .INSTANCE , operands ).accept (FilterSelectivityEstimator .this ));
659+ }
660+ }
661+
662+ if (sarg .nullAs == RexUnknownAs .TRUE ) {
663+ searchSelectivities .add (
664+ rexBuilder .makeCall (SqlStdOperatorTable .IS_NULL , ref ).accept (FilterSelectivityEstimator .this ));
665+ }
666+
667+ return searchSelectivities .size () == 1 ? searchSelectivities .get (0 ) : computeDisjunctionSelectivity (searchSelectivities );
668+ }
669+
670+ private void processRangeSelectivity (Range <C > range , List <Double > rangeSelectivities , List <RexNode > inLiterals ) {
671+ final boolean hasLower = range .hasLowerBound ();
672+ final boolean hasUpper = range .hasUpperBound ();
673+
674+ final BoundType lowerBoundType = hasLower ? range .lowerBoundType () : BoundType .CLOSED ;
675+ final BoundType upperBoundType = hasUpper ? range .upperBoundType () : BoundType .CLOSED ;
676+
677+ final RexNode lowerRex = hasLower ? makeLiteral (range .lowerEndpoint ()) : null ;
678+ final RexNode upperRex = hasUpper ? makeLiteral (range .upperEndpoint ()) : null ;
679+
680+ // map missing bounds to infinity
681+ final Optional <Float > lowerLiteral = hasLower ? extractLiteral (lowerRex ) : Optional .of (Float .NEGATIVE_INFINITY );
682+ final Optional <Float > upperLiteral = hasUpper ? extractLiteral (upperRex ) : Optional .of (Float .POSITIVE_INFINITY );
683+
684+ // check for single value ranges
685+ if (hasLower && hasUpper && lowerBoundType == BoundType .CLOSED && upperBoundType == BoundType .CLOSED
686+ && lowerLiteral .equals (upperLiteral )) {
687+ inLiterals .add (lowerRex );
688+ return ;
689+ }
690+
691+ // map the range to a selectivity
692+ final Supplier <Double > defaultSelectivity =
693+ hasLower && hasUpper ? () -> computeFunctionSelectivity (List .of (ref , lowerRex , upperRex ))
694+ : () -> DEFAULT_COMPARISON_SELECTIVITY ;
695+
696+ if (lowerLiteral .isEmpty () || upperLiteral .isEmpty ()) {
697+ rangeSelectivities .add (defaultSelectivity .get ());
698+ } else {
699+ rangeSelectivities .add (computeRangePredicateSelectivity (defaultSelectivity , ref ,
700+ Range .range (lowerLiteral .get (), lowerBoundType , upperLiteral .get (), upperBoundType )));
701+ }
702+ }
703+ }
704+
606705 /**
607706 * NDV of "f1(x, y, z) != f2(p, q, r)" ->
608707 * "(maxNDV(x,y,z,p,q,r) - 1)/maxNDV(x,y,z,p,q,r)".
@@ -633,7 +732,11 @@ private Double computeNotEqualitySelectivity(RexCall call) {
633732 * @return
634733 */
635734 private Double computeFunctionSelectivity (RexCall call ) {
636- Double tmpNDV = getMaxNDV (call );
735+ return computeFunctionSelectivity (call .getOperands ());
736+ }
737+
738+ private Double computeFunctionSelectivity (List <RexNode > operands ) {
739+ Double tmpNDV = getMaxNDV (operands );
637740 if (tmpNDV == null ) {
638741 // Could not be computed
639742 return null ;
@@ -653,12 +756,20 @@ private Double computeFunctionSelectivity(RexCall call) {
653756 * @return
654757 */
655758 private Double computeDisjunctionSelectivity (RexCall call ) {
759+ List <Double > selectivityList = new ArrayList <>(call .getOperands ().size ());
760+ for (RexNode dje : call .getOperands ()) {
761+ selectivityList .add (dje .accept (this ));
762+ }
763+ return computeDisjunctionSelectivity (selectivityList );
764+ }
765+
766+ private double computeDisjunctionSelectivity (List <Double > selectivityList ) {
656767 Double tmpCardinality ;
657768 Double tmpSelectivity ;
658769 double selectivity = 1 ;
659770
660- for (RexNode dje : call . getOperands () ) {
661- tmpSelectivity = dje . accept ( this ) ;
771+ for (Double sel : selectivityList ) {
772+ tmpSelectivity = sel ;
662773 if (tmpSelectivity == null ) {
663774 tmpSelectivity = 0.99 ;
664775 }
@@ -729,10 +840,14 @@ private long getMaxNulls(RexCall call, HiveTableScan t) {
729840 }
730841
731842 private Double getMaxNDV (RexCall call ) {
843+ return getMaxNDV (call .getOperands ());
844+ }
845+
846+ private Double getMaxNDV (List <RexNode > operands ) {
732847 Double tmpNDV ;
733848 double maxNDV = 1.0 ;
734849 InputReferencedVisitor irv ;
735- for (RexNode op : call . getOperands () ) {
850+ for (RexNode op : operands ) {
736851 if (op instanceof RexInputRef ) {
737852 tmpNDV = HiveRelMdDistinctRowCount .getDistinctRowCount (this .childRel , mq ,
738853 ((RexInputRef ) op ).getIndex ());
0 commit comments