Skip to content

Commit 6d9d9b4

Browse files
authored
HIVE-29479: Improve histogram-based selectivity estimation for two-sided range predicates (apache#6477)
1 parent 2310266 commit 6d9d9b4

3 files changed

Lines changed: 487 additions & 167 deletions

File tree

ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java

Lines changed: 165 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.util.Objects;
2626
import java.util.Optional;
2727
import java.util.Set;
28+
import java.util.function.Supplier;
2829

2930
import com.google.common.collect.BoundType;
3031
import com.google.common.collect.Range;
@@ -44,16 +45,17 @@
4445
import org.apache.calcite.rex.RexUtil;
4546
import org.apache.calcite.rex.RexVisitorImpl;
4647
import org.apache.calcite.sql.SqlKind;
48+
import org.apache.calcite.sql.fun.SqlStdOperatorTable;
4749
import org.apache.calcite.sql.type.SqlTypeName;
4850
import org.apache.calcite.sql.type.SqlTypeUtil;
4951
import org.apache.calcite.util.ImmutableBitSet;
52+
import org.apache.calcite.util.Sarg;
5053
import org.apache.datasketches.kll.KllFloatsSketch;
5154
import org.apache.datasketches.memory.Memory;
5255
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
5356
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
5457
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveConfPlannerContext;
5558
import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
56-
import org.apache.hadoop.hive.ql.optimizer.calcite.SearchTransformer;
5759
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveIn;
5860
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
5961
import org.apache.hadoop.hive.ql.plan.ColStatistics;
@@ -65,6 +67,8 @@ public class FilterSelectivityEstimator extends RexVisitorImpl<Double> {
6567

6668
protected static final Logger LOG = LoggerFactory.getLogger(FilterSelectivityEstimator.class);
6769

70+
private static final double DEFAULT_COMPARISON_SELECTIVITY = 1.0 / 3.0;
71+
6872
private final RelNode childRel;
6973
private final double childCardinality;
7074
private final RelMetadataQuery mq;
@@ -114,7 +118,8 @@ public Double visitCall(RexCall call) {
114118
break;
115119
}
116120
case SEARCH:
117-
return new SearchTransformer<>(rexBuilder, call, RexUnknownAs.FALSE).transform().accept(this);
121+
selectivity = computeSearchSelectivity(call);
122+
break;
118123
case OR: {
119124
selectivity = computeDisjunctionSelectivity(call);
120125
break;
@@ -159,7 +164,7 @@ public Double visitCall(RexCall call) {
159164
case GREATER_THAN_OR_EQUAL:
160165
case LESS_THAN:
161166
case GREATER_THAN: {
162-
selectivity = computeRangePredicateSelectivity(call, call.getKind());
167+
selectivity = computeComparisonPredicateSelectivity(call, call.getKind());
163168
break;
164169
}
165170

@@ -405,8 +410,8 @@ private static Range<Float> makeRange(float lower, float upper, BoundType upperT
405410
return lower > upper ? Range.closedOpen(0f, 0f) : Range.range(lower, BoundType.CLOSED, upper, upperType);
406411
}
407412

408-
private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
409-
double defaultSelectivity = ((double) 1 / (double) 3);
413+
private double computeComparisonPredicateSelectivity(RexCall call, SqlKind op) {
414+
double defaultSelectivity = DEFAULT_COMPARISON_SELECTIVITY;
410415
if (!(childRel instanceof HiveTableScan)) {
411416
return defaultSelectivity;
412417
}
@@ -440,34 +445,56 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
440445
boundaryValues[boundaryIdx] = value;
441446
inclusive[boundaryIdx] = openBound ? BoundType.OPEN : BoundType.CLOSED;
442447
Range<Float> boundaries = Range.range(boundaryValues[0], inclusive[0], boundaryValues[1], inclusive[1]);
443-
444-
// extract the column index from the other operator
445-
final HiveTableScan scan = (HiveTableScan) childRel;
446448
int inputRefOpIndex = 1 - literalOpIdx;
447449
RexNode node = operands.get(inputRefOpIndex);
448-
if (isRemovableCast(node, scan)) {
449-
Range<Float> typeRange = getRangeOfType(node.getType());
450-
boundaries = adjustRangeToType(boundaries, node.getType(), typeRange);
450+
return computeRangePredicateSelectivity(() -> defaultSelectivity, node, boundaries);
451+
}
452+
453+
private Double computeRangePredicateSelectivity(Supplier<Double> defaultSelectivity, RexNode operand,
454+
Range<Float> boundaries) {
455+
return computeRangePredicateSelectivity(defaultSelectivity, operand, boundaries, false);
456+
}
457+
458+
/**
459+
* Computes the selectivity of an operand in a certain range trying to leverage the histogram information.
460+
* Returns the default selectivity if the histogram is not available.
461+
*/
462+
private Double computeRangePredicateSelectivity(Supplier<Double> defaultSelectivity, RexNode operand,
463+
Range<Float> boundaries, boolean inverseBool /* true only for NOT_BETWEEN */) {
464+
if (!(childRel instanceof HiveTableScan)) {
465+
return defaultSelectivity.get();
466+
}
451467

452-
node = RexUtil.removeCast(node);
468+
final HiveTableScan scan = (HiveTableScan) childRel;
469+
Range<Float> typeRange = inverseBool ? Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY) : null;
470+
if (isRemovableCast(operand, scan)) {
471+
typeRange = getRangeOfType(operand.getType());
472+
boundaries = adjustRangeToType(boundaries, operand.getType(), typeRange);
473+
operand = RexUtil.removeCast(operand);
453474
}
454475

455476
int inputRefIndex = -1;
456-
if (node.getKind().equals(SqlKind.INPUT_REF)) {
457-
inputRefIndex = ((RexInputRef) node).getIndex();
477+
if (operand.getKind().equals(SqlKind.INPUT_REF)) {
478+
inputRefIndex = ((RexInputRef) operand).getIndex();
458479
}
459480

460481
if (inputRefIndex < 0) {
461-
return defaultSelectivity;
482+
return defaultSelectivity.get();
462483
}
463484

464485
final List<ColStatistics> colStats = scan.getColStat(Collections.singletonList(inputRefIndex));
465486
if (colStats.isEmpty() || !isHistogramAvailable(colStats.get(0))) {
466-
return defaultSelectivity;
487+
return defaultSelectivity.get();
467488
}
468489

469490
final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
470491
double rawSelectivity = rangedSelectivity(kll, boundaries);
492+
if (inverseBool) {
493+
// when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted
494+
// if there's a cast, the inversion is with respect to its codomain (range of the values of the cast)
495+
double typeRangeSelectivity = rangedSelectivity(kll, typeRange);
496+
rawSelectivity = typeRangeSelectivity - rawSelectivity;
497+
}
471498
return scaleSelectivityToNullableValues(kll, rawSelectivity, scan);
472499
}
473500

@@ -511,7 +538,6 @@ private Double computeBetweenPredicateSelectivity(RexCall call) {
511538
Optional<Float> rightLiteral = extractLiteral(operands.get(3));
512539

513540
if (hasLiteralBool && leftLiteral.isPresent() && rightLiteral.isPresent()) {
514-
final HiveTableScan scan = (HiveTableScan) childRel;
515541
float leftValue = leftLiteral.get();
516542
float rightValue = rightLiteral.get();
517543

@@ -522,36 +548,9 @@ private Double computeBetweenPredicateSelectivity(RexCall call) {
522548
}
523549

524550
Range<Float> rangeBoundaries = makeRange(leftValue, rightValue, BoundType.CLOSED);
525-
Range<Float> typeBoundaries = inverseBool ? Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY) : null;
526-
527551
RexNode expr = operands.get(1); // expr to be checked by the BETWEEN
528-
if (isRemovableCast(expr, scan)) {
529-
typeBoundaries = getRangeOfType(expr.getType());
530-
rangeBoundaries = adjustRangeToType(rangeBoundaries, expr.getType(), typeBoundaries);
531-
expr = RexUtil.removeCast(expr);
532-
}
533-
534-
int inputRefIndex = -1;
535-
if (expr.getKind().equals(SqlKind.INPUT_REF)) {
536-
inputRefIndex = ((RexInputRef) expr).getIndex();
537-
}
538-
539-
if (inputRefIndex < 0) {
540-
return computeFunctionSelectivity(call);
541-
}
542-
543-
final List<ColStatistics> colStats = scan.getColStat(Collections.singletonList(inputRefIndex));
544-
if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) {
545-
final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
546-
double rawSelectivity = rangedSelectivity(kll, rangeBoundaries);
547-
if (inverseBool) {
548-
// when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted
549-
// if there's a cast, the inversion is with respect to its codomain (range of the values of the cast)
550-
double typeRangeSelectivity = rangedSelectivity(kll, typeBoundaries);
551-
rawSelectivity = typeRangeSelectivity - rawSelectivity;
552-
}
553-
return scaleSelectivityToNullableValues(kll, rawSelectivity, scan);
554-
}
552+
return computeRangePredicateSelectivity(() -> computeFunctionSelectivity(call), expr, rangeBoundaries,
553+
inverseBool);
555554
}
556555
return computeFunctionSelectivity(call);
557556
}
@@ -603,6 +602,106 @@ private Optional<Float> extractLiteral(SqlTypeName typeName, Object boundValueOb
603602
return Optional.of(value);
604603
}
605604

605+
private double computeSearchSelectivity(RexCall search) {
606+
return new SearchSelectivityHelper<>(search).compute();
607+
}
608+
609+
/**
610+
* Auxiliary class to compute the selectivity of a SEARCH expression.
611+
*/
612+
private final class SearchSelectivityHelper<C extends Comparable<C>> {
613+
private final RexNode ref;
614+
private final Sarg<C> sarg;
615+
private final RelDataType operandType;
616+
617+
private SearchSelectivityHelper(RexCall search) {
618+
ref = search.getOperands().get(0);
619+
RexLiteral literal = (RexLiteral) search.operands.get(1);
620+
sarg = Objects.requireNonNull(literal.getValueAs(Sarg.class), "Sarg");
621+
operandType = literal.getType();
622+
}
623+
624+
private RexNode makeLiteral(C value) {
625+
return rexBuilder.makeLiteral(value, operandType, true, true);
626+
}
627+
628+
private double compute() {
629+
final List<RexNode> inLiterals = new ArrayList<>();
630+
final List<Double> rangeSelectivities = new ArrayList<>();
631+
for (Range<C> range : sarg.rangeSet.asRanges()) {
632+
if (!range.hasLowerBound() && !range.hasUpperBound()) {
633+
return 1.0; // "all" range
634+
}
635+
processRangeSelectivity(range, rangeSelectivities, inLiterals);
636+
}
637+
638+
final List<Double> searchSelectivities = new ArrayList<>();
639+
if (!rangeSelectivities.isEmpty() && rangeSelectivities.stream().noneMatch(Objects::isNull)) {
640+
// Aggregate all ranges selectivity, respecting the max value of 1
641+
double total = Math.min(1.0, rangeSelectivities.stream().mapToDouble(Double::doubleValue).sum());
642+
if (total == 1.0) {
643+
return 1.0;
644+
}
645+
searchSelectivities.add(total);
646+
} else {
647+
searchSelectivities.addAll(rangeSelectivities);
648+
}
649+
650+
if (!inLiterals.isEmpty()) {
651+
if (inLiterals.size() == 1) {
652+
searchSelectivities.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, inLiterals.get(0))
653+
.accept(FilterSelectivityEstimator.this));
654+
} else {
655+
List<RexNode> operands = new ArrayList<>(inLiterals.size() + 1);
656+
operands.add(ref);
657+
operands.addAll(inLiterals);
658+
searchSelectivities.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands).accept(FilterSelectivityEstimator.this));
659+
}
660+
}
661+
662+
if (sarg.nullAs == RexUnknownAs.TRUE) {
663+
searchSelectivities.add(
664+
rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref).accept(FilterSelectivityEstimator.this));
665+
}
666+
667+
return searchSelectivities.size() == 1 ? searchSelectivities.get(0) : computeDisjunctionSelectivity(searchSelectivities);
668+
}
669+
670+
private void processRangeSelectivity(Range<C> range, List<Double> rangeSelectivities, List<RexNode> inLiterals) {
671+
final boolean hasLower = range.hasLowerBound();
672+
final boolean hasUpper = range.hasUpperBound();
673+
674+
final BoundType lowerBoundType = hasLower ? range.lowerBoundType() : BoundType.CLOSED;
675+
final BoundType upperBoundType = hasUpper ? range.upperBoundType() : BoundType.CLOSED;
676+
677+
final RexNode lowerRex = hasLower ? makeLiteral(range.lowerEndpoint()) : null;
678+
final RexNode upperRex = hasUpper ? makeLiteral(range.upperEndpoint()) : null;
679+
680+
// map missing bounds to infinity
681+
final Optional<Float> lowerLiteral = hasLower ? extractLiteral(lowerRex) : Optional.of(Float.NEGATIVE_INFINITY);
682+
final Optional<Float> upperLiteral = hasUpper ? extractLiteral(upperRex) : Optional.of(Float.POSITIVE_INFINITY);
683+
684+
// check for single value ranges
685+
if (hasLower && hasUpper && lowerBoundType == BoundType.CLOSED && upperBoundType == BoundType.CLOSED
686+
&& lowerLiteral.equals(upperLiteral)) {
687+
inLiterals.add(lowerRex);
688+
return;
689+
}
690+
691+
// map the range to a selectivity
692+
final Supplier<Double> defaultSelectivity =
693+
hasLower && hasUpper ? () -> computeFunctionSelectivity(List.of(ref, lowerRex, upperRex))
694+
: () -> DEFAULT_COMPARISON_SELECTIVITY;
695+
696+
if (lowerLiteral.isEmpty() || upperLiteral.isEmpty()) {
697+
rangeSelectivities.add(defaultSelectivity.get());
698+
} else {
699+
rangeSelectivities.add(computeRangePredicateSelectivity(defaultSelectivity, ref,
700+
Range.range(lowerLiteral.get(), lowerBoundType, upperLiteral.get(), upperBoundType)));
701+
}
702+
}
703+
}
704+
606705
/**
607706
* NDV of "f1(x, y, z) != f2(p, q, r)" ->
608707
* "(maxNDV(x,y,z,p,q,r) - 1)/maxNDV(x,y,z,p,q,r)".
@@ -633,7 +732,11 @@ private Double computeNotEqualitySelectivity(RexCall call) {
633732
* @return
634733
*/
635734
private Double computeFunctionSelectivity(RexCall call) {
636-
Double tmpNDV = getMaxNDV(call);
735+
return computeFunctionSelectivity(call.getOperands());
736+
}
737+
738+
private Double computeFunctionSelectivity(List<RexNode> operands) {
739+
Double tmpNDV = getMaxNDV(operands);
637740
if (tmpNDV == null) {
638741
// Could not be computed
639742
return null;
@@ -653,12 +756,20 @@ private Double computeFunctionSelectivity(RexCall call) {
653756
* @return
654757
*/
655758
private Double computeDisjunctionSelectivity(RexCall call) {
759+
List<Double> selectivityList = new ArrayList<>(call.getOperands().size());
760+
for (RexNode dje : call.getOperands()) {
761+
selectivityList.add(dje.accept(this));
762+
}
763+
return computeDisjunctionSelectivity(selectivityList);
764+
}
765+
766+
private double computeDisjunctionSelectivity(List<Double> selectivityList) {
656767
Double tmpCardinality;
657768
Double tmpSelectivity;
658769
double selectivity = 1;
659770

660-
for (RexNode dje : call.getOperands()) {
661-
tmpSelectivity = dje.accept(this);
771+
for (Double sel : selectivityList) {
772+
tmpSelectivity = sel;
662773
if (tmpSelectivity == null) {
663774
tmpSelectivity = 0.99;
664775
}
@@ -729,10 +840,14 @@ private long getMaxNulls(RexCall call, HiveTableScan t) {
729840
}
730841

731842
private Double getMaxNDV(RexCall call) {
843+
return getMaxNDV(call.getOperands());
844+
}
845+
846+
private Double getMaxNDV(List<RexNode> operands) {
732847
Double tmpNDV;
733848
double maxNDV = 1.0;
734849
InputReferencedVisitor irv;
735-
for (RexNode op : call.getOperands()) {
850+
for (RexNode op : operands) {
736851
if (op instanceof RexInputRef) {
737852
tmpNDV = HiveRelMdDistinctRowCount.getDistinctRowCount(this.childRel, mq,
738853
((RexInputRef) op).getIndex());

0 commit comments

Comments
 (0)