Skip to content

Commit 45bdea4

Browse files
authored
HIVE-28911: Improve SEARCH expansion to exploit <> operator (#6503)
1 parent 1cb455b commit 45bdea4

12 files changed

Lines changed: 133 additions & 77 deletions

File tree

iceberg/iceberg-handler/src/test/results/positive/delete_iceberg_copy_on_write_unpartitioned.q.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,10 @@ STAGE PLANS:
4848
Map Operator Tree:
4949
TableScan
5050
alias: tbl_ice
51-
filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22)) or (b) IN ('four', 'one') or (a = 22)) (type: boolean)
51+
filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22)) or (b) IN ('four', 'one') or (a = 22)) (type: boolean)
5252
Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE
5353
Filter Operator
54-
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) and FILE__PATH is not null) (type: boolean)
54+
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) (type: boolean)
5555
Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE
5656
Select Operator
5757
expressions: a (type: int), b (type: string), c (type: int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string)

iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_7.q.out

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -150,27 +150,27 @@ Stage-0
150150
File Output Operator [FS_61]
151151
Limit [LIM_60] (rows=20 width=447)
152152
Number of rows:20
153-
Select Operator [SEL_59] (rows=473 width=447)
153+
Select Operator [SEL_59] (rows=791 width=447)
154154
Output:["_col0","_col1","_col2","_col3","_col4"]
155155
<-Map 1 [SIMPLE_EDGE] vectorized, llap
156156
SHUFFLE [RS_58]
157-
Top N Key Operator [TNK_57] (rows=473 width=447)
157+
Top N Key Operator [TNK_57] (rows=791 width=447)
158158
keys:_col0,top n:20
159-
Map Join Operator [MAPJOIN_56] (rows=473 width=447)
159+
Map Join Operator [MAPJOIN_56] (rows=791 width=447)
160160
BucketMapJoin:true,Conds:SEL_55._col0, _col1=RS_53._col0, _col1(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
161161
<-Map 3 [CUSTOM_EDGE] vectorized, llap
162162
MULTICAST [RS_53]
163163
PartitionCols:_col0, _col1
164-
Select Operator [SEL_52] (rows=387 width=178)
164+
Select Operator [SEL_52] (rows=500 width=178)
165165
Output:["_col0","_col1"]
166-
Filter Operator [FIL_51] (rows=387 width=178)
167-
predicate:(((key < '0') or ((key > '0') and (key < '100')) or (key > '100')) and value is not null)
166+
Filter Operator [FIL_51] (rows=500 width=178)
167+
predicate:((key <> '0') and (key <> '100') and value is not null)
168168
TableScan [TS_3] (rows=500 width=178)
169169
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
170-
<-Select Operator [SEL_55] (rows=387 width=269)
170+
<-Select Operator [SEL_55] (rows=500 width=269)
171171
Output:["_col0","_col1","_col2"]
172-
Filter Operator [FIL_54] (rows=387 width=269)
173-
predicate:(((key1 < '0') or ((key1 > '0') and (key1 < '100')) or (key1 > '100')) and key2 is not null)
172+
Filter Operator [FIL_54] (rows=500 width=269)
173+
predicate:((key1 <> '0') and (key1 <> '100') and key2 is not null)
174174
TableScan [TS_0] (rows=500 width=269)
175175
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:8,Grouping Partition Columns:["key1","key2"],Output:["key1","key2","value"]
176176

@@ -346,27 +346,27 @@ Stage-0
346346
File Output Operator [FS_41]
347347
Limit [LIM_40] (rows=20 width=447)
348348
Number of rows:20
349-
Select Operator [SEL_39] (rows=473 width=447)
349+
Select Operator [SEL_39] (rows=791 width=447)
350350
Output:["_col0","_col1","_col2","_col3","_col4"]
351351
<-Map 1 [SIMPLE_EDGE] vectorized, llap
352352
SHUFFLE [RS_38]
353-
Top N Key Operator [TNK_37] (rows=473 width=447)
353+
Top N Key Operator [TNK_37] (rows=791 width=447)
354354
keys:_col0,top n:20
355-
Map Join Operator [MAPJOIN_36] (rows=473 width=447)
355+
Map Join Operator [MAPJOIN_36] (rows=791 width=447)
356356
BucketMapJoin:true,Conds:SEL_35._col0=RS_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
357357
<-Map 3 [CUSTOM_EDGE] vectorized, llap
358358
MULTICAST [RS_33]
359359
PartitionCols:_col0
360-
Select Operator [SEL_32] (rows=387 width=178)
360+
Select Operator [SEL_32] (rows=500 width=178)
361361
Output:["_col0","_col1"]
362-
Filter Operator [FIL_31] (rows=387 width=178)
363-
predicate:((key < '0') or (key > '100') or ((key > '0') and (key < '100')))
362+
Filter Operator [FIL_31] (rows=500 width=178)
363+
predicate:((key <> '0') and (key <> '100'))
364364
TableScan [TS_3] (rows=500 width=178)
365365
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
366-
<-Select Operator [SEL_35] (rows=387 width=269)
366+
<-Select Operator [SEL_35] (rows=500 width=269)
367367
Output:["_col0","_col1","_col2"]
368-
Filter Operator [FIL_34] (rows=387 width=269)
369-
predicate:((key1 < '0') or (key1 > '100') or ((key1 > '0') and (key1 < '100')))
368+
Filter Operator [FIL_34] (rows=500 width=269)
369+
predicate:((key1 <> '0') and (key1 <> '100'))
370370
TableScan [TS_0] (rows=500 width=269)
371371
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","key2","value"]
372372

@@ -435,40 +435,40 @@ POSTHOOK: Input: default@srcbucket_big
435435
Plan optimized by CBO.
436436

437437
Vertex dependency in root stage
438-
Map 2 <- Map 1 (BROADCAST_EDGE)
439-
Reducer 3 <- Map 2 (SIMPLE_EDGE)
438+
Map 1 <- Map 3 (CUSTOM_EDGE)
439+
Reducer 2 <- Map 1 (SIMPLE_EDGE)
440440

441441
Stage-0
442442
Fetch Operator
443443
limit:20
444444
Stage-1
445-
Reducer 3 vectorized, llap
445+
Reducer 2 vectorized, llap
446446
File Output Operator [FS_41]
447447
Limit [LIM_40] (rows=20 width=447)
448448
Number of rows:20
449-
Select Operator [SEL_39] (rows=612 width=447)
449+
Select Operator [SEL_39] (rows=791 width=447)
450450
Output:["_col0","_col1","_col2","_col3","_col4"]
451-
<-Map 2 [SIMPLE_EDGE] vectorized, llap
451+
<-Map 1 [SIMPLE_EDGE] vectorized, llap
452452
SHUFFLE [RS_38]
453-
Top N Key Operator [TNK_37] (rows=612 width=447)
453+
Top N Key Operator [TNK_37] (rows=791 width=447)
454454
keys:_col0,top n:20
455-
Map Join Operator [MAPJOIN_36] (rows=612 width=447)
456-
Conds:RS_33._col0=SEL_35._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
457-
<-Map 1 [BROADCAST_EDGE] vectorized, llap
458-
BROADCAST [RS_33]
455+
Map Join Operator [MAPJOIN_36] (rows=791 width=447)
456+
BucketMapJoin:true,Conds:SEL_35._col0=RS_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
457+
<-Map 3 [CUSTOM_EDGE] vectorized, llap
458+
MULTICAST [RS_33]
459459
PartitionCols:_col0
460-
Select Operator [SEL_32] (rows=387 width=269)
461-
Output:["_col0","_col1","_col2"]
462-
Filter Operator [FIL_31] (rows=387 width=269)
463-
predicate:(((key2 < 'val_0') or ((key2 > 'val_0') and (key2 < 'val_100')) or (key2 > 'val_100')) and key1 is not null)
464-
TableScan [TS_0] (rows=500 width=269)
465-
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","key2","value"]
466-
<-Select Operator [SEL_35] (rows=500 width=178)
467-
Output:["_col0","_col1"]
468-
Filter Operator [FIL_34] (rows=500 width=178)
469-
predicate:key is not null
470-
TableScan [TS_3] (rows=500 width=178)
471-
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
460+
Select Operator [SEL_32] (rows=500 width=178)
461+
Output:["_col0","_col1"]
462+
Filter Operator [FIL_31] (rows=500 width=178)
463+
predicate:key is not null
464+
TableScan [TS_3] (rows=500 width=178)
465+
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
466+
<-Select Operator [SEL_35] (rows=500 width=269)
467+
Output:["_col0","_col1","_col2"]
468+
Filter Operator [FIL_34] (rows=500 width=269)
469+
predicate:((key2 <> 'val_0') and (key2 <> 'val_100') and key1 is not null)
470+
TableScan [TS_0] (rows=500 width=269)
471+
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","key2","value"]
472472

473473
PREHOOK: query: SELECT *
474474
FROM srcbucket_big a

iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_partitioned.q.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,10 @@ STAGE PLANS:
7171
Map Operator Tree:
7272
TableScan
7373
alias: tbl_ice
74-
filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) (type: boolean)
74+
filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) (type: boolean)
7575
Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: PARTIAL
7676
Filter Operator
77-
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) and FILE__PATH is not null) (type: boolean)
77+
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) (type: boolean)
7878
Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: PARTIAL
7979
Select Operator
8080
expressions: a (type: int), b (type: string), c (type: int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string)

iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_unpartitioned.q.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ STAGE PLANS:
7171
Map Operator Tree:
7272
TableScan
7373
alias: tbl_ice
74-
filterExpr: ((a = 22) or (b) IN ('four', 'one') or ((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) (type: boolean)
74+
filterExpr: ((a = 22) or (b) IN ('four', 'one') or ((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) (type: boolean)
7575
Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE
7676
Filter Operator
7777
predicate: ((a = 22) or (b) IN ('four', 'one')) (type: boolean)
@@ -93,7 +93,7 @@ STAGE PLANS:
9393
Map-reduce partition columns: FILE__PATH (type: string)
9494
Statistics: Num rows: 4 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
9595
Filter Operator
96-
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) and FILE__PATH is not null) (type: boolean)
96+
predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) (type: boolean)
9797
Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE
9898
Select Operator
9999
expressions: a (type: int), b (type: string), c (type: int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string)

ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/SearchTransformer.java

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -72,30 +72,44 @@ public SearchTransformer(RexBuilder rexBuilder, RexCall search, final RexUnknown
7272
this.unknownContext = unknownContext;
7373
}
7474

75+
/**
76+
* Transforms the SEARCH expression into an equivalent RexNode expression.
77+
* Warning: when called from a shuttle, callers of this method should consider flattening AND/OR expressions
78+
* afterward, to get the same result as applying {@link SearchTransformer.Shuttle}.
79+
*/
7580
public RexNode transform() {
7681
PerfLogger perfLogger = SessionState.getPerfLogger();
7782
perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.SEARCH_TRANSFORMER);
7883

79-
RangeConverter<C> consumer = new RangeConverter<>(rexBuilder, operandType, ref);
80-
RangeSets.forEach(sarg.rangeSet, consumer);
81-
8284
List<RexNode> orList = new ArrayList<>();
8385
if (sarg.nullAs == RexUnknownAs.TRUE && unknownContext != RexUnknownAs.TRUE) {
8486
orList.add(rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref));
8587
}
86-
switch (consumer.inLiterals.size()) {
87-
case 0:
88-
break;
89-
case 1:
90-
orList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, consumer.inLiterals.get(0)));
91-
break;
92-
default:
93-
List<RexNode> operands = new ArrayList<>(consumer.inLiterals.size() + 1);
94-
operands.add(ref);
95-
operands.addAll(consumer.inLiterals);
96-
orList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands));
88+
89+
if (sarg.isComplementedPoints()) {
90+
// Generate 'ref <> value1 AND ... AND ref <> valueN'
91+
List<RexNode> list = sarg.rangeSet.complement().asRanges().stream().map(
92+
range -> rexBuilder.makeCall(SqlStdOperatorTable.NOT_EQUALS, ref,
93+
rexBuilder.makeLiteral(range.lowerEndpoint(), operandType, true, true))).toList();
94+
orList.add(RexUtil.composeConjunction(rexBuilder, list));
95+
} else {
96+
RangeConverter<C> consumer = new RangeConverter<>(rexBuilder, operandType, ref);
97+
RangeSets.forEach(sarg.rangeSet, consumer);
98+
99+
switch (consumer.inLiterals.size()) {
100+
case 0:
101+
break;
102+
case 1:
103+
orList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, consumer.inLiterals.get(0)));
104+
break;
105+
default:
106+
List<RexNode> operands = new ArrayList<>(consumer.inLiterals.size() + 1);
107+
operands.add(ref);
108+
operands.addAll(consumer.inLiterals);
109+
orList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands));
110+
}
111+
orList.addAll(consumer.nodes);
97112
}
98-
orList.addAll(consumer.nodes);
99113
RexNode x = RexUtil.composeDisjunction(rexBuilder, orList);
100114

101115
if (sarg.nullAs == RexUnknownAs.FALSE && unknownContext != RexUnknownAs.FALSE) {

ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ExprNodeConverter.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
8383
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
8484
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
85+
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
8586
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
8687
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
8788
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
@@ -214,6 +215,12 @@ public ExprNodeDesc visitCall(RexCall call) {
214215
&& SqlTypeUtil.equalSansNullability(dTFactory, call.getType(),
215216
call.operands.get(0).getType())) {
216217
return args.get(0);
218+
} else if (call.isA(SqlKind.AND)) {
219+
// Make sure AND is flattened (we may have nested ANDs due to SearchTransformer conversion above)
220+
return ExprNodeDescUtils.and(args);
221+
} else if (call.isA(SqlKind.OR)) {
222+
// Make sure OR is flattened (we may have nested ORs due to SearchTransformer conversion above)
223+
return ExprNodeDescUtils.or(args);
217224
} else {
218225
GenericUDF hiveUdf = SqlFunctionConverter.getHiveUDF(call.getOperator(), call.getType(),
219226
args.size());

ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
import java.util.List;
6565
import java.util.Map;
6666
import java.util.Set;
67+
import java.util.function.Predicate;
6768

6869

6970
public class ExprNodeDescUtils {
@@ -243,6 +244,21 @@ public static ExprNodeGenericFuncDesc and(List<ExprNodeDesc> exps) {
243244
return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, new GenericUDFOPAnd(), "and", flatExps);
244245
}
245246

247+
/**
248+
* Creates a disjunction (OR) of the given expressions flattening nested disjunctions if possible.
249+
* <pre>
250+
* Input: AND(A, B), C, OR(D, OR(E, F))
251+
* Output: OR(AND(A, B), C, D, E, F)
252+
* </pre>
253+
*/
254+
public static ExprNodeGenericFuncDesc or(List<ExprNodeDesc> exps) {
255+
List<ExprNodeDesc> flatExps = new ArrayList<>();
256+
for (ExprNodeDesc e : exps) {
257+
split(e, flatExps, FunctionRegistry::isOpOr);
258+
}
259+
return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, new GenericUDFOPOr(), "or", flatExps);
260+
}
261+
246262
/**
247263
* Create an expression for computing a murmur hash by recursively hashing given expressions by two:
248264
* <pre>
@@ -305,9 +321,17 @@ public static List<ExprNodeDesc> split(ExprNodeDesc current) {
305321
* split predicates by AND op
306322
*/
307323
public static List<ExprNodeDesc> split(ExprNodeDesc current, List<ExprNodeDesc> splitted) {
308-
if (FunctionRegistry.isOpAnd(current)) {
324+
return split(current, splitted, FunctionRegistry::isOpAnd);
325+
}
326+
327+
/**
328+
* split predicates by a certain condition
329+
*/
330+
private static List<ExprNodeDesc> split(ExprNodeDesc current, List<ExprNodeDesc> splitted,
331+
Predicate<ExprNodeDesc> condition) {
332+
if (condition.test(current)) {
309333
for (ExprNodeDesc child : current.getChildren()) {
310-
split(child, splitted);
334+
split(child, splitted, condition);
311335
}
312336
return splitted;
313337
}

ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,17 @@ public void testBetweenSelectivityLeftEqualsRight_KO() {
371371
betweenSelectivity(KLL, 2, 2);
372372
}
373373

374+
@Test
375+
public void testComputeNotEqualsPredicateSelectivity() {
376+
RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.AND,
377+
REX_BUILDER.makeCall(SqlStdOperatorTable.NOT_EQUALS, inputRef0, int3),
378+
REX_BUILDER.makeCall(SqlStdOperatorTable.NOT_EQUALS, inputRef0, int7));
379+
filter = simplify(filter);
380+
Assert.assertEquals(SqlKind.SEARCH, filter.getKind());
381+
FilterSelectivityEstimator estimator = new FilterSelectivityEstimator(scan, mq);
382+
Assert.assertEquals(0.8095238095238095, estimator.estimateSelectivity(filter), DELTA);
383+
}
384+
374385
@Test
375386
public void testComputeRangePredicateSelectivityWhenNoStats() {
376387
RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN, inputRef0, int3);

ql/src/test/results/clientpositive/llap/folder_predicate.q.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ STAGE PLANS:
4141
Processor Tree:
4242
TableScan
4343
alias: predicate_fold_tb
44-
filterExpr: (value is null or (value < 3) or (value > 3)) (type: boolean)
44+
filterExpr: ((value <> 3) or value is null) (type: boolean)
4545
Filter Operator
46-
predicate: (value is null or (value < 3) or (value > 3)) (type: boolean)
46+
predicate: ((value <> 3) or value is null) (type: boolean)
4747
Select Operator
4848
expressions: value (type: int)
4949
outputColumnNames: _col0

0 commit comments

Comments
 (0)