Skip to content

Commit 1a16493

Browse files
authored
branch-2.1:[improvement](statistics)Eliminate null values while sample analyzing ndv. (#50574) (#51648)
backport: #50574
1 parent d4a5039 commit 1a16493

8 files changed

Lines changed: 177 additions & 22 deletions

File tree

fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -885,8 +885,15 @@ private ColumnStatistic getColumnStatistic(TableIf table, String colName, long i
885885
return ColumnStatistic.UNKNOWN;
886886
}
887887
} else {
888-
return Env.getCurrentEnv().getStatisticsCache().getColumnStatistics(
889-
catalogId, dbId, table.getId(), idxId, colName);
888+
ColumnStatistic columnStatistics = Env.getCurrentEnv().getStatisticsCache().getColumnStatistics(
889+
catalogId, dbId, table.getId(), idxId, colName);
890+
if (!columnStatistics.isUnKnown
891+
&& columnStatistics.ndv == 0
892+
&& (columnStatistics.minExpr != null || columnStatistics.maxExpr != null)
893+
&& columnStatistics.numNulls == columnStatistics.count) {
894+
return ColumnStatistic.UNKNOWN;
895+
}
896+
return columnStatistics;
890897
}
891898
}
892899

fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -237,16 +237,14 @@ protected String getMinFunction() {
237237
}
238238

239239
protected String getNdvFunction(String totalRows) {
240-
String sampleRows = "SUM(`t1`.`count`)";
241-
String onceCount = "SUM(IF(`t1`.`count` = 1, 1, 0))";
242-
String countDistinct = "COUNT(1)";
240+
String n = "SUM(`t1`.`count`)"; // sample rows
241+
String f1 = "SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0))";
242+
String d = "COUNT(`t1`.`column_key`)"; // sample ndv
243243
// DUJ1 estimator: n*d / (n - f1 + f1*n/N)
244244
// f1 is the count of element that appears only once in the sample.
245245
// (https://github.com/postgres/postgres/blob/master/src/backend/commands/analyze.c)
246246
// (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.93.8637&rep=rep1&type=pdf)
247-
// sample_row * count_distinct / ( sample_row - once_count + once_count * sample_row / total_row)
248-
return MessageFormat.format("{0} * {1} / ({0} - {2} + {2} * {0} / {3})", sampleRows,
249-
countDistinct, onceCount, totalRows);
247+
return MessageFormat.format("{0} * {1} / ({0} - {2} + {2} * {0} / {3})", n, d, f1, totalRows);
250248
}
251249

252250
// Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.

fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -189,15 +189,19 @@ public boolean isNull(String value) {
189189

190190
public boolean isValid() {
191191
if (ndv > 10 * count) {
192-
LOG.debug("Ndv {} is much larger than count {}", ndv, count);
192+
String message = String.format("ColStatsData ndv too large. %s", toSQL(true));
193+
LOG.warn(message);
193194
return false;
194195
}
195-
if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit))) {
196-
LOG.debug("Ndv is 0 but min or max exists");
196+
if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit)) && nullCount != count) {
197+
String message = String.format("ColStatsData ndv 0 but min/max is not null and nullCount != count. %s",
198+
toSQL(true));
199+
LOG.warn(message);
197200
return false;
198201
}
199-
if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) && (nullCount == 0 || count > nullCount * 10)) {
200-
LOG.debug("count {} not 0, ndv is 0, min and max are all null, null count {} is too small", count, count);
202+
if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) && (count > nullCount * 10)) {
203+
LOG.warn("count {} not 0, ndv is 0, min and max are all null, null count {} is too small",
204+
count, nullCount);
201205
return false;
202206
}
203207
return true;

fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,12 @@ public void testGetFunctions() {
6262
Assertions.assertEquals("NULL", maxFunction);
6363

6464
String ndvFunction = olapAnalysisTask.getNdvFunction(String.valueOf(100));
65-
Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) "
66-
+ "+ SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 100)", ndvFunction);
65+
Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) * SUM(`t1`.`count`) / 100)", ndvFunction);
6766
System.out.println(ndvFunction);
6867
}
6968

7069
@Test
71-
public void testInvalidColStats() {
70+
public void testNdvTooLarge() {
7271
List<String> values = Lists.newArrayList();
7372
values.add("id");
7473
values.add("10000");
@@ -101,10 +100,49 @@ public List<ResultRow> executeInternalQuery() {
101100
} catch (Exception e) {
102101
Assertions.assertEquals(e.getMessage(),
103102
"ColStatsData is invalid, skip analyzing. "
104-
+ "('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')");
103+
+ "('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')");
105104
return;
106105
}
107106
Assertions.fail();
108107
}
109108

109+
@Test
110+
public void testNdv0MinMaxExistsNullNotEqualCount() {
111+
List<String> values = Lists.newArrayList();
112+
values.add("id");
113+
values.add("10000");
114+
values.add("20000");
115+
values.add("30000");
116+
values.add("0");
117+
values.add("col");
118+
values.add(null);
119+
values.add("500"); // count
120+
values.add("0"); // ndv
121+
values.add("300"); // null
122+
values.add("min");
123+
values.add("max");
124+
values.add("400");
125+
values.add("500");
126+
ResultRow row = new ResultRow(values);
127+
List<ResultRow> result = Lists.newArrayList();
128+
result.add(row);
129+
130+
new MockUp<StmtExecutor>() {
131+
@Mock
132+
public List<ResultRow> executeInternalQuery() {
133+
return result;
134+
}
135+
};
136+
BaseAnalysisTask task = new OlapAnalysisTask();
137+
task.info = new AnalysisInfoBuilder().setJobType(JobType.MANUAL).build();
138+
try {
139+
task.runQuery("test");
140+
} catch (Exception e) {
141+
Assertions.assertEquals(e.getMessage(),
142+
"ColStatsData is invalid, skip analyzing. "
143+
+ "('id',10000,20000,30000,0,'col',null,500,0,300,'min','max',400,'500')");
144+
return;
145+
}
146+
Assertions.fail();
147+
}
110148
}

fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ public void testIsValid() {
215215
data = new ColStatsData(row);
216216
Assertions.assertFalse(data.isValid());
217217

218-
// Set max to null, min/max is not null
218+
// Set max to null, min/max are all null
219219
values.set(11, null);
220220
row = new ResultRow(values);
221221
data = new ColStatsData(row);

fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -370,15 +370,15 @@ protected boolean useLinearAnalyzeTemplate() {
370370
Assertions.assertTrue(task.scanFullTable());
371371
Assertions.assertEquals("1.0", params.get("scaleFactor"));
372372
Assertions.assertEquals("", params.get("sampleHints"));
373-
Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 10)", params.get("ndvFunction"));
373+
Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) * SUM(`t1`.`count`) / 10)", params.get("ndvFunction"));
374374
params.clear();
375375

376376
task = new OlapAnalysisTask();
377377
task.col = new Column("test", PrimitiveType.INT);
378378
task.getSampleParams(params, 1000);
379379
Assertions.assertEquals("10.0", params.get("scaleFactor"));
380380
Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints"));
381-
Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction"));
381+
Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction"));
382382
Assertions.assertEquals("SUM(t1.count) * 4", params.get("dataSizeFunction"));
383383
Assertions.assertEquals("`${colName}`", params.get("subStringColName"));
384384
params.clear();

regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_d
4141
result = sql """show column stats orc_all_types (int_col);"""
4242
assertEquals("int_col", result[0][0])
4343
assertEquals("3600.0", result[0][2])
44-
assertEquals("3240.0", result[0][3])
44+
assertEquals("3239.0", result[0][3])
4545
assertEquals("361.0", result[0][4])
4646
assertEquals("14400.0", result[0][5])
4747

4848
result = sql """show column stats orc_all_types (string_col);"""
4949
assertEquals("string_col", result[0][0])
5050
assertEquals("3600.0", result[0][2])
51-
assertEquals("3254.0", result[0][3])
51+
assertEquals("3253.0", result[0][3])
5252
assertEquals("347.0", result[0][4])
5353
assertEquals("453634.0", result[0][5])
5454

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
suite("test_analyze_all_null") {
19+
20+
def wait_row_count_reported = { db, table, row, column, expected ->
21+
def result = sql """show frontends;"""
22+
logger.info("show frontends result origin: " + result)
23+
def host
24+
def port
25+
for (int i = 0; i < result.size(); i++) {
26+
if (result[i][8] == "true") {
27+
host = result[i][1]
28+
port = result[i][4]
29+
}
30+
}
31+
def tokens = context.config.jdbcUrl.split('/')
32+
def url=tokens[0] + "//" + host + ":" + port
33+
logger.info("Master url is " + url)
34+
connect(context.config.jdbcUser, context.config.jdbcPassword, url) {
35+
sql """use ${db}"""
36+
result = sql """show frontends;"""
37+
logger.info("show frontends result master: " + result)
38+
for (int i = 0; i < 120; i++) {
39+
Thread.sleep(5000)
40+
result = sql """SHOW DATA FROM ${table};"""
41+
logger.info("result " + result)
42+
if (result[row][column] == expected) {
43+
return;
44+
}
45+
}
46+
throw new Exception("Row count report timeout.")
47+
}
48+
49+
}
50+
51+
sql """drop database if exists test_analyze_all_null"""
52+
sql """create database test_analyze_all_null"""
53+
sql """use test_analyze_all_null"""
54+
sql """set global enable_auto_analyze=false"""
55+
56+
sql """CREATE TABLE allnull (
57+
key1 int NULL,
58+
value1 varchar(25) NULL
59+
)ENGINE=OLAP
60+
DUPLICATE KEY(`key1`)
61+
COMMENT "OLAP"
62+
DISTRIBUTED BY HASH(`key1`) BUCKETS 2
63+
PROPERTIES (
64+
"replication_num" = "1"
65+
)
66+
"""
67+
sql """insert into allnull select null, null from numbers("number"="10000000")"""
68+
wait_row_count_reported("test_analyze_all_null", "allnull", 0, 4, "10000000")
69+
sql """analyze table allnull with sample rows 4000000 with sync"""
70+
71+
def result = sql """show column stats allnull(key1)"""
72+
assertEquals(1, result.size())
73+
assertEquals("1.0E7", result[0][2])
74+
assertEquals("0.0", result[0][3])
75+
result = sql """show column stats allnull(value1)"""
76+
assertEquals(1, result.size())
77+
assertEquals("1.0E7", result[0][2])
78+
assertEquals("0.0", result[0][3])
79+
80+
sql """CREATE TABLE invalidTest (
81+
col1 int NULL,
82+
col2 string NULL,
83+
col3 string NULL
84+
)ENGINE=OLAP
85+
DUPLICATE KEY(`col1`)
86+
COMMENT "OLAP"
87+
DISTRIBUTED BY HASH(`col1`) BUCKETS 2
88+
PROPERTIES (
89+
"replication_num" = "1"
90+
)
91+
"""
92+
sql """insert into invalidTest values(1, "1", "1")"""
93+
94+
sql """alter table invalidTest modify column col1 set stats ('row_count'='100', 'ndv'='100', 'num_nulls'='0.0', 'data_size'='3.2E8', 'min_value'='1', 'max_value'='20000000');"""
95+
sql """alter table invalidTest modify column col2 set stats ('row_count'='100', 'ndv'='0', 'num_nulls'='0.0', 'data_size'='3.2E8', 'min_value'='min', 'max_value'='max');"""
96+
sql """alter table invalidTest modify column col3 set stats ('row_count'='100', 'ndv'='0', 'num_nulls'='100', 'data_size'='3.2E8', 'min_value'='min', 'max_value'='max');"""
97+
result = sql """show column cached stats invalidTest"""
98+
assertEquals(3, result.size())
99+
100+
explain {
101+
sql("memo plan select * from invalidTest")
102+
contains "col1#0 -> ndv=100.0000"
103+
contains "col2#1 -> ndv=0.0000"
104+
contains "col3#2 -> unknown(100.0)"
105+
}
106+
107+
sql """drop database if exists test_analyze_all_null"""
108+
}

0 commit comments

Comments
 (0)