Skip to content

Commit 07a4200

Browse files
committed
add approx distinct count
Signed-off-by: xinyual <xinyual@amazon.com>
1 parent e6116bc commit 07a4200

8 files changed

Lines changed: 84 additions & 0 deletions

File tree

core/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ dependencies {
6363
api 'org.apache.calcite:calcite-linq4j:1.38.0'
6464
api project(':common')
6565
implementation "com.github.seancfoley:ipaddress:5.4.2"
66+
implementation "com.google.zetasketch:zetasketch:0.1.0"
6667

6768
annotationProcessor('org.immutables:value:2.8.8')
6869
compileOnly('org.immutables:value-annotations:2.8.8')
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.calcite.udf.udaf;
7+
8+
import com.google.zetasketch.HyperLogLogPlusPlus;
9+
import org.opensearch.sql.calcite.udf.UserDefinedAggFunction;
10+
11+
public class DistinctCountApproxAggFunction
12+
implements UserDefinedAggFunction<DistinctCountApproxAggFunction.HLLAccumulator> {
13+
14+
@Override
15+
public HLLAccumulator init() {
16+
return new HLLAccumulator();
17+
}
18+
19+
@Override
20+
public Object result(HLLAccumulator accumulator) {
21+
return accumulator.value();
22+
}
23+
24+
@Override
25+
public HLLAccumulator add(HLLAccumulator acc, Object... values) {
26+
for (Object value : values) {
27+
if (value != null) {
28+
acc.add(value.toString());
29+
}
30+
}
31+
return acc;
32+
}
33+
34+
public static class HLLAccumulator implements UserDefinedAggFunction.Accumulator {
35+
private final HyperLogLogPlusPlus<String> hll;
36+
37+
public HLLAccumulator() {
38+
this.hll = new HyperLogLogPlusPlus.Builder().buildForStrings();
39+
}
40+
41+
public void add(String value) {
42+
hll.add(value);
43+
}
44+
45+
@Override
46+
public Object value(Object... args) {
47+
return hll.result();
48+
}
49+
}
50+
}

core/src/main/java/org/opensearch/sql/calcite/utils/PlanUtils.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import org.opensearch.sql.ast.expression.WindowBound;
3434
import org.opensearch.sql.ast.expression.WindowFrame;
3535
import org.opensearch.sql.calcite.CalcitePlanContext;
36+
import org.opensearch.sql.calcite.udf.udaf.DistinctCountApproxAggFunction;
3637
import org.opensearch.sql.calcite.udf.udaf.PercentileApproxFunction;
3738
import org.opensearch.sql.calcite.udf.udaf.TakeAggFunction;
3839
import org.opensearch.sql.expression.function.BuiltinFunctionName;
@@ -218,6 +219,8 @@ static RelBuilder.AggCall makeAggCall(
218219
case AVG:
219220
return context.relBuilder.avg(distinct, null, field);
220221
case COUNT:
222+
// return context.relBuilder.aggregateCall(SqlStdOperatorTable.APPROX_COUNT_DISTINCT,
223+
// field);
221224
return context.relBuilder.count(
222225
distinct, null, field == null ? ImmutableList.of() : ImmutableList.of(field));
223226
case SUM:
@@ -238,6 +241,14 @@ static RelBuilder.AggCall makeAggCall(
238241
// case PERCENTILE_APPROX:
239242
// return
240243
// context.relBuilder.aggregateCall(SqlStdOperatorTable.PERCENTILE_CONT, field);
244+
case DISTINCT_COUNT_APPROX:
245+
return TransferUserDefinedAggFunction(
246+
DistinctCountApproxAggFunction.class,
247+
"APPROX_DISTINCT_COUNT",
248+
ReturnTypes.BIGINT_FORCE_NULLABLE,
249+
List.of(field),
250+
argList,
251+
context.relBuilder);
241252
case TAKE:
242253
return TransferUserDefinedAggFunction(
243254
TakeAggFunction.class,

core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ public enum BuiltinFunctionName {
188188
TAKE(FunctionName.of("take")),
189189
// t-digest percentile which is used in OpenSearch core by default.
190190
PERCENTILE_APPROX(FunctionName.of("percentile_approx")),
191+
DISTINCT_COUNT_APPROX(FunctionName.of("distinct_count_approx")),
191192
// Not always an aggregation query
192193
NESTED(FunctionName.of("nested")),
193194

@@ -310,6 +311,7 @@ public enum BuiltinFunctionName {
310311
.put("take", BuiltinFunctionName.TAKE)
311312
.put("percentile", BuiltinFunctionName.PERCENTILE_APPROX)
312313
.put("percentile_approx", BuiltinFunctionName.PERCENTILE_APPROX)
314+
.put("distinct_count_approx", BuiltinFunctionName.DISTINCT_COUNT_APPROX)
313315
.build();
314316

315317
private static final Map<String, BuiltinFunctionName> WINDOW_FUNC_MAPPING =

integ-test/src/test/java/org/opensearch/sql/calcite/standalone/CalcitePPLAggregationIT.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,17 @@ public void testCountDistinct() {
514514
verifyDataRows(actual, rows(3, "F"), rows(4, "M"));
515515
}
516516

517+
@Test
518+
public void testCountDistinctApprox() {
519+
JSONObject actual =
520+
executeQuery(
521+
String.format(
522+
"source=%s | stats distinct_count_approx(state) by gender", TEST_INDEX_BANK));
523+
verifySchema(
524+
actual, schema("gender", "string"), schema("distinct_count_approx(state)", "long"));
525+
verifyDataRows(actual, rows(3, "F"), rows(4, "M"));
526+
}
527+
517528
@Test
518529
public void testCountDistinctWithAlias() {
519530
JSONObject actual =

ppl/src/main/antlr/OpenSearchPPLLexer.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ BIT_XOR_OP: '^';
203203
AVG: 'AVG';
204204
COUNT: 'COUNT';
205205
DISTINCT_COUNT: 'DISTINCT_COUNT';
206+
DISTINCT_COUNT_APPROX: 'DISTINCT_COUNT_APPROX';
206207
ESTDC: 'ESTDC';
207208
ESTDC_ERROR: 'ESTDC_ERROR';
208209
MAX: 'MAX';

ppl/src/main/antlr/OpenSearchPPLParser.g4

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ statsFunction
389389
: statsFunctionName LT_PRTHS valueExpression RT_PRTHS # statsFunctionCall
390390
| COUNT LT_PRTHS RT_PRTHS # countAllFunctionCall
391391
| (DISTINCT_COUNT | DC) LT_PRTHS valueExpression RT_PRTHS # distinctCountFunctionCall
392+
| DISTINCT_COUNT_APPROX LT_PRTHS valueExpression RT_PRTHS # distinctCountApproxFunctionCall
392393
| takeAggFunction # takeAggFunctionCall
393394
| percentileApproxFunction # percentileApproxFunctionCall
394395
;
@@ -1096,6 +1097,7 @@ keywordsCanBeId
10961097
| statsFunctionName
10971098
| windowFunctionName
10981099
| DISTINCT_COUNT
1100+
| DISTINCT_COUNT_APPROX
10991101
| ESTDC
11001102
| ESTDC_ERROR
11011103
| MEAN

ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,12 @@ public UnresolvedExpression visitDistinctCountFunctionCall(DistinctCountFunction
203203
return new AggregateFunction("count", visit(ctx.valueExpression()), true);
204204
}
205205

206+
@Override
207+
public UnresolvedExpression visitDistinctCountApproxFunctionCall(
208+
OpenSearchPPLParser.DistinctCountApproxFunctionCallContext ctx) {
209+
return new AggregateFunction("distinct_count_approx", visit(ctx.valueExpression()), true);
210+
}
211+
206212
@Override
207213
public UnresolvedExpression visitPercentileApproxFunctionCall(
208214
OpenSearchPPLParser.PercentileApproxFunctionCallContext ctx) {

0 commit comments

Comments
 (0)