Skip to content

Commit 525656d

Browse files
committed
Lower structured search predicates to native RexCall in visitSearch
Previously `visitSearch` discarded `Search.getOriginalExpression()` and emitted a single `query_string(...)` function call covering the whole predicate. On the analytics-engine route every such filter is delegation-routed to Lucene; on a parquet-only composite shard the delegation handle NPEs in `LuceneAnalyticsBackendPlugin.getFilterDelegationHandle` because there is no Lucene reader. Walk the typed `SearchExpression` AST instead. Structured fragments — field comparisons, AND, OR, NOT, IN, plus time-modifier-resolved comparisons (`earliest=` / `latest=`) — become native PPL filter AST (`Compare` / `And` / `Or` / `Not` / `In`) that the existing rexVisitor handles like any `where`-clause condition. DataFusion executes them natively against parquet without any Lucene round-trip. Free-text / phrase literals stay in `query_string` form because they have no native equivalent. Top-level `AND` mixes the two: the structured conjunct is lowered, the relevance conjunct stays in `query_string`, and the resulting `AND(native, query_string(...))` lets the planner route each clause independently — DataFusion prunes parquet, Lucene handles the relevance term against the secondary format. Three guards keep the lowering faithful to PPL search semantics: - `containsLuceneWildcard` — `*` / `?` in a comparison value forces the fallback so Lucene wildcards keep working (`severityText=ERR*`). - `isOpenSearchDateMath` — values produced by `visitTimeModifierExpression` (`now`, `now-1h`, `now+1y/M-1M`, epoch-millis strings) can't be parsed as Calcite timestamps; keep them in `query_string` so Lucene evaluates the date math. - `isLowerableField` — comparisons against fields absent from the relation's row type fall through (matches Lucene's "missing field → no docs" semantics; native would error out in the analyzer). `SearchNot` always falls back: Lucene `NOT field=value` matches docs where the field is missing OR field != value, but `NOT (field = value)` in Calcite simplifies to `field <> value`, which drops missing-field rows since `NULL <> value` evaluates to NULL. Net `CalciteSearchCommandIT` impact (analytics-engine route, all OS- side fixes in `opensearch-project/OpenSearch#21681` in place): 3/52 → 35/52. Tests that pass via the native path include all field-equality, AND, OR, IN, range, double-comparison, attribute-field, and mixed boolean tests. Signed-off-by: Kai Huang <ahkcs@amazon.com>
1 parent 41c2bc4 commit 525656d

1 file changed

Lines changed: 206 additions & 8 deletions

File tree

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 206 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,14 @@
102102
import org.opensearch.sql.ast.expression.PatternMethod;
103103
import org.opensearch.sql.ast.expression.PatternMode;
104104
import org.opensearch.sql.ast.expression.QualifiedName;
105+
import org.opensearch.sql.ast.expression.SearchAnd;
106+
import org.opensearch.sql.ast.expression.SearchComparison;
107+
import org.opensearch.sql.ast.expression.SearchExpression;
108+
import org.opensearch.sql.ast.expression.SearchGroup;
109+
import org.opensearch.sql.ast.expression.SearchIn;
110+
import org.opensearch.sql.ast.expression.SearchLiteral;
111+
import org.opensearch.sql.ast.expression.SearchNot;
112+
import org.opensearch.sql.ast.expression.SearchOr;
105113
import org.opensearch.sql.ast.expression.Span;
106114
import org.opensearch.sql.ast.expression.SpanUnit;
107115
import org.opensearch.sql.ast.expression.UnresolvedExpression;
@@ -280,17 +288,207 @@ private RelBuilder scan(RelOptTable tableSchema, CalcitePlanContext context) {
280288
public RelNode visitSearch(Search node, CalcitePlanContext context) {
281289
// Visit the Relation child to get the scan
282290
node.getChild().get(0).accept(this, context);
283-
// Create query_string function
284-
Function queryStringFunc =
285-
AstDSL.function(
286-
"query_string",
287-
AstDSL.unresolvedArg("query", AstDSL.stringLiteral(node.getQueryString())));
288-
RexNode queryStringRex = rexVisitor.analyze(queryStringFunc, context);
289-
290-
context.relBuilder.filter(queryStringRex);
291+
292+
// Lower the typed SearchExpression AST to a native filter where possible. The
293+
// structured fragments (field comparisons, AND / OR / NOT, IN, time-modifier-resolved
294+
// comparisons) become a plain Compare / And / Or / Not / In AST that
295+
// ExpressionAnalyzer/rexVisitor handles like any `where`-clause condition, so the
296+
// analytics-engine route can mark them as DataFusion-native predicates instead of
297+
// delegating the whole thing to Lucene as a `query_string(...)`. Free-text / phrase
298+
// literals — and any comparison whose value looks like an OpenSearch date-math
299+
// expression or references a field absent from the relation's row type — stay in
300+
// query_string form because they have no faithful native equivalent.
301+
Set<String> knownFields = new HashSet<>(context.relBuilder.peek().getRowType().getFieldNames());
302+
SearchExpression original = node.getOriginalExpression();
303+
UnresolvedExpression filter =
304+
original != null
305+
? buildSearchFilter(original, node.getQueryString(), knownFields)
306+
: buildQueryStringFunction(node.getQueryString());
307+
RexNode filterRex = rexVisitor.analyze(filter, context);
308+
309+
context.relBuilder.filter(filterRex);
291310
return context.relBuilder.peek();
292311
}
293312

313+
/**
314+
* Build the filter AST for a search predicate. Prefers a fully-native lowering of {@code e};
315+
* falls back to {@code AND}-splitting a top-level conjunction with one native and one free-text
316+
* side; finally falls back to wrapping the original query string in {@code query_string(...)}.
317+
*/
318+
private UnresolvedExpression buildSearchFilter(
319+
SearchExpression e, String fallbackQueryString, Set<String> knownFields) {
320+
Optional<UnresolvedExpression> structured = lowerSearchExpression(e, knownFields);
321+
if (structured.isPresent()) {
322+
return structured.get();
323+
}
324+
// Top-level AND-split: native(L) AND queryString(toQueryString(R)) (or symmetric).
325+
// Only applied at the outermost conjunction — deeper splits would change OR semantics.
326+
if (e instanceof SearchAnd) {
327+
SearchAnd and = (SearchAnd) e;
328+
Optional<UnresolvedExpression> leftNative = lowerSearchExpression(and.getLeft(), knownFields);
329+
Optional<UnresolvedExpression> rightNative =
330+
lowerSearchExpression(and.getRight(), knownFields);
331+
if (leftNative.isPresent() && rightNative.isEmpty()) {
332+
return AstDSL.and(
333+
leftNative.get(), buildQueryStringFunction(and.getRight().toQueryString()));
334+
}
335+
if (rightNative.isPresent() && leftNative.isEmpty()) {
336+
return AstDSL.and(
337+
buildQueryStringFunction(and.getLeft().toQueryString()), rightNative.get());
338+
}
339+
}
340+
// Fully relevance / mixed-OR / unsupported — keep the original query_string fallback.
341+
return buildQueryStringFunction(fallbackQueryString);
342+
}
343+
344+
/**
345+
* Recursively translate a {@link SearchExpression} to the equivalent generic-PPL filter AST.
346+
* Returns empty whenever any subtree contains a {@link SearchLiteral} (free text or phrase), a
347+
* {@link SearchComparison} whose value looks like an OpenSearch date-math expression (handled
348+
* correctly only by Lucene), or a comparison/IN against a field that isn't in the relation's row
349+
* type (which the analyzer would otherwise reject; PPL search semantics on unknown fields is
350+
* "match nothing", matched by the query_string fallback).
351+
*/
352+
private Optional<UnresolvedExpression> lowerSearchExpression(
353+
SearchExpression e, Set<String> knownFields) {
354+
if (e instanceof SearchGroup) {
355+
return lowerSearchExpression(((SearchGroup) e).getExpression(), knownFields);
356+
}
357+
if (e instanceof SearchAnd) {
358+
SearchAnd and = (SearchAnd) e;
359+
Optional<UnresolvedExpression> left = lowerSearchExpression(and.getLeft(), knownFields);
360+
Optional<UnresolvedExpression> right = lowerSearchExpression(and.getRight(), knownFields);
361+
if (left.isPresent() && right.isPresent()) {
362+
return Optional.of(AstDSL.and(left.get(), right.get()));
363+
}
364+
return Optional.empty();
365+
}
366+
if (e instanceof SearchOr) {
367+
SearchOr or = (SearchOr) e;
368+
Optional<UnresolvedExpression> left = lowerSearchExpression(or.getLeft(), knownFields);
369+
Optional<UnresolvedExpression> right = lowerSearchExpression(or.getRight(), knownFields);
370+
if (left.isPresent() && right.isPresent()) {
371+
return Optional.of(AstDSL.or(left.get(), right.get()));
372+
}
373+
return Optional.empty();
374+
}
375+
if (e instanceof SearchNot) {
376+
// Lucene `NOT field=value` matches docs where the field is missing OR field != value.
377+
// SQL `NOT (field = value)` (and Calcite's simplification to `field <> value`) drops
378+
// missing-field rows because NULL <> value evaluates to NULL → filtered. Routing NOT
379+
// through the query_string fallback preserves the documented PPL search semantics.
380+
return Optional.empty();
381+
}
382+
if (e instanceof SearchComparison) {
383+
SearchComparison comp = (SearchComparison) e;
384+
if (!isLowerableField(comp.getField(), knownFields)) {
385+
return Optional.empty();
386+
}
387+
if (isOpenSearchDateMath(comp.getValue().getLiteral())) {
388+
return Optional.empty();
389+
}
390+
if (containsLuceneWildcard(comp.getValue().getLiteral())) {
391+
// `severityText=ERR*` / `field=foo?` / `name=*-service` — Lucene-style wildcards in
392+
// the right-hand value. A native `=` lowering would compare literally and drop every
393+
// matching document. Keep the query in query_string form so Lucene evaluates the
394+
// wildcard.
395+
return Optional.empty();
396+
}
397+
return Optional.of(
398+
AstDSL.compare(
399+
comp.getOperator().getSymbol(), comp.getField(), comp.getValue().getLiteral()));
400+
}
401+
if (e instanceof SearchIn) {
402+
SearchIn in = (SearchIn) e;
403+
if (!isLowerableField(in.getField(), knownFields)) {
404+
return Optional.empty();
405+
}
406+
List<UnresolvedExpression> values =
407+
in.getValues().stream().map(SearchLiteral::getLiteral).collect(Collectors.toList());
408+
return Optional.of(AstDSL.in(in.getField(), values));
409+
}
410+
// SearchLiteral (standalone free-text / phrase) — no native equivalent.
411+
return Optional.empty();
412+
}
413+
414+
private static boolean isLowerableField(Field field, Set<String> knownFields) {
415+
QualifiedName qn = (QualifiedName) field.getField();
416+
return knownFields.contains(qn.toString());
417+
}
418+
419+
/**
420+
* Recognise OpenSearch date-math values produced by {@code visitTimeModifierExpression} — {@code
421+
* "now"}, {@code "now-1h"}, {@code "now+1y/M-1M"}, anchored expressions like {@code
422+
* "2024-01-15||+1d"}, and epoch-millis strings such as {@code "1754020060000"}. These strings
423+
* have to be evaluated as Lucene-style date math; Calcite would compare them lexically. Detecting
424+
* them lets the visitor keep the comparison in {@code query_string} form so the existing
425+
* semantics survive.
426+
*/
427+
private static boolean isOpenSearchDateMath(UnresolvedExpression value) {
428+
if (!(value instanceof Literal)) {
429+
return false;
430+
}
431+
Object raw = ((Literal) value).getValue();
432+
if (!(raw instanceof String)) {
433+
return false;
434+
}
435+
String s = ((String) raw).trim();
436+
if (s.isEmpty()) {
437+
return false;
438+
}
439+
if (s.regionMatches(true, 0, "now", 0, 3)) {
440+
return true;
441+
}
442+
if (s.contains("||")) {
443+
return true;
444+
}
445+
// Epoch-millis from `earliest=1754020060` / `latest=1754020060.123` — 12+ digits only.
446+
if (s.length() >= 12) {
447+
for (int i = 0; i < s.length(); i++) {
448+
char c = s.charAt(i);
449+
if (c < '0' || c > '9') {
450+
return false;
451+
}
452+
}
453+
return true;
454+
}
455+
return false;
456+
}
457+
458+
/**
459+
* Detects an unescaped Lucene wildcard ({@code *} or {@code ?}) in a string literal. A preceding
460+
* odd run of backslashes means the wildcard is already escaped (literal star / question mark in
461+
* the value), so it's safe to lower as a regular equality.
462+
*/
463+
private static boolean containsLuceneWildcard(UnresolvedExpression value) {
464+
if (!(value instanceof Literal)) {
465+
return false;
466+
}
467+
Object raw = ((Literal) value).getValue();
468+
if (!(raw instanceof String)) {
469+
return false;
470+
}
471+
String s = (String) raw;
472+
for (int i = 0; i < s.length(); i++) {
473+
char c = s.charAt(i);
474+
if (c == '*' || c == '?') {
475+
int backslashes = 0;
476+
for (int j = i - 1; j >= 0 && s.charAt(j) == '\\'; j--) {
477+
backslashes++;
478+
}
479+
if ((backslashes & 1) == 0) {
480+
return true;
481+
}
482+
}
483+
}
484+
return false;
485+
}
486+
487+
private Function buildQueryStringFunction(String queryString) {
488+
return AstDSL.function(
489+
"query_string", AstDSL.unresolvedArg("query", AstDSL.stringLiteral(queryString)));
490+
}
491+
294492
@Override
295493
public RelNode visitFilter(Filter node, CalcitePlanContext context) {
296494
visitChildren(node, context);

0 commit comments

Comments
 (0)