|
102 | 102 | import org.opensearch.sql.ast.expression.PatternMethod; |
103 | 103 | import org.opensearch.sql.ast.expression.PatternMode; |
104 | 104 | import org.opensearch.sql.ast.expression.QualifiedName; |
| 105 | +import org.opensearch.sql.ast.expression.SearchAnd; |
| 106 | +import org.opensearch.sql.ast.expression.SearchComparison; |
| 107 | +import org.opensearch.sql.ast.expression.SearchExpression; |
| 108 | +import org.opensearch.sql.ast.expression.SearchGroup; |
| 109 | +import org.opensearch.sql.ast.expression.SearchIn; |
| 110 | +import org.opensearch.sql.ast.expression.SearchLiteral; |
| 111 | +import org.opensearch.sql.ast.expression.SearchNot; |
| 112 | +import org.opensearch.sql.ast.expression.SearchOr; |
105 | 113 | import org.opensearch.sql.ast.expression.Span; |
106 | 114 | import org.opensearch.sql.ast.expression.SpanUnit; |
107 | 115 | import org.opensearch.sql.ast.expression.UnresolvedExpression; |
@@ -280,17 +288,207 @@ private RelBuilder scan(RelOptTable tableSchema, CalcitePlanContext context) { |
280 | 288 | public RelNode visitSearch(Search node, CalcitePlanContext context) { |
281 | 289 | // Visit the Relation child to get the scan |
282 | 290 | node.getChild().get(0).accept(this, context); |
283 | | - // Create query_string function |
284 | | - Function queryStringFunc = |
285 | | - AstDSL.function( |
286 | | - "query_string", |
287 | | - AstDSL.unresolvedArg("query", AstDSL.stringLiteral(node.getQueryString()))); |
288 | | - RexNode queryStringRex = rexVisitor.analyze(queryStringFunc, context); |
289 | | - |
290 | | - context.relBuilder.filter(queryStringRex); |
| 291 | + |
| 292 | + // Lower the typed SearchExpression AST to a native filter where possible. The |
| 293 | + // structured fragments (field comparisons, AND / OR / NOT, IN, time-modifier-resolved |
| 294 | + // comparisons) become a plain Compare / And / Or / Not / In AST that |
| 295 | + // ExpressionAnalyzer/rexVisitor handles like any `where`-clause condition, so the |
| 296 | + // analytics-engine route can mark them as DataFusion-native predicates instead of |
| 297 | + // delegating the whole thing to Lucene as a `query_string(...)`. Free-text / phrase |
| 298 | + // literals — and any comparison whose value looks like an OpenSearch date-math |
| 299 | + // expression or references a field absent from the relation's row type — stay in |
| 300 | + // query_string form because they have no faithful native equivalent. |
| 301 | + Set<String> knownFields = new HashSet<>(context.relBuilder.peek().getRowType().getFieldNames()); |
| 302 | + SearchExpression original = node.getOriginalExpression(); |
| 303 | + UnresolvedExpression filter = |
| 304 | + original != null |
| 305 | + ? buildSearchFilter(original, node.getQueryString(), knownFields) |
| 306 | + : buildQueryStringFunction(node.getQueryString()); |
| 307 | + RexNode filterRex = rexVisitor.analyze(filter, context); |
| 308 | + |
| 309 | + context.relBuilder.filter(filterRex); |
291 | 310 | return context.relBuilder.peek(); |
292 | 311 | } |
293 | 312 |
|
| 313 | + /** |
| 314 | + * Build the filter AST for a search predicate. Prefers a fully-native lowering of {@code e}; |
| 315 | + * falls back to {@code AND}-splitting a top-level conjunction with one native and one free-text |
| 316 | + * side; finally falls back to wrapping the original query string in {@code query_string(...)}. |
| 317 | + */ |
| 318 | + private UnresolvedExpression buildSearchFilter( |
| 319 | + SearchExpression e, String fallbackQueryString, Set<String> knownFields) { |
| 320 | + Optional<UnresolvedExpression> structured = lowerSearchExpression(e, knownFields); |
| 321 | + if (structured.isPresent()) { |
| 322 | + return structured.get(); |
| 323 | + } |
| 324 | + // Top-level AND-split: native(L) AND queryString(toQueryString(R)) (or symmetric). |
| 325 | + // Only applied at the outermost conjunction — deeper splits would change OR semantics. |
| 326 | + if (e instanceof SearchAnd) { |
| 327 | + SearchAnd and = (SearchAnd) e; |
| 328 | + Optional<UnresolvedExpression> leftNative = lowerSearchExpression(and.getLeft(), knownFields); |
| 329 | + Optional<UnresolvedExpression> rightNative = |
| 330 | + lowerSearchExpression(and.getRight(), knownFields); |
| 331 | + if (leftNative.isPresent() && rightNative.isEmpty()) { |
| 332 | + return AstDSL.and( |
| 333 | + leftNative.get(), buildQueryStringFunction(and.getRight().toQueryString())); |
| 334 | + } |
| 335 | + if (rightNative.isPresent() && leftNative.isEmpty()) { |
| 336 | + return AstDSL.and( |
| 337 | + buildQueryStringFunction(and.getLeft().toQueryString()), rightNative.get()); |
| 338 | + } |
| 339 | + } |
| 340 | + // Fully relevance / mixed-OR / unsupported — keep the original query_string fallback. |
| 341 | + return buildQueryStringFunction(fallbackQueryString); |
| 342 | + } |
| 343 | + |
| 344 | + /** |
| 345 | + * Recursively translate a {@link SearchExpression} to the equivalent generic-PPL filter AST. |
| 346 | + * Returns empty whenever any subtree contains a {@link SearchLiteral} (free text or phrase), a |
| 347 | + * {@link SearchComparison} whose value looks like an OpenSearch date-math expression (handled |
| 348 | + * correctly only by Lucene), or a comparison/IN against a field that isn't in the relation's row |
| 349 | + * type (which the analyzer would otherwise reject; PPL search semantics on unknown fields is |
| 350 | + * "match nothing", matched by the query_string fallback). |
| 351 | + */ |
| 352 | + private Optional<UnresolvedExpression> lowerSearchExpression( |
| 353 | + SearchExpression e, Set<String> knownFields) { |
| 354 | + if (e instanceof SearchGroup) { |
| 355 | + return lowerSearchExpression(((SearchGroup) e).getExpression(), knownFields); |
| 356 | + } |
| 357 | + if (e instanceof SearchAnd) { |
| 358 | + SearchAnd and = (SearchAnd) e; |
| 359 | + Optional<UnresolvedExpression> left = lowerSearchExpression(and.getLeft(), knownFields); |
| 360 | + Optional<UnresolvedExpression> right = lowerSearchExpression(and.getRight(), knownFields); |
| 361 | + if (left.isPresent() && right.isPresent()) { |
| 362 | + return Optional.of(AstDSL.and(left.get(), right.get())); |
| 363 | + } |
| 364 | + return Optional.empty(); |
| 365 | + } |
| 366 | + if (e instanceof SearchOr) { |
| 367 | + SearchOr or = (SearchOr) e; |
| 368 | + Optional<UnresolvedExpression> left = lowerSearchExpression(or.getLeft(), knownFields); |
| 369 | + Optional<UnresolvedExpression> right = lowerSearchExpression(or.getRight(), knownFields); |
| 370 | + if (left.isPresent() && right.isPresent()) { |
| 371 | + return Optional.of(AstDSL.or(left.get(), right.get())); |
| 372 | + } |
| 373 | + return Optional.empty(); |
| 374 | + } |
| 375 | + if (e instanceof SearchNot) { |
| 376 | + // Lucene `NOT field=value` matches docs where the field is missing OR field != value. |
| 377 | + // SQL `NOT (field = value)` (and Calcite's simplification to `field <> value`) drops |
| 378 | + // missing-field rows because NULL <> value evaluates to NULL → filtered. Routing NOT |
| 379 | + // through the query_string fallback preserves the documented PPL search semantics. |
| 380 | + return Optional.empty(); |
| 381 | + } |
| 382 | + if (e instanceof SearchComparison) { |
| 383 | + SearchComparison comp = (SearchComparison) e; |
| 384 | + if (!isLowerableField(comp.getField(), knownFields)) { |
| 385 | + return Optional.empty(); |
| 386 | + } |
| 387 | + if (isOpenSearchDateMath(comp.getValue().getLiteral())) { |
| 388 | + return Optional.empty(); |
| 389 | + } |
| 390 | + if (containsLuceneWildcard(comp.getValue().getLiteral())) { |
| 391 | + // `severityText=ERR*` / `field=foo?` / `name=*-service` — Lucene-style wildcards in |
| 392 | + // the right-hand value. A native `=` lowering would compare literally and drop every |
| 393 | + // matching document. Keep the query in query_string form so Lucene evaluates the |
| 394 | + // wildcard. |
| 395 | + return Optional.empty(); |
| 396 | + } |
| 397 | + return Optional.of( |
| 398 | + AstDSL.compare( |
| 399 | + comp.getOperator().getSymbol(), comp.getField(), comp.getValue().getLiteral())); |
| 400 | + } |
| 401 | + if (e instanceof SearchIn) { |
| 402 | + SearchIn in = (SearchIn) e; |
| 403 | + if (!isLowerableField(in.getField(), knownFields)) { |
| 404 | + return Optional.empty(); |
| 405 | + } |
| 406 | + List<UnresolvedExpression> values = |
| 407 | + in.getValues().stream().map(SearchLiteral::getLiteral).collect(Collectors.toList()); |
| 408 | + return Optional.of(AstDSL.in(in.getField(), values)); |
| 409 | + } |
| 410 | + // SearchLiteral (standalone free-text / phrase) — no native equivalent. |
| 411 | + return Optional.empty(); |
| 412 | + } |
| 413 | + |
| 414 | + private static boolean isLowerableField(Field field, Set<String> knownFields) { |
| 415 | + QualifiedName qn = (QualifiedName) field.getField(); |
| 416 | + return knownFields.contains(qn.toString()); |
| 417 | + } |
| 418 | + |
| 419 | + /** |
| 420 | + * Recognise OpenSearch date-math values produced by {@code visitTimeModifierExpression} — {@code |
| 421 | + * "now"}, {@code "now-1h"}, {@code "now+1y/M-1M"}, anchored expressions like {@code |
| 422 | + * "2024-01-15||+1d"}, and epoch-millis strings such as {@code "1754020060000"}. These strings |
| 423 | + * have to be evaluated as Lucene-style date math; Calcite would compare them lexically. Detecting |
| 424 | + * them lets the visitor keep the comparison in {@code query_string} form so the existing |
| 425 | + * semantics survive. |
| 426 | + */ |
| 427 | + private static boolean isOpenSearchDateMath(UnresolvedExpression value) { |
| 428 | + if (!(value instanceof Literal)) { |
| 429 | + return false; |
| 430 | + } |
| 431 | + Object raw = ((Literal) value).getValue(); |
| 432 | + if (!(raw instanceof String)) { |
| 433 | + return false; |
| 434 | + } |
| 435 | + String s = ((String) raw).trim(); |
| 436 | + if (s.isEmpty()) { |
| 437 | + return false; |
| 438 | + } |
| 439 | + if (s.regionMatches(true, 0, "now", 0, 3)) { |
| 440 | + return true; |
| 441 | + } |
| 442 | + if (s.contains("||")) { |
| 443 | + return true; |
| 444 | + } |
| 445 | + // Epoch-millis from `earliest=1754020060` / `latest=1754020060.123` — 12+ digits only. |
| 446 | + if (s.length() >= 12) { |
| 447 | + for (int i = 0; i < s.length(); i++) { |
| 448 | + char c = s.charAt(i); |
| 449 | + if (c < '0' || c > '9') { |
| 450 | + return false; |
| 451 | + } |
| 452 | + } |
| 453 | + return true; |
| 454 | + } |
| 455 | + return false; |
| 456 | + } |
| 457 | + |
| 458 | + /** |
| 459 | + * Detects an unescaped Lucene wildcard ({@code *} or {@code ?}) in a string literal. A preceding |
| 460 | + * odd run of backslashes means the wildcard is already escaped (literal star / question mark in |
| 461 | + * the value), so it's safe to lower as a regular equality. |
| 462 | + */ |
| 463 | + private static boolean containsLuceneWildcard(UnresolvedExpression value) { |
| 464 | + if (!(value instanceof Literal)) { |
| 465 | + return false; |
| 466 | + } |
| 467 | + Object raw = ((Literal) value).getValue(); |
| 468 | + if (!(raw instanceof String)) { |
| 469 | + return false; |
| 470 | + } |
| 471 | + String s = (String) raw; |
| 472 | + for (int i = 0; i < s.length(); i++) { |
| 473 | + char c = s.charAt(i); |
| 474 | + if (c == '*' || c == '?') { |
| 475 | + int backslashes = 0; |
| 476 | + for (int j = i - 1; j >= 0 && s.charAt(j) == '\\'; j--) { |
| 477 | + backslashes++; |
| 478 | + } |
| 479 | + if ((backslashes & 1) == 0) { |
| 480 | + return true; |
| 481 | + } |
| 482 | + } |
| 483 | + } |
| 484 | + return false; |
| 485 | + } |
| 486 | + |
| 487 | + private Function buildQueryStringFunction(String queryString) { |
| 488 | + return AstDSL.function( |
| 489 | + "query_string", AstDSL.unresolvedArg("query", AstDSL.stringLiteral(queryString))); |
| 490 | + } |
| 491 | + |
294 | 492 | @Override |
295 | 493 | public RelNode visitFilter(Filter node, CalcitePlanContext context) { |
296 | 494 | visitChildren(node, context); |
|
0 commit comments