Skip to content

Commit 24bf6e0

Browse files
committed
[BugFix] Lock in BETWEEN / NOT IN pushdown shapes for vectorSearch
Round 2 manual testing confirmed that WHERE clauses with BETWEEN and NOT IN predicates on a vectorSearch() relation push down as native OpenSearch DSL (range / bool.must_not) rather than falling back to a serialized script query. These two shapes were previously unwitnessed in VectorSearchExplainIT. Add explain-plan integration tests that pin the current behaviour so any regression that causes BETWEEN or NOT IN to degrade to a script query, or to leak the predicate into the knn payload under POST filter mode, fails loudly in CI instead of silently regressing query performance. - testBetweenPushesAsRange: asserts BETWEEN desugars to native range DSL with from=50 / to=200 bounds pushed outside the knn payload. - testNotInPushesAsMustNotTerms: asserts NOT IN desugars to bool.must_not wrapping bool.should[term, term] on the keyword subfield, also outside the knn payload. Pure test addition; no production code changes. Signed-off-by: Eric Wei <mengwei.eric@gmail.com>
1 parent ffbd1cf commit 24bf6e0

1 file changed

Lines changed: 127 additions & 0 deletions

File tree

integ-test/src/test/java/org/opensearch/sql/sql/VectorSearchExplainIT.java

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,4 +447,131 @@ public void testEfficientFilterWithOrderByScoreDescSucceeds() throws IOException
447447
"Efficient mode knn JSON should contain the WHERE predicate field:\n" + knnJson,
448448
knnJson.contains("state"));
449449
}
450+
451+
// ── BETWEEN / NOT IN pushdown regression guards ─────────────────────
452+
// These tests lock in the DSL shape currently produced for BETWEEN and NOT IN predicates
453+
// when pushed down through vectorSearch(). They exist to catch silent regressions where a
454+
// change in the v2 FilterQueryBuilder pipeline would fall back to a serialized script query
455+
// instead of the native range/bool shape the cluster can index-accelerate.
456+
457+
@Test
458+
public void testBetweenPushesAsRange() throws IOException {
459+
String explain =
460+
explainQuery(
461+
"SELECT v._id, v._score "
462+
+ "FROM vectorSearch(table='"
463+
+ TEST_INDEX
464+
+ "', field='embedding', "
465+
+ "vector='[1.0, 2.0, 3.0]', option='k=10') AS v "
466+
+ "WHERE v.balance BETWEEN 50 AND 200 "
467+
+ "LIMIT 10");
468+
469+
// BETWEEN is desugared by the analyzer into AND(>=, <=), which FilterQueryBuilder renders as
470+
// two range clauses combined under a bool. The goal here is regression lock-in: ensure the
471+
// pushed filter is native range DSL, not a serialized script query.
472+
String sourceBuilderJson = extractSourceBuilderJson(explain);
473+
assertTrue(
474+
"Explain should contain bool query:\n" + sourceBuilderJson,
475+
sourceBuilderJson.contains("\"bool\""));
476+
assertTrue(
477+
"Explain should contain must clause (knn in scoring context):\n" + sourceBuilderJson,
478+
sourceBuilderJson.contains("\"must\""));
479+
assertTrue(
480+
"Explain should contain filter clause (WHERE in non-scoring context):\n"
481+
+ sourceBuilderJson,
482+
sourceBuilderJson.contains("\"filter\""));
483+
assertTrue(
484+
"BETWEEN should push as native range DSL:\n" + sourceBuilderJson,
485+
sourceBuilderJson.contains("\"range\""));
486+
assertTrue(
487+
"Range should target balance field:\n" + sourceBuilderJson,
488+
sourceBuilderJson.contains("\"balance\""));
489+
// RangeQueryBuilder serializes inclusive bounds as from/to + include_lower/include_upper. Lock
490+
// both the lower bound (50) and upper bound (200) are present in the pushed DSL.
491+
assertTrue(
492+
"Range should contain lower bound 50:\n" + sourceBuilderJson,
493+
sourceBuilderJson.contains("\"from\" : 50") || sourceBuilderJson.contains("\"from\":50"));
494+
assertTrue(
495+
"Range should contain upper bound 200:\n" + sourceBuilderJson,
496+
sourceBuilderJson.contains("\"to\" : 200") || sourceBuilderJson.contains("\"to\":200"));
497+
// Script-query fallback sentinel: the CompoundedScriptEngine lang marker must NOT appear when
498+
// BETWEEN is pushed down natively.
499+
assertFalse(
500+
"BETWEEN must not fall back to a serialized script query:\n" + sourceBuilderJson,
501+
sourceBuilderJson.contains("\"script\""));
502+
503+
// POST-filter mode (default): the WHERE predicate must live OUTSIDE the knn payload.
504+
String knnJson = decodeSoleKnnJson(explain);
505+
assertTrue("knn JSON should contain knn key:\n" + knnJson, knnJson.contains("\"knn\""));
506+
assertFalse(
507+
"Post-filter mode must not embed the balance predicate inside knn:\n" + knnJson,
508+
knnJson.contains("balance"));
509+
assertFalse(
510+
"Post-filter mode must not embed a range inside knn:\n" + knnJson,
511+
knnJson.contains("range"));
512+
}
513+
514+
@Test
515+
public void testNotInPushesAsMustNotTerms() throws IOException {
516+
String explain =
517+
explainQuery(
518+
"SELECT v._id, v._score "
519+
+ "FROM vectorSearch(table='"
520+
+ TEST_INDEX
521+
+ "', field='embedding', "
522+
+ "vector='[1.0, 2.0, 3.0]', option='k=10') AS v "
523+
+ "WHERE v.gender NOT IN ('M', 'F') "
524+
+ "LIMIT 10");
525+
526+
// v2 analyzer desugars `x NOT IN (a, b)` into `NOT(x = a OR x = b)`. FilterQueryBuilder maps
527+
// NOT to bool.must_not and OR to bool.should, so the pushed DSL is must_not[should[term,term]]
528+
// rather than a single terms clause. The shape we're locking in is: native bool with must_not
529+
// on the keyword subfield, *not* a serialized script query.
530+
String sourceBuilderJson = extractSourceBuilderJson(explain);
531+
assertTrue(
532+
"Explain should contain bool query:\n" + sourceBuilderJson,
533+
sourceBuilderJson.contains("\"bool\""));
534+
assertTrue(
535+
"Explain should contain must clause (knn in scoring context):\n" + sourceBuilderJson,
536+
sourceBuilderJson.contains("\"must\""));
537+
assertTrue(
538+
"Explain should contain filter clause (WHERE in non-scoring context):\n"
539+
+ sourceBuilderJson,
540+
sourceBuilderJson.contains("\"filter\""));
541+
assertTrue(
542+
"NOT IN should push as bool.must_not:\n" + sourceBuilderJson,
543+
sourceBuilderJson.contains("\"must_not\""));
544+
// OR-of-equals desugaring means the two literals land in a bool.should of term clauses.
545+
assertTrue(
546+
"NOT IN should contain should clause for OR-of-equals desugaring:\n" + sourceBuilderJson,
547+
sourceBuilderJson.contains("\"should\""));
548+
assertTrue(
549+
"NOT IN should produce term clauses for each literal:\n" + sourceBuilderJson,
550+
sourceBuilderJson.contains("\"term\""));
551+
// Terms target the keyword subfield of gender (text field with .keyword multi-field).
552+
assertTrue(
553+
"NOT IN term clauses should target gender.keyword:\n" + sourceBuilderJson,
554+
sourceBuilderJson.contains("\"gender.keyword\""));
555+
// Both literals must be present in the pushed DSL.
556+
assertTrue(
557+
"NOT IN should contain the 'M' literal:\n" + sourceBuilderJson,
558+
sourceBuilderJson.contains("\"M\""));
559+
assertTrue(
560+
"NOT IN should contain the 'F' literal:\n" + sourceBuilderJson,
561+
sourceBuilderJson.contains("\"F\""));
562+
// Script-query fallback sentinel: native pushdown must not degrade to a serialized script.
563+
assertFalse(
564+
"NOT IN must not fall back to a serialized script query:\n" + sourceBuilderJson,
565+
sourceBuilderJson.contains("\"script\""));
566+
567+
// POST-filter mode (default): the WHERE predicate must live OUTSIDE the knn payload.
568+
String knnJson = decodeSoleKnnJson(explain);
569+
assertTrue("knn JSON should contain knn key:\n" + knnJson, knnJson.contains("\"knn\""));
570+
assertFalse(
571+
"Post-filter mode must not embed the gender predicate inside knn:\n" + knnJson,
572+
knnJson.contains("gender"));
573+
assertFalse(
574+
"Post-filter mode must not embed must_not inside knn:\n" + knnJson,
575+
knnJson.contains("must_not"));
576+
}
450577
}

0 commit comments

Comments
 (0)