Skip to content

Commit 5b39f8e

Browse files
committed
Reject outer WHERE on vectorSearch() subqueries
When vectorSearch() is wrapped in a subquery and the outer query has its own WHERE clause, for example: SELECT * FROM (SELECT v.firstname, v.state FROM vectorSearch(table='idx', field='emb', vector='[1,2]', option='k=5') AS v) t WHERE t.state = 'TX' the outer predicate never reaches the push-down contract. The plan shape after analysis is LogicalFilter over LogicalProject over the scan builder, so the PUSH_DOWN_FILTER rule (which matches filter directly above scanBuilder) does not fire. The filter is then applied in memory after k-NN has already returned top-k documents ranked by vector distance, which can silently yield zero rows with no explanation. This commit adds a post-optimization validation hook on TableScanBuilder (default no-op) and invokes it from Planner.plan() once all push-down rules have settled. VectorSearchIndexScanBuilder overrides the hook to walk the fully optimized plan and reject the outer-WHERE-over-subquery shape: a LogicalFilter whose descendant chain reaches the scan builder through one or more LogicalProject nodes. A filter directly above the scan builder (WHERE on vectorSearch() itself) is explicitly allowed because that path has already been exercised by push-down. The error message names the shape and tells the user to move the predicate inside the subquery so it is applied during the k-NN search. Tests: - VectorSearchIndexScanBuilderTest adds six cases covering the bad shape, nested subqueries, and the three positive controls (filter directly on scanBuilder, inner filter wrapped in outer project, no filter at all) plus a bare-scanBuilder defensive baseline. - VectorSearchSubqueryIT adds integration tests for the exact tracker query shape, the with-LIMIT variant, the _explain path, and positive controls for inner WHERE and no-WHERE subqueries. Signed-off-by: Eric Wei <mengwei.eric@gmail.com>
1 parent fa444fe commit 5b39f8e

5 files changed

Lines changed: 355 additions & 8 deletions

File tree

core/src/main/java/org/opensearch/sql/planner/Planner.java

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import org.opensearch.sql.planner.optimizer.LogicalPlanOptimizer;
1515
import org.opensearch.sql.planner.physical.PhysicalPlan;
1616
import org.opensearch.sql.storage.Table;
17+
import org.opensearch.sql.storage.read.TableScanBuilder;
1718

1819
/** Planner that plans and chooses the optimal physical plan. */
1920
@RequiredArgsConstructor
@@ -34,7 +35,35 @@ public PhysicalPlan plan(LogicalPlan plan) {
3435
if (table == null) {
3536
return plan.accept(new DefaultImplementor<>(), null);
3637
}
37-
return table.implement(table.optimize(optimize(plan)));
38+
LogicalPlan optimized = table.optimize(optimize(plan));
39+
// Give scan builders a chance to reject shapes that push-down alone cannot express safely
40+
// (e.g. operators that land above the scan but outside its push-down contract).
41+
validateScanBuilders(optimized);
42+
return table.implement(optimized);
43+
}
44+
45+
/**
46+
* Walk the optimized plan and invoke {@link TableScanBuilder#validatePlan(LogicalPlan)} on every
47+
* scan builder, passing the fully optimized root so scan builders can inspect their ancestors.
48+
*/
49+
private void validateScanBuilders(LogicalPlan optimized) {
50+
optimized.accept(
51+
new LogicalPlanNodeVisitor<Void, Object>() {
52+
@Override
53+
public Void visitNode(LogicalPlan node, Object context) {
54+
for (LogicalPlan child : node.getChild()) {
55+
child.accept(this, context);
56+
}
57+
return null;
58+
}
59+
60+
@Override
61+
public Void visitTableScanBuilder(TableScanBuilder node, Object context) {
62+
node.validatePlan(optimized);
63+
return null;
64+
}
65+
},
66+
null);
3867
}
3968

4069
private Table findTable(LogicalPlan plan) {

core/src/main/java/org/opensearch/sql/storage/read/TableScanBuilder.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,19 @@ public boolean pushDownPageSize(LogicalPaginate paginate) {
119119
return false;
120120
}
121121

122+
/**
123+
* Post-optimization validation hook. Called once by the planner after all push-down rules have
124+
* run, with the fully optimized plan root. Subclasses may inspect the ancestors of this scan
125+
* builder to reject planner shapes that push-down alone cannot express safely (for example,
126+
* operators that land above the scan but outside its push-down contract and would be executed
127+
* after the scan has already returned a bounded result set). Default is no-op.
128+
*
129+
* @param root the fully optimized logical plan containing this scan builder
130+
*/
131+
public void validatePlan(LogicalPlan root) {
132+
// no-op by default
133+
}
134+
122135
@Override
123136
public <R, C> R accept(LogicalPlanNodeVisitor<R, C> visitor, C context) {
124137
return visitor.visitTableScanBuilder(this, context);
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.sql;
7+
8+
import static org.hamcrest.Matchers.containsString;
9+
10+
import java.io.IOException;
11+
import org.junit.Test;
12+
import org.opensearch.client.ResponseException;
13+
import org.opensearch.sql.legacy.SQLIntegTestCase;
14+
import org.opensearch.sql.legacy.TestsConstants;
15+
16+
/**
17+
* Integration tests for vectorSearch() used inside subqueries. Locks in the rejection of outer
18+
* WHERE on a vectorSearch() subquery, which would otherwise silently yield zero rows because the
19+
* outer predicate is applied only after the k-NN search has already selected top-k documents by
20+
* vector distance.
21+
*
22+
* <p>Uses _explain-only plus error-path queries, so the k-NN plugin is not required — the planner
23+
* validation fires during planning, before any k-NN execution.
24+
*/
25+
public class VectorSearchSubqueryIT extends SQLIntegTestCase {
26+
27+
@Override
28+
protected void init() throws Exception {
29+
loadIndex(Index.ACCOUNT);
30+
}
31+
32+
private static final String TEST_INDEX = TestsConstants.TEST_INDEX_ACCOUNT;
33+
34+
@Test
35+
public void testOuterWhereOnSubqueryRejected() throws IOException {
36+
// Without the guard the outer predicate is dropped from the pushed DSL and applied only in
37+
// memory after k-NN returned top-k, which can yield silent zero rows.
38+
ResponseException ex =
39+
expectThrows(
40+
ResponseException.class,
41+
() ->
42+
executeQuery(
43+
"SELECT * FROM (SELECT v.firstname, v.state "
44+
+ "FROM vectorSearch(table='"
45+
+ TEST_INDEX
46+
+ "', field='embedding', vector='[1.0, 2.0]', option='k=5') AS v) t "
47+
+ "WHERE t.state = 'TX'"));
48+
49+
assertThat(
50+
ex.getMessage(),
51+
containsString("Outer WHERE on a vectorSearch() subquery is not supported"));
52+
assertThat(ex.getMessage(), containsString("silently yield zero rows"));
53+
}
54+
55+
@Test
56+
public void testOuterWhereOnSubqueryRejectedWithLimit() throws IOException {
57+
// Same shape with an outer LIMIT — exercises a second planner path (LogicalLimit above
58+
// LogicalFilter above LogicalProject above scan builder).
59+
ResponseException ex =
60+
expectThrows(
61+
ResponseException.class,
62+
() ->
63+
executeQuery(
64+
"SELECT * FROM (SELECT v.firstname, v.state "
65+
+ "FROM vectorSearch(table='"
66+
+ TEST_INDEX
67+
+ "', field='embedding', vector='[1.0, 2.0]', option='k=5') AS v) t "
68+
+ "WHERE t.state = 'TX' "
69+
+ "LIMIT 3"));
70+
71+
assertThat(
72+
ex.getMessage(),
73+
containsString("Outer WHERE on a vectorSearch() subquery is not supported"));
74+
}
75+
76+
@Test
77+
public void testOuterWhereOnSubqueryRejectedExplain() throws IOException {
78+
// The guard must fire during planning, before any k-NN execution — so _explain must also
79+
// return the validation error rather than a silently dropped predicate in the DSL.
80+
ResponseException ex =
81+
expectThrows(
82+
ResponseException.class,
83+
() ->
84+
explainQuery(
85+
"SELECT * FROM (SELECT v.firstname, v.state "
86+
+ "FROM vectorSearch(table='"
87+
+ TEST_INDEX
88+
+ "', field='embedding', vector='[1.0, 2.0]', option='k=5') AS v) t "
89+
+ "WHERE t.state = 'TX'"));
90+
91+
assertThat(
92+
ex.getMessage(),
93+
containsString("Outer WHERE on a vectorSearch() subquery is not supported"));
94+
}
95+
96+
@Test
97+
public void testInnerWhereStillWorks() throws IOException {
98+
// Positive control: WHERE directly on vectorSearch() inside the subquery must still plan
99+
// successfully — the rejection is scoped to OUTER filters that cannot reach the push-down
100+
// contract. We use _explain because the default integ-test cluster has no k-NN plugin.
101+
String explain =
102+
explainQuery(
103+
"SELECT * FROM (SELECT v.firstname, v.state "
104+
+ "FROM vectorSearch(table='"
105+
+ TEST_INDEX
106+
+ "', field='embedding', vector='[1.0, 2.0]', option='k=5') AS v "
107+
+ "WHERE v.state = 'TX') t");
108+
109+
assertThat(explain, containsString("wrapper"));
110+
// Inner WHERE should push down, so the state predicate appears in the DSL.
111+
assertThat(explain, containsString("state"));
112+
}
113+
114+
@Test
115+
public void testInnerWhereWithOuterProjectStillWorks() throws IOException {
116+
// Another positive control: the outer layer can still project and limit columns from the
117+
// subquery without the guard firing — only outer WHERE is rejected.
118+
String explain =
119+
explainQuery(
120+
"SELECT t.firstname FROM (SELECT v.firstname, v.state "
121+
+ "FROM vectorSearch(table='"
122+
+ TEST_INDEX
123+
+ "', field='embedding', vector='[1.0, 2.0]', option='k=5') AS v "
124+
+ "WHERE v.state = 'TX') t "
125+
+ "LIMIT 3");
126+
127+
assertThat(explain, containsString("wrapper"));
128+
}
129+
130+
@Test
131+
public void testSubqueryNoWhereStillWorks() throws IOException {
132+
// Baseline: a subquery with no WHERE anywhere must not be rejected — the guard fires only
133+
// when an outer LogicalFilter sits above a subquery project boundary.
134+
String explain =
135+
explainQuery(
136+
"SELECT * FROM (SELECT v.firstname "
137+
+ "FROM vectorSearch(table='"
138+
+ TEST_INDEX
139+
+ "', field='embedding', vector='[1.0, 2.0]', option='k=5') AS v) t "
140+
+ "LIMIT 3");
141+
142+
assertThat(explain, containsString("wrapper"));
143+
}
144+
}

opensearch/src/main/java/org/opensearch/sql/opensearch/storage/scan/VectorSearchIndexScanBuilder.java

Lines changed: 76 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,28 @@
99
import org.opensearch.sql.exception.ExpressionEvaluationException;
1010
import org.opensearch.sql.opensearch.request.OpenSearchRequestBuilder;
1111
import org.opensearch.sql.planner.logical.LogicalAggregation;
12+
import org.opensearch.sql.planner.logical.LogicalFilter;
13+
import org.opensearch.sql.planner.logical.LogicalPlan;
14+
import org.opensearch.sql.planner.logical.LogicalProject;
1215

1316
/**
14-
* Scan builder for vector search relations. Rejects aggregation pushdown: aggregations over
15-
* vectorSearch() relations are not supported in the SQL surface. Native OpenSearch k-NN does
16-
* support aggregations alongside similarity search; this restriction is specific to the SQL layer.
17+
* Scan builder for vector search relations.
18+
*
19+
* <p>Rejects two planner shapes that the SQL surface cannot express safely:
20+
*
21+
* <ul>
22+
* <li><b>Aggregations</b> — native OpenSearch k-NN supports aggregations alongside similarity
23+
* search, but the SQL layer does not plumb them through, so we fail fast rather than return
24+
* silently unaggregated results.
25+
* <li><b>Outer WHERE over a vectorSearch() subquery</b> — when vectorSearch() is wrapped in a
26+
* subquery (e.g. {@code SELECT * FROM (SELECT v.id FROM vectorSearch(...) AS v) t WHERE
27+
* t.price < 150}), the outer predicate does not reach the push-down contract (the inner
28+
* {@link LogicalProject} sits between the outer {@link LogicalFilter} and this scan builder,
29+
* so the filter never matches the {@code filter(scanBuilder)} push-down pattern). The filter
30+
* is then applied in memory, but only <i>after</i> k-NN has already returned the top-k
31+
* documents ranked by vector distance, so the outer predicate can silently produce zero rows.
32+
* We detect this shape in {@link #validatePlan(LogicalPlan)} and reject with a clear error.
33+
* </ul>
1734
*/
1835
public class VectorSearchIndexScanBuilder extends OpenSearchIndexScanBuilder {
1936

@@ -28,4 +45,60 @@ public boolean pushDownAggregation(LogicalAggregation aggregation) {
2845
throw new ExpressionEvaluationException(
2946
"Aggregations are not supported on vectorSearch() relations.");
3047
}
48+
49+
/**
50+
* Walk the fully-optimized plan and reject the outer-WHERE-over-subquery shape. We look for a
51+
* {@link LogicalFilter} whose descendant chain reaches this scan builder through one or more
52+
* {@link LogicalProject} nodes (the subquery-boundary marker). A filter directly above this scan
53+
* builder is fine — that WHERE has already gone through the push-down contract.
54+
*/
55+
@Override
56+
public void validatePlan(LogicalPlan root) {
57+
checkForOuterFilter(root, false, false);
58+
}
59+
60+
/**
61+
* Recursive walker with two booleans that together encode the "outer filter separated by a
62+
* subquery boundary" pattern:
63+
*
64+
* <ul>
65+
* <li>{@code insideFilter} — true iff some ancestor on the current walk path is a {@link
66+
* LogicalFilter}. Projects only matter below a filter, so without a filter ancestor a
67+
* project is just the outer SELECT and should not trigger rejection.
68+
* <li>{@code sawProjectSinceFilter} — true iff a {@link LogicalProject} has been seen between
69+
* the <em>nearest enclosing</em> filter and the current position. That project is the
70+
* subquery-boundary marker that separates "outer WHERE on a subquery" (bad) from "WHERE
71+
* directly on vectorSearch()" (fine, handled by push-down).
72+
* </ul>
73+
*
74+
* <p>Entering a new {@link LogicalFilter} resets {@code sawProjectSinceFilter} to false because
75+
* the nearest enclosing filter is now this one, and the contract is evaluated from here down.
76+
*/
77+
private void checkForOuterFilter(
78+
LogicalPlan node, boolean insideFilter, boolean sawProjectSinceFilter) {
79+
if (node == this) {
80+
if (insideFilter && sawProjectSinceFilter) {
81+
throw new ExpressionEvaluationException(
82+
"Outer WHERE on a vectorSearch() subquery is not supported: the predicate does not"
83+
+ " push into the k-NN search and would be applied only after top-k results have"
84+
+ " been selected by vector distance, which can silently yield zero rows."
85+
+ " Move the predicate inside the subquery (WHERE directly on vectorSearch()) so"
86+
+ " it is applied during the k-NN search.");
87+
}
88+
return;
89+
}
90+
boolean nextInsideFilter = insideFilter;
91+
boolean nextSawProject = sawProjectSinceFilter;
92+
if (node instanceof LogicalFilter) {
93+
// Entering a new filter: nearest enclosing filter is now this one, reset project marker.
94+
nextInsideFilter = true;
95+
nextSawProject = false;
96+
} else if (node instanceof LogicalProject && insideFilter) {
97+
// A project between the nearest enclosing filter and a scan builder is the bug marker.
98+
nextSawProject = true;
99+
}
100+
for (LogicalPlan child : node.getChild()) {
101+
checkForOuterFilter(child, nextInsideFilter, nextSawProject);
102+
}
103+
}
31104
}

0 commit comments

Comments
 (0)