Skip to content

Commit 0ad6d2a

Browse files
committed
Strengthen VectorSearchExplainIT with structural DSL assertions
The explain suite previously relied on substring checks (contains("wrapper"), contains("bool"), contains("filter")) that could false-positive across distinct push-down shapes — notably, post-filter vs. efficient-mode filter_type output look similar at the substring level. Add shared helpers to (1) decode every base64 WrapperQueryBuilder payload into its inner k-NN JSON and (2) extract and unescape the outer sourceBuilder JSON from explain output. Strengthen the top-k, radial (max_distance, min_score), post-filter, compound-predicate, radial-with-WHERE, and filter_type=post tests to assert both the outer bool/must/filter shape and that WHERE predicates stay out of the inner k-NN payload. Refactor the efficient-mode test onto the same helpers so its no-outer-bool/must guard is verified against the unescaped sourceBuilder JSON rather than accidentally passing on escaped output. Leave the ORDER BY _score DESC and LIMIT <= k smoke tests untouched — those are guard-rail "does planning succeed" checks where shape is covered elsewhere. Signed-off-by: Eric Wei <mengwei.eric@gmail.com>
1 parent 231d477 commit 0ad6d2a

1 file changed

Lines changed: 203 additions & 41 deletions

File tree

integ-test/src/test/java/org/opensearch/sql/sql/VectorSearchExplainIT.java

Lines changed: 203 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66
package org.opensearch.sql.sql;
77

88
import java.io.IOException;
9+
import java.nio.charset.StandardCharsets;
10+
import java.util.ArrayList;
11+
import java.util.Base64;
12+
import java.util.List;
13+
import java.util.regex.Matcher;
14+
import java.util.regex.Pattern;
915
import org.junit.Test;
1016
import org.opensearch.sql.legacy.SQLIntegTestCase;
1117
import org.opensearch.sql.legacy.TestsConstants;
@@ -17,6 +23,38 @@
1723
*/
1824
public class VectorSearchExplainIT extends SQLIntegTestCase {
1925

26+
// Matches WrapperQueryBuilder's base64 payload in explain JSON. The explain output escapes
27+
// quotes as \", so the regex tolerates both \" and " forms around the query key/value.
28+
private static final Pattern WRAPPER_PAYLOAD =
29+
Pattern.compile("\\\\?\"query\\\\?\":\\\\?\"([A-Za-z0-9+/=]+)\\\\?\"");
30+
private static final Pattern SOURCE_BUILDER_JSON =
31+
Pattern.compile("sourceBuilder=(\\{.*?\\}), pitId=", Pattern.DOTALL);
32+
33+
/** Decodes every base64-encoded wrapper payload in the explain output into its knn JSON. */
34+
private static List<String> decodeWrapperKnnJsons(String explain) {
35+
List<String> payloads = new ArrayList<>();
36+
Matcher m = WRAPPER_PAYLOAD.matcher(explain);
37+
while (m.find()) {
38+
payloads.add(new String(Base64.getDecoder().decode(m.group(1)), StandardCharsets.UTF_8));
39+
}
40+
return payloads;
41+
}
42+
43+
/** Returns the single wrapper knn JSON, asserting exactly one is present. */
44+
private static String decodeSoleKnnJson(String explain) {
45+
List<String> payloads = decodeWrapperKnnJsons(explain);
46+
assertEquals(
47+
"Expected exactly one wrapper query payload in explain:\n" + explain, 1, payloads.size());
48+
return payloads.get(0);
49+
}
50+
51+
/** Extracts and unescapes the sourceBuilder JSON embedded in the explain request string. */
52+
private static String extractSourceBuilderJson(String explain) {
53+
Matcher m = SOURCE_BUILDER_JSON.matcher(explain);
54+
assertTrue("Explain should contain sourceBuilder JSON:\n" + explain, m.find());
55+
return m.group(1).replace("\\\"", "\"");
56+
}
57+
2058
@Override
2159
protected void init() throws Exception {
2260
// _explain needs the index to exist for field resolution.
@@ -38,11 +76,20 @@ public void testExplainTopKProducesKnnQuery() throws IOException {
3876
+ "vector='[1.0, 2.0, 3.0]', option='k=5') AS v "
3977
+ "LIMIT 5");
4078

41-
// WrapperQueryBuilder wraps the knn JSON — verify the wrapper is present
42-
// and track_scores is enabled for score preservation.
43-
assertTrue("Explain should contain wrapper query:\n" + explain, explain.contains("wrapper"));
4479
assertTrue(
4580
"Explain should contain track_scores:\n" + explain, explain.contains("track_scores"));
81+
82+
String knnJson = decodeSoleKnnJson(explain);
83+
assertTrue("knn JSON should contain knn key:\n" + knnJson, knnJson.contains("\"knn\""));
84+
assertTrue(
85+
"knn JSON should target the embedding field:\n" + knnJson,
86+
knnJson.contains("\"embedding\""));
87+
assertTrue(
88+
"knn JSON should contain the vector values:\n" + knnJson,
89+
knnJson.contains("[1.0,2.0,3.0]"));
90+
assertTrue("knn JSON should contain k=5:\n" + knnJson, knnJson.contains("\"k\":5"));
91+
assertFalse(
92+
"Top-k without WHERE should not embed a filter:\n" + knnJson, knnJson.contains("filter"));
4693
}
4794

4895
@Test
@@ -56,7 +103,18 @@ public void testExplainRadialMaxDistanceProducesKnnQuery() throws IOException {
56103
+ "vector='[1.0, 2.0]', option='max_distance=10.5') AS v "
57104
+ "LIMIT 100");
58105

59-
assertTrue("Explain should contain wrapper query:\n" + explain, explain.contains("wrapper"));
106+
String knnJson = decodeSoleKnnJson(explain);
107+
assertTrue("knn JSON should contain knn key:\n" + knnJson, knnJson.contains("\"knn\""));
108+
assertTrue(
109+
"knn JSON should target the embedding field:\n" + knnJson,
110+
knnJson.contains("\"embedding\""));
111+
assertTrue(
112+
"knn JSON should contain the vector values:\n" + knnJson, knnJson.contains("[1.0,2.0]"));
113+
assertTrue(
114+
"knn JSON should contain max_distance=10.5:\n" + knnJson,
115+
knnJson.contains("\"max_distance\":10.5"));
116+
assertFalse(
117+
"Radial without WHERE should not embed a filter:\n" + knnJson, knnJson.contains("filter"));
60118
}
61119

62120
@Test
@@ -70,7 +128,18 @@ public void testExplainRadialMinScoreProducesKnnQuery() throws IOException {
70128
+ "vector='[1.0, 2.0]', option='min_score=0.8') AS v "
71129
+ "LIMIT 100");
72130

73-
assertTrue("Explain should contain wrapper query:\n" + explain, explain.contains("wrapper"));
131+
String knnJson = decodeSoleKnnJson(explain);
132+
assertTrue("knn JSON should contain knn key:\n" + knnJson, knnJson.contains("\"knn\""));
133+
assertTrue(
134+
"knn JSON should target the embedding field:\n" + knnJson,
135+
knnJson.contains("\"embedding\""));
136+
assertTrue(
137+
"knn JSON should contain the vector values:\n" + knnJson, knnJson.contains("[1.0,2.0]"));
138+
assertTrue(
139+
"knn JSON should contain min_score=0.8:\n" + knnJson,
140+
knnJson.contains("\"min_score\":0.8"));
141+
assertFalse(
142+
"Radial without WHERE should not embed a filter:\n" + knnJson, knnJson.contains("filter"));
74143
}
75144

76145
// ── Post-filter DSL shape ────────────────────────────────────────────
@@ -87,13 +156,35 @@ public void testExplainPostFilterProducesBoolQuery() throws IOException {
87156
+ "WHERE v.state = 'TX' "
88157
+ "LIMIT 10");
89158

90-
assertTrue("Explain should contain bool query:\n" + explain, explain.contains("bool"));
159+
// Post-filter shape: outer bool.must=[knn], bool.filter=[term] — WHERE lives OUTSIDE the knn
160+
// payload. Verify by decoding the wrapper and asserting the predicate field is NOT embedded.
161+
String sourceBuilderJson = extractSourceBuilderJson(explain);
162+
assertTrue(
163+
"Explain should contain bool query:\n" + sourceBuilderJson,
164+
sourceBuilderJson.contains("\"bool\""));
165+
assertTrue(
166+
"Explain should contain must clause (knn in scoring context):\n" + sourceBuilderJson,
167+
sourceBuilderJson.contains("\"must\""));
91168
assertTrue(
92-
"Explain should contain must clause (knn in scoring context):\n" + explain,
93-
explain.contains("must"));
169+
"Explain should contain filter clause (WHERE in non-scoring context):\n"
170+
+ sourceBuilderJson,
171+
sourceBuilderJson.contains("\"filter\""));
94172
assertTrue(
95-
"Explain should contain filter clause (WHERE in non-scoring context):\n" + explain,
96-
explain.contains("filter"));
173+
"Explain should contain the outer state predicate:\n" + sourceBuilderJson,
174+
sourceBuilderJson.contains("\"state.keyword\""));
175+
176+
String knnJson = decodeSoleKnnJson(explain);
177+
assertTrue("knn JSON should contain knn key:\n" + knnJson, knnJson.contains("\"knn\""));
178+
assertTrue(
179+
"knn JSON should target the embedding field:\n" + knnJson,
180+
knnJson.contains("\"embedding\""));
181+
assertTrue("knn JSON should contain k=10:\n" + knnJson, knnJson.contains("\"k\":10"));
182+
assertFalse(
183+
"Post-filter mode must not embed the WHERE predicate inside knn:\n" + knnJson,
184+
knnJson.contains("state"));
185+
assertFalse(
186+
"Post-filter mode must not embed a filter inside knn:\n" + knnJson,
187+
knnJson.contains("filter"));
97188
}
98189

99190
@Test
@@ -108,13 +199,41 @@ public void testExplainCompoundPredicateProducesBoolQuery() throws IOException {
108199
+ "WHERE v.state = 'TX' AND v.age > 30 "
109200
+ "LIMIT 10");
110201

111-
assertTrue("Explain should contain bool query:\n" + explain, explain.contains("bool"));
202+
// Compound post-filter still uses outer bool.must=[knn]/bool.filter=[predicates]. Both WHERE
203+
// predicates must stay outside the knn payload; otherwise efficient mode could false-positive.
204+
String sourceBuilderJson = extractSourceBuilderJson(explain);
205+
assertTrue(
206+
"Explain should contain bool query:\n" + sourceBuilderJson,
207+
sourceBuilderJson.contains("\"bool\""));
112208
assertTrue(
113-
"Explain should contain must clause (knn in scoring context):\n" + explain,
114-
explain.contains("must"));
209+
"Explain should contain must clause (knn in scoring context):\n" + sourceBuilderJson,
210+
sourceBuilderJson.contains("\"must\""));
115211
assertTrue(
116-
"Explain should contain filter clause (compound WHERE in non-scoring context):\n" + explain,
117-
explain.contains("filter"));
212+
"Explain should contain filter clause (compound WHERE in non-scoring context):\n"
213+
+ sourceBuilderJson,
214+
sourceBuilderJson.contains("\"filter\""));
215+
assertTrue(
216+
"Explain should contain the outer state predicate:\n" + sourceBuilderJson,
217+
sourceBuilderJson.contains("\"state.keyword\""));
218+
assertTrue(
219+
"Explain should contain the outer age predicate:\n" + sourceBuilderJson,
220+
sourceBuilderJson.contains("\"age\""));
221+
222+
String knnJson = decodeSoleKnnJson(explain);
223+
assertTrue("knn JSON should contain knn key:\n" + knnJson, knnJson.contains("\"knn\""));
224+
assertTrue(
225+
"knn JSON should target the embedding field:\n" + knnJson,
226+
knnJson.contains("\"embedding\""));
227+
assertTrue("knn JSON should contain k=10:\n" + knnJson, knnJson.contains("\"k\":10"));
228+
assertFalse(
229+
"Compound post-filter must not embed the state predicate inside knn:\n" + knnJson,
230+
knnJson.contains("state"));
231+
assertFalse(
232+
"Compound post-filter must not embed the age predicate inside knn:\n" + knnJson,
233+
knnJson.contains("age"));
234+
assertFalse(
235+
"Compound post-filter must not embed a filter inside knn:\n" + knnJson,
236+
knnJson.contains("filter"));
118237
}
119238

120239
@Test
@@ -129,13 +248,37 @@ public void testExplainRadialWithWhereProducesBoolQuery() throws IOException {
129248
+ "WHERE v.state = 'TX' "
130249
+ "LIMIT 100");
131250

132-
assertTrue("Explain should contain bool query:\n" + explain, explain.contains("bool"));
251+
// Radial + WHERE should also keep the WHERE predicate in the outer bool.filter rather than
252+
// embedding it into the radial knn payload.
253+
String sourceBuilderJson = extractSourceBuilderJson(explain);
254+
assertTrue(
255+
"Explain should contain bool query:\n" + sourceBuilderJson,
256+
sourceBuilderJson.contains("\"bool\""));
257+
assertTrue(
258+
"Explain should contain must clause (knn in scoring context):\n" + sourceBuilderJson,
259+
sourceBuilderJson.contains("\"must\""));
260+
assertTrue(
261+
"Explain should contain filter clause (WHERE in non-scoring context):\n"
262+
+ sourceBuilderJson,
263+
sourceBuilderJson.contains("\"filter\""));
264+
assertTrue(
265+
"Explain should contain the outer state predicate:\n" + sourceBuilderJson,
266+
sourceBuilderJson.contains("\"state.keyword\""));
267+
268+
String knnJson = decodeSoleKnnJson(explain);
269+
assertTrue("knn JSON should contain knn key:\n" + knnJson, knnJson.contains("\"knn\""));
133270
assertTrue(
134-
"Explain should contain must clause (knn in scoring context):\n" + explain,
135-
explain.contains("must"));
271+
"knn JSON should target the embedding field:\n" + knnJson,
272+
knnJson.contains("\"embedding\""));
136273
assertTrue(
137-
"Explain should contain filter clause (WHERE in non-scoring context):\n" + explain,
138-
explain.contains("filter"));
274+
"knn JSON should contain max_distance=10.5:\n" + knnJson,
275+
knnJson.contains("\"max_distance\":10.5"));
276+
assertFalse(
277+
"Radial post-filter must not embed the WHERE predicate inside knn:\n" + knnJson,
278+
knnJson.contains("state"));
279+
assertFalse(
280+
"Radial post-filter must not embed a filter inside knn:\n" + knnJson,
281+
knnJson.contains("filter"));
139282
}
140283

141284
// ── Sort + LIMIT explain ─────────────────────────────────────────────
@@ -185,9 +328,35 @@ public void testExplainFilterTypePostProducesBoolQuery() throws IOException {
185328
+ "WHERE v.state = 'TX' "
186329
+ "LIMIT 10");
187330

188-
assertTrue("Explain should contain bool query:\n" + explain, explain.contains("bool"));
189-
assertTrue("Explain should contain must:\n" + explain, explain.contains("must"));
190-
assertTrue("Explain should contain filter:\n" + explain, explain.contains("filter"));
331+
// Explicit filter_type=post must produce the same bool.must=[knn]/bool.filter=[term] shape as
332+
// the default, and the WHERE predicate must NOT leak into the knn payload (that would be
333+
// efficient mode). This is the key false-positive guard: substring-only checks would pass for
334+
// efficient mode too.
335+
String sourceBuilderJson = extractSourceBuilderJson(explain);
336+
assertTrue(
337+
"Explain should contain bool query:\n" + sourceBuilderJson,
338+
sourceBuilderJson.contains("\"bool\""));
339+
assertTrue(
340+
"Explain should contain must:\n" + sourceBuilderJson,
341+
sourceBuilderJson.contains("\"must\""));
342+
assertTrue(
343+
"Explain should contain filter:\n" + sourceBuilderJson,
344+
sourceBuilderJson.contains("\"filter\""));
345+
assertTrue(
346+
"Explain should contain the outer state predicate:\n" + sourceBuilderJson,
347+
sourceBuilderJson.contains("\"state.keyword\""));
348+
349+
String knnJson = decodeSoleKnnJson(explain);
350+
assertTrue("knn JSON should contain knn key:\n" + knnJson, knnJson.contains("\"knn\""));
351+
assertTrue(
352+
"knn JSON should target the embedding field:\n" + knnJson,
353+
knnJson.contains("\"embedding\""));
354+
assertFalse(
355+
"filter_type=post must not embed the WHERE predicate inside knn:\n" + knnJson,
356+
knnJson.contains("state"));
357+
assertFalse(
358+
"filter_type=post must not embed a filter inside knn:\n" + knnJson,
359+
knnJson.contains("filter"));
191360
}
192361

193362
@Test
@@ -204,26 +373,19 @@ public void testExplainFilterTypeEfficientProducesKnnWithFilter() throws IOExcep
204373

205374
// Efficient mode: knn rebuilt with filter inside, wrapped in WrapperQueryBuilder.
206375
// The knn JSON (including the embedded filter) is base64-encoded inside the wrapper,
207-
// so we verify structure by: (1) wrapper present, (2) no bool/must in plaintext
208-
// (that would be post-filter shape), (3) decode the base64 payload to confirm
209-
// the filter and predicate field are embedded inside the knn query.
210-
assertTrue("Explain should contain wrapper query:\n" + explain, explain.contains("wrapper"));
376+
// so we verify structure by: (1) no bool/must in plaintext (that would be post-filter shape),
377+
// (2) decode the base64 payload to confirm the filter and predicate field are embedded inside
378+
// the knn query.
379+
String sourceBuilderJson = extractSourceBuilderJson(explain);
211380
assertFalse(
212-
"Efficient mode should not produce bool query (that is post-filter shape):\n" + explain,
213-
explain.contains("\"bool\""));
381+
"Efficient mode should not produce bool query (that is post-filter shape):\n"
382+
+ sourceBuilderJson,
383+
sourceBuilderJson.contains("\"bool\""));
214384
assertFalse(
215-
"Efficient mode should not contain must clause:\n" + explain, explain.contains("\"must\""));
216-
217-
// Extract and decode the base64 knn payload to verify filter embedding.
218-
// The explain output escapes quotes as \", so match both \" and " forms.
219-
java.util.regex.Matcher m =
220-
java.util.regex.Pattern.compile("\\\\?\"query\\\\?\":\\\\?\"([A-Za-z0-9+/=]+)\\\\?\"")
221-
.matcher(explain);
222-
assertTrue("Explain should contain base64-encoded knn query:\n" + explain, m.find());
223-
String knnJson =
224-
new String(
225-
java.util.Base64.getDecoder().decode(m.group(1)),
226-
java.nio.charset.StandardCharsets.UTF_8);
385+
"Efficient mode should not contain must clause:\n" + sourceBuilderJson,
386+
sourceBuilderJson.contains("\"must\""));
387+
388+
String knnJson = decodeSoleKnnJson(explain);
227389
assertTrue(
228390
"Efficient mode knn JSON should contain filter:\n" + knnJson, knnJson.contains("filter"));
229391
assertTrue(

0 commit comments

Comments
 (0)