Tighten VectorSearchExecutionIT per reviewer feedback

mengweieric · mengweieric · commit 3be0b53d17d3 · 2026-04-17T14:25:52.000-07:00
Strengthen the happy-path IT so the efficient-filter test actually
discriminates EFFICIENT from POST and so the other tests assert
exact id sets instead of "at least one row, all in expected range".

- Pin the test index to Lucene HNSW + L2 so efficient filtering is
  deterministic (k-NN only supports efficient filtering on lucene+hnsw
  and faiss+hnsw/ivf) and the L2 -&gt; 1/(1+d) scoring used by min_score
  is well-defined.
- Rework the efficient-filter test to query near TX with k=3 and
  WHERE state='CA'. A POST-filter implementation would return 0 rows
  here (the 3 nearest candidates are all TX and get filtered out), so
  the assertion on exactly {4,5,6} is discriminative between modes.
- Tighten POST and radial tests to exact id-set assertions.
- Drop the scripts/setup-knn-local.sh reference in the class javadoc
  (the helper is local-only working-tree tooling, not a tracked script).

Signed-off-by: Eric Wei &lt;mengwei.eric@gmail.com&gt;
diff --git a/integ-test/src/test/java/org/opensearch/sql/sql/VectorSearchExecutionIT.java b/integ-test/src/test/java/org/opensearch/sql/sql/VectorSearchExecutionIT.java
@@ -10,6 +10,9 @@
 import static org.opensearch.sql.util.TestUtils.performRequest;
 
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
 import org.json.JSONArray;
 import org.json.JSONObject;
 import org.junit.Assume;
@@ -25,8 +28,8 @@
  *
  * <p>The k-NN plugin is not provisioned by the default integ-test cluster — each test calls {@link
  * Assume#assumeTrue} on {@link #isKnnPluginInstalled()} so the class is silently skipped when k-NN
- * is absent. Run locally after {@code scripts/setup-knn-local.sh} has wired k-NN into the test
- * cluster. Provisioning k-NN in CI is a separate follow-up.
+ * is absent. Run locally against a cluster that has opensearch-knn installed. Provisioning k-NN in
+ * CI is a separate follow-up.
  */
 public class VectorSearchExecutionIT extends SQLIntegTestCase {
 
@@ -35,12 +38,23 @@ public class VectorSearchExecutionIT extends SQLIntegTestCase {
   // 6 docs in 2D — two clusters so filter/radial tests have distinguishable results.
   // Cluster A near [1, 1]: docs 1-3 (state=TX, ages 25/30/40).
   // Cluster B near [9, 9]: docs 4-6 (state=CA, ages 28/35/45).
+  // Pin Lucene HNSW + L2 so efficient filtering is deterministic (k-NN supports efficient
+  // filtering only on lucene+hnsw and faiss+hnsw/ivf) and the L2 → 1/(1+d) scoring used by the
+  // radial min_score test is well-defined.
   private static final String MAPPING =
       "{"
           + "  \"settings\": {\"index\": {\"knn\": true}},"
           + "  \"mappings\": {"
           + "    \"properties\": {"
-          + "      \"embedding\": {\"type\": \"knn_vector\", \"dimension\": 2},"
+          + "      \"embedding\": {"
+          + "        \"type\": \"knn_vector\","
+          + "        \"dimension\": 2,"
+          + "        \"method\": {"
+          + "          \"name\": \"hnsw\","
+          + "          \"engine\": \"lucene\","
+          + "          \"space_type\": \"l2\""
+          + "        }"
+          + "      },"
           + "      \"state\": {\"type\": \"keyword\"},"
           + "      \"age\": {\"type\": \"integer\"}"
           + "    }"
@@ -119,8 +133,9 @@ public void testTopKReturnsNearestSortedByScore() throws IOException {
 
   @Test
   public void testPostFilterReturnsOnlyMatchingDocs() throws IOException {
-    // Query from cluster B with WHERE state='TX' should force the scan to find TX docs
-    // (cluster A) even though the vector is closer to cluster B. Proves filter is applied.
+    // Query from cluster B with WHERE state='TX' forces POST filtering to surface TX docs
+    // (cluster A) even though the vector is closer to cluster B. k=10 covers all 6 docs so
+    // post-filtering to state='TX' deterministically yields exactly {1,2,3}.
     JSONObject result =
         executeJdbcRequest(
             "SELECT v._id, v._score "
@@ -131,46 +146,37 @@ public void testPostFilterReturnsOnlyMatchingDocs() throws IOException {
                 + "WHERE v.state = 'TX' "
                 + "LIMIT 10");
 
-    JSONArray rows = result.getJSONArray("datarows");
-    assertTrue("Expected at least one row:\n" + result, rows.length() > 0);
-    for (int i = 0; i < rows.length(); i++) {
-      String id = rows.getJSONArray(i).getString(0);
-      assertTrue(
-          "Row " + i + " id=" + id + " should be from TX cluster (1,2,3):\n" + result,
-          id.equals("1") || id.equals("2") || id.equals("3"));
-    }
+    assertRowIdsEqual(result, "1", "2", "3");
   }
 
   // ── EFFICIENT filter happy path ─────────────────────────────────────
 
   @Test
   public void testEfficientFilterReturnsOnlyMatchingDocs() throws IOException {
+    // Query vector sits on cluster A (TX) but WHERE state='CA' forces EFFICIENT filtering to
+    // navigate HNSW toward CA docs. With k=3, a POST-filter implementation would return 0 rows
+    // (the 3 nearest candidates are all TX, which get filtered out); an efficient-filter
+    // implementation returns exactly the 3 CA docs {4,5,6}. This asymmetry makes the test
+    // discriminate between the two filter modes.
     JSONObject result =
         executeJdbcRequest(
             "SELECT v._id, v._score "
                 + "FROM vectorSearch(table='"
                 + TEST_INDEX
                 + "', field='embedding', "
-                + "vector='[1.0, 1.0]', option='k=5,filter_type=efficient') AS v "
+                + "vector='[1.0, 1.0]', option='k=3,filter_type=efficient') AS v "
                 + "WHERE v.state = 'CA' "
                 + "LIMIT 5");
 
-    JSONArray rows = result.getJSONArray("datarows");
-    assertTrue("Expected at least one row:\n" + result, rows.length() > 0);
-    for (int i = 0; i < rows.length(); i++) {
-      String id = rows.getJSONArray(i).getString(0);
-      assertTrue(
-          "Row " + i + " id=" + id + " should be from CA cluster (4,5,6):\n" + result,
-          id.equals("4") || id.equals("5") || id.equals("6"));
-    }
+    assertRowIdsEqual(result, "4", "5", "6");
   }
 
   // ── Radial happy paths ──────────────────────────────────────────────
 
   @Test
   public void testRadialMaxDistanceReturnsOnlyNearDocs() throws IOException {
-    // max_distance=1.0 (L2) centered on [1,1] should pick up cluster A docs and exclude
-    // cluster B which is ~11 units away.
+    // max_distance=1.0 (L2) centered on [1,1] includes all 3 cluster A docs (max L2 ≈ 0.22)
+    // and excludes cluster B which is ~11 units away.
     JSONObject result =
         executeJdbcRequest(
             "SELECT v._id "
@@ -180,20 +186,13 @@ public void testRadialMaxDistanceReturnsOnlyNearDocs() throws IOException {
                 + "vector='[1.0, 1.0]', option='max_distance=1.0') AS v "
                 + "LIMIT 10");
 
-    JSONArray rows = result.getJSONArray("datarows");
-    assertTrue("Expected at least one row:\n" + result, rows.length() > 0);
-    for (int i = 0; i < rows.length(); i++) {
-      String id = rows.getJSONArray(i).getString(0);
-      assertTrue(
-          "Row " + i + " id=" + id + " should be within max_distance of cluster A:\n" + result,
-          id.equals("1") || id.equals("2") || id.equals("3"));
-    }
+    assertRowIdsEqual(result, "1", "2", "3");
   }
 
   @Test
   public void testRadialMinScoreReturnsOnlyHighScoreDocs() throws IOException {
     // For L2 space, OpenSearch score = 1/(1+distance). Centered on [1,1], cluster A docs
-    // score ~0.8-1.0 and cluster B scores ~0.08. min_score=0.5 should exclude cluster B.
+    // score ~0.82-1.0 and cluster B scores ~0.08. min_score=0.5 yields exactly {1,2,3}.
     JSONObject result =
         executeJdbcRequest(
             "SELECT v._id, v._score "
@@ -204,16 +203,23 @@ public void testRadialMinScoreReturnsOnlyHighScoreDocs() throws IOException {
                 + "LIMIT 10");
 
     JSONArray rows = result.getJSONArray("datarows");
-    assertTrue("Expected at least one row:\n" + result, rows.length() > 0);
     for (int i = 0; i < rows.length(); i++) {
-      String id = rows.getJSONArray(i).getString(0);
       double score = rows.getJSONArray(i).getDouble(1);
-      assertTrue(
-          "Row " + i + " id=" + id + " score=" + score + " should be >= 0.5:\n" + result,
-          score >= 0.5);
-      assertTrue(
-          "Row " + i + " id=" + id + " should be from cluster A:\n" + result,
-          id.equals("1") || id.equals("2") || id.equals("3"));
+      assertTrue("Row " + i + " score=" + score + " should be >= 0.5:\n" + result, score >= 0.5);
+    }
+    assertRowIdsEqual(result, "1", "2", "3");
+  }
+
+  /** Asserts the result's datarows column 0 contains exactly the given ids (as a set). */
+  private static void assertRowIdsEqual(JSONObject result, String... expectedIds) {
+    JSONArray rows = result.getJSONArray("datarows");
+    assertEquals(
+        "Expected " + expectedIds.length + " rows:\n" + result, expectedIds.length, rows.length());
+    Set<String> expected = new HashSet<>(Arrays.asList(expectedIds));
+    Set<String> actual = new HashSet<>();
+    for (int i = 0; i < rows.length(); i++) {
+      actual.add(rows.getJSONArray(i).getString(0));
     }
+    assertEquals("Row id set mismatch:\n" + result, expected, actual);
   }
 }