lance-format · jackye1995 · Jun 6, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/.github/workflows/spark-search.yml b/.github/workflows/spark-search.yml
@@ -0,0 +1,179 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Spark Search Docker
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - ready_for_review
+      - reopened
+    paths:
+      - ".github/workflows/spark-search.yml"
+      - "Makefile"
+      - "docker/**"
+      - "integration-tests/**"
+      - "lance-spark-base_2.12/src/main/java/org/lance/spark/search/**"
+      - "lance-spark-base_2.12/src/main/scala/org/lance/spark/search/**"
+      - "lance-spark-base_2.12/src/test/java/org/lance/spark/search/**"
+      - "lance-spark-*/src/main/scala/org/lance/spark/extensions/**"
+      - "pom.xml"
+      - "*/pom.xml"
+  workflow_dispatch:
+    inputs:
+      spark-version:
+        description: "Spark version to test"
+        required: true
+        default: "3.5"
+      scala-version:
+        description: "Scala version to test"
+        required: true
+        default: "2.13"
+      backends:
+        description: "Comma-separated test backends: local or local,rest-dir"
+        required: true
+        default: "local,rest-dir"
+      rest-uri:
+        description: "Optional REST namespace URI. If omitted, tests start a local REST directory namespace."
+        required: false
+        default: ""
+      rest-database:
+        description: "Optional database header value for an external REST namespace"
+        required: false
+        default: ""
+      docker-run-args:
+        description: "Extra docker run args for docker-test"
+        required: false
+        default: ""
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  SPARK_VERSION: ${{ github.event.inputs['spark-version'] || '3.5' }}
+  SCALA_VERSION: ${{ github.event.inputs['scala-version'] || '2.13' }}
+  SEARCH_TEST_BACKENDS: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.backends || 'local,rest-dir' }}
+  SEARCH_PYTEST_CMD: >-
+    pytest /home/lance/tests/test_lance_spark.py::TestDQLSearchTableFunctions
+    -v --timeout=180
+
+jobs:
+  search-docker-test:
+    name: Search Docker Test
+    runs-on: ubuntu-24.04
+    timeout-minutes: 90
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+      - name: Set up Java
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 17
+          cache: "maven"
+      - name: Resolve Docker build args
+        id: docker-args
+        run: |
+          make print-docker-build-args SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION} >> $GITHUB_OUTPUT
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build test-base image (cached)
+        uses: docker/build-push-action@v6
+        with:
+          context: docker
+          file: docker/Dockerfile.test-base
+          load: true
+          tags: lance-spark-test-base:${{ env.SPARK_VERSION }}_${{ env.SCALA_VERSION }}
+          build-args: |
+            SPARK_DOWNLOAD_VERSION=${{ steps.docker-args.outputs.spark-download-version }}
+            SPARK_MAJOR_VERSION=${{ env.SPARK_VERSION }}
+            SCALA_VERSION=${{ env.SCALA_VERSION }}
+            PY4J_VERSION=${{ steps.docker-args.outputs.py4j-version }}
+            SPARK_SCALA_SUFFIX=${{ steps.docker-args.outputs.spark-scala-suffix }}
+          cache-from: type=gha,scope=search-test-base-${{ env.SPARK_VERSION }}_${{ env.SCALA_VERSION }}
+          cache-to: type=gha,mode=max,scope=search-test-base-${{ env.SPARK_VERSION }}_${{ env.SCALA_VERSION }}
+      - name: Build bundle
+        run: make bundle SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION}
+      - name: Build test image
+        run: |
+          make docker-build-test \
+            SPARK_VERSION=${SPARK_VERSION} \
+            SCALA_VERSION=${SCALA_VERSION} \
+            LANCE_NAMESPACE_IMPL_VERSION=${{ steps.docker-args.outputs.lance-namespace-impl-version }}
+      - name: Run directory namespace search tests
+        if: ${{ contains(env.SEARCH_TEST_BACKENDS, 'local') }}
+        run: |
+          make docker-test \
+            SPARK_VERSION=${SPARK_VERSION} \
+            SCALA_VERSION=${SCALA_VERSION} \
+            TEST_BACKENDS=local \
+            PYTEST_CMD="${SEARCH_PYTEST_CMD}"
+      - name: Resolve REST namespace URI
+        id: rest
+        if: ${{ contains(env.SEARCH_TEST_BACKENDS, 'rest-dir') }}
+        env:
+          INPUT_REST_URI: ${{ github.event.inputs['rest-uri'] }}
+          INPUT_DOCKER_RUN_ARGS: ${{ github.event.inputs['docker-run-args'] }}
+        run: |
+          rest_uri="${INPUT_REST_URI}"
+          docker_run_args="${INPUT_DOCKER_RUN_ARGS}"
+          start_rest_dir="false"
+          rest_dir_root=""
+          rest_dir_port=""
+
+          if [ -z "${rest_uri}" ]; then
+            rest_dir_port="10024"
+            rest_dir_root="/home/lance/rest-data"
+            rest_uri="http://127.0.0.1:${rest_dir_port}"
+            start_rest_dir="true"
+          fi
+
+          echo "uri=${rest_uri}" >> "$GITHUB_OUTPUT"
+          echo "start_rest_dir=${start_rest_dir}" >> "$GITHUB_OUTPUT"
+          echo "rest_dir_root=${rest_dir_root}" >> "$GITHUB_OUTPUT"
+          echo "rest_dir_port=${rest_dir_port}" >> "$GITHUB_OUTPUT"
+          {
+            echo "docker_run_args<<EOF"
+            echo "${docker_run_args}"
+            echo "EOF"
+          } >> "$GITHUB_OUTPUT"
+      - name: Run REST directory namespace search tests
+        if: ${{ contains(env.SEARCH_TEST_BACKENDS, 'rest-dir') }}
+        env:
+          LANCE_SPARK_REST_URI: ${{ steps.rest.outputs.uri }}
+          LANCE_SPARK_REST_API_KEY: ${{ secrets.LANCE_SPARK_REST_API_KEY }}
+          LANCE_SPARK_REST_DATABASE: ${{ github.event.inputs['rest-database'] }}
+          LANCE_SPARK_START_REST_DIR: ${{ steps.rest.outputs.start_rest_dir }}
+          LANCE_SPARK_REST_DIR_ROOT: ${{ steps.rest.outputs.rest_dir_root }}
+          LANCE_SPARK_REST_DIR_PORT: ${{ steps.rest.outputs.rest_dir_port }}
+          DOCKER_RUN_ARGS: ${{ steps.rest.outputs.docker_run_args }}
+        run: |
+          make docker-test \
+            SPARK_VERSION=${SPARK_VERSION} \
+            SCALA_VERSION=${SCALA_VERSION} \
+            TEST_BACKENDS=rest-dir \
+            LANCE_SPARK_REST_URI="${LANCE_SPARK_REST_URI}" \
+            LANCE_SPARK_REST_API_KEY="${LANCE_SPARK_REST_API_KEY}" \
+            LANCE_SPARK_REST_DATABASE="${LANCE_SPARK_REST_DATABASE}" \
+            LANCE_SPARK_START_REST_DIR="${LANCE_SPARK_START_REST_DIR}" \
+            LANCE_SPARK_REST_DIR_ROOT="${LANCE_SPARK_REST_DIR_ROOT}" \
+            LANCE_SPARK_REST_DIR_PORT="${LANCE_SPARK_REST_DIR_PORT}" \
+            DOCKER_RUN_ARGS="${DOCKER_RUN_ARGS}" \
+            PYTEST_CMD="${SEARCH_PYTEST_CMD}"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -58,6 +58,38 @@ To auto-format the code, run:
 make format
 ```
 
+## Docker Integration Tests
+
+Build the Spark bundle and Docker integration-test image before running Docker tests:
+
+```shell
+make bundle SPARK_VERSION=3.5 SCALA_VERSION=2.13
+make docker-build-test SPARK_VERSION=3.5 SCALA_VERSION=2.13
+make docker-test SPARK_VERSION=3.5 SCALA_VERSION=2.13
+```
+
+Use `PYTEST_CMD` to run a targeted pytest path in the Docker image. For example, run only the SQL search table-function tests against the directory namespace:
+
+```shell
+make docker-test SPARK_VERSION=3.5 SCALA_VERSION=2.13 \
+  TEST_BACKENDS=local \
+  PYTEST_CMD="pytest /home/lance/tests/test_lance_spark.py::TestDQLSearchTableFunctions -v --timeout=180"
+```
+
+To also validate a REST namespace backed by a directory namespace, let the Docker test container start the OSS Lance REST adapter:
+
+```shell
+make docker-test SPARK_VERSION=3.5 SCALA_VERSION=2.13 \
+  TEST_BACKENDS=local,rest-dir \
+  LANCE_SPARK_START_REST_DIR=true \
+  LANCE_SPARK_REST_URI=http://127.0.0.1:10024 \
+  PYTEST_CMD="pytest /home/lance/tests/test_lance_spark.py::TestDQLSearchTableFunctions -v --timeout=180"
+```
+
+To run against an already-running compatible REST namespace server instead, omit `LANCE_SPARK_START_REST_DIR` and pass that server's URI with `LANCE_SPARK_REST_URI`.
+
+The `Spark Search Docker` GitHub Actions workflow runs the same targeted Docker tests. Pull requests run directory namespace and REST-directory namespace coverage automatically. Use workflow dispatch with `rest-uri` only when validating against an external REST namespace server.
+
 ## Documentation
 
 ### Setup

diff --git a/Makefile b/Makefile
@@ -42,6 +42,7 @@ endif
 DOCKER_CACHE_FROM ?=
 DOCKER_CACHE_TO ?=
 LANCE_NAMESPACE_IMPL_VERSION ?= $(shell sed -n 's:.*<lance-namespace-impl.version>\(.*\)</lance-namespace-impl.version>.*:\1:p' pom.xml | head -n 1)
+PYTEST_CMD ?= pytest /home/lance/tests/ -v --timeout=180
 
 DOCKER_COMPOSE := $(shell \
 	if docker compose version >/dev/null 2>&1; then \
@@ -190,6 +191,12 @@ docker-test:
 		$(if $(LANCEDB_API_KEY),-e LANCEDB_API_KEY=$(LANCEDB_API_KEY)) \
 		$(if $(LANCEDB_HOST_OVERRIDE),-e LANCEDB_HOST_OVERRIDE=$(LANCEDB_HOST_OVERRIDE)) \
 		$(if $(LANCEDB_REGION),-e LANCEDB_REGION=$(LANCEDB_REGION)) \
+		$(if $(LANCE_SPARK_REST_URI),-e LANCE_SPARK_REST_URI=$(LANCE_SPARK_REST_URI)) \
+		$(if $(LANCE_SPARK_REST_API_KEY),-e LANCE_SPARK_REST_API_KEY=$(LANCE_SPARK_REST_API_KEY)) \
+		$(if $(LANCE_SPARK_REST_DATABASE),-e LANCE_SPARK_REST_DATABASE=$(LANCE_SPARK_REST_DATABASE)) \
+		$(if $(LANCE_SPARK_START_REST_DIR),-e LANCE_SPARK_START_REST_DIR=$(LANCE_SPARK_START_REST_DIR)) \
+		$(if $(LANCE_SPARK_REST_DIR_ROOT),-e LANCE_SPARK_REST_DIR_ROOT=$(LANCE_SPARK_REST_DIR_ROOT)) \
+		$(if $(LANCE_SPARK_REST_DIR_PORT),-e LANCE_SPARK_REST_DIR_PORT=$(LANCE_SPARK_REST_DIR_PORT)) \
 		$(if $(TEST_BACKENDS),-e TEST_BACKENDS=$(TEST_BACKENDS)) \
 		$(if $(LANCE_FTS_FORMAT_VERSION),-e LANCE_FTS_FORMAT_VERSION=$(LANCE_FTS_FORMAT_VERSION)) \
 		$(if $(AWS_REGION),-e AWS_REGION=$(AWS_REGION)) \
@@ -203,8 +210,9 @@ docker-test:
 		$(if $(AWS_SESSION_TOKEN),-e AWS_SESSION_TOKEN=$(AWS_SESSION_TOKEN)) \
 		$(if $(AWS_PROFILE),-e AWS_PROFILE=$(AWS_PROFILE)) \
 		$(if $(AWS_PROFILE),-v $(HOME)/.aws:/root/.aws:ro) \
+		$(DOCKER_RUN_ARGS) \
 		lance-spark-test:$(SPARK_VERSION)_$(SCALA_VERSION) \
-		"pytest /home/lance/tests/ -v --timeout=180"
+		"$(PYTEST_CMD)"
 
 # =============================================================================
 # Benchmark
@@ -295,6 +303,7 @@ help:
 	@echo "  docker-build-test-base - Build test base image (system deps + Spark)"
 	@echo "  docker-build-test      - Build test image (base + bundle JAR)"
 	@echo "  docker-test            - Run integration tests in lance-spark-test container"
+	@echo "                           Override PYTEST_CMD to run a targeted pytest command"
 	@echo ""
 	@echo "Benchmark:"
 	@echo "  benchmark-build         - Build benchmark jar (shared by TPC-DS and TPC-H)"

diff --git a/docker/Dockerfile.test b/docker/Dockerfile.test
@@ -35,6 +35,7 @@ RUN mkdir -p /home/lance/warehouse /home/lance/spark-events /home/lance/data
 # Copy tests
 RUN mkdir -p /home/lance/tests
 COPY integration-tests/ /home/lance/tests/
+RUN javac -cp "${SPARK_HOME}/jars/*" /home/lance/tests/LanceRestDirNamespaceServer.java
 
 WORKDIR ${SPARK_HOME}
 COPY docker/entrypoint.sh .

diff --git a/docs/src/config.md b/docs/src/config.md
@@ -49,6 +49,9 @@ Lance provides SQL extensions that add additional functionality beyond standard
 
 The following features require the Lance Spark SQL extension to be enabled:
 
+- [VECTOR_SEARCH](operations/dql/vector-search.md) - Run vector similarity search through Lance namespace execution
+- [SEARCH](operations/dql/search.md) - Run full-text search through Lance namespace execution
+- [HYBRID_SEARCH](operations/dql/hybrid-search.md) - Combine vector and full-text search with reciprocal rank fusion
 - [ADD COLUMNS with backfill](operations/dml/add-columns.md) - Add new columns and backfill existing rows with data
 - [UPDATE COLUMNS with backfill](operations/dml/update-columns.md) - Update existing columns using data from a source
 - [OPTIMIZE](operations/ddl/optimize.md) - Compact table fragments for improved query performance

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -17,6 +17,7 @@ Specifically, you can use the Apache Spark Connector for Lance to:
 * **Read & Write Lance Datasets**: Seamlessly read and write datasets stored in the Lance format using Spark.
 * **Distributed, Parallel Scans**: Leverage Spark's distributed computing capabilities to perform parallel scans on Lance datasets.
 * **Column and Filter Pushdown**: Optimize query performance by pushing down column selections and filters to the data source.
+* **SQL Search Table Functions**: Run [vector](operations/dql/vector-search.md), [full-text](operations/dql/search.md), and [hybrid](operations/dql/hybrid-search.md) search through Lance namespace execution.
 
 ## Quick Start
 
@@ -28,4 +29,4 @@ make docker-build
 make docker-up
 ```
 
-And then open the notebook at `http://localhost:8888`.
+And then open the notebook at `http://localhost:8888`.
diff --git a/docs/src/operations/ddl/create-index.md b/docs/src/operations/ddl/create-index.md
@@ -147,6 +147,8 @@ Create an FTS index on a text column:
     );
     ```
 
+Query the indexed column with the [SEARCH](../dql/search.md) table function.
+
 ## Output
 
 The `CREATE INDEX` command returns the following information about the operation:

diff --git a/docs/src/operations/dql/.pages b/docs/src/operations/dql/.pages
@@ -1,3 +1,6 @@
 title: DQL
 nav:
   - select.md
+  - vector-search.md
+  - search.md
+  - hybrid-search.md
diff --git a/docs/src/operations/dql/hybrid-search.md b/docs/src/operations/dql/hybrid-search.md
@@ -0,0 +1,86 @@
+# HYBRID_SEARCH
+
+Run vector search and full-text search together from Spark SQL, then rerank the combined results with reciprocal rank fusion.
+
+!!! warning "Spark Extension Required"
+    `HYBRID_SEARCH` requires the Lance Spark SQL extension to be enabled. See [Spark SQL Extensions](../../config.md#spark-sql-extensions) for configuration details.
+
+!!! note "Namespace Tables Required"
+    `HYBRID_SEARCH` resolves the `table` argument through a Spark catalog and executes both side queries through the Lance namespace `queryTable` API. Use a Lance namespace catalog table such as `lance.default.documents`, not a raw Lance dataset path.
+
+!!! note "Named Arguments"
+    Named arguments require Spark 3.5 or later. On Spark 3.4, use the positional form.
+
+## Basic Usage
+
+`HYBRID_SEARCH` returns the selected table columns plus `_distance`, `_score`, and `_relevance_score`. Rows that only match one side have null for the other side's metric.
+
+=== "SQL"
+    ```sql
+    SELECT id, body, _distance, _score, _relevance_score
+    FROM HYBRID_SEARCH(
+        table => 'lance.default.documents',
+        query_vector => array(0.12, 0.34, 0.56, 0.78),
+        query => 'vector database',
+        vector_column => 'embedding',
+        search_columns => array('body'),
+        columns => array('id', 'body'),
+        num_results => 10,
+        candidates => 50,
+        rrf_k => 60.0
+    )
+    ORDER BY _relevance_score DESC;
+    ```
+
+## Positional Form
+
+Use positional arguments for simple calls and Spark 3.4 compatibility.
+
+=== "SQL"
+    ```sql
+    SELECT *
+    FROM HYBRID_SEARCH('lance.default.documents', array(0.12, 0.34, 0.56), 'lance', 5);
+    ```
+
+## Arguments
+
+| Argument | Type | Required | Description |
+|----------|------|----------|-------------|
+| `table` | String | Yes | Catalog table name to search. |
+| `query_vector` | Array numeric literal | Yes | Query vector. |
+| `query` or `search_query` | String | Yes | Full-text query string. |
+| `vector_column` | String | No | Vector column name. Lance defaults to `vector` when omitted. |
+| `search_columns` | Array string literal | No | Text columns to search. When omitted, Lance uses the indexed columns configured for the FTS index. |
+| `num_results`, `limit`, or `k` | Integer | No | Number of final reranked results. Defaults to `10`. |
+| `candidates`, `num_candidates`, or `candidate_count` | Integer | No | Number of rows to fetch from each side before reranking. Defaults to `num_results + offset`. Values below `num_results + offset` are raised to that minimum. |
+| `rrf_k` | Float | No | Reciprocal rank fusion constant. Defaults to `60.0`. |
+| `columns` | Array string literal | No | Output table columns. `_distance`, `_score`, and `_relevance_score` are always included. Use `array('*')` or omit this argument for all table columns. |
+| `filter` | String | No | SQL filter expression evaluated by Lance on both side queries. |
+| `offset` | Integer | No | Number of reranked results to skip after fusion. Defaults to `0`. |
+| `version` | Long | No | Lance table version to search. |
+| `distance_type` | String | No | Distance metric such as `l2`, `cosine`, or `dot`. |
+| `nprobes`, `ef`, `refine_factor` | Integer | No | Vector index search tuning parameters. |
+| `lower_bound`, `upper_bound` | Float | No | Distance bounds. |
+| `bypass_vector_index`, `fast_search`, `prefilter`, `with_row_id` | Boolean | No | Lance query options. `with_row_id` adds `_rowid` to the output. |
+
+## Reranking
+
+Hybrid search performs reciprocal rank fusion in Spark:
+
+```text
+_relevance_score = sum(1.0 / (rank + rrf_k))
+```
+
+Ranks are zero-based in each side's result set. `candidates` controls how many rows are fetched from each side before reranking.
+
+## Output
+
+The result includes the requested table columns plus nullable `_distance` and `_score` float columns and a non-null `_relevance_score` float column. If `with_row_id => true`, or if `_rowid` is listed in `columns`, the result also includes Lance row ids.
+
+## Execution
+
+Spark plans `HYBRID_SEARCH` as a DataSource V2 batch read with one input partition. The partition reader issues one vector `queryTable` request and one full-text `queryTable` request through the Lance namespace API, merges the two result sets in Spark with reciprocal rank fusion, and returns the final rows. With a REST namespace the two side searches can be handled by the REST server, while the final fusion currently happens in the Spark task.
+
+## Validation
+
+The Docker integration suite covers `HYBRID_SEARCH` against the directory namespace and a REST namespace backed by a directory namespace. The `Spark Search Docker` GitHub Actions workflow runs both backends for pull requests.
-Original file line number
+Diff line change
@@ Expand Up / @@ -147,6 +147,8 @@ Create an FTS index on a text column: @@
         );
         ```
+    Query the indexed column with the [SEARCH](../dql/search.md) table function.
     ## Output
     The `CREATE INDEX` command returns the following information about the operation:
@@ Expand Down @@