Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions .github/workflows/spark-search.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Spark Search Docker

on:
pull_request:
types:
- opened
- synchronize
- ready_for_review
- reopened
paths:
- ".github/workflows/spark-search.yml"
- "Makefile"
- "docker/**"
- "integration-tests/**"
- "lance-spark-base_2.12/src/main/java/org/lance/spark/search/**"
- "lance-spark-base_2.12/src/main/scala/org/lance/spark/search/**"
- "lance-spark-base_2.12/src/test/java/org/lance/spark/search/**"
- "lance-spark-*/src/main/scala/org/lance/spark/extensions/**"
- "pom.xml"
- "*/pom.xml"
workflow_dispatch:
inputs:
spark-version:
description: "Spark version to test"
required: true
default: "3.5"
scala-version:
description: "Scala version to test"
required: true
default: "2.13"
backends:
description: "Comma-separated test backends: local or local,rest-dir"
required: true
default: "local,rest-dir"
rest-uri:
description: "Optional REST namespace URI. If omitted, tests start a local REST directory namespace."
required: false
default: ""
rest-database:
description: "Optional database header value for an external REST namespace"
required: false
default: ""
docker-run-args:
description: "Extra docker run args for docker-test"
required: false
default: ""

permissions:
contents: read

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
SPARK_VERSION: ${{ github.event.inputs['spark-version'] || '3.5' }}
SCALA_VERSION: ${{ github.event.inputs['scala-version'] || '2.13' }}
SEARCH_TEST_BACKENDS: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.backends || 'local,rest-dir' }}
SEARCH_PYTEST_CMD: >-
pytest /home/lance/tests/test_lance_spark.py::TestDQLSearchTableFunctions
-v --timeout=180

jobs:
search-docker-test:
name: Search Docker Test
runs-on: ubuntu-24.04
timeout-minutes: 90
steps:
- name: Checkout
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: Set up Java
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 17
cache: "maven"
- name: Resolve Docker build args
id: docker-args
run: |
make print-docker-build-args SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION} >> $GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build test-base image (cached)
uses: docker/build-push-action@v6
with:
context: docker
file: docker/Dockerfile.test-base
load: true
tags: lance-spark-test-base:${{ env.SPARK_VERSION }}_${{ env.SCALA_VERSION }}
build-args: |
SPARK_DOWNLOAD_VERSION=${{ steps.docker-args.outputs.spark-download-version }}
SPARK_MAJOR_VERSION=${{ env.SPARK_VERSION }}
SCALA_VERSION=${{ env.SCALA_VERSION }}
PY4J_VERSION=${{ steps.docker-args.outputs.py4j-version }}
SPARK_SCALA_SUFFIX=${{ steps.docker-args.outputs.spark-scala-suffix }}
cache-from: type=gha,scope=search-test-base-${{ env.SPARK_VERSION }}_${{ env.SCALA_VERSION }}
cache-to: type=gha,mode=max,scope=search-test-base-${{ env.SPARK_VERSION }}_${{ env.SCALA_VERSION }}
- name: Build bundle
run: make bundle SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION}
- name: Build test image
run: |
make docker-build-test \
SPARK_VERSION=${SPARK_VERSION} \
SCALA_VERSION=${SCALA_VERSION} \
LANCE_NAMESPACE_IMPL_VERSION=${{ steps.docker-args.outputs.lance-namespace-impl-version }}
- name: Run directory namespace search tests
if: ${{ contains(env.SEARCH_TEST_BACKENDS, 'local') }}
run: |
make docker-test \
SPARK_VERSION=${SPARK_VERSION} \
SCALA_VERSION=${SCALA_VERSION} \
TEST_BACKENDS=local \
PYTEST_CMD="${SEARCH_PYTEST_CMD}"
- name: Resolve REST namespace URI
id: rest
if: ${{ contains(env.SEARCH_TEST_BACKENDS, 'rest-dir') }}
env:
INPUT_REST_URI: ${{ github.event.inputs['rest-uri'] }}
INPUT_DOCKER_RUN_ARGS: ${{ github.event.inputs['docker-run-args'] }}
run: |
rest_uri="${INPUT_REST_URI}"
docker_run_args="${INPUT_DOCKER_RUN_ARGS}"
start_rest_dir="false"
rest_dir_root=""
rest_dir_port=""

if [ -z "${rest_uri}" ]; then
rest_dir_port="10024"
rest_dir_root="/home/lance/rest-data"
rest_uri="http://127.0.0.1:${rest_dir_port}"
start_rest_dir="true"
fi

echo "uri=${rest_uri}" >> "$GITHUB_OUTPUT"
echo "start_rest_dir=${start_rest_dir}" >> "$GITHUB_OUTPUT"
echo "rest_dir_root=${rest_dir_root}" >> "$GITHUB_OUTPUT"
echo "rest_dir_port=${rest_dir_port}" >> "$GITHUB_OUTPUT"
{
echo "docker_run_args<<EOF"
echo "${docker_run_args}"
echo "EOF"
} >> "$GITHUB_OUTPUT"
- name: Run REST directory namespace search tests
if: ${{ contains(env.SEARCH_TEST_BACKENDS, 'rest-dir') }}
env:
LANCE_SPARK_REST_URI: ${{ steps.rest.outputs.uri }}
LANCE_SPARK_REST_API_KEY: ${{ secrets.LANCE_SPARK_REST_API_KEY }}
LANCE_SPARK_REST_DATABASE: ${{ github.event.inputs['rest-database'] }}
LANCE_SPARK_START_REST_DIR: ${{ steps.rest.outputs.start_rest_dir }}
LANCE_SPARK_REST_DIR_ROOT: ${{ steps.rest.outputs.rest_dir_root }}
LANCE_SPARK_REST_DIR_PORT: ${{ steps.rest.outputs.rest_dir_port }}
DOCKER_RUN_ARGS: ${{ steps.rest.outputs.docker_run_args }}
run: |
make docker-test \
SPARK_VERSION=${SPARK_VERSION} \
SCALA_VERSION=${SCALA_VERSION} \
TEST_BACKENDS=rest-dir \
LANCE_SPARK_REST_URI="${LANCE_SPARK_REST_URI}" \
LANCE_SPARK_REST_API_KEY="${LANCE_SPARK_REST_API_KEY}" \
LANCE_SPARK_REST_DATABASE="${LANCE_SPARK_REST_DATABASE}" \
LANCE_SPARK_START_REST_DIR="${LANCE_SPARK_START_REST_DIR}" \
LANCE_SPARK_REST_DIR_ROOT="${LANCE_SPARK_REST_DIR_ROOT}" \
LANCE_SPARK_REST_DIR_PORT="${LANCE_SPARK_REST_DIR_PORT}" \
DOCKER_RUN_ARGS="${DOCKER_RUN_ARGS}" \
PYTEST_CMD="${SEARCH_PYTEST_CMD}"
32 changes: 32 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,38 @@ To auto-format the code, run:
make format
```

## Docker Integration Tests

Build the Spark bundle and Docker integration-test image before running Docker tests:

```shell
make bundle SPARK_VERSION=3.5 SCALA_VERSION=2.13
make docker-build-test SPARK_VERSION=3.5 SCALA_VERSION=2.13
make docker-test SPARK_VERSION=3.5 SCALA_VERSION=2.13
```

Use `PYTEST_CMD` to run a targeted pytest path in the Docker image. For example, run only the SQL search table-function tests against the directory namespace:

```shell
make docker-test SPARK_VERSION=3.5 SCALA_VERSION=2.13 \
TEST_BACKENDS=local \
PYTEST_CMD="pytest /home/lance/tests/test_lance_spark.py::TestDQLSearchTableFunctions -v --timeout=180"
```

To also validate a REST namespace backed by a directory namespace, let the Docker test container start the OSS Lance REST adapter:

```shell
make docker-test SPARK_VERSION=3.5 SCALA_VERSION=2.13 \
TEST_BACKENDS=local,rest-dir \
LANCE_SPARK_START_REST_DIR=true \
LANCE_SPARK_REST_URI=http://127.0.0.1:10024 \
PYTEST_CMD="pytest /home/lance/tests/test_lance_spark.py::TestDQLSearchTableFunctions -v --timeout=180"
```

To run against an already-running compatible REST namespace server instead, omit `LANCE_SPARK_START_REST_DIR` and pass that server's URI with `LANCE_SPARK_REST_URI`.

The `Spark Search Docker` GitHub Actions workflow runs the same targeted Docker tests. Pull requests run directory namespace and REST-directory namespace coverage automatically. Use workflow dispatch with `rest-uri` only when validating against an external REST namespace server.

## Documentation

### Setup
Expand Down
11 changes: 10 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ endif
DOCKER_CACHE_FROM ?=
DOCKER_CACHE_TO ?=
LANCE_NAMESPACE_IMPL_VERSION ?= $(shell sed -n 's:.*<lance-namespace-impl.version>\(.*\)</lance-namespace-impl.version>.*:\1:p' pom.xml | head -n 1)
PYTEST_CMD ?= pytest /home/lance/tests/ -v --timeout=180

DOCKER_COMPOSE := $(shell \
if docker compose version >/dev/null 2>&1; then \
Expand Down Expand Up @@ -190,6 +191,12 @@ docker-test:
$(if $(LANCEDB_API_KEY),-e LANCEDB_API_KEY=$(LANCEDB_API_KEY)) \
$(if $(LANCEDB_HOST_OVERRIDE),-e LANCEDB_HOST_OVERRIDE=$(LANCEDB_HOST_OVERRIDE)) \
$(if $(LANCEDB_REGION),-e LANCEDB_REGION=$(LANCEDB_REGION)) \
$(if $(LANCE_SPARK_REST_URI),-e LANCE_SPARK_REST_URI=$(LANCE_SPARK_REST_URI)) \
$(if $(LANCE_SPARK_REST_API_KEY),-e LANCE_SPARK_REST_API_KEY=$(LANCE_SPARK_REST_API_KEY)) \
$(if $(LANCE_SPARK_REST_DATABASE),-e LANCE_SPARK_REST_DATABASE=$(LANCE_SPARK_REST_DATABASE)) \
$(if $(LANCE_SPARK_START_REST_DIR),-e LANCE_SPARK_START_REST_DIR=$(LANCE_SPARK_START_REST_DIR)) \
$(if $(LANCE_SPARK_REST_DIR_ROOT),-e LANCE_SPARK_REST_DIR_ROOT=$(LANCE_SPARK_REST_DIR_ROOT)) \
$(if $(LANCE_SPARK_REST_DIR_PORT),-e LANCE_SPARK_REST_DIR_PORT=$(LANCE_SPARK_REST_DIR_PORT)) \
$(if $(TEST_BACKENDS),-e TEST_BACKENDS=$(TEST_BACKENDS)) \
$(if $(LANCE_FTS_FORMAT_VERSION),-e LANCE_FTS_FORMAT_VERSION=$(LANCE_FTS_FORMAT_VERSION)) \
$(if $(AWS_REGION),-e AWS_REGION=$(AWS_REGION)) \
Expand All @@ -203,8 +210,9 @@ docker-test:
$(if $(AWS_SESSION_TOKEN),-e AWS_SESSION_TOKEN=$(AWS_SESSION_TOKEN)) \
$(if $(AWS_PROFILE),-e AWS_PROFILE=$(AWS_PROFILE)) \
$(if $(AWS_PROFILE),-v $(HOME)/.aws:/root/.aws:ro) \
$(DOCKER_RUN_ARGS) \
lance-spark-test:$(SPARK_VERSION)_$(SCALA_VERSION) \
"pytest /home/lance/tests/ -v --timeout=180"
"$(PYTEST_CMD)"

# =============================================================================
# Benchmark
Expand Down Expand Up @@ -295,6 +303,7 @@ help:
@echo " docker-build-test-base - Build test base image (system deps + Spark)"
@echo " docker-build-test - Build test image (base + bundle JAR)"
@echo " docker-test - Run integration tests in lance-spark-test container"
@echo " Override PYTEST_CMD to run a targeted pytest command"
@echo ""
@echo "Benchmark:"
@echo " benchmark-build - Build benchmark jar (shared by TPC-DS and TPC-H)"
Expand Down
1 change: 1 addition & 0 deletions docker/Dockerfile.test
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ RUN mkdir -p /home/lance/warehouse /home/lance/spark-events /home/lance/data
# Copy tests
RUN mkdir -p /home/lance/tests
COPY integration-tests/ /home/lance/tests/
RUN javac -cp "${SPARK_HOME}/jars/*" /home/lance/tests/LanceRestDirNamespaceServer.java

WORKDIR ${SPARK_HOME}
COPY docker/entrypoint.sh .
Expand Down
3 changes: 3 additions & 0 deletions docs/src/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ Lance provides SQL extensions that add additional functionality beyond standard

The following features require the Lance Spark SQL extension to be enabled:

- [VECTOR_SEARCH](operations/dql/vector-search.md) - Run vector similarity search through Lance namespace execution
- [SEARCH](operations/dql/search.md) - Run full-text search through Lance namespace execution
- [HYBRID_SEARCH](operations/dql/hybrid-search.md) - Combine vector and full-text search with reciprocal rank fusion
- [ADD COLUMNS with backfill](operations/dml/add-columns.md) - Add new columns and backfill existing rows with data
- [UPDATE COLUMNS with backfill](operations/dml/update-columns.md) - Update existing columns using data from a source
- [OPTIMIZE](operations/ddl/optimize.md) - Compact table fragments for improved query performance
Expand Down
3 changes: 2 additions & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Specifically, you can use the Apache Spark Connector for Lance to:
* **Read & Write Lance Datasets**: Seamlessly read and write datasets stored in the Lance format using Spark.
* **Distributed, Parallel Scans**: Leverage Spark's distributed computing capabilities to perform parallel scans on Lance datasets.
* **Column and Filter Pushdown**: Optimize query performance by pushing down column selections and filters to the data source.
* **SQL Search Table Functions**: Run [vector](operations/dql/vector-search.md), [full-text](operations/dql/search.md), and [hybrid](operations/dql/hybrid-search.md) search through Lance namespace execution.

## Quick Start

Expand All @@ -28,4 +29,4 @@ make docker-build
make docker-up
```

And then open the notebook at `http://localhost:8888`.
And then open the notebook at `http://localhost:8888`.
2 changes: 2 additions & 0 deletions docs/src/operations/ddl/create-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ Create an FTS index on a text column:
);
```

Query the indexed column with the [SEARCH](../dql/search.md) table function.

## Output

The `CREATE INDEX` command returns the following information about the operation:
Expand Down
3 changes: 3 additions & 0 deletions docs/src/operations/dql/.pages
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
title: DQL
nav:
- select.md
- vector-search.md
- search.md
- hybrid-search.md
86 changes: 86 additions & 0 deletions docs/src/operations/dql/hybrid-search.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# HYBRID_SEARCH

Run vector search and full-text search together from Spark SQL, then rerank the combined results with reciprocal rank fusion.

!!! warning "Spark Extension Required"
`HYBRID_SEARCH` requires the Lance Spark SQL extension to be enabled. See [Spark SQL Extensions](../../config.md#spark-sql-extensions) for configuration details.

!!! note "Namespace Tables Required"
`HYBRID_SEARCH` resolves the `table` argument through a Spark catalog and executes both side queries through the Lance namespace `queryTable` API. Use a Lance namespace catalog table such as `lance.default.documents`, not a raw Lance dataset path.

!!! note "Named Arguments"
Named arguments require Spark 3.5 or later. On Spark 3.4, use the positional form.

## Basic Usage

`HYBRID_SEARCH` returns the selected table columns plus `_distance`, `_score`, and `_relevance_score`. Rows that only match one side have null for the other side's metric.

=== "SQL"
```sql
SELECT id, body, _distance, _score, _relevance_score
FROM HYBRID_SEARCH(
table => 'lance.default.documents',
query_vector => array(0.12, 0.34, 0.56, 0.78),
query => 'vector database',
vector_column => 'embedding',
search_columns => array('body'),
columns => array('id', 'body'),
num_results => 10,
candidates => 50,
rrf_k => 60.0
)
ORDER BY _relevance_score DESC;
```

## Positional Form

Use positional arguments for simple calls and Spark 3.4 compatibility.

=== "SQL"
```sql
SELECT *
FROM HYBRID_SEARCH('lance.default.documents', array(0.12, 0.34, 0.56), 'lance', 5);
```

## Arguments

| Argument | Type | Required | Description |
|----------|------|----------|-------------|
| `table` | String | Yes | Catalog table name to search. |
| `query_vector` | Array numeric literal | Yes | Query vector. |
| `query` or `search_query` | String | Yes | Full-text query string. |
| `vector_column` | String | No | Vector column name. Lance defaults to `vector` when omitted. |
| `search_columns` | Array string literal | No | Text columns to search. When omitted, Lance uses the indexed columns configured for the FTS index. |
| `num_results`, `limit`, or `k` | Integer | No | Number of final reranked results. Defaults to `10`. |
| `candidates`, `num_candidates`, or `candidate_count` | Integer | No | Number of rows to fetch from each side before reranking. Defaults to `num_results + offset`. Values below `num_results + offset` are raised to that minimum. |
| `rrf_k` | Float | No | Reciprocal rank fusion constant. Defaults to `60.0`. |
| `columns` | Array string literal | No | Output table columns. `_distance`, `_score`, and `_relevance_score` are always included. Use `array('*')` or omit this argument for all table columns. |
| `filter` | String | No | SQL filter expression evaluated by Lance on both side queries. |
| `offset` | Integer | No | Number of reranked results to skip after fusion. Defaults to `0`. |
| `version` | Long | No | Lance table version to search. |
| `distance_type` | String | No | Distance metric such as `l2`, `cosine`, or `dot`. |
| `nprobes`, `ef`, `refine_factor` | Integer | No | Vector index search tuning parameters. |
| `lower_bound`, `upper_bound` | Float | No | Distance bounds. |
| `bypass_vector_index`, `fast_search`, `prefilter`, `with_row_id` | Boolean | No | Lance query options. `with_row_id` adds `_rowid` to the output. |

## Reranking

Hybrid search performs reciprocal rank fusion in Spark:

```text
_relevance_score = sum(1.0 / (rank + rrf_k))
```

Ranks are zero-based in each side's result set. `candidates` controls how many rows are fetched from each side before reranking.

## Output

The result includes the requested table columns plus nullable `_distance` and `_score` float columns and a non-null `_relevance_score` float column. If `with_row_id => true`, or if `_rowid` is listed in `columns`, the result also includes Lance row ids.

## Execution

Spark plans `HYBRID_SEARCH` as a DataSource V2 batch read with one input partition. The partition reader issues one vector `queryTable` request and one full-text `queryTable` request through the Lance namespace API, merges the two result sets in Spark with reciprocal rank fusion, and returns the final rows. With a REST namespace the two side searches can be handled by the REST server, while the final fusion currently happens in the Spark task.

## Validation

The Docker integration suite covers `HYBRID_SEARCH` against the directory namespace and a REST namespace backed by a directory namespace. The `Spark Search Docker` GitHub Actions workflow runs both backends for pull requests.
Loading
Loading