Interval join, coverage, count-overlaps, nearest-neighbor, overlap, merge, cluster, complement, and subtract operations for Apache DataFusion.
This crate provides optimized genomic interval operations as DataFusion extensions:
- Interval join — SQL joins with range overlap conditions, optimized via multiple interval tree algorithms
- Coverage — base-pair overlap depth between two interval sets
- Count overlaps — number of overlapping intervals per region
- Nearest — nearest-neighbor interval matching
- Overlap — all pairs of overlapping intervals between two tables
- Merge — merge overlapping intervals within a single table
- Cluster — annotate intervals with cluster membership (cluster ID + boundaries)
- Complement — find gaps (uncovered regions) relative to optional chromsizes view
- Subtract — basepair-level set difference between two interval sets
use datafusion_bio_function_ranges::{create_bio_session, register_ranges_functions};
// Option 1: Create a fully configured session (recommended)
let ctx = create_bio_session();
// Option 2: Register functions on an existing bio-configured session
use datafusion::config::ConfigOptions;
use datafusion::prelude::{SessionConfig, SessionContext};
use datafusion_bio_function_ranges::{BioConfig, BioSessionExt};
let config = SessionConfig::from(ConfigOptions::new())
.with_option_extension(BioConfig::default())
.with_information_schema(true)
.with_repartition_joins(false);
let ctx = SessionContext::new_with_bio(config);
register_ranges_functions(&ctx);Registers the coverage, count_overlaps, nearest, overlap, merge, cluster, complement, and subtract SQL table functions on an existing SessionContext. This is analogous to register_pileup_functions in the pileup crate.
use datafusion_bio_function_ranges::register_ranges_functions;
register_ranges_functions(&ctx);Convenience function that creates a SessionContext with:
- Custom query planner for automatic interval join detection
- Physical optimizer rule that converts hash/nested-loop joins to interval joins
BioConfigextension for algorithm selection viaSET bio.*statements- All SQL table functions:
coverage(),count_overlaps(),nearest(),overlap(),merge(),cluster(),complement(),subtract()
use datafusion_bio_function_ranges::create_bio_session;
let ctx = create_bio_session();Computes base-pair coverage depth. Builds an interval tree from left_table, then for each row in right_table, computes the total overlap in base pairs with the merged intervals.
-- Default column names: contig, pos_start, pos_end
SELECT * FROM coverage('reads', 'targets')
-- Custom shared column names
SELECT * FROM coverage('reads', 'targets', 'chrom', 'start', 'end')
-- Separate column names for left and right tables
SELECT * FROM coverage('reads', 'targets', 'chrom', 'start', 'end', 'contig', 'pos_start', 'pos_end')
-- For 0-based half-open coordinates (adjusts boundaries with +1/-1)
SELECT * FROM coverage('reads', 'targets', 'contig', 'pos_start', 'pos_end', 'strict')Counts overlapping intervals. Same interface as coverage, but returns the count of overlapping (non-merged) intervals instead of base-pair overlap.
SELECT * FROM count_overlaps('reads', 'targets')Returns up to k nearest left intervals for each right interval.
kdefault:1(must be>= 1)overlapdefault:truetrue: overlapping intervals are returned first, then nearest non-overlaps if neededfalse: overlaps are ignored, only nearest non-overlaps are returned
- If no keyed candidate exists for a right row, a row is still emitted with
NULLinleft_*columns. - Deterministic tie-break order is by
(start, end, row_position)on the left side.
Output columns are prefixed with left_ and right_ to avoid ambiguity.
Accepted call forms:
nearest('left', 'right')nearest('left', 'right', 3)nearest('left', 'right', false)(use defaultk=1, disable overlap candidates)nearest('left', 'right', 3, false)nearest('left', 'right', 3, false, 'contig', 'pos_start', 'pos_end')nearest('left', 'right', 3, false, 'l_contig', 'l_start', 'l_end', 'r_contig', 'r_start', 'r_end')- append
'strict'or'weak'to any columns form above
-- Default k=1, include overlaps
SELECT * FROM nearest('targets', 'reads')
-- Top-3 nearest per right interval, ignoring overlaps
SELECT * FROM nearest('targets', 'reads', 3, false)
-- Default k=1, ignore overlaps
SELECT * FROM nearest('targets', 'reads', false)
-- 0-based half-open coordinates
SELECT * FROM nearest('targets', 'reads', 1, true, 'contig', 'pos_start', 'pos_end', 'strict')
-- Custom column names for left and right
SELECT * FROM nearest(
'left_tbl',
'right_tbl',
5,
true,
'chrom', 'start', 'end',
'chr', 'from', 'to'
)Returns all pairs of overlapping intervals between two tables. Output columns are prefixed with left_ and right_.
SELECT * FROM overlap('reads', 'targets')
-- With 0-based half-open coordinates
SELECT * FROM overlap('reads', 'targets', 'contig', 'pos_start', 'pos_end', 'strict')
-- Separate column names for left and right tables
SELECT * FROM overlap('reads', 'targets', 'l_chr', 'l_s', 'l_e', 'r_chr', 'r_s', 'r_e')Merges overlapping intervals within a single table. Returns merged intervals with an n_intervals count.
SELECT * FROM merge('intervals')
-- Merge intervals within distance 10
SELECT * FROM merge('intervals', 10)
-- With custom columns and strict mode
SELECT * FROM merge('intervals', 0, 'contig', 'pos_start', 'pos_end', 'strict')Annotates each interval with its cluster membership. Returns all original rows plus cluster (ID), cluster_start, and cluster_end columns. Same argument pattern as merge.
SELECT * FROM cluster('intervals')
-- Cluster intervals within distance 5
SELECT * FROM cluster('intervals', 5)
-- With custom columns and strict mode
SELECT * FROM cluster('intervals', 0, 'contig', 'pos_start', 'pos_end', 'strict')Finds gaps (uncovered regions) in the input intervals. Without a view table, gaps extend from 0 to i64::MAX per contig. With a view table (chromsizes), gaps are bounded to view coordinates.
-- Gaps relative to 0..MAX per contig
SELECT * FROM complement('intervals')
-- Gaps within chromsizes boundaries
SELECT * FROM complement('intervals', 'chromsizes')
-- With shared custom column names
SELECT * FROM complement('intervals', 'chromsizes', 'chrom', 'start', 'end')
-- With separate column names for input and view tables
SELECT * FROM complement('intervals', 'chromsizes', 'c1', 's1', 'e1', 'vc', 'vs', 've')Basepair-level set difference: removes portions of left intervals that overlap with right intervals. Fragments left intervals at overlap boundaries.
SELECT * FROM subtract('left_table', 'right_table')
-- With custom columns and strict mode
SELECT * FROM subtract('left_table', 'right_table', 'chrom', 'start', 'end', 'strict')
-- Separate column names for left and right tables
SELECT * FROM subtract('left_table', 'right_table', 'l_chr', 'l_s', 'l_e', 'r_chr', 'r_s', 'r_e')| Value | Description | Use when |
|---|---|---|
'weak' (default) |
Standard overlap: start <= end AND end >= start |
1-based inclusive coordinates |
'strict' |
Adjusted boundaries: queries with start+1, end-1 |
0-based half-open coordinates |
When using a bio-configured session (create_bio_session() or BioSessionExt::new_with_bio()), SQL joins with range overlap conditions are automatically optimized:
-- Automatically detected and optimized as interval join
SELECT *
FROM reads
JOIN targets
ON reads.contig = targets.contig
AND reads.pos_start <= targets.pos_end
AND reads.pos_end >= targets.pos_start-- Select interval join algorithm
SET bio.interval_join_algorithm = Coitrees; -- default, best general performance
-- Available algorithms:
-- Coitrees, IntervalTree, ArrayIntervalTree, Lapper, SuperIntervals
-- CoitreesNearest (1 nearest match per right-side row)SET bio.interval_join_algorithm = CoitreesNearest;
SELECT *
FROM targets
JOIN reads
ON targets.contig = reads.contig
AND targets.pos_start <= reads.pos_end
AND targets.pos_end >= reads.pos_startReturns exactly one match per right-side row: the overlapping interval if one exists, otherwise the nearest interval by distance. If there is no matching key group on the left side, the row is emitted with nulls on left columns.
For direct Rust usage without SQL:
use std::sync::Arc;
use datafusion_bio_function_ranges::{CountOverlapsProvider, FilterOp};
let provider = CountOverlapsProvider::new(
Arc::new(ctx.clone()),
"reads".to_string(), // left table (built into interval tree)
"targets".to_string(), // right table (gets count/coverage column)
targets_schema, // Schema of the right table
vec!["contig".into(), "pos_start".into(), "pos_end".into()], // left columns
vec!["contig".into(), "pos_start".into(), "pos_end".into()], // right columns
FilterOp::Weak, // or FilterOp::Strict for 0-based half-open
true, // true = coverage, false = count_overlaps
);
ctx.register_table("result", Arc::new(provider))?;
let df = ctx.sql("SELECT * FROM result").await?;This crate replaces the sequila-core crate from the sequila-native repository. The functionality is identical; only names and the module structure have changed.
| sequila-native | datafusion-bio-function-ranges |
|---|---|
sequila_core::session_context::SeQuiLaSessionExt |
BioSessionExt |
sequila_core::session_context::SequilaConfig |
BioConfig |
sequila_core::session_context::Algorithm |
Algorithm |
SessionContext::new_with_sequila(config) |
SessionContext::new_with_bio(config) |
| sequila-native | datafusion-bio-function-ranges |
|---|---|
SET sequila.prefer_interval_join = true |
SET bio.prefer_interval_join = true |
SET sequila.interval_join_algorithm = Coitrees |
SET bio.interval_join_algorithm = Coitrees |
SET sequila.interval_join_low_memory = true |
SET bio.interval_join_low_memory = true |
Before (sequila-native):
use sequila_core::session_context::{SeQuiLaSessionExt, SequilaConfig};
let mut sequila_config = SequilaConfig::default();
sequila_config.prefer_interval_join = true;
let config = SessionConfig::from(options)
.with_option_extension(sequila_config);
let ctx = SessionContext::new_with_sequila(config);After (datafusion-bio-function-ranges):
use datafusion_bio_function_ranges::{create_bio_session, register_ranges_functions};
// Simple: creates context with everything configured
let ctx = create_bio_session();
// Or manually:
use datafusion_bio_function_ranges::{BioConfig, BioSessionExt};
let config = SessionConfig::from(options)
.with_option_extension(BioConfig::default());
let ctx = SessionContext::new_with_bio(config);
register_ranges_functions(&ctx); // registers coverage() and count_overlaps() UDTFsAll interval operations are available as SQL table functions:
SELECT * FROM coverage('reads', 'targets')
SELECT * FROM count_overlaps('reads', 'targets')
SELECT * FROM nearest('targets', 'reads')
SELECT * FROM overlap('reads', 'targets')
SELECT * FROM merge('intervals')
SELECT * FROM cluster('intervals')
SELECT * FROM complement('intervals', 'chromsizes')
SELECT * FROM subtract('left_table', 'right_table')Before:
sequila-core = { git = "https://github.com/biodatageeks/sequila-native.git", rev = "..." }After:
datafusion-bio-function-ranges = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "..." }| Dependency | Version |
|---|---|
| DataFusion | 50.3.0 |
| Arrow | 56.1.0 |
| Rust edition | 2024 |
These versions must stay in sync with datafusion-bio-formats and polars-bio.
This crate is licensed under the Apache License 2.0, consistent with the rest of the datafusion-bio-functions workspace.
The vendored superintervals sub-crate (in superintervals/) is licensed under the MIT License by Kez Cleal. MIT is a permissive license fully compatible with Apache 2.0 — MIT-licensed code can be included in Apache 2.0 projects without restriction.