Skip to content

Latest commit

 

History

History

Folders and files

NameName
Last commit message
Last commit date

parent directory

..
 
 
 
 
 
 
 
 
 
 

README.md

datafusion-bio-function-ranges

Interval join, coverage, count-overlaps, nearest-neighbor, overlap, merge, cluster, complement, and subtract operations for Apache DataFusion.

This crate provides optimized genomic interval operations as DataFusion extensions:

  • Interval join — SQL joins with range overlap conditions, optimized via multiple interval tree algorithms
  • Coverage — base-pair overlap depth between two interval sets
  • Count overlaps — number of overlapping intervals per region
  • Nearest — nearest-neighbor interval matching
  • Overlap — all pairs of overlapping intervals between two tables
  • Merge — merge overlapping intervals within a single table
  • Cluster — annotate intervals with cluster membership (cluster ID + boundaries)
  • Complement — find gaps (uncovered regions) relative to optional chromsizes view
  • Subtract — basepair-level set difference between two interval sets

Quick Start

use datafusion_bio_function_ranges::{create_bio_session, register_ranges_functions};

// Option 1: Create a fully configured session (recommended)
let ctx = create_bio_session();

// Option 2: Register functions on an existing bio-configured session
use datafusion::config::ConfigOptions;
use datafusion::prelude::{SessionConfig, SessionContext};
use datafusion_bio_function_ranges::{BioConfig, BioSessionExt};

let config = SessionConfig::from(ConfigOptions::new())
    .with_option_extension(BioConfig::default())
    .with_information_schema(true)
    .with_repartition_joins(false);
let ctx = SessionContext::new_with_bio(config);
register_ranges_functions(&ctx);

Registering Functions

register_ranges_functions(ctx)

Registers the coverage, count_overlaps, nearest, overlap, merge, cluster, complement, and subtract SQL table functions on an existing SessionContext. This is analogous to register_pileup_functions in the pileup crate.

use datafusion_bio_function_ranges::register_ranges_functions;

register_ranges_functions(&ctx);

create_bio_session()

Convenience function that creates a SessionContext with:

  • Custom query planner for automatic interval join detection
  • Physical optimizer rule that converts hash/nested-loop joins to interval joins
  • BioConfig extension for algorithm selection via SET bio.* statements
  • All SQL table functions: coverage(), count_overlaps(), nearest(), overlap(), merge(), cluster(), complement(), subtract()
use datafusion_bio_function_ranges::create_bio_session;

let ctx = create_bio_session();

SQL Table Functions

coverage(left_table, right_table [, columns...] [, filter_op])

Computes base-pair coverage depth. Builds an interval tree from left_table, then for each row in right_table, computes the total overlap in base pairs with the merged intervals.

-- Default column names: contig, pos_start, pos_end
SELECT * FROM coverage('reads', 'targets')

-- Custom shared column names
SELECT * FROM coverage('reads', 'targets', 'chrom', 'start', 'end')

-- Separate column names for left and right tables
SELECT * FROM coverage('reads', 'targets', 'chrom', 'start', 'end', 'contig', 'pos_start', 'pos_end')

-- For 0-based half-open coordinates (adjusts boundaries with +1/-1)
SELECT * FROM coverage('reads', 'targets', 'contig', 'pos_start', 'pos_end', 'strict')

count_overlaps(left_table, right_table [, columns...] [, filter_op])

Counts overlapping intervals. Same interface as coverage, but returns the count of overlapping (non-merged) intervals instead of base-pair overlap.

SELECT * FROM count_overlaps('reads', 'targets')

nearest(left_table, right_table [, k] [, overlap] [, columns...] [, filter_op])

Returns up to k nearest left intervals for each right interval.

  • k default: 1 (must be >= 1)
  • overlap default: true
    • true: overlapping intervals are returned first, then nearest non-overlaps if needed
    • false: overlaps are ignored, only nearest non-overlaps are returned
  • If no keyed candidate exists for a right row, a row is still emitted with NULL in left_* columns.
  • Deterministic tie-break order is by (start, end, row_position) on the left side.

Output columns are prefixed with left_ and right_ to avoid ambiguity.

Accepted call forms:

  • nearest('left', 'right')
  • nearest('left', 'right', 3)
  • nearest('left', 'right', false) (use default k=1, disable overlap candidates)
  • nearest('left', 'right', 3, false)
  • nearest('left', 'right', 3, false, 'contig', 'pos_start', 'pos_end')
  • nearest('left', 'right', 3, false, 'l_contig', 'l_start', 'l_end', 'r_contig', 'r_start', 'r_end')
  • append 'strict' or 'weak' to any columns form above
-- Default k=1, include overlaps
SELECT * FROM nearest('targets', 'reads')

-- Top-3 nearest per right interval, ignoring overlaps
SELECT * FROM nearest('targets', 'reads', 3, false)

-- Default k=1, ignore overlaps
SELECT * FROM nearest('targets', 'reads', false)

-- 0-based half-open coordinates
SELECT * FROM nearest('targets', 'reads', 1, true, 'contig', 'pos_start', 'pos_end', 'strict')

-- Custom column names for left and right
SELECT * FROM nearest(
  'left_tbl',
  'right_tbl',
  5,
  true,
  'chrom', 'start', 'end',
  'chr', 'from', 'to'
)

overlap(left_table, right_table [, columns...] [, filter_op])

Returns all pairs of overlapping intervals between two tables. Output columns are prefixed with left_ and right_.

SELECT * FROM overlap('reads', 'targets')

-- With 0-based half-open coordinates
SELECT * FROM overlap('reads', 'targets', 'contig', 'pos_start', 'pos_end', 'strict')

-- Separate column names for left and right tables
SELECT * FROM overlap('reads', 'targets', 'l_chr', 'l_s', 'l_e', 'r_chr', 'r_s', 'r_e')

merge(table [, min_dist] [, columns...] [, filter_op])

Merges overlapping intervals within a single table. Returns merged intervals with an n_intervals count.

SELECT * FROM merge('intervals')

-- Merge intervals within distance 10
SELECT * FROM merge('intervals', 10)

-- With custom columns and strict mode
SELECT * FROM merge('intervals', 0, 'contig', 'pos_start', 'pos_end', 'strict')

cluster(table [, min_dist] [, columns...] [, filter_op])

Annotates each interval with its cluster membership. Returns all original rows plus cluster (ID), cluster_start, and cluster_end columns. Same argument pattern as merge.

SELECT * FROM cluster('intervals')

-- Cluster intervals within distance 5
SELECT * FROM cluster('intervals', 5)

-- With custom columns and strict mode
SELECT * FROM cluster('intervals', 0, 'contig', 'pos_start', 'pos_end', 'strict')

complement(table [, view_table] [, columns...] [, filter_op])

Finds gaps (uncovered regions) in the input intervals. Without a view table, gaps extend from 0 to i64::MAX per contig. With a view table (chromsizes), gaps are bounded to view coordinates.

-- Gaps relative to 0..MAX per contig
SELECT * FROM complement('intervals')

-- Gaps within chromsizes boundaries
SELECT * FROM complement('intervals', 'chromsizes')

-- With shared custom column names
SELECT * FROM complement('intervals', 'chromsizes', 'chrom', 'start', 'end')

-- With separate column names for input and view tables
SELECT * FROM complement('intervals', 'chromsizes', 'c1', 's1', 'e1', 'vc', 'vs', 've')

subtract(left_table, right_table [, columns...] [, filter_op])

Basepair-level set difference: removes portions of left intervals that overlap with right intervals. Fragments left intervals at overlap boundaries.

SELECT * FROM subtract('left_table', 'right_table')

-- With custom columns and strict mode
SELECT * FROM subtract('left_table', 'right_table', 'chrom', 'start', 'end', 'strict')

-- Separate column names for left and right tables
SELECT * FROM subtract('left_table', 'right_table', 'l_chr', 'l_s', 'l_e', 'r_chr', 'r_s', 'r_e')

Filter Operations

Value Description Use when
'weak' (default) Standard overlap: start <= end AND end >= start 1-based inclusive coordinates
'strict' Adjusted boundaries: queries with start+1, end-1 0-based half-open coordinates

Interval Join (SQL)

When using a bio-configured session (create_bio_session() or BioSessionExt::new_with_bio()), SQL joins with range overlap conditions are automatically optimized:

-- Automatically detected and optimized as interval join
SELECT *
FROM reads
JOIN targets
  ON reads.contig = targets.contig
  AND reads.pos_start <= targets.pos_end
  AND reads.pos_end >= targets.pos_start

Algorithm Selection

-- Select interval join algorithm
SET bio.interval_join_algorithm = Coitrees;  -- default, best general performance

-- Available algorithms:
-- Coitrees, IntervalTree, ArrayIntervalTree, Lapper, SuperIntervals
-- CoitreesNearest (1 nearest match per right-side row)

Nearest Join

SET bio.interval_join_algorithm = CoitreesNearest;

SELECT *
FROM targets
JOIN reads
  ON targets.contig = reads.contig
  AND targets.pos_start <= reads.pos_end
  AND targets.pos_end >= reads.pos_start

Returns exactly one match per right-side row: the overlapping interval if one exists, otherwise the nearest interval by distance. If there is no matching key group on the left side, the row is emitted with nulls on left columns.

Programmatic API

For direct Rust usage without SQL:

use std::sync::Arc;
use datafusion_bio_function_ranges::{CountOverlapsProvider, FilterOp};

let provider = CountOverlapsProvider::new(
    Arc::new(ctx.clone()),
    "reads".to_string(),          // left table (built into interval tree)
    "targets".to_string(),        // right table (gets count/coverage column)
    targets_schema,               // Schema of the right table
    vec!["contig".into(), "pos_start".into(), "pos_end".into()],  // left columns
    vec!["contig".into(), "pos_start".into(), "pos_end".into()],  // right columns
    FilterOp::Weak,               // or FilterOp::Strict for 0-based half-open
    true,                         // true = coverage, false = count_overlaps
);
ctx.register_table("result", Arc::new(provider))?;
let df = ctx.sql("SELECT * FROM result").await?;

Migration from sequila-native

This crate replaces the sequila-core crate from the sequila-native repository. The functionality is identical; only names and the module structure have changed.

Type Renames

sequila-native datafusion-bio-function-ranges
sequila_core::session_context::SeQuiLaSessionExt BioSessionExt
sequila_core::session_context::SequilaConfig BioConfig
sequila_core::session_context::Algorithm Algorithm
SessionContext::new_with_sequila(config) SessionContext::new_with_bio(config)

Configuration Namespace

sequila-native datafusion-bio-function-ranges
SET sequila.prefer_interval_join = true SET bio.prefer_interval_join = true
SET sequila.interval_join_algorithm = Coitrees SET bio.interval_join_algorithm = Coitrees
SET sequila.interval_join_low_memory = true SET bio.interval_join_low_memory = true

Registration Pattern

Before (sequila-native):

use sequila_core::session_context::{SeQuiLaSessionExt, SequilaConfig};

let mut sequila_config = SequilaConfig::default();
sequila_config.prefer_interval_join = true;

let config = SessionConfig::from(options)
    .with_option_extension(sequila_config);

let ctx = SessionContext::new_with_sequila(config);

After (datafusion-bio-function-ranges):

use datafusion_bio_function_ranges::{create_bio_session, register_ranges_functions};

// Simple: creates context with everything configured
let ctx = create_bio_session();

// Or manually:
use datafusion_bio_function_ranges::{BioConfig, BioSessionExt};

let config = SessionConfig::from(options)
    .with_option_extension(BioConfig::default());
let ctx = SessionContext::new_with_bio(config);
register_ranges_functions(&ctx);  // registers coverage() and count_overlaps() UDTFs

New SQL Table Functions

All interval operations are available as SQL table functions:

SELECT * FROM coverage('reads', 'targets')
SELECT * FROM count_overlaps('reads', 'targets')
SELECT * FROM nearest('targets', 'reads')
SELECT * FROM overlap('reads', 'targets')
SELECT * FROM merge('intervals')
SELECT * FROM cluster('intervals')
SELECT * FROM complement('intervals', 'chromsizes')
SELECT * FROM subtract('left_table', 'right_table')

Dependency Update

Before:

sequila-core = { git = "https://github.com/biodatageeks/sequila-native.git", rev = "..." }

After:

datafusion-bio-function-ranges = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "..." }

Version Compatibility

Dependency Version
DataFusion 50.3.0
Arrow 56.1.0
Rust edition 2024

These versions must stay in sync with datafusion-bio-formats and polars-bio.

License

This crate is licensed under the Apache License 2.0, consistent with the rest of the datafusion-bio-functions workspace.

The vendored superintervals sub-crate (in superintervals/) is licensed under the MIT License by Kez Cleal. MIT is a permissive license fully compatible with Apache 2.0 — MIT-licensed code can be included in Apache 2.0 projects without restriction.