Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
3bd5933
Optimization of the unnest -> transform -> aggregate pattern which wi…
psuszyns Mar 8, 2026
fbb381b
bio-function-vcftools: fix error with transformed column types
psuszyns Mar 9, 2026
8fb81ab
cargo fmt --all
psuszyns Mar 9, 2026
5a789a7
vcftools: changes after running 'cargo clippy --all-targets --all-fea…
psuszyns Mar 11, 2026
d43a3e8
fix incorrect handling of nulls and mismatched array lengths
psuszyns Mar 11, 2026
8989be9
making code more readbale, extracting functions
psuszyns Mar 11, 2026
4036a98
cargo fmt --all
psuszyns Mar 11, 2026
b19be99
vcftools: ORDER BY, DISTINCT and FILTER modifiers inside array_agg() …
psuszyns Mar 11, 2026
0d167be
vcftools: address review comments
psuszyns Mar 11, 2026
c20dcf1
vcftools: update README.md
psuszyns Mar 11, 2026
e347544
vcftools: remove obsolete comment
psuszyns Mar 11, 2026
f562312
vcftools: address most important comments from second review
psuszyns Mar 11, 2026
0e6d41d
vcftools: reuse DefaultPhysicalPlanner
psuszyns Mar 11, 2026
64d16af
vcftools: address remaining review comments
psuszyns Mar 12, 2026
cef1eb8
vcftools: check for the presence of a row identifying column in the G…
psuszyns Mar 12, 2026
c5935a0
vcftools: address remaining review comments
psuszyns Mar 12, 2026
82c23f5
vcftools: add support for transforming passthrough columns
psuszyns Mar 12, 2026
65c23f1
vcftools: minor changes after review
psuszyns Mar 12, 2026
9f16ae8
vcftools: add test for multiple transforms
psuszyns Mar 13, 2026
87ee1dc
formatting
psuszyns Mar 14, 2026
9c42a46
datafusion-bio-query-planner: single QueryPanner for both bio-functio…
psuszyns Mar 22, 2026
2612a2b
formatting + clippy
psuszyns Mar 22, 2026
f48e1bd
new bio-function module with unified BioSessionExt
psuszyns Mar 23, 2026
b227339
vcf_sample_qc TVF: vectorized sample QC with direct VCF output
mwiewior Mar 27, 2026
242f56e
register vcf_sample_qc TVF in with_config_rt_bio session setup
mwiewior Mar 27, 2026
417faf0
vcf_sample_qc: auto-detect gzip from output path extension
mwiewior Mar 27, 2026
735474d
vcf_sample_qc: use noodles-bgzf for tabix-compatible BGZF output
mwiewior Mar 27, 2026
236c74a
vcf_sample_qc: fix O(N*V) linear scan and per-sample allocation in PL…
mwiewior Mar 27, 2026
0c1c39d
vcf_sample_qc: fix remaining performance bottlenecks (#3-8)
mwiewior Mar 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[workspace]
resolver = "2"
members = ["datafusion/bio-function-pileup", "datafusion/bio-function-ranges", "datafusion/bio-function-vep"]
members = ["datafusion/bio-function-pileup", "datafusion/bio-function-ranges", "datafusion/bio-function-vep", "datafusion/bio-function-vcftools", "datafusion/bio-function"]
exclude = ["datafusion/bio-function-ranges/superintervals"]

[workspace.package]
Expand All @@ -15,3 +15,7 @@ datafusion = { version = "50.3.0" }
tokio = { version = "1.43.0", features = ["rt-multi-thread", "rt", "macros"] }
futures = "0.3.31"
log = "0.4.27"
serial_test = "3.1"
tempfile = "3.15"
parquet = "56.1"
async-trait = "0.1.88"
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ This workspace provides a collection of Rust crates that implement DataFusion UD

| Crate | Description | Status |
|-------|-------------|--------|
| **[datafusion-bio-function](datafusion/bio-function)** | Unified bio `SessionContext` setup (`BioSessionExt`, `create_bio_session`) that wires ranges + vcftools optimizations | ✅ |
| **[datafusion-bio-function-pileup](datafusion/bio-function-pileup)** | Depth-of-coverage (pileup) computation from BAM alignments | ✅ |
| **[datafusion-bio-function-ranges](datafusion/bio-function-ranges)** | Interval join, coverage, count-overlaps, nearest-neighbor, overlap, merge, cluster, complement, and subtract operations | ✅ |
| **[datafusion-bio-function-vcftools](datafusion/bio-function-vcftools)** | Fused array transform logical/physical optimization for unnest-transform-array_agg patterns | ✅ |
| **[datafusion-bio-function-vep](datafusion/bio-function-vep)** | VEP variant annotation via `lookup_variants()` table function with parquet + Fjall KV cache backends | ✅ |

## Features
Expand Down Expand Up @@ -347,7 +349,7 @@ Both schemas include metadata key `bio.coordinate_system_zero_based` (`"true"` o
Create a bio-configured session and write standard SQL — range overlap joins are automatically optimized:

```rust
use datafusion_bio_function_ranges::create_bio_session;
use datafusion_bio_function::create_bio_session;

#[tokio::main]
async fn main() -> datafusion::error::Result<()> {
Expand Down
33 changes: 18 additions & 15 deletions datafusion/bio-function-ranges/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ This crate provides optimized genomic interval operations as DataFusion extensio
## Quick Start

```rust
use datafusion_bio_function_ranges::{create_bio_session, register_ranges_functions};
use datafusion_bio_function::{create_bio_session, BioSessionExt};
use datafusion_bio_function_ranges::{BioConfig, register_ranges_functions};

// Option 1: Create a fully configured session (recommended)
let ctx = create_bio_session();

// Option 2: Register functions on an existing bio-configured session
use datafusion::config::ConfigOptions;
use datafusion::prelude::{SessionConfig, SessionContext};
use datafusion_bio_function_ranges::{BioConfig, BioSessionExt};

let config = SessionConfig::from(ConfigOptions::new())
.with_option_extension(BioConfig::default())
Expand All @@ -47,18 +47,21 @@ use datafusion_bio_function_ranges::register_ranges_functions;
register_ranges_functions(&ctx);
```

### `create_bio_session()`
### Unified Bio Session (recommended)

Convenience function that creates a `SessionContext` with:
- Custom query planner for automatic interval join detection
- Physical optimizer rule that converts hash/nested-loop joins to interval joins
Use `datafusion-bio-function` to create a `SessionContext` with:
- interval join optimization (from ranges)
- fused array transform optimization (from vcftools)
- `BioConfig` extension for algorithm selection via `SET bio.*` statements
- All SQL table functions: `coverage()`, `count_overlaps()`, `nearest()`, `overlap()`, `merge()`, `cluster()`, `complement()`, `subtract()`

Then register ranges table functions from this crate.

```rust
use datafusion_bio_function_ranges::create_bio_session;
use datafusion_bio_function::create_bio_session;
use datafusion_bio_function_ranges::register_ranges_functions;

let ctx = create_bio_session();
register_ranges_functions(&ctx);
```

## SQL Table Functions
Expand Down Expand Up @@ -218,7 +221,7 @@ SELECT * FROM subtract('left_table', 'right_table', 'l_chr', 'l_s', 'l_e', 'r_ch

## Interval Join (SQL)

When using a bio-configured session (`create_bio_session()` or `BioSessionExt::new_with_bio()`), SQL joins with range overlap conditions are automatically optimized:
When using a bio-configured session (`datafusion_bio_function::create_bio_session()` or `BioSessionExt::new_with_bio()` from `datafusion-bio-function`), SQL joins with range overlap conditions are automatically optimized:

```sql
-- Automatically detected and optimized as interval join
Expand Down Expand Up @@ -287,10 +290,10 @@ This crate replaces the `sequila-core` crate from the [sequila-native](https://g

| sequila-native | datafusion-bio-function-ranges |
|----------------|-------------------------------|
| `sequila_core::session_context::SeQuiLaSessionExt` | `BioSessionExt` |
| `sequila_core::session_context::SeQuiLaSessionExt` | `datafusion_bio_function::BioSessionExt` |
| `sequila_core::session_context::SequilaConfig` | `BioConfig` |
| `sequila_core::session_context::Algorithm` | `Algorithm` |
| `SessionContext::new_with_sequila(config)` | `SessionContext::new_with_bio(config)` |
| `SessionContext::new_with_sequila(config)` | `SessionContext::new_with_bio(config)` (from `datafusion-bio-function`) |

### Configuration Namespace

Expand All @@ -317,18 +320,18 @@ let ctx = SessionContext::new_with_sequila(config);

**After (datafusion-bio-function-ranges):**
```rust
use datafusion_bio_function_ranges::{create_bio_session, register_ranges_functions};
use datafusion_bio_function::{create_bio_session, BioSessionExt};
use datafusion_bio_function_ranges::{BioConfig, register_ranges_functions};

// Simple: creates context with everything configured
let ctx = create_bio_session();
register_ranges_functions(&ctx);

// Or manually:
use datafusion_bio_function_ranges::{BioConfig, BioSessionExt};

let config = SessionConfig::from(options)
.with_option_extension(BioConfig::default());
let ctx = SessionContext::new_with_bio(config);
register_ranges_functions(&ctx); // registers coverage() and count_overlaps() UDTFs
register_ranges_functions(&ctx); // registers ranges table functions
```

### New SQL Table Functions
Expand Down
77 changes: 77 additions & 0 deletions datafusion/bio-function-ranges/src/algorithms.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
use datafusion::config::ConfigField;
use std::str::FromStr;

#[derive(Debug, Eq, PartialEq, Default, Clone, Copy)]
pub enum Algorithm {
#[default]
Coitrees,
IntervalTree,
ArrayIntervalTree,
Lapper,
SuperIntervals,
CoitreesNearest,
CoitreesCountOverlaps,
}

#[derive(Debug)]
pub struct ParseAlgorithmError(String);

impl std::fmt::Display for ParseAlgorithmError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}

impl std::error::Error for ParseAlgorithmError {}

impl FromStr for Algorithm {
type Err = ParseAlgorithmError;

#[inline]
fn from_str(s: &str) -> Result<Algorithm, Self::Err> {
match s.to_lowercase().as_str() {
"coitrees" => Ok(Algorithm::Coitrees),
"intervaltree" => Ok(Algorithm::IntervalTree),
"arrayintervaltree" => Ok(Algorithm::ArrayIntervalTree),
"lapper" => Ok(Algorithm::Lapper),
"superintervals" => Ok(Algorithm::SuperIntervals),
"coitreesnearest" => Ok(Algorithm::CoitreesNearest),
"coitreescountoverlaps" => Ok(Algorithm::CoitreesCountOverlaps),
_ => Err(ParseAlgorithmError(format!(
"Can't parse '{s}' as Algorithm"
))),
}
}
}

impl std::fmt::Display for Algorithm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let val = match self {
Algorithm::Coitrees => "Coitrees",
Algorithm::IntervalTree => "IntervalTree",
Algorithm::ArrayIntervalTree => "ArrayIntervalTree",
Algorithm::Lapper => "Lapper",
Algorithm::SuperIntervals => "SuperIntervals",
Algorithm::CoitreesNearest => "CoitreesNearest",
Algorithm::CoitreesCountOverlaps => "CoitreesCountOverlaps",
};
write!(f, "{val}")
}
}

impl From<ParseAlgorithmError> for datafusion::error::DataFusionError {
fn from(e: ParseAlgorithmError) -> Self {
datafusion::error::DataFusionError::External(Box::new(e))
}
}

impl ConfigField for Algorithm {
fn set(&mut self, _key: &str, value: &str) -> datafusion::common::Result<()> {
*self = value.parse::<Algorithm>()?;
Ok(())
}

fn visit<V: datafusion::config::Visit>(&self, visitor: &mut V, name: &str, doc: &'static str) {
visitor.some(name, self, doc)
}
}
16 changes: 16 additions & 0 deletions datafusion/bio-function-ranges/src/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
use datafusion::common::extensions_options;
use datafusion::config::ConfigExtension;

use crate::algorithms::Algorithm;

extensions_options! {
pub struct BioConfig {
pub prefer_interval_join: bool, default = true
pub interval_join_algorithm: Algorithm, default = Algorithm::default()
pub interval_join_low_memory: bool, default = false
}
}

impl ConfigExtension for BioConfig {
const PREFIX: &'static str = "bio";
}
10 changes: 7 additions & 3 deletions datafusion/bio-function-ranges/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
pub mod algorithms;
pub mod array_utils;
pub mod cluster;
pub mod complement;
pub mod config;
pub mod count_overlaps;
pub mod filter_op;
pub mod grouped_stream;
Expand All @@ -10,21 +12,23 @@ pub mod nearest;
pub mod nearest_index;
pub mod overlap;
pub mod physical_planner;
pub mod session_context;
pub mod session_builder;
pub mod subtract;
pub mod table_function;

// Re-export key types
pub use algorithms::Algorithm;
pub use cluster::ClusterProvider;
pub use complement::ComplementProvider;
pub use config::BioConfig;
pub use count_overlaps::CountOverlapsProvider;
pub use filter_op::FilterOp;
pub use merge::MergeProvider;
pub use nearest::NearestProvider;
pub use overlap::OverlapProvider;
pub use physical_planner::BioQueryPlanner;
pub use physical_planner::IntervalJoinPhysicalOptimizationRule;
pub use physical_planner::RangesQueryPlanner;
pub use physical_planner::joins::interval_join::IntervalJoinExec;
pub use session_context::{Algorithm, BioConfig, BioSessionExt, create_bio_session};
pub use session_builder::{create_ranges_session, create_ranges_session_with_config};
pub use subtract::SubtractProvider;
pub use table_function::register_ranges_functions;
Loading