|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +//! Built-in [`RelationPlanner`] for `TABLESAMPLE SYSTEM(p%)`. |
| 19 | +//! |
| 20 | +//! Auto-registered via [`SessionStateDefaults::default_relation_planners`] |
| 21 | +//! so SQL `TABLESAMPLE SYSTEM (10) [REPEATABLE (n)]` works out of the |
| 22 | +//! box on any default `SessionContext`. Other `TABLESAMPLE` flavours |
| 23 | +//! (`BERNOULLI`, `ROW`, `BUCKET ... OUT OF ...`, `OFFSET`) are rejected |
| 24 | +//! at planning time — implementing those is left to a downstream |
| 25 | +//! `RelationPlanner` (see `datafusion-examples/examples/relation_planner/`). |
| 26 | +//! |
| 27 | +//! `SessionStateBuilder::register_relation_planner` inserts new planners |
| 28 | +//! at the front of the chain, so a downstream planner that returns |
| 29 | +//! `Planned` for the same `TABLESAMPLE` syntax wins. Returning |
| 30 | +//! `Original` falls through to this default. |
| 31 | +//! |
| 32 | +//! [`SessionStateDefaults::default_relation_planners`]: ../../datafusion/execution/session_state/struct.SessionStateDefaults.html |
| 33 | +
|
| 34 | +use std::sync::Arc; |
| 35 | + |
| 36 | +use datafusion_common::{Result, not_impl_err, plan_datafusion_err, plan_err}; |
| 37 | +use datafusion_expr::logical_plan::sample::{SampleMethod, sample_plan}; |
| 38 | +use datafusion_expr::planner::{ |
| 39 | + PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning, |
| 40 | +}; |
| 41 | +use sqlparser::ast::{ |
| 42 | + self, TableFactor, TableSampleKind, TableSampleMethod, TableSampleUnit, |
| 43 | +}; |
| 44 | + |
| 45 | +/// Built-in `RelationPlanner` that lifts `TABLESAMPLE SYSTEM(p%)` |
| 46 | +/// (with optional `REPEATABLE(seed)`) into the core |
| 47 | +/// [`Sample`](datafusion_expr::logical_plan::sample::Sample) extension |
| 48 | +/// node so the `SamplePushdown` optimizer rule can absorb the sample |
| 49 | +/// into the scan. |
| 50 | +/// |
| 51 | +/// Rejects every other form of `TABLESAMPLE` with a `not_impl_err`. To |
| 52 | +/// support `BERNOULLI`, row counts, or `BUCKET`, register your own |
| 53 | +/// `RelationPlanner` ahead of this one — `register_relation_planner` |
| 54 | +/// pushes to the front and the first `Planned` wins. |
| 55 | +#[derive(Debug, Default)] |
| 56 | +pub struct TableSampleSystemPlanner; |
| 57 | + |
| 58 | +impl RelationPlanner for TableSampleSystemPlanner { |
| 59 | + fn plan_relation( |
| 60 | + &self, |
| 61 | + relation: TableFactor, |
| 62 | + context: &mut dyn RelationPlannerContext, |
| 63 | + ) -> Result<RelationPlanning> { |
| 64 | + // Only act on Table relations carrying a `TABLESAMPLE` clause. |
| 65 | + // Everything else (derived, function, unnest, join) falls |
| 66 | + // through to the next planner / DataFusion's default logic. |
| 67 | + let TableFactor::Table { |
| 68 | + sample: Some(sample), |
| 69 | + alias, |
| 70 | + name, |
| 71 | + args, |
| 72 | + with_hints, |
| 73 | + version, |
| 74 | + with_ordinality, |
| 75 | + partitions, |
| 76 | + json_path, |
| 77 | + index_hints, |
| 78 | + } = relation |
| 79 | + else { |
| 80 | + return Ok(RelationPlanning::Original(Box::new(relation))); |
| 81 | + }; |
| 82 | + |
| 83 | + let ts = match sample { |
| 84 | + TableSampleKind::BeforeTableAlias(s) |
| 85 | + | TableSampleKind::AfterTableAlias(s) => *s, |
| 86 | + }; |
| 87 | + |
| 88 | + if ts.bucket.is_some() { |
| 89 | + return not_impl_err!( |
| 90 | + "TABLESAMPLE BUCKET is not supported (only SYSTEM PERCENT). \ |
| 91 | + Register a custom RelationPlanner before the built-in \ |
| 92 | + TableSampleSystemPlanner to handle other forms." |
| 93 | + ); |
| 94 | + } |
| 95 | + if ts.offset.is_some() { |
| 96 | + return not_impl_err!( |
| 97 | + "TABLESAMPLE OFFSET is not supported (only SYSTEM PERCENT)" |
| 98 | + ); |
| 99 | + } |
| 100 | + match ts.name { |
| 101 | + // The built-in planner only handles SYSTEM (and BLOCK as an |
| 102 | + // alias for SYSTEM, matching Hive). Anything else is a |
| 103 | + // semantics commitment we don't want to make in core. |
| 104 | + Some(TableSampleMethod::System) | Some(TableSampleMethod::Block) | None => {} |
| 105 | + Some(other) => { |
| 106 | + return not_impl_err!( |
| 107 | + "TABLESAMPLE method {other} is not supported (only SYSTEM). \ |
| 108 | + Register a custom RelationPlanner before the built-in \ |
| 109 | + TableSampleSystemPlanner to handle other methods." |
| 110 | + ); |
| 111 | + } |
| 112 | + } |
| 113 | + |
| 114 | + let quantity = ts.quantity.ok_or_else(|| { |
| 115 | + plan_datafusion_err!("TABLESAMPLE without a quantity is not supported") |
| 116 | + })?; |
| 117 | + let raw = match &quantity.value { |
| 118 | + ast::Expr::Value(vs) => match &vs.value { |
| 119 | + ast::Value::Number(n, _) => n.parse::<f64>().map_err(|_| { |
| 120 | + plan_datafusion_err!("invalid TABLESAMPLE quantity: {n}") |
| 121 | + })?, |
| 122 | + v => return plan_err!("TABLESAMPLE quantity must be numeric; got {v:?}"), |
| 123 | + }, |
| 124 | + other => { |
| 125 | + return plan_err!("TABLESAMPLE quantity must be a literal; got {other}"); |
| 126 | + } |
| 127 | + }; |
| 128 | + let fraction = match quantity.unit { |
| 129 | + Some(TableSampleUnit::Percent) | None => raw / 100.0, |
| 130 | + Some(TableSampleUnit::Rows) => { |
| 131 | + return not_impl_err!( |
| 132 | + "TABLESAMPLE with ROWS count is not supported (only SYSTEM PERCENT)" |
| 133 | + ); |
| 134 | + } |
| 135 | + }; |
| 136 | + |
| 137 | + let seed = ts |
| 138 | + .seed |
| 139 | + .map(|s| match s.value { |
| 140 | + ast::Value::Number(n, _) => n |
| 141 | + .parse::<u64>() |
| 142 | + .map_err(|_| plan_datafusion_err!("invalid REPEATABLE seed: {n}")), |
| 143 | + v => Err(plan_datafusion_err!( |
| 144 | + "REPEATABLE seed must be an integer; got {v:?}" |
| 145 | + )), |
| 146 | + }) |
| 147 | + .transpose()?; |
| 148 | + |
| 149 | + // Replan the bare table without the sample clause, then wrap |
| 150 | + // the resulting plan in a `Sample` extension node. |
| 151 | + let bare = TableFactor::Table { |
| 152 | + sample: None, |
| 153 | + alias: alias.clone(), |
| 154 | + name, |
| 155 | + args, |
| 156 | + with_hints, |
| 157 | + version, |
| 158 | + with_ordinality, |
| 159 | + partitions, |
| 160 | + json_path, |
| 161 | + index_hints, |
| 162 | + }; |
| 163 | + let input = context.plan(bare)?; |
| 164 | + let plan = sample_plan(Arc::new(input), SampleMethod::System, fraction, seed)?; |
| 165 | + Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new( |
| 166 | + plan, alias, |
| 167 | + )))) |
| 168 | + } |
| 169 | +} |
0 commit comments