Skip to content

Commit 92962a4

Browse files
authored
simplify projection expression for SELECT * in duckdb (#7885)
In case we have a SELECT * with possible virtual columns, use root() and not select(names, root()) in duckdb Signed-off-by: Mikhail Kot <to@myrrc.dev>
1 parent e160125 commit 92962a4

3 files changed

Lines changed: 142 additions & 28 deletions

File tree

vortex-duckdb/src/datasource.rs

Lines changed: 125 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -594,8 +594,6 @@ struct ProjectionWithVirtualColumns {
594594
file_row_number_column_pos: Option<usize>,
595595
}
596596

597-
/// Creates a projection expression from raw projection/column ID slices and
598-
/// column names.
599597
fn extract_projection_expr(
600598
projection_ids: Option<&[u64]>,
601599
column_ids: &[u64],
@@ -610,33 +608,62 @@ fn extract_projection_expr(
610608

611609
let mut file_index_column_pos = None;
612610
let mut file_row_number_column_pos = None;
611+
let mut is_star = true;
612+
let mut real_column_count = 0;
613+
614+
// DuckDB uses u64 as column indices but Rust uses usize
615+
for (column_pos, &column_id) in ids.iter().enumerate() {
616+
let column_id = if has_projection_ids {
617+
let column_id: usize = column_id.as_();
618+
column_ids[column_id]
619+
} else {
620+
column_id
621+
};
613622

614-
#[expect(clippy::cast_possible_truncation)]
615-
let names = ids
616-
.iter()
617-
.enumerate()
618-
.map(|(column_pos, &column_id)| {
619-
let column_id = if has_projection_ids {
620-
column_ids[column_id as usize]
621-
} else {
622-
column_id
623-
};
623+
if column_id == FILE_INDEX_COLUMN_IDX {
624+
file_index_column_pos = Some(column_pos);
625+
continue;
626+
}
627+
if column_id == FILE_ROW_NUMBER_COLUMN_IDX {
628+
file_row_number_column_pos = Some(column_pos);
629+
continue;
630+
}
624631

625-
if column_id == FILE_INDEX_COLUMN_IDX {
626-
file_index_column_pos = Some(column_pos);
627-
}
628-
if column_id == FILE_ROW_NUMBER_COLUMN_IDX {
629-
file_row_number_column_pos = Some(column_pos);
630-
}
632+
// In SELECT * DuckDB requests all columns from 0 to column_fields in
633+
// increasing order. After removing virtual columns, compare column_id
634+
// with (0..column_fields.len()) range.
635+
is_star &= column_id == real_column_count;
636+
real_column_count += 1;
637+
}
638+
// Duckdb can request less columns than there are in table i.e. [0, 1] with
639+
// 5 columns total.
640+
is_star &= real_column_count == column_fields.len() as u64;
631641

632-
column_id
633-
})
634-
.filter(|&col_id| !is_virtual_column(col_id))
635-
.map(|col_id| Arc::from(column_fields[col_id as usize].name.as_str()))
636-
.collect::<FieldNames>();
642+
let select = if is_star {
643+
root()
644+
} else {
645+
let names = ids
646+
.iter()
647+
.map(|&column_id| {
648+
if has_projection_ids {
649+
let column_id: usize = column_id.as_();
650+
column_ids[column_id]
651+
} else {
652+
column_id
653+
}
654+
})
655+
.filter(|&col_id| !is_virtual_column(col_id))
656+
.map(|column_id| {
657+
let column_id: usize = column_id.as_();
658+
Arc::from(column_fields[column_id].name.as_str())
659+
})
660+
.collect::<FieldNames>();
661+
662+
select(names, root())
663+
};
637664

638665
// file_index column will be filled later when exporting the chunk.
639-
let select = select(names, root());
666+
640667
let projection = if file_row_number_column_pos.is_some() {
641668
// row_idx will be rearranged to correct position in scan(), prepend
642669
// here
@@ -723,7 +750,20 @@ mod tests {
723750
use std::sync::atomic::AtomicU64;
724751
use std::sync::atomic::Ordering::Relaxed;
725752

753+
use vortex::dtype::DType;
754+
use vortex::dtype::PType;
755+
use vortex::expr::cast;
756+
use vortex::expr::merge;
757+
use vortex::expr::pack;
758+
use vortex::expr::root;
759+
use vortex::layout::layouts::row_idx::row_idx;
760+
726761
use super::progress;
762+
use crate::datasource::DuckdbField;
763+
use crate::datasource::FILE_INDEX_COLUMN_IDX;
764+
use crate::datasource::FILE_ROW_NUMBER_COLUMN_IDX;
765+
use crate::datasource::extract_projection_expr;
766+
use crate::duckdb::LogicalType;
727767

728768
#[test]
729769
fn test_table_scan_progress() {
@@ -738,4 +778,65 @@ mod tests {
738778
bytes_total.fetch_add(100, Relaxed);
739779
assert!((progress(&bytes_read, &bytes_total) - 50.).abs() < f64::EPSILON);
740780
}
781+
782+
#[test]
783+
fn test_select_star() {
784+
let ids = [0, 1, 2];
785+
let fields = [
786+
DuckdbField {
787+
name: "".to_owned(),
788+
logical_type: LogicalType::null(),
789+
dtype: DType::Null,
790+
},
791+
DuckdbField {
792+
name: "".to_owned(),
793+
logical_type: LogicalType::null(),
794+
dtype: DType::Null,
795+
},
796+
DuckdbField {
797+
name: "".to_owned(),
798+
logical_type: LogicalType::null(),
799+
dtype: DType::Null,
800+
},
801+
];
802+
803+
assert_eq!(
804+
extract_projection_expr(None, &ids, &fields).projection,
805+
root()
806+
);
807+
808+
let ids = [FILE_ROW_NUMBER_COLUMN_IDX, 0, 1, FILE_INDEX_COLUMN_IDX, 2];
809+
let exprs = extract_projection_expr(None, &ids, &fields);
810+
let row_idx = cast(row_idx(), DType::Primitive(PType::I64, false.into()));
811+
let row_idx_struct = pack([("file_row_number", row_idx)], false.into());
812+
let root_with_virtual_cols = merge([row_idx_struct, root()]);
813+
814+
assert_eq!(exprs.projection, root_with_virtual_cols);
815+
assert_eq!(exprs.file_index_column_pos, Some(3));
816+
assert_eq!(exprs.file_row_number_column_pos, Some(0));
817+
818+
// projections can't be set in SELECT *.
819+
assert_ne!(
820+
extract_projection_expr(Some(&[0, 1]), &ids, &fields).projection,
821+
root()
822+
);
823+
824+
let ids = [0, 1];
825+
assert_ne!(
826+
extract_projection_expr(None, &ids, &fields).projection,
827+
root()
828+
);
829+
830+
let ids = [0, 2, 2];
831+
assert_ne!(
832+
extract_projection_expr(None, &ids, &fields).projection,
833+
root()
834+
);
835+
836+
let ids = [2, 1, 0];
837+
assert_ne!(
838+
extract_projection_expr(None, &ids, &fields).projection,
839+
root()
840+
);
841+
}
741842
}

vortex-duckdb/src/duckdb/table_function/init.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,10 @@ impl<'a, T: TableFunction> TableInitInput<'a, T> {
8484
}
8585

8686
pub fn projection_ids(&self) -> Option<&[u64]> {
87-
if self.input.projection_ids.is_null() {
87+
// Passed pointer is std::vector's .data(). However, C++ doesn't
88+
// guarantee an empty vector's pointer is nullptr so we need to check
89+
// both conditions
90+
if self.input.projection_ids.is_null() || self.input.projection_ids_count == 0 {
8891
return None;
8992
}
9093
Some(unsafe {

vortex-sqllogictest/slt/duckdb/file_index.slt

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ SELECT file_index, str FROM '$__TEST_DIR__/file_index_2.vortex';
2727
0 3
2828

2929
query TB
30-
SELECT *, file_index < 2 FROM '$__TEST_DIR__/*.vortex'
30+
SELECT *, file_index < 2 FROM '$__TEST_DIR__/file_index_*.vortex'
3131
ORDER BY str;
3232
----
3333
1 true
@@ -36,14 +36,24 @@ ORDER BY str;
3636
Hello true
3737
Hi true
3838

39+
query BT
40+
SELECT file_index < 2, * FROM '$__TEST_DIR__/file_index_*.vortex'
41+
ORDER BY str;
42+
----
43+
true 1
44+
true 2
45+
true 3
46+
true Hello
47+
true Hi
48+
3949
query IB
40-
SELECT count(*) AS cnt, sum(file_index) <= 3 FROM '$__TEST_DIR__/*.vortex'
50+
SELECT count(*) AS cnt, sum(file_index) <= 3 FROM '$__TEST_DIR__/file_index_*.vortex'
4151
ORDER BY cnt;
4252
----
4353
5 true
4454

4555
query B
46-
SELECT file_index < 2 FROM '$__TEST_DIR__/*.vortex'
56+
SELECT file_index < 2 FROM '$__TEST_DIR__/file_index_*.vortex'
4757
WHERE len(str) > 1;
4858
----
4959
true

0 commit comments

Comments
 (0)