Skip to content

Commit 1d4d456

Browse files
committed
Add more tests and docs for heap size estimation
1 parent c8b784a commit 1d4d456

1 file changed

Lines changed: 207 additions & 5 deletions

File tree

datafusion/common/src/heap_size.rs

Lines changed: 207 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,30 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
//! Estimating the heap-allocated memory owned by a value.
19+
//!
20+
//! The [`DFHeapSize`] trait reports the number of bytes a value owns on the
21+
//! heap, **excluding** the stack size of the value itself.
22+
//!
23+
//! Implementations need to use [`DFHeapSizeCtx`] that is pushed through every
24+
//! nested call. The context records which allocations have already been measured
25+
//! so they are only counted once.
26+
//!
27+
//! # Example
28+
//!
29+
//! ```
30+
//! use datafusion_common::heap_size::{DFHeapSize, DFHeapSizeCtx};
31+
//! use std::sync::Arc;
32+
//!
33+
//! let shared: Arc<String> = Arc::new("hello".to_string());
34+
//! let alias = Arc::clone(&shared);
35+
//!
36+
//! let mut ctx = DFHeapSizeCtx::default();
37+
//! // The shared allocation is counted once even when reached twice.
38+
//! let total = shared.heap_size(&mut ctx) + alias.heap_size(&mut ctx);
39+
//! assert_eq!(total, shared.heap_size(&mut DFHeapSizeCtx::default()));
40+
//! ```
41+
1842
use crate::stats::Precision;
1943
use crate::{ColumnStatistics, ScalarValue, Statistics, TableReference};
2044
use arrow::array::{
@@ -32,12 +56,15 @@ use std::collections::HashMap;
3256
use std::fmt::Debug;
3357
use std::sync::Arc;
3458

35-
/// This is a temporary solution until <https://github.com/apache/datafusion/pull/19599> and
36-
/// <https://github.com/apache/arrow-rs/pull/9138> are resolved.
37-
/// Trait for calculating the size of various containers
59+
/// Trait for computing how many bytes a value has allocated on the heap.
60+
///
61+
/// Implementations need to use [`DFHeapSizeCtx`] that is pushed through every
62+
/// nested call. The context records which allocations have already been measured
63+
/// so they are only counted once.
64+
///
3865
pub trait DFHeapSize {
39-
/// Return the size of any bytes allocated on the heap by this object,
40-
/// including heap memory in those structures
66+
/// Return the number of bytes this value has allocated on the heap,
67+
/// including heap memory owned transitively by nested values.
4168
///
4269
/// Note that the size of the type itself is not included in the result --
4370
/// instead, that size is added by the caller (e.g. container).
@@ -521,6 +548,10 @@ impl DFHeapSize for usize {
521548
mod tests {
522549
use super::*;
523550

551+
fn size<T: DFHeapSize + ?Sized>(v: &T) -> usize {
552+
v.heap_size(&mut DFHeapSizeCtx::default())
553+
}
554+
524555
#[test]
525556
fn test_heap_size_arc_avoid_double_accounting() {
526557
let a1 = Arc::new(vec![1, 2, 3]);
@@ -558,4 +589,175 @@ mod tests {
558589

559590
assert_eq!(heap_size, heap_size_with_clones);
560591
}
592+
593+
#[test]
594+
fn test_arc_dyn() {
595+
let a1: Arc<dyn DFHeapSize> = Arc::new(String::from("hello"));
596+
let baseline = size(&a1);
597+
598+
let a2 = Arc::clone(&a1);
599+
let mut ctx = DFHeapSizeCtx::default();
600+
let with_clones = a1.heap_size(&mut ctx) + a2.heap_size(&mut ctx);
601+
assert_eq!(baseline, with_clones);
602+
}
603+
604+
#[test]
605+
fn test_primitives() {
606+
assert_eq!(size(&true), 0);
607+
assert_eq!(size(&0u8), 0);
608+
assert_eq!(size(&0u16), 0);
609+
assert_eq!(size(&0u32), 0);
610+
assert_eq!(size(&0u64), 0);
611+
assert_eq!(size(&0usize), 0);
612+
assert_eq!(size(&0i8), 0);
613+
assert_eq!(size(&0i16), 0);
614+
assert_eq!(size(&0i32), 0);
615+
assert_eq!(size(&0i64), 0);
616+
assert_eq!(size(&0i128), 0);
617+
assert_eq!(size(&i256::ZERO), 0);
618+
assert_eq!(size(&0f32), 0);
619+
assert_eq!(size(&0f64), 0);
620+
assert_eq!(size(&f16::from_f32(0.0)), 0);
621+
}
622+
623+
#[test]
624+
fn test_string() {
625+
let mut s = String::with_capacity(32);
626+
s.push_str("hello");
627+
assert_eq!(size(&s), 32);
628+
629+
let empty = String::new();
630+
assert_eq!(size(&empty), 0);
631+
}
632+
633+
#[test]
634+
fn test_str() {
635+
let s: &str = "hello";
636+
assert!(size(s) > 0);
637+
assert_eq!(size(""), 0);
638+
}
639+
640+
#[test]
641+
fn test_option() {
642+
let some: Option<String> = Some(String::from("hi"));
643+
assert_eq!(size(&some), some.as_ref().unwrap().capacity());
644+
645+
let none: Option<String> = None;
646+
assert_eq!(size(&none), 0);
647+
}
648+
649+
#[test]
650+
fn test_vec() {
651+
let v: Vec<i32> = vec![1, 2, 3];
652+
assert!(size(&v) > 0);
653+
654+
let strings = vec![String::from("ab"), String::from("cdef")];
655+
assert!(size(&strings) > 0);
656+
657+
let empty: Vec<i32> = Vec::new();
658+
assert_eq!(size(&empty), 0);
659+
}
660+
661+
#[test]
662+
fn test_box() {
663+
let b: Box<i32> = Box::new(42);
664+
assert!(size(&b) > 0);
665+
666+
let b: Box<String> = Box::new(String::from("hello"));
667+
assert!(size(&b) > 0);
668+
}
669+
670+
#[test]
671+
fn test_tuple() {
672+
let zero = (1i32, 2i64);
673+
assert_eq!(size(&zero), 0);
674+
675+
let t = (String::from("hello"), String::from("world"));
676+
assert!(size(&t) > 0);
677+
}
678+
679+
#[test]
680+
fn test_hashmap() {
681+
let m: HashMap<i32, i32> = HashMap::new();
682+
assert_eq!(size(&m), 0);
683+
684+
let mut m: HashMap<String, String> = HashMap::new();
685+
m.insert("key".into(), "value".into());
686+
687+
assert!(size(&m) > 0);
688+
}
689+
690+
691+
#[test]
692+
fn test_precision() {
693+
let exact: Precision<usize> = Precision::Exact(42);
694+
assert_eq!(size(&exact), 0);
695+
696+
let inexact: Precision<usize> = Precision::Inexact(99);
697+
assert_eq!(size(&inexact), 0);
698+
699+
let absent: Precision<usize> = Precision::Absent;
700+
assert_eq!(size(&absent), 0);
701+
}
702+
703+
#[test]
704+
fn test_scalar_values() {
705+
assert_eq!(size(&ScalarValue::Null), 0);
706+
assert_eq!(size(&ScalarValue::Int32(Some(42))), 0);
707+
assert_eq!(size(&ScalarValue::Boolean(Some(true))), 0);
708+
assert_eq!(size(&ScalarValue::Float64(None)), 0);
709+
710+
let sv = ScalarValue::Utf8(Some(String::from("hello")));
711+
assert_eq!(size(&sv), "hello".len());
712+
713+
let sv = ScalarValue::Utf8(None);
714+
assert_eq!(size(&sv), 0);
715+
}
716+
717+
#[test]
718+
fn test_data_type_primitive() {
719+
assert_eq!(size(&DataType::Int32), 0);
720+
assert_eq!(size(&DataType::Utf8), 0);
721+
assert_eq!(size(&DataType::Boolean), 0);
722+
assert_eq!(size(&DataType::Null), 0);
723+
}
724+
725+
#[test]
726+
fn test_data_type_with_field() {
727+
let list = DataType::List(Arc::new(Field::new("item", DataType::Int32, true)));
728+
assert!(size(&list) > 0);
729+
}
730+
731+
#[test]
732+
fn test_table_reference() {
733+
let tr = TableReference::bare("users");
734+
// Arc<str> overhead (two usize counts) plus the bytes of "users".
735+
assert!(size(&tr) > 0);
736+
let tr = TableReference::full("cat", "schema", "users");
737+
assert!(size(&tr) > 0);
738+
}
739+
740+
#[test]
741+
fn test_column_statistics() {
742+
let mut col = ColumnStatistics::new_unknown();
743+
col.max_value = Precision::Exact(ScalarValue::Utf8(Some("hello".into())));
744+
col.min_value = Precision::Exact(ScalarValue::Utf8(Some("ab".into())));
745+
assert_eq!(size(&col), "hello".len() + "ab".len());
746+
747+
let mut col = ColumnStatistics::new_unknown();
748+
col.max_value = Precision::Exact(ScalarValue::Utf8(Some("hello".into())));
749+
let stats = Statistics {
750+
num_rows: Precision::Exact(10),
751+
total_byte_size: Precision::Absent,
752+
column_statistics: vec![col],
753+
};
754+
assert!(size(&stats) > 0);
755+
}
756+
757+
#[test]
758+
fn test_field() {
759+
let field = Field::new("temperature", DataType::Float64, true);
760+
assert!(size(&field) >= "temperature".len());
761+
}
762+
561763
}

0 commit comments

Comments
 (0)