Skip to content

Commit 674cc10

Browse files
authored
feat: add ExpLinkedList (#3721)
This data structure is useful when we need to append a lot of values and finally iterate over all of them. Compared to `Vec<T>`, this data structure doesn't waste much memory like `Vec<T>`, and doesn't need to copy the data when recap. This is introduced for FTS indexing process, will apply this in next PR --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
1 parent 68ca98f commit 674cc10

3 files changed

Lines changed: 293 additions & 0 deletions

File tree

rust/lance-core/src/container.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright The Lance Authors
3+
4+
pub mod list;
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright The Lance Authors
3+
4+
use std::collections::LinkedList;
5+
6+
use deepsize::DeepSizeOf;
7+
8+
/// A linked list that grows exponentially. It is used to store a large number of
9+
/// elements in a memory-efficient way. The list grows by doubling the capacity of
10+
/// the last element when it's full, the capacity can be limited by the `limit`
11+
/// parameter. The default value is 0, which means no limit.
12+
#[derive(Debug, Clone, Default)]
13+
pub struct ExpLinkedList<T> {
14+
inner: LinkedList<Vec<T>>,
15+
len: usize,
16+
// The maximum capacity of single node in the list.
17+
// If the limit is 0, there is no limit.
18+
// We use u16 to save memory because ExpLinkedList should not
19+
// be used if the limit is that large.
20+
limit: u16,
21+
}
22+
23+
impl<T> ExpLinkedList<T> {
24+
/// Creates a new empty `ExpLinkedList`.
25+
pub fn new() -> Self {
26+
Self {
27+
inner: LinkedList::new(),
28+
len: 0,
29+
limit: 0,
30+
}
31+
}
32+
33+
pub fn with_capacity(capacity: usize) -> Self {
34+
let mut inner = LinkedList::new();
35+
inner.push_back(Vec::with_capacity(capacity));
36+
Self {
37+
inner,
38+
len: 0,
39+
limit: 0,
40+
}
41+
}
42+
43+
/// Creates a new `ExpLinkedList` with a specified capacity limit.
44+
/// The limit is the maximum capacity of a single node in the list.
45+
/// If the limit is 0, there is no limit.
46+
pub fn with_capacity_limit(limit: u16) -> Self {
47+
Self {
48+
inner: LinkedList::new(),
49+
len: 0,
50+
limit,
51+
}
52+
}
53+
54+
/// Pushes a new element into the list. If the last element in the list
55+
/// reaches its capacity, a new node is created with double capacity.
56+
pub fn push(&mut self, v: T) {
57+
match self.inner.back() {
58+
Some(last) => {
59+
if last.len() == last.capacity() {
60+
let new_cap = if self.limit > 0 && last.capacity() * 2 >= self.limit as usize {
61+
self.limit as usize
62+
} else {
63+
last.capacity() * 2
64+
};
65+
self.inner.push_back(Vec::with_capacity(new_cap));
66+
}
67+
}
68+
None => {
69+
self.inner.push_back(Vec::with_capacity(1));
70+
}
71+
}
72+
self.do_push(v);
73+
}
74+
75+
fn do_push(&mut self, v: T) {
76+
self.inner.back_mut().unwrap().push(v);
77+
self.len += 1;
78+
}
79+
80+
/// Removes the last element from the list.
81+
pub fn pop(&mut self) -> Option<T> {
82+
match self.inner.back_mut() {
83+
Some(last) => {
84+
if last.is_empty() {
85+
self.inner.pop_back();
86+
self.pop()
87+
} else {
88+
self.len -= 1;
89+
last.pop()
90+
}
91+
}
92+
None => None,
93+
}
94+
}
95+
96+
/// Clears the list, removing all elements.
97+
/// This will free the memory used by the list.
98+
pub fn clear(&mut self) {
99+
self.inner.clear();
100+
self.len = 0;
101+
}
102+
103+
/// Returns the number of elements in the list.
104+
pub fn len(&self) -> usize {
105+
self.len
106+
}
107+
108+
/// Returns whether the list is empty.
109+
pub fn is_empty(&self) -> bool {
110+
self.inner.is_empty()
111+
}
112+
113+
/// Returns the size of list, including the size of the elements and the
114+
/// size of the list itself, and the unused space.
115+
/// The element size is calculated using `std::mem::size_of::<T>()`,
116+
/// so it is not accurate for all types.
117+
/// For example, for `String`, it will return the size of the pointer,
118+
/// not the size of the string itself. For that you need to use `DeepSizeOf`.
119+
pub fn size(&self) -> usize {
120+
let unused_space = match self.inner.back() {
121+
Some(last) => last.capacity() - last.len(),
122+
None => 0,
123+
};
124+
(self.len() + unused_space) * std::mem::size_of::<T>()
125+
+ std::mem::size_of::<Self>()
126+
+ self.inner.len() * std::mem::size_of::<Vec<T>>()
127+
}
128+
129+
/// Returns an iterator over the elements in the list.
130+
pub fn iter(&self) -> ExpLinkedListIter<'_, T> {
131+
ExpLinkedListIter::new(self)
132+
}
133+
}
134+
135+
impl<T: DeepSizeOf> DeepSizeOf for ExpLinkedList<T> {
136+
fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize {
137+
self.inner
138+
.iter()
139+
.map(|v| v.deep_size_of_children(context))
140+
.sum()
141+
}
142+
}
143+
144+
impl<T> FromIterator<T> for ExpLinkedList<T> {
145+
fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
146+
let iter = iter.into_iter();
147+
let size_hint = iter.size_hint().0;
148+
let cap = if size_hint > 0 { size_hint } else { 1 };
149+
let mut list = Self::with_capacity(cap);
150+
for item in iter {
151+
list.push(item);
152+
}
153+
list
154+
}
155+
}
156+
157+
pub struct ExpLinkedListIter<'a, T> {
158+
inner: std::collections::linked_list::Iter<'a, Vec<T>>,
159+
inner_iter: Option<std::slice::Iter<'a, T>>,
160+
}
161+
162+
impl<'a, T> ExpLinkedListIter<'a, T> {
163+
pub fn new(inner: &'a ExpLinkedList<T>) -> Self {
164+
Self {
165+
inner: inner.inner.iter(),
166+
inner_iter: None,
167+
}
168+
}
169+
}
170+
171+
impl<'a, T> Iterator for ExpLinkedListIter<'a, T> {
172+
type Item = &'a T;
173+
174+
fn next(&mut self) -> Option<Self::Item> {
175+
if let Some(inner_iter) = &mut self.inner_iter {
176+
if let Some(v) = inner_iter.next() {
177+
return Some(v);
178+
}
179+
}
180+
if let Some(inner) = self.inner.next() {
181+
self.inner_iter = Some(inner.iter());
182+
return self.next();
183+
}
184+
None
185+
}
186+
}
187+
188+
pub struct ExpLinkedListIntoIter<T> {
189+
inner: std::collections::linked_list::IntoIter<Vec<T>>,
190+
inner_iter: Option<std::vec::IntoIter<T>>,
191+
len: usize,
192+
}
193+
194+
impl<T> ExpLinkedListIntoIter<T> {
195+
pub fn new(list: ExpLinkedList<T>) -> Self {
196+
let len = list.len();
197+
Self {
198+
inner: list.inner.into_iter(),
199+
inner_iter: None,
200+
len,
201+
}
202+
}
203+
}
204+
205+
impl<T> Iterator for ExpLinkedListIntoIter<T> {
206+
type Item = T;
207+
208+
fn next(&mut self) -> Option<Self::Item> {
209+
if let Some(inner_iter) = &mut self.inner_iter {
210+
if let Some(v) = inner_iter.next() {
211+
return Some(v);
212+
}
213+
}
214+
if let Some(inner) = self.inner.next() {
215+
self.inner_iter = Some(inner.into_iter());
216+
return self.next();
217+
}
218+
None
219+
}
220+
221+
fn size_hint(&self) -> (usize, Option<usize>) {
222+
(self.len, Some(self.len))
223+
}
224+
}
225+
226+
impl<T> IntoIterator for ExpLinkedList<T> {
227+
type Item = T;
228+
type IntoIter = ExpLinkedListIntoIter<T>;
229+
230+
fn into_iter(self) -> Self::IntoIter {
231+
ExpLinkedListIntoIter::new(self)
232+
}
233+
}
234+
235+
#[cfg(test)]
236+
mod tests {
237+
use super::*;
238+
239+
fn test_exp_linked_list(list: &mut ExpLinkedList<usize>) {
240+
assert_eq!(list.len(), 100);
241+
assert!(!list.is_empty());
242+
243+
// removes the last 50 elements
244+
for i in 0..50 {
245+
assert_eq!(list.pop(), Some(99 - i));
246+
}
247+
assert_eq!(list.len(), 50);
248+
assert!(!list.is_empty());
249+
250+
// iterate over the list
251+
for (i, v) in list.iter().enumerate() {
252+
assert_eq!(*v, i);
253+
}
254+
255+
// clear the list
256+
list.clear();
257+
assert_eq!(list.len(), 0);
258+
assert!(list.is_empty());
259+
assert_eq!(list.pop(), None);
260+
}
261+
262+
#[test]
263+
fn test_exp_linked_list_basic() {
264+
let mut list = ExpLinkedList::new();
265+
for i in 0..100 {
266+
list.push(i);
267+
assert_eq!(list.len(), i + 1);
268+
}
269+
test_exp_linked_list(&mut list);
270+
}
271+
272+
#[test]
273+
fn test_exp_linked_list_from() {
274+
let mut list = (0..100).collect();
275+
test_exp_linked_list(&mut list);
276+
}
277+
278+
#[test]
279+
fn test_exp_linked_list_with_capacity_limit() {
280+
let mut list = ExpLinkedList::with_capacity_limit(10);
281+
for i in 0..100 {
282+
list.push(i);
283+
assert_eq!(list.len(), i + 1);
284+
}
285+
assert_eq!(list.inner.back().unwrap().capacity(), 10);
286+
test_exp_linked_list(&mut list);
287+
}
288+
}

rust/lance-core/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
use arrow_schema::{DataType, Field as ArrowField};
55

66
pub mod cache;
7+
pub mod container;
78
pub mod datatypes;
89
pub mod error;
910
pub mod traits;

0 commit comments

Comments
 (0)