-
Notifications
You must be signed in to change notification settings - Fork 56
Expand file tree
/
Copy pathfreelist.rs
More file actions
291 lines (269 loc) · 11.5 KB
/
Copy pathfreelist.rs
File metadata and controls
291 lines (269 loc) · 11.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
//! Persisted free-page list (SQLR-6).
//!
//! After `DROP TABLE` / `DROP INDEX` / `ALTER TABLE DROP COLUMN`, the next
//! `save_database` no longer references the dropped object's pages. Without
//! a freelist those pages would either be re-numbered (every later table
//! shifts down → high write amplification) or stranded as orphans on disk.
//!
//! The freelist solves both: pages no longer referenced from `sqlrite_master`
//! go on a persisted stack rooted at `header.freelist_head`, so subsequent
//! saves can pull from there before extending the file. The unrelated tables
//! that didn't change keep their page numbers and their pages re-stage byte-
//! identical → the diff pager skips writing them.
//!
//! ## On-disk layout
//!
//! Each freelist trunk page carries the standard 7-byte page header with
//! `page_type = PAGE_TYPE_FREELIST_TRUNK (5)` and `next_page` pointing at the
//! next trunk in the chain (`0` = end). The 4089-byte payload area holds:
//!
//! ```text
//! 0..2 count: u16 LE — number of free leaf-page IDs that follow
//! 2..2+4*N page_ids[N]: u32 LE — free leaf-page numbers, ascending
//! ```
//!
//! A trunk holds up to `(PAYLOAD_PER_PAGE - 2) / 4 = 1021` free page IDs.
//! Larger freelists chain across multiple trunks. The trunk pages themselves
//! are part of the live page set (they hold metadata) and are *not* on the
//! freelist they encode.
use std::collections::VecDeque;
use crate::error::{Result, SQLRiteError};
use crate::sql::pager::page::{PAGE_HEADER_SIZE, PAGE_SIZE, PAYLOAD_PER_PAGE};
use crate::sql::pager::pager::Pager;
/// Page-type tag for a freelist trunk page. Distinct from existing page tags
/// (`2 = TableLeaf`, `3 = Overflow`, `4 = InteriorNode`); `1` was retired.
pub const PAGE_TYPE_FREELIST_TRUNK: u8 = 5;
/// Maximum number of free page IDs a single trunk page can hold.
/// `PAYLOAD_PER_PAGE` is 4089; we reserve 2 bytes for the count and 4 bytes
/// per ID, giving `(4089 - 2) / 4 = 1021`.
pub const FREELIST_IDS_PER_TRUNK: usize = (PAYLOAD_PER_PAGE - 2) / 4;
/// Encodes a single freelist trunk page into the given buffer.
///
/// `next_trunk` is the page number of the next trunk in the chain, or `0`
/// to mark the end. `page_ids` must have at most `FREELIST_IDS_PER_TRUNK`
/// entries.
pub fn encode_trunk(buf: &mut [u8; PAGE_SIZE], next_trunk: u32, page_ids: &[u32]) -> Result<()> {
if page_ids.len() > FREELIST_IDS_PER_TRUNK {
return Err(SQLRiteError::Internal(format!(
"freelist trunk overflow: {} ids exceeds capacity {}",
page_ids.len(),
FREELIST_IDS_PER_TRUNK
)));
}
// Zero out the buffer; trailing payload bytes after the encoded IDs are
// unused and must be deterministic so the diff pager can skip an
// unchanged trunk.
buf.fill(0);
buf[0] = PAGE_TYPE_FREELIST_TRUNK;
buf[1..5].copy_from_slice(&next_trunk.to_le_bytes());
// Per-page `payload_length` field (bytes 5..7) is unused for trunks —
// the count field inside the payload self-describes the entries.
buf[5..7].copy_from_slice(&0u16.to_le_bytes());
let count = page_ids.len() as u16;
buf[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + 2].copy_from_slice(&count.to_le_bytes());
let mut off = PAGE_HEADER_SIZE + 2;
for id in page_ids {
buf[off..off + 4].copy_from_slice(&id.to_le_bytes());
off += 4;
}
Ok(())
}
/// Decodes one freelist trunk page. Returns `(next_trunk, page_ids)`.
fn decode_trunk(buf: &[u8; PAGE_SIZE]) -> Result<(u32, Vec<u32>)> {
if buf[0] != PAGE_TYPE_FREELIST_TRUNK {
return Err(SQLRiteError::General(format!(
"expected freelist trunk page (tag {PAGE_TYPE_FREELIST_TRUNK}), got tag {}",
buf[0]
)));
}
let next_trunk = u32::from_le_bytes(buf[1..5].try_into().unwrap());
let count = u16::from_le_bytes(
buf[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + 2]
.try_into()
.unwrap(),
) as usize;
if count > FREELIST_IDS_PER_TRUNK {
return Err(SQLRiteError::General(format!(
"freelist trunk count {count} exceeds capacity {FREELIST_IDS_PER_TRUNK}"
)));
}
let mut ids = Vec::with_capacity(count);
let mut off = PAGE_HEADER_SIZE + 2;
for _ in 0..count {
let id = u32::from_le_bytes(buf[off..off + 4].try_into().unwrap());
ids.push(id);
off += 4;
}
Ok((next_trunk, ids))
}
/// Walks the freelist chain rooted at `head` and returns every free leaf-page
/// ID. The trunk pages themselves are *not* included — they're live metadata.
/// Returns the trunk page numbers separately so the caller can release them
/// (the next save re-encodes the freelist from scratch and frees the old
/// trunks for reuse).
///
/// `head == 0` → empty freelist; returns `(vec![], vec![])`.
pub fn read_freelist(pager: &Pager, head: u32) -> Result<(Vec<u32>, Vec<u32>)> {
let mut leaves: Vec<u32> = Vec::new();
let mut trunks: Vec<u32> = Vec::new();
let mut cursor = head;
let mut visited: std::collections::HashSet<u32> = std::collections::HashSet::new();
while cursor != 0 {
if !visited.insert(cursor) {
return Err(SQLRiteError::General(format!(
"freelist cycle detected at trunk page {cursor}"
)));
}
let buf = pager.read_page(cursor).ok_or_else(|| {
SQLRiteError::General(format!(
"freelist trunk page {cursor} is past page_count or unreadable"
))
})?;
let (next, ids) = decode_trunk(buf)?;
leaves.extend(ids);
trunks.push(cursor);
cursor = next;
}
Ok((leaves, trunks))
}
/// Stages the freelist chain into the pager. `free_pages` is the full set
/// of pages that need persisted-on-the-freelist treatment — both the trunk
/// pages (metadata) and the leaf entries (the actually-free leaves).
///
/// The chain consumes some of `free_pages` as its own trunks: a trunk
/// page IS a free page that's been temporarily borrowed for metadata.
/// Returns the new `freelist_head` (or `0` if `free_pages` is empty).
///
/// Encoding: `T = ceil(N / (IDS_PER_TRUNK + 1))` trunks; each trunk
/// (except possibly the last) carries `IDS_PER_TRUNK` leaf IDs. Leaves
/// are stored ascending within each trunk for deterministic on-disk
/// bytes (so an unchanged freelist re-stages byte-identical → diff skip).
pub fn stage_freelist(pager: &mut Pager, free_pages: Vec<u32>) -> Result<u32> {
if free_pages.is_empty() {
return Ok(0);
}
// Sort + dedup so the on-disk ordering is deterministic. A duplicate
// in the freelist would be a serious bug elsewhere (double-free), but
// dedupping here is a cheap defensive guardrail.
let mut ids = free_pages;
ids.sort_unstable();
ids.dedup();
// Solve N = T + L where L = number of leaf slots used, T = number of
// trunks, and L ≤ T * IDS_PER_TRUNK. Smallest T satisfying that is
// ceil(N / (IDS_PER_TRUNK + 1)) — the +1 accounts for the trunk
// page itself absorbing one of the N pages.
let n = ids.len();
let t = n.div_ceil(FREELIST_IDS_PER_TRUNK + 1);
// Take the highest-numbered T pages as trunks. This keeps the leaf
// IDs ascending within each trunk and matches the "drain low first"
// policy of the allocator on subsequent saves.
let leaves_count = n - t;
let trunk_pages: Vec<u32> = ids.split_off(leaves_count);
let leaves = ids;
// Lay out leaves across trunks, IDS_PER_TRUNK per trunk.
let mut chunks: Vec<&[u32]> = leaves.chunks(FREELIST_IDS_PER_TRUNK).collect();
// If there are more trunks than chunks (e.g. N=1 → T=1, L=0), pad
// with empty chunks so every trunk gets staged.
while chunks.len() < trunk_pages.len() {
chunks.push(&[]);
}
for (i, chunk) in chunks.iter().enumerate() {
let next = if i + 1 < trunk_pages.len() {
trunk_pages[i + 1]
} else {
0
};
let mut buf = [0u8; PAGE_SIZE];
encode_trunk(&mut buf, next, chunk)?;
pager.stage_page(trunk_pages[i], buf);
}
Ok(trunk_pages[0])
}
/// Helper: collect a freelist into a `VecDeque<u32>` sorted ascending —
/// the ordering the `PageAllocator` uses to draw next free pages from.
pub fn freelist_to_deque(leaves: Vec<u32>) -> VecDeque<u32> {
let mut sorted = leaves;
sorted.sort_unstable();
sorted.dedup();
VecDeque::from(sorted)
}
/// Auto-VACUUM (SQLR-10) does not fire on databases below this many
/// pages. The 4 KiB page size makes 16 pages = 64 KiB — small enough
/// that the cost of a full-file rewrite is negligible if the user
/// genuinely wants it (manual `VACUUM;` still works), but large enough
/// that single-table churn doesn't blow past the threshold and trigger
/// a noisy compact every few statements.
pub const MIN_PAGES_FOR_AUTO_VACUUM: u32 = 16;
/// Returns `true` if the on-disk freelist (counting both leaf and
/// trunk pages — they're all reclaimable bytes) exceeds `threshold`
/// of `header.page_count`, i.e. the file is bloated enough to be
/// worth compacting. Returns `false` for tiny databases (under
/// [`MIN_PAGES_FOR_AUTO_VACUUM`]) and for empty freelists, both as
/// fast paths to keep the auto-VACUUM hook cheap on the common case.
///
/// This is a read-only inspection of the pager's current header and
/// freelist chain — it does not mutate any state. The caller is
/// responsible for actually invoking
/// [`crate::sql::pager::vacuum_database`] when this returns `true`.
pub fn should_auto_vacuum(pager: &Pager, threshold: f32) -> Result<bool> {
let header = pager.header();
if header.page_count < MIN_PAGES_FOR_AUTO_VACUUM {
return Ok(false);
}
if header.freelist_head == 0 {
return Ok(false);
}
let (leaves, trunks) = read_freelist(pager, header.freelist_head)?;
let free_pages = leaves.len() + trunks.len();
let ratio = free_pages as f32 / header.page_count as f32;
Ok(ratio > threshold)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_freelist_round_trip() {
let mut buf = [0u8; PAGE_SIZE];
encode_trunk(&mut buf, 0, &[]).unwrap();
let (next, ids) = decode_trunk(&buf).unwrap();
assert_eq!(next, 0);
assert!(ids.is_empty());
}
#[test]
fn single_chunk_round_trip() {
let mut buf = [0u8; PAGE_SIZE];
let pages = [3u32, 7, 12, 99];
encode_trunk(&mut buf, 42, &pages).unwrap();
let (next, ids) = decode_trunk(&buf).unwrap();
assert_eq!(next, 42);
assert_eq!(ids, pages);
}
#[test]
fn full_chunk_fits_capacity() {
let mut buf = [0u8; PAGE_SIZE];
let pages: Vec<u32> = (1..=FREELIST_IDS_PER_TRUNK as u32).collect();
encode_trunk(&mut buf, 0, &pages).unwrap();
let (next, ids) = decode_trunk(&buf).unwrap();
assert_eq!(next, 0);
assert_eq!(ids.len(), FREELIST_IDS_PER_TRUNK);
assert_eq!(ids[0], 1);
assert_eq!(
ids[FREELIST_IDS_PER_TRUNK - 1],
FREELIST_IDS_PER_TRUNK as u32
);
}
#[test]
fn over_capacity_errors() {
let mut buf = [0u8; PAGE_SIZE];
let pages: Vec<u32> = (1..=(FREELIST_IDS_PER_TRUNK as u32 + 1)).collect();
let err = encode_trunk(&mut buf, 0, &pages).unwrap_err();
assert!(format!("{err}").contains("freelist trunk overflow"));
}
#[test]
fn wrong_tag_errors_on_decode() {
let mut buf = [0u8; PAGE_SIZE];
buf[0] = 2; // TableLeaf, not freelist
let err = decode_trunk(&buf).unwrap_err();
assert!(format!("{err}").contains("expected freelist trunk page"));
}
}