rust_sqlite/src/sql/pager/fts_cell.rs at main · joaoh82/rust_sqlite · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
//! On-disk format for one FTS posting list (Phase 8c).
//!
//! Each cell carries either a posting list for one term — `(term,
//! [(rowid, term_freq), ...])` — or, in a single sidecar cell with
//! `term.is_empty()`, the per-doc length map `(rowid, doc_len)`. Cells
//! live on `TableLeaf` pages identical to regular table data trees, so
//! the slot directory + sibling `next_page` chain + interior-page
//! mechanics from Phase 3d work without FTS-specific page plumbing.
//!
//! Reusing the table-tree shape lets `Cell::peek_rowid` work uniformly
//! across cell kinds: it skips `cell_length | kind_tag` and reads the
//! first varint, which is `cell_id` here. `cell_id` is a sequential
//! integer assigned at save time (1, 2, 3 …), not a row identifier —
//! the B-Tree just needs an ordered key for slot directory binary
//! search; the actual data is keyed by `term`.
//!
//! ```text
//!   cell_length   varint          bytes after this field
//!   kind_tag      u8 = 0x06       (KIND_FTS_POSTING)
//!   cell_id       zigzag varint   sequential B-Tree slot key
//!   term_len      varint          length of `term` in bytes; 0 → sidecar
//!   term          term_len bytes  ASCII-lowercased term per Phase 8 Q3
//!   count         varint          number of (rowid, value) pairs
//!   for each:
//!     rowid       zigzag varint   the row this entry belongs to
//!     value       varint          term frequency, or doc length when
//!                                 term_len == 0 (sidecar cell)
//! ```
//!
//! One sidecar cell suffices for the entire index: it lists every
//! indexed doc with its tokenized length, including the zero-length
//! corner case (a row whose text tokenizes to nothing — still indexed
//! so `len()` and `total_docs` round-trip). Posting cells follow.
//!
//! No null bitmap, no per-field type tag — every field has a fixed
//! type. The encoding is deliberately minimal because long posting
//! lists dominate disk usage on real corpora.

use crate::error::{Result, SQLRiteError};
use crate::sql::pager::cell::KIND_FTS_POSTING;
use crate::sql::pager::varint;

/// One FTS posting list cell — either a per-term postings entry or the
/// single doc-lengths sidecar (when `term.is_empty()`).
#[derive(Debug, Clone, PartialEq)]
pub struct FtsPostingCell {
    /// Sequential id assigned at save time. Acts as the B-Tree slot
    /// directory key; never persisted as part of the index logic.
    pub cell_id: i64,
    /// Lowercased ASCII term. Empty on the doc-lengths sidecar.
    pub term: String,
    /// `(rowid, value)` pairs. `value` is term frequency for posting
    /// cells, doc length for the sidecar.
    pub entries: Vec<(i64, u32)>,
}

impl FtsPostingCell {
    pub fn posting(cell_id: i64, term: String, entries: Vec<(i64, u32)>) -> Self {
        Self {
            cell_id,
            term,
            entries,
        }
    }

    /// Constructs the doc-lengths sidecar cell (term left empty).
    pub fn doc_lengths(cell_id: i64, entries: Vec<(i64, u32)>) -> Self {
        Self {
            cell_id,
            term: String::new(),
            entries,
        }
    }

    /// Encodes the cell into a freshly-allocated `Vec<u8>`. The result
    /// starts with the shared `cell_length | kind_tag` prefix and is
    /// directly usable as a slot-directory entry on a `TableLeaf`-style
    /// page.
    pub fn encode(&self) -> Result<Vec<u8>> {
        // Body capacity guess: 1 (kind) + 10 (cell_id) + 5 (term_len)
        // + term + 5 (count) + per-pair 10 (rowid) + 5 (value).
        let pair_bytes = self.entries.len() * 15;
        let mut body = Vec::with_capacity(1 + 10 + 5 + self.term.len() + 5 + pair_bytes);

        body.push(KIND_FTS_POSTING);
        varint::write_i64(&mut body, self.cell_id);
        varint::write_u64(&mut body, self.term.len() as u64);
        body.extend_from_slice(self.term.as_bytes());
        varint::write_u64(&mut body, self.entries.len() as u64);
        for (rowid, value) in &self.entries {
            varint::write_i64(&mut body, *rowid);
            varint::write_u64(&mut body, *value as u64);
        }

        let mut out = Vec::with_capacity(body.len() + varint::MAX_VARINT_BYTES);
        varint::write_u64(&mut out, body.len() as u64);
        out.extend_from_slice(&body);
        Ok(out)
    }

    /// Decodes one cell starting at `pos`. Returns the cell plus the
    /// total bytes consumed (including the leading length varint).
    pub fn decode(buf: &[u8], pos: usize) -> Result<(FtsPostingCell, usize)> {
        let (body_len, len_bytes) = varint::read_u64(buf, pos)?;
        let body_start = pos + len_bytes;
        let body_end = body_start
            .checked_add(body_len as usize)
            .ok_or_else(|| SQLRiteError::Internal("FTS cell length overflow".to_string()))?;
        if body_end > buf.len() {
            return Err(SQLRiteError::Internal(format!(
                "FTS cell extends past buffer: needs {body_start}..{body_end}, have {}",
                buf.len()
            )));
        }
        let body = &buf[body_start..body_end];
        if body.first().copied() != Some(KIND_FTS_POSTING) {
            return Err(SQLRiteError::Internal(format!(
                "FtsPostingCell::decode called on non-FTS entry (kind_tag = {:#x})",
                body.first().copied().unwrap_or(0)
            )));
        }

        let mut cur = 1usize;
        let (cell_id, n) = varint::read_i64(body, cur)?;
        cur += n;

        let (term_len, n) = varint::read_u64(body, cur)?;
        cur += n;
        // Sanity: a single term shouldn't exceed a few KB even with
        // pathological input. The whole cell body sits inside one page
        // (~4 KiB), so a giant term length is almost certainly a
        // corrupt cell — bail before allocating.
        if term_len as usize > body.len().saturating_sub(cur) {
            return Err(SQLRiteError::Internal(format!(
                "FTS cell {cell_id}: term_len {term_len} exceeds remaining body \
                 ({}) — corrupt cell?",
                body.len() - cur
            )));
        }
        let term_bytes = &body[cur..cur + term_len as usize];
        cur += term_len as usize;
        let term = std::str::from_utf8(term_bytes)
            .map_err(|e| {
                SQLRiteError::Internal(format!("FTS cell {cell_id}: term not valid UTF-8: {e}"))
            })?
            .to_string();

        let (count, n) = varint::read_u64(body, cur)?;
        cur += n;
        // Sanity: a single posting list shouldn't exceed corpus size.
        // 8 GiB worth of entries (8 bytes per rowid alone) is firmly in
        // "corrupt cell" territory.
        if count > 1 << 28 {
            return Err(SQLRiteError::Internal(format!(
                "FTS cell {cell_id}: claims {count} entries (>2^28) — corrupt cell?"
            )));
        }
        let mut entries = Vec::with_capacity(count as usize);
        for _ in 0..count {
            let (rowid, n) = varint::read_i64(body, cur)?;
            cur += n;
            let (value_u64, n) = varint::read_u64(body, cur)?;
            cur += n;
            // Term frequencies and doc lengths fit in u32 (a doc with
            // 4 billion tokens is implausible). Reject with a clean
            // error instead of silently truncating.
            if value_u64 > u32::MAX as u64 {
                return Err(SQLRiteError::Internal(format!(
                    "FTS cell {cell_id}: value {value_u64} exceeds u32::MAX — corrupt cell?"
                )));
            }
            entries.push((rowid, value_u64 as u32));
        }

        if cur != body.len() {
            return Err(SQLRiteError::Internal(format!(
                "FTS cell {cell_id} had {} trailing bytes",
                body.len() - cur
            )));
        }

        Ok((
            FtsPostingCell {
                cell_id,
                term,
                entries,
            },
            len_bytes + body_len as usize,
        ))
    }

    /// True iff this cell is the doc-lengths sidecar.
    pub fn is_doc_lengths(&self) -> bool {
        self.term.is_empty()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn round_trip(cell: &FtsPostingCell) {
        let bytes = cell.encode().expect("encode");
        let (decoded, consumed) = FtsPostingCell::decode(&bytes, 0).expect("decode");
        assert_eq!(
            consumed,
            bytes.len(),
            "decode should consume the whole cell"
        );
        assert_eq!(&decoded, cell);
    }

    #[test]
    fn posting_cell_round_trips() {
        let cell = FtsPostingCell::posting(7, "rust".to_string(), vec![(1, 2), (3, 1), (5, 7)]);
        round_trip(&cell);
    }

    #[test]
    fn doc_lengths_sidecar_round_trips() {
        let cell = FtsPostingCell::doc_lengths(1, vec![(1, 12), (2, 20), (3, 0), (4, 7)]);
        assert!(cell.is_doc_lengths());
        round_trip(&cell);
    }

    #[test]
    fn empty_postings_round_trips() {
        // Edge case: an FTS cell with zero entries shouldn't be
        // emitted in practice (the term would be pruned by remove()),
        // but the format must still round-trip.
        let cell = FtsPostingCell::posting(2, "ghost".to_string(), vec![]);
        round_trip(&cell);
    }

    #[test]
    fn negative_and_large_rowids_round_trip() {
        // Rowids are zigzag-encoded; cover both signs.
        round_trip(&FtsPostingCell::posting(
            3,
            "x".to_string(),
            vec![(-1, 1), (i64::MAX, 99), (i64::MIN, 1)],
        ));
    }

    #[test]
    fn long_term_round_trips() {
        // A 1024-byte term — well within page capacity. Tokenizer
        // wouldn't actually emit this in practice, but encode/decode
        // must still survive.
        let term = "a".repeat(1024);
        let cell = FtsPostingCell::posting(4, term, vec![(1, 1)]);
        round_trip(&cell);
    }

    #[test]
    fn long_posting_list_round_trips() {
        // 5000 entries — exercises the count + pair-loop paths.
        let entries: Vec<(i64, u32)> = (0..5000_i64).map(|i| (i, ((i * 3) as u32) + 1)).collect();
        let cell = FtsPostingCell::posting(5, "common".to_string(), entries);
        round_trip(&cell);
    }

    #[test]
    fn decode_rejects_wrong_kind_tag() {
        let mut bad = Vec::new();
        varint::write_u64(&mut bad, 1); // body_len
        bad.push(0x01); // KIND_LOCAL, not KIND_FTS_POSTING
        let err = FtsPostingCell::decode(&bad, 0).unwrap_err();
        assert!(format!("{err}").contains("non-FTS entry"));
    }

    #[test]
    fn decode_rejects_truncated_buffer() {
        let cell = FtsPostingCell::posting(1, "rust".to_string(), vec![(1, 2), (5, 3)]);
        let bytes = cell.encode().expect("encode");
        for chop in 1..=3 {
            let truncated = &bytes[..bytes.len() - chop];
            assert!(
                FtsPostingCell::decode(truncated, 0).is_err(),
                "expected error chopping {chop} byte(s) from end of {} byte cell",
                bytes.len()
            );
        }
    }

    #[test]
    fn decode_rejects_invalid_utf8_term() {
        // Hand-craft a cell whose term bytes aren't valid UTF-8.
        let mut body = Vec::new();
        body.push(KIND_FTS_POSTING);
        varint::write_i64(&mut body, 1); // cell_id
        varint::write_u64(&mut body, 2); // term_len
        body.extend_from_slice(&[0xFF, 0xFE]); // not valid UTF-8
        varint::write_u64(&mut body, 0); // count = 0
        let mut out = Vec::new();
        varint::write_u64(&mut out, body.len() as u64);
        out.extend_from_slice(&body);
        let err = FtsPostingCell::decode(&out, 0).unwrap_err();
        assert!(format!("{err}").to_lowercase().contains("utf-8"));
    }

    #[test]
    fn decode_rejects_implausible_count() {
        // Hand-craft a cell with count = 2^29 (above the corruption sanity bound).
        let mut body = Vec::new();
        body.push(KIND_FTS_POSTING);
        varint::write_i64(&mut body, 1);
        varint::write_u64(&mut body, 4);
        body.extend_from_slice(b"term");
        varint::write_u64(&mut body, 1u64 << 29);
        let mut out = Vec::new();
        varint::write_u64(&mut out, body.len() as u64);
        out.extend_from_slice(&body);
        let err = FtsPostingCell::decode(&out, 0).unwrap_err();
        assert!(format!("{err}").to_lowercase().contains("corrupt"));
    }
}