|
| 1 | +//! The order-preserving key encoding. |
| 2 | +//! |
| 3 | +//! For any two keyable values (and any two composite keys), |
| 4 | +//! `bytewise_cmp(encode(a), encode(b)) == logical_cmp(a, b)`. Every encoded |
| 5 | +//! value is **prefix-free**, so composite keys are plain concatenation and the |
| 6 | +//! encoding decodes back exactly — Phase 7 relies on that to recover PK |
| 7 | +//! suffixes from index entries. |
| 8 | +//! |
| 9 | +//! Layout per value: one tag byte (fixing the cross-type rank, nulls first), |
| 10 | +//! then a payload: |
| 11 | +//! |
| 12 | +//! | type | payload | |
| 13 | +//! |---|---| |
| 14 | +//! | `null` | none | |
| 15 | +//! | `bool` | `0x00` / `0x01` | |
| 16 | +//! | `i64`, `timestamp` | 8 bytes big-endian, sign bit flipped | |
| 17 | +//! | `f64` | 8 bytes big-endian IEEE-754 total-order mapping | |
| 18 | +//! | `text`, `blob` | bytes with `0x00` → `0x00 0xFF`, terminated `0x00 0x00` | |
| 19 | +//! | `uuid` | 16 raw bytes | |
| 20 | +//! |
| 21 | +//! `json` is opaque in v1 and cannot be a key component. |
| 22 | +
|
| 23 | +use crate::value::{f64_from_total_key, f64_total_key, Value}; |
| 24 | +use crate::{KeyCorruption, Result, TypeError}; |
| 25 | + |
| 26 | +const TAG_NULL: u8 = 0x01; |
| 27 | +const TAG_BOOL: u8 = 0x02; |
| 28 | +const TAG_I64: u8 = 0x03; |
| 29 | +const TAG_F64: u8 = 0x04; |
| 30 | +const TAG_TEXT: u8 = 0x05; |
| 31 | +const TAG_BLOB: u8 = 0x06; |
| 32 | +const TAG_UUID: u8 = 0x07; |
| 33 | +const TAG_TIMESTAMP: u8 = 0x08; |
| 34 | + |
| 35 | +/// Encode a composite key (one or more components) into its order-preserving |
| 36 | +/// byte form. A single-column key is a one-element slice. |
| 37 | +/// |
| 38 | +/// Returns [`TypeError::NotKeyable`] if a component is `json` (opaque in v1). |
| 39 | +/// |
| 40 | +/// # Examples |
| 41 | +/// |
| 42 | +/// ``` |
| 43 | +/// use types::{encode_key, Value}; |
| 44 | +/// |
| 45 | +/// let lo = encode_key(&[Value::I64(-5)]).unwrap(); |
| 46 | +/// let hi = encode_key(&[Value::I64(3)]).unwrap(); |
| 47 | +/// assert!(lo < hi); |
| 48 | +/// ``` |
| 49 | +pub fn encode_key(components: &[Value]) -> Result<Vec<u8>> { |
| 50 | + let mut out = Vec::new(); |
| 51 | + for value in components { |
| 52 | + encode_into(&mut out, value)?; |
| 53 | + } |
| 54 | + Ok(out) |
| 55 | +} |
| 56 | + |
| 57 | +fn encode_into(out: &mut Vec<u8>, value: &Value) -> Result<()> { |
| 58 | + match value { |
| 59 | + Value::Null => out.push(TAG_NULL), |
| 60 | + Value::Bool(b) => { |
| 61 | + out.push(TAG_BOOL); |
| 62 | + out.push(u8::from(*b)); |
| 63 | + } |
| 64 | + Value::I64(v) => { |
| 65 | + out.push(TAG_I64); |
| 66 | + out.extend_from_slice(&flip_sign(*v).to_be_bytes()); |
| 67 | + } |
| 68 | + Value::F64(v) => { |
| 69 | + out.push(TAG_F64); |
| 70 | + out.extend_from_slice(&f64_total_key(*v).to_be_bytes()); |
| 71 | + } |
| 72 | + Value::Text(s) => { |
| 73 | + out.push(TAG_TEXT); |
| 74 | + escape_into(out, s.as_bytes()); |
| 75 | + } |
| 76 | + Value::Blob(b) => { |
| 77 | + out.push(TAG_BLOB); |
| 78 | + escape_into(out, b); |
| 79 | + } |
| 80 | + Value::Uuid(u) => { |
| 81 | + out.push(TAG_UUID); |
| 82 | + out.extend_from_slice(u); |
| 83 | + } |
| 84 | + Value::Timestamp(v) => { |
| 85 | + out.push(TAG_TIMESTAMP); |
| 86 | + out.extend_from_slice(&flip_sign(*v).to_be_bytes()); |
| 87 | + } |
| 88 | + Value::Json(_) => return Err(TypeError::NotKeyable { kind: "json" }), |
| 89 | + } |
| 90 | + Ok(()) |
| 91 | +} |
| 92 | + |
| 93 | +/// Decode a key produced by [`encode_key`] back into its components. |
| 94 | +/// |
| 95 | +/// The input is stored bytes, so every malformation is a typed |
| 96 | +/// [`TypeError::KeyCorrupt`] — never a panic. |
| 97 | +pub fn decode_key(bytes: &[u8]) -> Result<Vec<Value>> { |
| 98 | + let mut components = Vec::new(); |
| 99 | + let mut rest = bytes; |
| 100 | + while let Some((&tag, after_tag)) = rest.split_first() { |
| 101 | + let (value, after_value) = decode_one(tag, after_tag)?; |
| 102 | + components.push(value); |
| 103 | + rest = after_value; |
| 104 | + } |
| 105 | + Ok(components) |
| 106 | +} |
| 107 | + |
| 108 | +fn decode_one(tag: u8, rest: &[u8]) -> Result<(Value, &[u8])> { |
| 109 | + match tag { |
| 110 | + TAG_NULL => Ok((Value::Null, rest)), |
| 111 | + TAG_BOOL => { |
| 112 | + let (&byte, rest) = rest |
| 113 | + .split_first() |
| 114 | + .ok_or_else(|| corrupt(KeyCorruption::Truncated))?; |
| 115 | + match byte { |
| 116 | + 0 => Ok((Value::Bool(false), rest)), |
| 117 | + 1 => Ok((Value::Bool(true), rest)), |
| 118 | + _ => Err(corrupt(KeyCorruption::BadBool { byte })), |
| 119 | + } |
| 120 | + } |
| 121 | + TAG_I64 => { |
| 122 | + let (word, rest) = take_u64(rest)?; |
| 123 | + Ok((Value::I64(unflip_sign(word)), rest)) |
| 124 | + } |
| 125 | + TAG_F64 => { |
| 126 | + let (word, rest) = take_u64(rest)?; |
| 127 | + Ok((Value::F64(f64_from_total_key(word)), rest)) |
| 128 | + } |
| 129 | + TAG_TEXT => { |
| 130 | + let (bytes, rest) = unescape(rest)?; |
| 131 | + let text = String::from_utf8(bytes).map_err(|_| corrupt(KeyCorruption::InvalidUtf8))?; |
| 132 | + Ok((Value::Text(text), rest)) |
| 133 | + } |
| 134 | + TAG_BLOB => { |
| 135 | + let (bytes, rest) = unescape(rest)?; |
| 136 | + Ok((Value::Blob(bytes), rest)) |
| 137 | + } |
| 138 | + TAG_UUID => { |
| 139 | + if rest.len() < 16 { |
| 140 | + return Err(corrupt(KeyCorruption::Truncated)); |
| 141 | + } |
| 142 | + let (head, rest) = rest.split_at(16); |
| 143 | + let mut uuid = [0u8; 16]; |
| 144 | + uuid.copy_from_slice(head); |
| 145 | + Ok((Value::Uuid(uuid), rest)) |
| 146 | + } |
| 147 | + TAG_TIMESTAMP => { |
| 148 | + let (word, rest) = take_u64(rest)?; |
| 149 | + Ok((Value::Timestamp(unflip_sign(word)), rest)) |
| 150 | + } |
| 151 | + _ => Err(corrupt(KeyCorruption::BadTag { tag })), |
| 152 | + } |
| 153 | +} |
| 154 | + |
| 155 | +/// Map an `i64` to a `u64` whose unsigned (big-endian byte) order matches the |
| 156 | +/// signed order: flip the sign bit. |
| 157 | +fn flip_sign(v: i64) -> u64 { |
| 158 | + (v as u64) ^ (1 << 63) |
| 159 | +} |
| 160 | + |
| 161 | +fn unflip_sign(word: u64) -> i64 { |
| 162 | + (word ^ (1 << 63)) as i64 |
| 163 | +} |
| 164 | + |
| 165 | +fn take_u64(rest: &[u8]) -> Result<(u64, &[u8])> { |
| 166 | + if rest.len() < 8 { |
| 167 | + return Err(corrupt(KeyCorruption::Truncated)); |
| 168 | + } |
| 169 | + let (head, rest) = rest.split_at(8); |
| 170 | + let mut word = [0u8; 8]; |
| 171 | + word.copy_from_slice(head); |
| 172 | + Ok((u64::from_be_bytes(word), rest)) |
| 173 | +} |
| 174 | + |
| 175 | +/// Escape variable-length bytes so they are prefix-free yet order-preserving: |
| 176 | +/// every `0x00` becomes `0x00 0xFF`, and the value ends with `0x00 0x00`. |
| 177 | +/// A proper prefix then terminates (`0x00 0x00`) exactly where the longer |
| 178 | +/// value continues with either an escaped zero (`0x00 0xFF`, larger) or any |
| 179 | +/// other byte (`0x01..`, larger) — so prefixes sort first, matching the |
| 180 | +/// logical bytewise order. |
| 181 | +fn escape_into(out: &mut Vec<u8>, bytes: &[u8]) { |
| 182 | + for &b in bytes { |
| 183 | + out.push(b); |
| 184 | + if b == 0x00 { |
| 185 | + out.push(0xFF); |
| 186 | + } |
| 187 | + } |
| 188 | + out.extend_from_slice(&[0x00, 0x00]); |
| 189 | +} |
| 190 | + |
| 191 | +fn unescape(mut rest: &[u8]) -> Result<(Vec<u8>, &[u8])> { |
| 192 | + let mut out = Vec::new(); |
| 193 | + while let Some((&b, tail)) = rest.split_first() { |
| 194 | + if b != 0x00 { |
| 195 | + out.push(b); |
| 196 | + rest = tail; |
| 197 | + continue; |
| 198 | + } |
| 199 | + match tail.split_first() { |
| 200 | + Some((&0x00, after)) => return Ok((out, after)), |
| 201 | + Some((&0xFF, after)) => { |
| 202 | + out.push(0x00); |
| 203 | + rest = after; |
| 204 | + } |
| 205 | + Some((&escape, _)) => return Err(corrupt(KeyCorruption::BadEscape { escape })), |
| 206 | + None => return Err(corrupt(KeyCorruption::Truncated)), |
| 207 | + } |
| 208 | + } |
| 209 | + Err(corrupt(KeyCorruption::Truncated)) |
| 210 | +} |
| 211 | + |
| 212 | +fn corrupt(kind: KeyCorruption) -> TypeError { |
| 213 | + TypeError::KeyCorrupt(kind) |
| 214 | +} |
| 215 | + |
| 216 | +#[cfg(test)] |
| 217 | +mod tests { |
| 218 | + use super::*; |
| 219 | + |
| 220 | + #[test] |
| 221 | + fn prefix_sorts_before_extension() { |
| 222 | + let a = encode_key(&[Value::Text("ab".into())]).unwrap(); |
| 223 | + let b = encode_key(&[Value::Text("ab\u{0}".into())]).unwrap(); |
| 224 | + let c = encode_key(&[Value::Text("abc".into())]).unwrap(); |
| 225 | + assert!(a < b && b < c); |
| 226 | + } |
| 227 | + |
| 228 | + #[test] |
| 229 | + fn composite_orders_component_wise() { |
| 230 | + let a = encode_key(&[Value::Text("a".into()), Value::I64(9)]).unwrap(); |
| 231 | + let b = encode_key(&[Value::Text("ab".into()), Value::I64(0)]).unwrap(); |
| 232 | + assert!(a < b, "shorter first component must dominate"); |
| 233 | + } |
| 234 | + |
| 235 | + #[test] |
| 236 | + fn json_is_not_keyable() { |
| 237 | + let err = encode_key(&[Value::Json(vec![0xC0])]).unwrap_err(); |
| 238 | + assert!(matches!(err, TypeError::NotKeyable { kind: "json" })); |
| 239 | + } |
| 240 | +} |
0 commit comments