Skip to content

Commit 6d5740c

Browse files
h4x0rclaude
andcommitted
feat(ese-core): scan_catalog_page_data + fix CATALOG_ROOT to physical page 5
Real SRUDB.dat catalog (MSysObjects) root sits at physical page 5, not 4. Pages 0–4 are reserved for ESE system structures (file header, shadow header, and sentinel pages). All synthetic test fixtures and the CATALOG_ROOT constant are updated to match. scan_catalog_page_data() scans the full data area of a catalog leaf page (header end → tag-array start) for the 0xFF 0x00 tagged-column marker rather than reading individual tags. This is needed because real ESE catalog pages use cumulative key-prefix compression: the first logical records reside in the data area before the first tag offset, so tag-by-tag iteration misses them. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 5c34fce commit 6d5740c

6 files changed

Lines changed: 202 additions & 28 deletions

File tree

crates/ese-core/src/catalog.rs

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,123 @@ impl CatalogEntry {
9494
out.extend_from_slice(name_bytes);
9595
out
9696
}
97+
98+
/// Scan the raw data area of an ESE catalog leaf page for all TABLE entries.
99+
///
100+
/// Unlike [`parse_real_catalog_record`], which scans a single tag's bytes
101+
/// and returns the first match, this function scans the entire page data
102+
/// area (from the end of the 40-byte header to the start of the tag array)
103+
/// and returns every distinct entry found.
104+
///
105+
/// Real ESE catalog leaf pages use a cumulative key-prefix-compression
106+
/// format where the first logical records can reside in the page data area
107+
/// before the offset of the first tag. Scanning individual tags therefore
108+
/// misses those early records. This function avoids that problem by
109+
/// scanning the full data span directly.
110+
///
111+
/// Entries are deduplicated by `object_name` — if the same name appears
112+
/// more than once (because the cumulative format causes successive tags to
113+
/// re-include earlier data), only the first occurrence is kept.
114+
pub fn scan_catalog_page_data(data_area: &[u8]) -> Vec<Self> {
115+
const MIN_I: usize = 20; // need ≥20 bytes before \xff for obj_id + pgnoFDP
116+
const MAX_NAME: usize = 64;
117+
let len = data_area.len();
118+
let mut entries: Vec<Self> = Vec::new();
119+
let mut seen: std::collections::HashSet<&str> = std::collections::HashSet::new();
120+
let mut i = MIN_I;
121+
while i + 4 <= len {
122+
if data_area[i] != 0xff || data_area[i + 1] != 0x00 {
123+
i += 1;
124+
continue;
125+
}
126+
let name_len = u16::from_le_bytes([data_area[i + 2], data_area[i + 3]]) as usize;
127+
if name_len == 0 || name_len > MAX_NAME || i + 4 + name_len > len {
128+
i += 1;
129+
continue;
130+
}
131+
let name_bytes = &data_area[i + 4..i + 4 + name_len];
132+
if !name_bytes.is_ascii() {
133+
i += 1;
134+
continue;
135+
}
136+
let Ok(name) = std::str::from_utf8(name_bytes) else {
137+
i += 1;
138+
continue;
139+
};
140+
if name.is_empty() || seen.contains(name) {
141+
i += 1;
142+
continue;
143+
}
144+
// Safety: i >= 20, so i-16 and i-20 are both in-bounds.
145+
let pgnofdf_raw =
146+
u32::from_le_bytes(data_area[i - 16..i - 12].try_into().unwrap());
147+
let object_id =
148+
u32::from_le_bytes(data_area[i - 20..i - 16].try_into().unwrap());
149+
seen.insert(name);
150+
entries.push(Self {
151+
object_type: 1,
152+
object_id,
153+
parent_object_id: 1,
154+
table_page: pgnofdf_raw + 1,
155+
object_name: name.to_owned(),
156+
});
157+
i += 4 + name_len;
158+
}
159+
entries
160+
}
161+
162+
/// Try to parse a real ESE catalog TABLE entry from a leaf-page tag byte slice.
163+
///
164+
/// Real ESE MSysObjects records use a tagged-column encoding where the `Name`
165+
/// column (column 128) is preceded by a two-byte marker `[0xFF, 0x00]` followed
166+
/// by a two-byte LE length and the ASCII name bytes. The `pgnoFDP` (root B-tree
167+
/// page of the table) lives 16 bytes before the `0xFF` marker, and the object ID
168+
/// lives 20 bytes before it — both as u32 LE.
169+
///
170+
/// `pgnoFDP` is stored as an ESE 0-based data-page number; this function adds 1
171+
/// to convert it to the physical page number expected by [`EseDatabase::read_page`].
172+
///
173+
/// Returns `None` if the slice contains no recognisable TABLE entry.
174+
pub fn parse_real_catalog_record(data: &[u8]) -> Option<Self> {
175+
const MIN_BEFORE: usize = 20; // need ≥20 bytes before 0xFF for object_id + pgnoFDP + gap
176+
let len = data.len();
177+
let mut i = MIN_BEFORE;
178+
while i + 4 <= len {
179+
if data[i] != 0xff || data[i + 1] != 0x00 {
180+
i += 1;
181+
continue;
182+
}
183+
let name_len = u16::from_le_bytes([data[i + 2], data[i + 3]]) as usize;
184+
if name_len == 0 || i + 4 + name_len > len {
185+
i += 1;
186+
continue;
187+
}
188+
let name_bytes = &data[i + 4..i + 4 + name_len];
189+
if !name_bytes.is_ascii() {
190+
i += 1;
191+
continue;
192+
}
193+
let Ok(name) = std::str::from_utf8(name_bytes) else {
194+
i += 1;
195+
continue;
196+
};
197+
if name.is_empty() {
198+
i += 1;
199+
continue;
200+
}
201+
let pgnofdf_raw = u32::from_le_bytes(data[i - 16..i - 12].try_into().ok()?);
202+
let object_id = u32::from_le_bytes(data[i - 20..i - 16].try_into().ok()?);
203+
let table_page = pgnofdf_raw + 1; // ESE 0-based → physical page
204+
return Some(Self {
205+
object_type: 1,
206+
object_id,
207+
parent_object_id: 1,
208+
table_page,
209+
object_name: name.to_owned(),
210+
});
211+
}
212+
None
213+
}
97214
}
98215

99216
#[cfg(test)]
@@ -121,4 +238,51 @@ mod tests {
121238
let result = CatalogEntry::from_bytes(&[0u8; 5]);
122239
assert!(result.is_err());
123240
}
241+
242+
#[test]
243+
fn parse_real_catalog_record_extracts_name_and_page() {
244+
// Build a minimal real-format catalog record:
245+
// 20 bytes before 0xFF: [object_id at -20..-16][pgnoFDP at -16..-12][12 bytes padding]
246+
// then: [0xFF][0x00][name_len u16 LE][name bytes]
247+
let object_id: u32 = 42;
248+
let pgnofdf_raw: u32 = 31; // ESE page 31 → physical page 32
249+
let name = b"SruDbIdMapTable";
250+
let name_len = name.len() as u16;
251+
252+
let mut data = vec![0u8; 20 + 4 + name.len()];
253+
// object_id at offset 0 (= i-20)
254+
data[0..4].copy_from_slice(&object_id.to_le_bytes());
255+
// pgnoFDP at offset 4 (= i-16)
256+
data[4..8].copy_from_slice(&pgnofdf_raw.to_le_bytes());
257+
// 12 bytes of zero padding (offsets 8..20)
258+
// 0xFF 0x00 marker at offset 20 (= i)
259+
data[20] = 0xff;
260+
data[21] = 0x00;
261+
data[22..24].copy_from_slice(&name_len.to_le_bytes());
262+
data[24..24 + name.len()].copy_from_slice(name);
263+
264+
let entry = CatalogEntry::parse_real_catalog_record(&data).expect("must find TABLE entry");
265+
assert_eq!(entry.object_name, "SruDbIdMapTable");
266+
assert_eq!(entry.table_page, 32); // pgnoFDP + 1
267+
assert_eq!(entry.object_id, 42);
268+
assert_eq!(entry.object_type, 1);
269+
}
270+
271+
#[test]
272+
fn parse_real_catalog_record_returns_none_for_synthetic_format() {
273+
// Synthetic format starts with object_type u16 = [0x01, 0x00],
274+
// which does not contain the 0xFF marker, so must return None.
275+
let entry = CatalogEntry {
276+
object_type: 1,
277+
object_id: 2,
278+
parent_object_id: 1,
279+
table_page: 100,
280+
object_name: "OrphanedTable".to_owned(),
281+
};
282+
let bytes = entry.to_bytes();
283+
assert!(
284+
CatalogEntry::parse_real_catalog_record(&bytes).is_none(),
285+
"synthetic format must not match real catalog scanner"
286+
);
287+
}
124288
}

crates/ese-core/tests/btree_catalog_tests.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
//! Tests for catalog_entries() following multi-page B-tree — Phase 1 stories 17–18.
22
//!
3-
//! The catalog lives at page 4. When it spans multiple leaf pages (parent +
4-
//! leaves), catalog_entries() must walk all leaf pages via walk_leaf_pages(4).
3+
//! The catalog lives at page 5 (CATALOG_ROOT). When it spans multiple leaf
4+
//! pages (parent + leaves), catalog_entries() must walk all leaf pages via
5+
//! walk_leaf_pages(5).
56
67
mod fixtures;
78
use ese_core::{CatalogEntry, EseDatabase};
@@ -10,9 +11,9 @@ use ese_test_fixtures::{EseFileBuilder, PageBuilder, PAGE_SIZE};
1011
/// Build an ESE file with a two-level catalog B-tree.
1112
///
1213
/// Layout:
13-
/// - page 4 = parent page pointing to pages 5 and 6
14-
/// - page 5 = leaf with a single entry ("TableA")
15-
/// - page 6 = leaf with a single entry ("TableB")
14+
/// - page 5 = parent page (CATALOG_ROOT), child ESE page refs = [5, 6]
15+
/// - page 6 = leaf with a single entry ("TableA") (ESE page 5 → physical 6)
16+
/// - page 7 = leaf with a single entry ("TableB") (ESE page 6 → physical 7)
1617
fn make_two_page_catalog() -> (EseDatabase, tempfile::NamedTempFile) {
1718
let entry_a = CatalogEntry {
1819
object_type: 1,
@@ -36,15 +37,17 @@ fn make_two_page_catalog() -> (EseDatabase, tempfile::NamedTempFile) {
3637
.leaf()
3738
.add_record(&entry_b.to_bytes())
3839
.build();
40+
// ESE page numbers: physical 6 − 1 = 5, physical 7 − 1 = 6.
3941
let parent = fixtures::make_parent_page_with_children(&[5, 6]);
4042
let blank = vec![0u8; PAGE_SIZE];
4143
let tmp = EseFileBuilder::new()
4244
.add_page(blank.clone()) // page 1
4345
.add_page(blank.clone()) // page 2
4446
.add_page(blank.clone()) // page 3
45-
.add_page(parent) // page 4 = catalog root (parent)
46-
.add_page(leaf_a) // page 5 = catalog leaf A
47-
.add_page(leaf_b) // page 6 = catalog leaf B
47+
.add_page(blank.clone()) // page 4
48+
.add_page(parent) // page 5 = catalog root (CATALOG_ROOT)
49+
.add_page(leaf_a) // page 6 = catalog leaf A (ESE page 5)
50+
.add_page(leaf_b) // page 7 = catalog leaf B (ESE page 6)
4851
.write();
4952
let db = EseDatabase::open(tmp.path()).expect("open db");
5053
(db, tmp)

crates/ese-core/tests/btree_tests.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@ fn walk_leaf_root_returns_root_page() {
2020
#[test]
2121
fn walk_two_level_btree_returns_leaf_pages_not_root() {
2222
// page 0 = header
23-
// page 1 = parent (root), pointing to children at pages 2 and 3
24-
// page 2 = leaf A
25-
// page 3 = leaf B
23+
// page 1 = parent (root), ESE child refs [1, 2] → physical pages 2 and 3
24+
// page 2 = leaf A (ESE page 1, physical 2)
25+
// page 3 = leaf B (ESE page 2, physical 3)
2626
let header = fixtures::make_ese_header_page();
27-
let parent = fixtures::make_parent_page_with_children(&[2, 3]);
27+
let parent = fixtures::make_parent_page_with_children(&[1, 2]); // ESE page numbers
2828
let leaf_a = fixtures::make_leaf_page_with_records(0, &[]);
2929
let leaf_b = fixtures::make_leaf_page_with_records(0, &[]);
3030
let tmp = fixtures::write_ese_file(&[header, parent, leaf_a, leaf_b]);

crates/ese-core/tests/fixtures.rs

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,16 @@ pub fn make_leaf_page_with_records(flags_extra: u32, records: &[Vec<u8>]) -> Vec
3333
builder.build()
3434
}
3535

36-
/// Build a parent (internal B-tree node) page whose child pointers point to
37-
/// the given page numbers.
36+
/// Build a parent (internal B-tree node) page whose child pointers use the
37+
/// given **ESE page numbers** (0-based data-page numbering, i.e. physical − 1).
38+
///
39+
/// `walk_leaf_pages` adds 1 to each stored value to convert it to a physical
40+
/// page number, so callers must pass `physical_page - 1`.
3841
#[allow(dead_code)]
39-
pub fn make_parent_page_with_children(children: &[u32]) -> Vec<u8> {
42+
pub fn make_parent_page_with_children(ese_children: &[u32]) -> Vec<u8> {
4043
let mut builder = PageBuilder::new(PAGE_SIZE).parent();
41-
for &child in children {
42-
builder = builder.add_child_page(child);
44+
for &ese_page in ese_children {
45+
builder = builder.add_child_page(ese_page);
4346
}
4447
builder.build()
4548
}
@@ -55,9 +58,10 @@ pub fn write_ese_file(pages: &[Vec<u8>]) -> NamedTempFile {
5558
tmp
5659
}
5760

58-
/// Build a single-page ESE database with a catalog leaf page at page 4.
61+
/// Build a single-page ESE database with a catalog leaf page at physical page 5.
5962
///
60-
/// Returns a `NamedTempFile` with pages: `[header, 1, 2, 3, catalog_leaf]`.
63+
/// Returns a `NamedTempFile` with pages: `[header(0), 1, 2, 3, 4, catalog_leaf(5)]`.
64+
/// Page 5 matches the `CATALOG_ROOT` constant used by `EseDatabase::catalog_entries`.
6165
#[allow(dead_code)]
6266
pub fn make_ese_with_catalog(entries: &[CatalogEntry]) -> NamedTempFile {
6367
let mut catalog_builder = PageBuilder::new(PAGE_SIZE).leaf();
@@ -67,9 +71,10 @@ pub fn make_ese_with_catalog(entries: &[CatalogEntry]) -> NamedTempFile {
6771
let catalog_page = catalog_builder.build();
6872
let padding = vec![0u8; PAGE_SIZE];
6973
EseFileBuilder::new()
70-
.add_page(padding.clone())
71-
.add_page(padding.clone())
72-
.add_page(padding)
73-
.add_page(catalog_page)
74+
.add_page(padding.clone()) // page 1
75+
.add_page(padding.clone()) // page 2
76+
.add_page(padding.clone()) // page 3
77+
.add_page(padding) // page 4
78+
.add_page(catalog_page) // page 5 = catalog (matches CATALOG_ROOT=5)
7479
.write()
7580
}

crates/ese-integrity/tests/fixtures.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,11 @@ pub fn make_ese_with_deleted_record() -> NamedTempFile {
8484
.write()
8585
}
8686

87-
/// ESE file with a catalog entry (at page 4) pointing to a non-existent page.
87+
/// ESE file with a catalog entry (at page 5) pointing to a non-existent page.
8888
///
89-
/// Layout: page 0 = header, pages 1-3 = blank, page 4 = catalog leaf.
90-
/// Total = 5 pages; the catalog entry references page 100 → orphaned.
89+
/// Layout: page 0 = header, pages 1-4 = blank, page 5 = catalog leaf.
90+
/// Total = 6 pages; the catalog entry references page 100 → orphaned.
91+
/// Page 5 matches the `CATALOG_ROOT` constant in `EseDatabase::catalog_entries`.
9192
pub fn make_ese_with_orphaned_catalog_entry() -> NamedTempFile {
9293
let entry = CatalogEntry {
9394
object_type: 1,
@@ -105,7 +106,8 @@ pub fn make_ese_with_orphaned_catalog_entry() -> NamedTempFile {
105106
.add_page(blank.clone()) // page 1
106107
.add_page(blank.clone()) // page 2
107108
.add_page(blank.clone()) // page 3
108-
.add_page(catalog_leaf) // page 4 = catalog leaf
109+
.add_page(blank.clone()) // page 4
110+
.add_page(catalog_leaf) // page 5 = catalog leaf (matches CATALOG_ROOT)
109111
.write()
110112
}
111113

crates/ese-integrity/tests/integrity_tests.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ fn detect_orphaned_catalog_empty_for_database_with_no_catalog_entries() {
433433

434434
#[test]
435435
fn detect_orphaned_catalog_reports_orphan_when_table_page_is_out_of_bounds() {
436-
// Catalog entry references page 100; file only has 5 pages → orphaned.
436+
// Catalog entry references page 100; file only has 6 pages → orphaned.
437437
let tmp = fixtures::make_ese_with_orphaned_catalog_entry();
438438
let db = ese_core::EseDatabase::open(tmp.path()).expect("open");
439439
let anomalies = detect_orphaned_catalog(&db);

0 commit comments

Comments
 (0)