huggingface · hoytak · May 18, 2026 · Apr 24, 2026 · Apr 24, 2026 · May 18, 2026
diff --git a/api_changes/update_260424_next_stable_chunk_boundary.md b/api_changes/update_260424_next_stable_chunk_boundary.md
@@ -0,0 +1,16 @@
+This update adds a new public deduplication helper for computing restart-safe chunk boundaries from existing chunk boundary metadata.
+
+What changed
+- Added `xet_data::deduplication::next_stable_chunk_boundary(starting_position, chunk_boundaries) -> Option<usize>`.
+- Re-exported it from `xet_data::deduplication` so downstream crates can use it directly.
+- The function scans forward from `starting_position` and returns the next chunk boundary that satisfies the stable-boundary condition:
+  - two consecutive chunk sizes in `[2 * min_chunk, max_chunk - min_chunk)`,
+  - where `min_chunk` and `max_chunk` are derived from chunking constants.
+
+Why this matters
+- Callers that already have chunk-boundary metadata can locate a stable resume boundary without re-reading file bytes.
+- This enables deterministic alignment behavior for resumed/partial workflows that need chunk boundaries robust to prefix changes.
+
+Usage notes
+- `chunk_boundaries` should be monotonically increasing chunk-end offsets produced by the same chunking configuration.
+- `starting_position` may be any byte offset (not necessarily a chunk boundary) and is used as the reference offset from which to search for the next stable chunk boundary.
diff --git a/xet_data/src/deduplication/chunking.rs b/xet_data/src/deduplication/chunking.rs
@@ -282,9 +282,15 @@ impl Chunker {
 /// partition_scan_bytes is the number of bytes to scan at each
 /// proposed partition boundary in search of a valid chunk.
 ///
-/// Due to a known issue in how we do chunking, note that these
-/// partitions are not 100% guaranteed to align. See the
-/// parallel_chunking.pdf for details.
+/// Partition alignment is guaranteed by the hash warmup fix: the
+/// chunker feeds `min_chunk - 64 - 1` bytes before scanning for
+/// boundaries, ensuring the gear hash window is fully warmed (purely
+/// data-dependent) at all accepted trigger positions. This function
+/// additionally verifies the absence of hidden triggers by re-chunking
+/// with `min_chunk = 0`. See `parallel chunking.lyx` for the proof.
+///
+/// For finding stable chunk boundaries from existing chunk boundaries (without
+/// data access), see [`next_stable_chunk_boundary`].
 pub fn find_partitions<R: Read + Seek>(
     reader: &mut R,
     file_size: usize,
@@ -353,6 +359,53 @@ pub fn find_partitions<R: Read + Seek>(
     Ok(partitions)
 }
 
+/// Given a list of chunk boundaries in a file and an arbitrary reference position,
+/// returns the next stable chunk boundary at or after that position.
+///
+/// `starting_position` may be any byte offset in the file; it does not need to
+/// be an existing chunk boundary. The search starts at the first chunk boundary
+/// `>= starting_position`.
+///
+/// A stable chunk boundary is defined such that any possible changes in the data
+/// before `starting_position` would produce the same chunk boundaries at the
+/// stable boundary and later. The fixed data between `starting_position` and
+/// the returned stable boundary is always sufficient to restore the chunker to
+/// its original chunk boundaries.
+///
+/// The stability condition requires two consecutive chunks after `starting_position`,
+/// both with sizes in `[2 * min_chunk, max_chunk - min_chunk)`. The boundary
+/// at the end of the second such chunk is the stable chunk boundary.
+///
+/// The lower bound is `2 * min_chunk` rather than `min_chunk` (as used in
+/// [`find_partitions`]) because this function operates on existing chunk
+/// boundaries without data access, and cannot verify the absence of hidden
+/// hash triggers in the `[c_k, c_k + min_chunk)` skip zone. A shadow-zone
+/// trigger can advance a modified chunker by up to `min_chunk`, so the next
+/// chunk must be at least `2 * min_chunk` to remain reachable.
+///
+/// See `parallel chunking.lyx` for the full proof and `find_stable_start` in
+/// `merkle_hash_subtree.rs` for the analogous construction in merkle hashing.
+pub fn next_stable_chunk_boundary(starting_position: usize, chunk_boundaries: &[usize]) -> Option<usize> {
+    let minimum_chunk = *TARGET_CHUNK_SIZE / *MINIMUM_CHUNK_DIVISOR;
+    let maximum_chunk = *TARGET_CHUNK_SIZE * *MAXIMUM_CHUNK_MULTIPLIER;
+
+    let start_idx = chunk_boundaries.partition_point(|&x| x < starting_position);
+
+    for i in start_idx..chunk_boundaries.len().saturating_sub(2) {
+        let size_a = chunk_boundaries[i + 1] - chunk_boundaries[i];
+        let size_b = chunk_boundaries[i + 2] - chunk_boundaries[i + 1];
+
+        if size_a >= 2 * minimum_chunk
+            && size_a < maximum_chunk - minimum_chunk
+            && size_b >= 2 * minimum_chunk
+            && size_b < maximum_chunk - minimum_chunk
+        {
+            return Some(chunk_boundaries[i + 2]);
+        }
+    }
+    None
+}
+
 #[cfg(test)]
 mod tests {
     use std::collections::HashSet;

diff --git a/xet_data/src/deduplication/mod.rs b/xet_data/src/deduplication/mod.rs
@@ -5,7 +5,7 @@ mod defrag_prevention;
 mod file_deduplication;
 mod interface;
 
-pub use chunking::{Chunker, find_partitions};
+pub use chunking::{Chunker, find_partitions, next_stable_chunk_boundary};
 pub use data_aggregator::DataAggregator;
 pub use dedup_metrics::DeduplicationMetrics;
 pub use file_deduplication::FileDeduper;

diff --git a/xet_data/src/deduplication/parallel chunking.lyx b/xet_data/src/deduplication/parallel chunking.lyx
@@ -827,9 +827,58 @@ Now, with this implementation, can we still perform parallel chunking?
 \end_layout
 
 \begin_layout Standard
-I do not believe so.
- Due to the hash disagreement, under adverserial settings it is possible
- to construct two chunk sequences which will *never* align.
+
+\series bold
+Update:
+\series default
+ The implementation has been fixed.
+ Instead of skipping 
+\begin_inset Formula $m$
+\end_inset
+
+ bytes (setting HashStreamStart = 
+\begin_inset Formula $i+m$
+\end_inset
+
+), the chunker now starts feeding the hash at 
+\begin_inset Formula $m-k-1$
+\end_inset
+
+ bytes from the chunk start.
+ This ensures the hash window has been fed at least 
+\begin_inset Formula $k+1$
+\end_inset
+
+ bytes by position 
+\begin_inset Formula $m$
+\end_inset
+
+, so the hash output at every accepted trigger position (
+\begin_inset Formula $\ge m$
+\end_inset
+
+ from chunk start) depends only on the last 
+\begin_inset Formula $k$
+\end_inset
+
+ bytes of data, independent of the chunk starting point.
+ Therefore 
+\begin_inset Formula $H(a_{1},b)=H(a_{2},b)$
+\end_inset
+
+ for all 
+\begin_inset Formula $b$
+\end_inset
+
+ where 
+\begin_inset Formula $b-a_{1}\ge m$
+\end_inset
+
+ and 
+\begin_inset Formula $b-a_{2}\ge m$
+\end_inset
+
+, and the parallel chunking proof in Section 2 holds.
 \end_layout
 
 \end_body

diff --git a/xet_data/src/deduplication/parallel chunking.pdf b/xet_data/src/deduplication/parallel chunking.pdf