@@ -108,7 +108,7 @@ impl Stage {
108108}
109109
110110type SmemOffset = u32 ;
111- type StageLen = u32 ;
111+ type OutputLen = u32 ;
112112
113113/// A dispatch plan before device materialization.
114114///
@@ -118,7 +118,6 @@ pub enum DispatchPlan {
118118 /// Entire encoding tree is fusable into a single kernel launch.
119119 Fused ( FusedPlan ) ,
120120 /// Some subtrees need separate execution before the fused plan can run.
121- /// Shared memory has already been validated.
122121 PartiallyFused {
123122 /// The fused plan (with placeholder buffer slots for pending subtrees).
124123 plan : FusedPlan ,
@@ -165,11 +164,10 @@ pub enum DispatchPlan {
165164/// exceed `stage.len` by up to 1023 elements. This overflow is absorbed by
166165/// the scratch region (`SMEM_TILE_SIZE` ≥ `FL_CHUNK_SIZE`).
167166pub struct FusedPlan {
168- /// Stages in kernel execution order. All stages except the last decode
169- /// fully into persistent shared memory; the final stage produces the
170- /// output.
171- stages : Vec < ( Stage , SmemOffset , StageLen ) > ,
172- /// Shared memory elements reserved by the preceding (non-output) stages.
167+ /// Stages in kernel execution order; all but the last decode into
168+ /// shared memory, the last decodes into global memeory.
169+ stages : Vec < ( Stage , SmemOffset , OutputLen ) > ,
170+ /// Shared memory reserved by the non-output stages.
173171 smem_cursor : SmemOffset ,
174172 /// Source buffers. `None` entries are placeholder slots for pending subtrees,
175173 /// filled by [`materialize_with_subtrees`] before device copy.
0 commit comments