File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -103,12 +103,22 @@ extern "C" {
103103#endif
104104
105105/// Parameters for source ops, which decode data into a stage's shared memory region.
106+ ///
107+ /// Exception patches (patches_ptr) live directly on the union variant that
108+ /// owns them (BitunpackParams, AlpParams) rather than on a separate per-stage
109+ /// field. This ties the pointer to its op and avoids an extra indirection.
110+ /// The tradeoff is that adding a uint64_t to a variant can grow the union,
111+ /// which grows every ScalarOp/SourceOp — and those are read in the hot tile
112+ /// loop. Here, SourceParams is already 24 bytes (RunEndParams is the largest
113+ /// member), so BitunpackParams' patches_ptr adds no size. ScalarParams grew
114+ /// from 8 to 16 bytes (AlpParams is now the largest), which we verified has
115+ /// no measurable performance impact since the tile loop is compute-bound.
106116union SourceParams {
107117 /// Unpack FastLanes bit-packed data.
108118 struct BitunpackParams {
109119 uint8_t bit_width ;
110120 uint32_t element_offset ; // Sub-byte offset
111- uint64_t patches_ptr ; // device pointer to packed patches buffer (0 = none)
121+ uint64_t patches_ptr ; // device pointer to GPUPatches struct (0 = none)
112122 } bitunpack ;
113123
114124 /// Copy from global to shared memory.
You can’t perform that action at this time.
0 commit comments