[ET-VK][quantized] Store dq8ca per-token zero-point as fp32

ssjia · ssjia · commit ddb5aeede2de · 2026-06-24T11:03:54.000-07:00
The per-token dynamic-activation-quant (`dq8ca`) zero-point was corrupted by a tensor-allocation vs shader-access dtype mismatch. The per-token zero-point tensor is created with a float dtype -- fp32, or fp16 under `USE_VULKAN_FP16_INFERENCE` -- so its backing image uses a float texel format (`rgba32f` / `rgba16f`). But the shader declared and accessed that image with an integer dtype (`int8`, an integer image format `rgba8i`). Reading a float-format image through an integer-format binding is the bug. On ARM Mali (Valhall) GPUs this mismatch corrupted the per-token zero-points: negative zero-points came back as garbage (`-k` read as `-2^23 - k`), driving the quantized activation to the int8 floor, the per-group sums to `-4096`, and the GEMM output to garbage, producing garbled, runaway generation for 8da4w models (e.g. the Llama4-mini TISO TTS backbone on Mali-G715/G710). Adreno happened to tolerate the same mismatch and read correct values, so the corruption was Mali-specific even though the mismatch itself is general. The per-token zero-point is serialized as fp32 by torchao design: `Int8DynamicActivationIntxWeightConfig` (8da4w) uses asymmetric per-token activation quant with an explicit fp32 `zero_point_dtype`. Decoding the serialized `.pte` confirms the zero-point tensor is FLOAT32, and (like the scale) it is stored in a texture as an `rgba32f` texel -- never `rgba8i`. The float allocation is the truth; the int8 shader access was the mismatched side. The fix is to declare, store, and read the per-token zero-point as fp32 across the dq8ca qparams shaders, so the shader access dtype matches the tensor's allocation dtype and the texture is read as the `rgba32f` image it actually is. The zero-point value is integer-valued (nudged to `[-128, 127]`), so fp32 represents it exactly and the consumer's `int(zp)` conversion for the integer dequant-correction is lossless. This touches the dq8ca qparams shaders -- `choose_qparams_per_row`, `quantize_and_pack_4h4w_with_group_sums`, `linear_dq8ca_q4gsw_tiled`, the shared `linear_int8_input_scales_zps_load` helper, and the `linear_q4gsw_coop` variant (whose zero-point binding only matches the descriptor-set layout and is never read) -- plus a documentation comment in `ChooseQParams.cpp`. Because the per-token qparams remain in texture storage (unchanged from before) and only the zero-point dtype changes, this is a pure runtime shader fix: existing texture-qparams 8da4w `.pte` files are corrected without re-export, since the texture already bakes the zero-point as `rgba32f` and the shader now reads it as such. Authored with Claude Code. Differential Revision: [D109595977](https://our.internmc.facebook.com/intern/diff/D109595977/) ghstack-source-id: 396618146 Pull-Request: #20491
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl
@@ -30,7 +30,7 @@ layout(std430) buffer;
 #include "common.glslh"
 
 ${layout_declare_tensor(B, "w", "t_scales", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "w", "t_zps", "int8", "texture3d")}
+${layout_declare_tensor(B, "w", "t_zps", "float", "texture3d")}
 ${layout_declare_tensor(B, "r", "t_input", DTYPE, STORAGE, is_scalar_array=False)}
 
 ${layout_declare_ubo(B, "ivec4", "input_sizes")}
@@ -196,7 +196,7 @@ void main() {
 
   if (worker_id == 0) {
     imageStore(t_scales, ivec3(output_y4, 0, 0), scales_out);
-    imageStore(t_zps, ivec3(output_y4, 0, 0), zps_out);
+    imageStore(t_zps, ivec3(output_y4, 0, 0), vec4(zps_out));
   }
 
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl
@@ -46,7 +46,7 @@ ${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=Fa
 ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INT8_INPUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")}
+${layout_declare_tensor(B, "r", "t_int8_input_zps", "float", "texture3d")}
 ${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_scales_zps_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_scales_zps_load.glslh
@@ -20,7 +20,8 @@ void load_int8_input_scales_and_zps(
   [[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) {
     scales.data[m4] =
         VEC4_T(texelFetch(t_int8_input_scales, ivec3(m4_start + m4, 0, 0), 0));
-    zps.data[m4] = texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0);
+    zps.data[m4] =
+        ivec4(texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0));
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl
@@ -40,7 +40,7 @@ $if DYNAMIC_QUANT_VARIANT:
   ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INPUT_STORAGE, is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_int_input_sums", "int", "buffer", is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_input_scale", DTYPE, "texture3d")}
-  ${layout_declare_tensor(B, "r", "t_input_zp", "int", "texture3d")}
+  ${layout_declare_tensor(B, "r", "t_input_zp", "float", "texture3d")}
   ${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.glsl
@@ -33,7 +33,7 @@ ${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is
 ${layout_declare_tensor(B, "w", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")}
+${layout_declare_tensor(B, "r", "t_int8_input_zps", "float", "texture3d")}
 
 ${layout_declare_ubo(B, "ivec4", "input_sizes")}
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp b/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp
@@ -78,6 +78,13 @@ utils::uvec3 pick_choose_qparams_per_row_local_wg_size(
   return {workers_per_output, outputs_per_wg, 1u};
 }
 
+// The per-token zero-point tensor is fp32-typed (matching torchao's serialized
+// asymmetric per-token zero_point_dtype=fp32), even though its values are
+// integer-valued in [-128, 127]. The shaders read it as a float texel and
+// convert to int for the integer dequant-correction. Declaring the shader
+// binding fp32 to match the tensor's allocation avoids the
+// float-image-read-through-an-integer-binding format mismatch that corrupted
+// negative zero-points on Mali.
 void add_choose_qparams_per_row_node(
     ComputeGraph& graph,
     const ValueRef& input,
diff --git a/backends/vulkan/test/custom_ops/test_choose_qparams_per_row.cpp b/backends/vulkan/test/custom_ops/test_choose_qparams_per_row.cpp
@@ -44,7 +44,7 @@ TestCase create_test_case_from_config(
         config.channel_size < kRefDimSizeLimit);
   std::string prefix = is_perf ? "PERF" : "ACCU";
   std::string in_dtype = dtype_short(input_dtype);
-  std::string out_dtype = "f32,i8"; // pair: (scale, zero_point)
+  std::string out_dtype = "f32,f32"; // pair: (scale, zero_point)
   std::string shape_str = "[" + std::to_string(config.num_channels) + "," +
       std::to_string(config.channel_size) + "]";
   std::string storage_str = repr_str(storage_type, utils::kWidthPacked);
@@ -81,10 +81,10 @@ TestCase create_test_case_from_config(
       utils::kWidthPacked,
       DataGenType::ZEROS);
 
-  // Output zero_point tensor (int8) - [num_channels]
+  // Output zero_point tensor (float) - [num_channels]
   ValueSpec zero_point_out(
       {config.num_channels},
-      vkapi::kChar, // int8 for quantized zero point
+      vkapi::kFloat,
       utils::kTexture3D, // Always buffer as per requirement
       utils::kWidthPacked,
       DataGenType::ZEROS);
@@ -289,7 +289,7 @@ void choose_qparams_per_channel_reference_impl(TestCase& test_case) {
 
   // Prepare output data
   auto& scale_ref_data = scale_out_spec.get_ref_float_data();
-  auto& zero_point_ref_data = zero_point_out_spec.get_ref_int8_data();
+  auto& zero_point_ref_data = zero_point_out_spec.get_ref_float_data();
   scale_ref_data.resize(num_channels);
   zero_point_ref_data.resize(num_channels);
 
@@ -312,9 +312,9 @@ void choose_qparams_per_channel_reference_impl(TestCase& test_case) {
     calculate_scale_and_zero_point_reference(
         min_val, max_val, quant_min, quant_max, scale, zero_point);
 
-    // Store results (cast zero_point to int8)
+    // Store results
     scale_ref_data[channel] = scale;
-    zero_point_ref_data[channel] = static_cast<int8_t>(zero_point);
+    zero_point_ref_data[channel] = static_cast<float>(zero_point);
   }
 }
 
diff --git a/backends/vulkan/test/custom_ops/test_q4gsw_linear.cpp b/backends/vulkan/test/custom_ops/test_q4gsw_linear.cpp
@@ -97,7 +97,7 @@ TestCase create_test_case_from_config(
 
   ValueSpec input_zero_point(
       {1, config.M}, // Per-input channel tensor
-      vkapi::kChar,
+      vkapi::kFloat,
       storage_type,
       utils::kWidthPacked,
       DataGenType::RANDINT);
@@ -428,7 +428,7 @@ void linear_dq8ca_q4gsw_reference_impl(TestCase& test_case) {
   auto& input_scale_data =
       input_scale_spec.get_float_data(); // Per-input channel tensor
   auto& input_zero_point_data =
-      input_zeros_spec.get_int8_data(); // Per-input channel tensor
+      input_zeros_spec.get_float_data(); // Per-input channel tensor
 
   auto& weight_data = weight_spec.get_uint8_data();
   auto& weight_sums_data = weight_sums_spec.get_int32_data();
@@ -462,8 +462,8 @@ void linear_dq8ca_q4gsw_reference_impl(TestCase& test_case) {
 
         // Use per-input channel scale and zero point - index by batch dimension
         float input_scale = input_scale_data[b]; // {1, M} -> index by batch
-        int8_t input_zero_point =
-            input_zero_point_data[b]; // {1, M} -> index by batch
+        int8_t input_zero_point = static_cast<int8_t>(
+            input_zero_point_data[b]); // {1, M} -> index by batch
 
         float quant_input_f =
             std::round(input_data[input_idx] / input_scale) + input_zero_point;

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ layout(std430) buffer;`
`30`	`30`	`#include "common.glslh"`
`31`	`31`
`32`	`32`	`${layout_declare_tensor(B, "w", "t_scales", DTYPE, "texture3d")}`
`33`		`-${layout_declare_tensor(B, "w", "t_zps", "int8", "texture3d")}`
	`33`	`+${layout_declare_tensor(B, "w", "t_zps", "float", "texture3d")}`
`34`	`34`	`${layout_declare_tensor(B, "r", "t_input", DTYPE, STORAGE, is_scalar_array=False)}`
`35`	`35`
`36`	`36`	`${layout_declare_ubo(B, "ivec4", "input_sizes")}`
`@@ -196,7 +196,7 @@ void main() {`
`196`	`196`
`197`	`197`	`if (worker_id == 0) {`
`198`	`198`	`imageStore(t_scales, ivec3(output_y4, 0, 0), scales_out);`
`199`		`- imageStore(t_zps, ivec3(output_y4, 0, 0), zps_out);`
	`199`	`+ imageStore(t_zps, ivec3(output_y4, 0, 0), vec4(zps_out));`
`200`	`200`	`}`
`201`	`201`
`202`	`202`	`}`
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,8 @@ void load_int8_input_scales_and_zps(`
`20`	`20`	`[[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) {`
`21`	`21`	`scales.data[m4] =`
`22`	`22`	`VEC4_T(texelFetch(t_int8_input_scales, ivec3(m4_start + m4, 0, 0), 0));`
`23`		`- zps.data[m4] = texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0);`
	`23`	`+ zps.data[m4] =`
	`24`	`+ ivec4(texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0));`
`24`	`25`	`}`
`25`	`26`	`}`
`26`	`27`