pytorch · SS-JIA · Jun 24, 2026 · Jun 24, 2026
@@ -30,7 +30,7 @@ layout(std430) buffer;
 #include "common.glslh"
 
 ${layout_declare_tensor(B, "w", "t_scales", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "w", "t_zps", "int8", "texture3d")}
+${layout_declare_tensor(B, "w", "t_zps", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "t_input", DTYPE, STORAGE, is_scalar_array=False)}
 
 ${layout_declare_ubo(B, "ivec4", "input_sizes")}
@@ -196,7 +196,7 @@ void main() {
 
   if (worker_id == 0) {
     imageStore(t_scales, ivec3(output_y4, 0, 0), scales_out);
-    imageStore(t_zps, ivec3(output_y4, 0, 0), zps_out);
+    imageStore(t_zps, ivec3(output_y4, 0, 0), VEC4_T(zps_out));
   }
 
 }
@@ -46,7 +46,7 @@ ${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=Fa
 ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INT8_INPUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")}
+${layout_declare_tensor(B, "r", "t_int8_input_zps", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}

@@ -20,7 +20,8 @@ void load_int8_input_scales_and_zps(
   [[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) {
     scales.data[m4] =
         VEC4_T(texelFetch(t_int8_input_scales, ivec3(m4_start + m4, 0, 0), 0));
-    zps.data[m4] = texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0);
+    zps.data[m4] =
+        ivec4(texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0));
   }
 }
 

@@ -40,7 +40,7 @@ $if DYNAMIC_QUANT_VARIANT:
   ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INPUT_STORAGE, is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_int_input_sums", "int", "buffer", is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_input_scale", DTYPE, "texture3d")}
-  ${layout_declare_tensor(B, "r", "t_input_zp", "int", "texture3d")}
+  ${layout_declare_tensor(B, "r", "t_input_zp", DTYPE, "texture3d")}
   ${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}

@@ -33,7 +33,7 @@ ${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is
 ${layout_declare_tensor(B, "w", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")}
+${layout_declare_tensor(B, "r", "t_int8_input_zps", DTYPE, "texture3d")}
 
 ${layout_declare_ubo(B, "ivec4", "input_sizes")}
 

@@ -78,6 +78,13 @@ utils::uvec3 pick_choose_qparams_per_row_local_wg_size(
   return {workers_per_output, outputs_per_wg, 1u};
 }
 
+// The per-token zero-point tensor is fp32-typed (matching torchao's serialized
+// asymmetric per-token zero_point_dtype=fp32), even though its values are
+// integer-valued in [-128, 127]. The shaders read it as a float texel and
+// convert to int for the integer dequant-correction. Declaring the shader
+// binding fp32 to match the tensor's allocation avoids the
+// float-image-read-through-an-integer-binding format mismatch that corrupted
+// negative zero-points on Mali.
 void add_choose_qparams_per_row_node(
     ComputeGraph& graph,
     const ValueRef& input,

@@ -44,7 +44,7 @@ TestCase create_test_case_from_config(
         config.channel_size < kRefDimSizeLimit);
   std::string prefix = is_perf ? "PERF" : "ACCU";
   std::string in_dtype = dtype_short(input_dtype);
-  std::string out_dtype = "f32,i8"; // pair: (scale, zero_point)
+  std::string out_dtype = "f32,f32"; // pair: (scale, zero_point)
   std::string shape_str = "[" + std::to_string(config.num_channels) + "," +
       std::to_string(config.channel_size) + "]";
   std::string storage_str = repr_str(storage_type, utils::kWidthPacked);
@@ -81,10 +81,10 @@ TestCase create_test_case_from_config(
       utils::kWidthPacked,
       DataGenType::ZEROS);
 
-  // Output zero_point tensor (int8) - [num_channels]
+  // Output zero_point tensor (float) - [num_channels]
   ValueSpec zero_point_out(
       {config.num_channels},
-      vkapi::kChar, // int8 for quantized zero point
+      vkapi::kFloat,
       utils::kTexture3D, // Always buffer as per requirement
       utils::kWidthPacked,
       DataGenType::ZEROS);
@@ -289,7 +289,7 @@ void choose_qparams_per_channel_reference_impl(TestCase& test_case) {
 
   // Prepare output data
   auto& scale_ref_data = scale_out_spec.get_ref_float_data();
-  auto& zero_point_ref_data = zero_point_out_spec.get_ref_int8_data();
+  auto& zero_point_ref_data = zero_point_out_spec.get_ref_float_data();
   scale_ref_data.resize(num_channels);
   zero_point_ref_data.resize(num_channels);
 
@@ -312,9 +312,9 @@ void choose_qparams_per_channel_reference_impl(TestCase& test_case) {
     calculate_scale_and_zero_point_reference(
         min_val, max_val, quant_min, quant_max, scale, zero_point);
 
-    // Store results (cast zero_point to int8)
+    // Store results
     scale_ref_data[channel] = scale;
-    zero_point_ref_data[channel] = static_cast<int8_t>(zero_point);
+    zero_point_ref_data[channel] = static_cast<float>(zero_point);
   }
 }
 

@@ -97,7 +97,7 @@ TestCase create_test_case_from_config(
 
   ValueSpec input_zero_point(
       {1, config.M}, // Per-input channel tensor
-      vkapi::kChar,
+      vkapi::kFloat,
       storage_type,
       utils::kWidthPacked,
       DataGenType::RANDINT);
@@ -428,7 +428,7 @@ void linear_dq8ca_q4gsw_reference_impl(TestCase& test_case) {
   auto& input_scale_data =
       input_scale_spec.get_float_data(); // Per-input channel tensor
   auto& input_zero_point_data =
-      input_zeros_spec.get_int8_data(); // Per-input channel tensor
+      input_zeros_spec.get_float_data(); // Per-input channel tensor
 
   auto& weight_data = weight_spec.get_uint8_data();
   auto& weight_sums_data = weight_sums_spec.get_int32_data();
@@ -462,8 +462,8 @@ void linear_dq8ca_q4gsw_reference_impl(TestCase& test_case) {
 
         // Use per-input channel scale and zero point - index by batch dimension
         float input_scale = input_scale_data[b]; // {1, M} -> index by batch
-        int8_t input_zero_point =
-            input_zero_point_data[b]; // {1, M} -> index by batch
+        int8_t input_zero_point = static_cast<int8_t>(
+            input_zero_point_data[b]); // {1, M} -> index by batch
 
         float quant_input_f =
             std::round(input_data[input_idx] / input_scale) + input_zero_point;