Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ layout(std430) buffer;
#include "common.glslh"

${layout_declare_tensor(B, "w", "t_scales", DTYPE, "texture3d")}
${layout_declare_tensor(B, "w", "t_zps", "int8", "texture3d")}
${layout_declare_tensor(B, "w", "t_zps", DTYPE, "texture3d")}
${layout_declare_tensor(B, "r", "t_input", DTYPE, STORAGE, is_scalar_array=False)}

${layout_declare_ubo(B, "ivec4", "input_sizes")}
Expand Down Expand Up @@ -196,7 +196,7 @@ void main() {

if (worker_id == 0) {
imageStore(t_scales, ivec3(output_y4, 0, 0), scales_out);
imageStore(t_zps, ivec3(output_y4, 0, 0), zps_out);
imageStore(t_zps, ivec3(output_y4, 0, 0), VEC4_T(zps_out));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ ${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=Fa
${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INT8_INPUT_STORAGE, is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")}
${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")}
${layout_declare_tensor(B, "r", "t_int8_input_zps", DTYPE, "texture3d")}
${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ void load_int8_input_scales_and_zps(
[[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) {
scales.data[m4] =
VEC4_T(texelFetch(t_int8_input_scales, ivec3(m4_start + m4, 0, 0), 0));
zps.data[m4] = texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0);
zps.data[m4] =
ivec4(texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ $if DYNAMIC_QUANT_VARIANT:
${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INPUT_STORAGE, is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_int_input_sums", "int", "buffer", is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_input_scale", DTYPE, "texture3d")}
${layout_declare_tensor(B, "r", "t_input_zp", "int", "texture3d")}
${layout_declare_tensor(B, "r", "t_input_zp", DTYPE, "texture3d")}
${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ ${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is
${layout_declare_tensor(B, "w", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")}
${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")}
${layout_declare_tensor(B, "r", "t_int8_input_zps", DTYPE, "texture3d")}

${layout_declare_ubo(B, "ivec4", "input_sizes")}

Expand Down
7 changes: 7 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ utils::uvec3 pick_choose_qparams_per_row_local_wg_size(
return {workers_per_output, outputs_per_wg, 1u};
}

// The per-token zero-point tensor is fp32-typed (matching torchao's serialized
// asymmetric per-token zero_point_dtype=fp32), even though its values are
// integer-valued in [-128, 127]. The shaders read it as a float texel and
// convert to int for the integer dequant-correction. Declaring the shader
// binding fp32 to match the tensor's allocation avoids the
// float-image-read-through-an-integer-binding format mismatch that corrupted
// negative zero-points on Mali.
void add_choose_qparams_per_row_node(
ComputeGraph& graph,
const ValueRef& input,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ TestCase create_test_case_from_config(
config.channel_size < kRefDimSizeLimit);
std::string prefix = is_perf ? "PERF" : "ACCU";
std::string in_dtype = dtype_short(input_dtype);
std::string out_dtype = "f32,i8"; // pair: (scale, zero_point)
std::string out_dtype = "f32,f32"; // pair: (scale, zero_point)
std::string shape_str = "[" + std::to_string(config.num_channels) + "," +
std::to_string(config.channel_size) + "]";
std::string storage_str = repr_str(storage_type, utils::kWidthPacked);
Expand Down Expand Up @@ -81,10 +81,10 @@ TestCase create_test_case_from_config(
utils::kWidthPacked,
DataGenType::ZEROS);

// Output zero_point tensor (int8) - [num_channels]
// Output zero_point tensor (float) - [num_channels]
ValueSpec zero_point_out(
{config.num_channels},
vkapi::kChar, // int8 for quantized zero point
vkapi::kFloat,
utils::kTexture3D, // Always buffer as per requirement
utils::kWidthPacked,
DataGenType::ZEROS);
Expand Down Expand Up @@ -289,7 +289,7 @@ void choose_qparams_per_channel_reference_impl(TestCase& test_case) {

// Prepare output data
auto& scale_ref_data = scale_out_spec.get_ref_float_data();
auto& zero_point_ref_data = zero_point_out_spec.get_ref_int8_data();
auto& zero_point_ref_data = zero_point_out_spec.get_ref_float_data();
scale_ref_data.resize(num_channels);
zero_point_ref_data.resize(num_channels);

Expand All @@ -312,9 +312,9 @@ void choose_qparams_per_channel_reference_impl(TestCase& test_case) {
calculate_scale_and_zero_point_reference(
min_val, max_val, quant_min, quant_max, scale, zero_point);

// Store results (cast zero_point to int8)
// Store results
scale_ref_data[channel] = scale;
zero_point_ref_data[channel] = static_cast<int8_t>(zero_point);
zero_point_ref_data[channel] = static_cast<float>(zero_point);
}
}

Expand Down
8 changes: 4 additions & 4 deletions backends/vulkan/test/custom_ops/test_q4gsw_linear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ TestCase create_test_case_from_config(

ValueSpec input_zero_point(
{1, config.M}, // Per-input channel tensor
vkapi::kChar,
vkapi::kFloat,
storage_type,
utils::kWidthPacked,
DataGenType::RANDINT);
Expand Down Expand Up @@ -428,7 +428,7 @@ void linear_dq8ca_q4gsw_reference_impl(TestCase& test_case) {
auto& input_scale_data =
input_scale_spec.get_float_data(); // Per-input channel tensor
auto& input_zero_point_data =
input_zeros_spec.get_int8_data(); // Per-input channel tensor
input_zeros_spec.get_float_data(); // Per-input channel tensor

auto& weight_data = weight_spec.get_uint8_data();
auto& weight_sums_data = weight_sums_spec.get_int32_data();
Expand Down Expand Up @@ -462,8 +462,8 @@ void linear_dq8ca_q4gsw_reference_impl(TestCase& test_case) {

// Use per-input channel scale and zero point - index by batch dimension
float input_scale = input_scale_data[b]; // {1, M} -> index by batch
int8_t input_zero_point =
input_zero_point_data[b]; // {1, M} -> index by batch
int8_t input_zero_point = static_cast<int8_t>(
input_zero_point_data[b]); // {1, M} -> index by batch

float quant_input_f =
std::round(input_data[input_idx] / input_scale) + input_zero_point;
Expand Down
Loading