Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,6 @@ def register_q8ta_add():
return OpFeatures(
inputs_storage=utils.PACKED_INT8_BUFFER,
supports_resize=False,
supports_prepacking=True,
)


Expand Down
61 changes: 41 additions & 20 deletions backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -334,45 +334,66 @@ TensorIndex linear_idx_to_tensor_idx(
/*
* Convert a linear texel index to a TensorIndex4D.
*
* This function is used for texel-based dispatch where each thread handles
* one packed texel (4 elements along the packed dimension). The texel index
* is decomposed using the dim_order and strides from the tensor's layout.
* This is the inverse of tensor4d_idx_to_texel_idx. It handles both
* single-packed layouts (outer_block_size == 1) and block-packed layouts
* (e.g., 4W4C where outer_block_size > 1).
*
* The strides in BufferMetadata should already be in texel space (with packed
* dimension size divided by 4).
* The approach mirrors tensor4d_idx_to_texel_idx by decomposing the problem
* into two levels:
* 1. Decompose texel_idx into block_idx and intra-block texel offset
* 2. Decompose block_idx into block-space tensor coordinates using strides
* 3. Convert block-space coordinates to element-space by multiplying by
* block sizes
* 4. Add the intra-block outer-dimension offset
*
* For single-packed layouts (outer_block_size == 1, inner_dim == outer_dim),
* texels_per_block == 1, so block_idx == texel_idx and intra_block_texel == 0.
* The only effective multiplication is tidx[inner_dim] *= inner_block_size
* (i.e., *= 4), matching the previous single-packed behavior.
*
* Parameters:
* meta: BufferMetadata with tensor sizes and texel-space strides
* meta: BufferMetadata with block-space strides
* texel_idx: Linear index into packed texels (0 to num_texels-1)
* hashed_layout: Packed layout info containing dim_order and packed_dim
*
* Returns: TensorIndex4D with logical tensor coordinates (packed dim is base of 4-element block)
* Returns: TensorIndex4D with logical tensor coordinates (packed dims are
* base of their respective blocks)
*/
TensorIndex4D texel_idx_to_tensor4d_idx(
const BufferMetadata meta,
uint texel_idx,
const int hashed_layout) {
TensorIndex4D tidx;

const int packed_dim = get_packed_dim(hashed_layout);
const int inner_dim = get_packed_dim(hashed_layout);
const int outer_dim = get_outer_packed_dim(hashed_layout);
const int inner_block_size = get_packed_dim_block_size(hashed_layout);
const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout);

// Decompose texel_idx using dim_order from hashed_layout and strides from meta
// Iterate from slowest-varying dimension (d=3) to fastest (d=0)
// This follows the pattern of linear_idx_to_tensor_idx in indexing.glslh
// Number of texels per block: each block has inner_block_size *
// outer_block_size elements, and each texel holds 4 elements
const int texels_per_block = (inner_block_size * outer_block_size) / 4;

// Decompose texel_idx into block_idx and intra-block texel offset
const uint block_idx = texel_idx / texels_per_block;
const int intra_block_texel = int(texel_idx % texels_per_block);

// Decompose block_idx into block-space tensor coordinates using dim_order
// and strides. Iterate from slowest-varying (d=3) to fastest (d=0).
uint remaining = block_idx;
[[unroll]] for (int d = 3; d >= 0; d--) {
// Get dim index from hashed_layout's dim_order (bits 0-15)
int dim_idx = extract_4b(hashed_layout, d);

// Get stride for this dimension from BufferMetadata
uint dim_stride = meta.strides[0][dim_idx];

// Compute coordinate for this dimension
tidx.data[dim_idx] = int(texel_idx / dim_stride);
texel_idx = texel_idx % dim_stride;
tidx.data[dim_idx] = int(remaining / dim_stride);
remaining = remaining % dim_stride;
}

// Convert packed dimension from texel index to element index
tidx.data[packed_dim] *= 4;
// Convert block-space coordinates to element-space
tidx.data[inner_dim] *= inner_block_size;
tidx.data[outer_dim] *= outer_block_size;

// Add intra-block outer-dimension offset
tidx.data[outer_dim] += intra_block_texel;

return tidx;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

${define_active_storage_type("buffer")}

layout(std430) buffer;

#include "indexing.glslh"

// Output buffer: packed int8x4 values (each int32 contains 4 packed int8)
${layout_declare_tensor(B, "w", "t_outp", "int", "buffer")}
// Input staging buffer: raw int8 data interpreted as int32 for device compat
${layout_declare_tensor(B, "r", "nchw_in", "int", "buffer")}

// Metadata for output tensor
${layout_declare_ubo(B, "BufferMetadata", "outp")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}

void main() {
const uint texel_idx = gl_GlobalInvocationID.x;
const uint num_texels = numel(outp) / 4;
if (texel_idx >= num_texels) {
return;
}

const int inner_dim = get_packed_dim(outp_layout);
const int outer_dim = get_outer_packed_dim(outp_layout);

const TensorIndex4D tidx =
texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout);

// Bounds check on outer dimension
if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) {
return;
}

// Tensor sizes in WHCN order for NCHW contiguous index computation
const uint W = outp.sizes[0][0];
const uint H = outp.sizes[0][1];
const uint C = outp.sizes[0][2];

// Pack 4 int8 values along inner dimension into one int32
int packed = 0;
[[unroll]] for (int i = 0; i < 4; ++i) {
const int elem_inner = tidx.data[inner_dim] + i;
if (elem_inner >= int(outp.sizes[0][inner_dim])) {
break;
}

// Build element coordinates
ivec4 elem = tidx.data;
elem[inner_dim] = elem_inner;

// Compute NCHW contiguous index: w + h*W + c*H*W + n*C*H*W
const uint nchw_idx = uint(elem[0]) + uint(elem[1]) * W +
uint(elem[2]) * H * W + uint(elem[3]) * C * H * W;

// Read int8 from staging buffer (each int32 contains 4 bytes)
const uint int_idx = nchw_idx >> 2;
const uint byte_pos = nchw_idx & 3;
const int staging_val = nchw_in[int_idx];
const int byte_val = (staging_val >> (byte_pos * 8)) & 0xFF;

packed |= (byte_val << (i * 8));
}

t_outp[texel_idx] = packed;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

nchw_to_int8x4_buffer:
parameter_names_with_default_values:
DTYPE: int
shader_variants:
- NAME: nchw_to_int8x4_buffer
49 changes: 49 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>

namespace vkcompute {

void add_staging_to_int8x4_buffer_node(
ComputeGraph& graph,
const ValueRef tensor_data,
const ValueRef tensor) {
VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4);

std::string kernel_name = "nchw_to_int8x4_buffer";

vkapi::ParamsBindList param_buffers;
param_buffers.append(graph.buffer_meta_ubo(tensor));

// One thread per texel (each texel = one int32 = 4 packed int8).
// Use padded_numel to account for dimension padding in packed int8 layouts
// (e.g., kPackedInt8_4C with C=3 pads to C=4).
uint32_t num_texels =
utils::safe_downcast<uint32_t>(graph.padded_numel_of(tensor) / 4);
utils::uvec3 global_wg_size = {num_texels, 1, 1};
utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);

graph.prepack_nodes().emplace_back(new PrepackNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_wg_size,
local_wg_size,
// Input and Output
tensor_data,
tensor,
// Parameter Buffers
param_buffers,
// Specialization Constants
{graph.hashed_layout_of(tensor)}));
}

} // namespace vkcompute
20 changes: 20 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>

namespace vkcompute {

void add_staging_to_int8x4_buffer_node(
ComputeGraph& graph,
const ValueRef tensor_data,
const ValueRef tensor);

} // namespace vkcompute
4 changes: 4 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/Staging.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>

Expand Down Expand Up @@ -327,6 +328,9 @@ ValueRef prepack_int4_linear_weight_transposed_interleaved(
}

void prepack_op(ComputeGraph& graph, const std::vector<ValueRef>& args) {
if (graph.dtype_of(args[1]) == vkapi::kInt8x4) {
return add_staging_to_int8x4_buffer_node(graph, args[0], args[1]);
}
return add_prepack_standard_node(graph, args[0], args[1]);
}

Expand Down
3 changes: 3 additions & 0 deletions backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype) {
case vkapi::kUInt64:
kernel_name += "_uint64";
break;
case vkapi::kInt8x4:
kernel_name += "_int32";
break;
default:
break;
}
Expand Down
30 changes: 19 additions & 11 deletions backends/vulkan/test/custom_ops/impl/TestQ8taBinary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taBinary.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taQuantizeDequantize.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>

namespace vkcompute {

void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
int32_t idx = 0;
const ValueRef fp_input_a = args.at(idx++);
const ValueRef fp_input_b = args.at(idx++);
ValueRef fp_input_a = args.at(idx++);
ValueRef input_b = args.at(idx++);
const ValueRef input_a_scale = args.at(idx++);
const ValueRef input_a_zp = args.at(idx++);
const ValueRef input_b_scale = args.at(idx++);
Expand All @@ -32,6 +33,10 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
utils::GPUMemoryLayout quant_layout =
static_cast<utils::GPUMemoryLayout>(layout_value);

// Check if input_b is a pre-quantized int8 TensorRef
bool input_b_is_int8 =
graph.val_is_tref(input_b) && graph.dtype_of(input_b) == vkapi::kChar;

// Create temporary tensors for quantized data with the specified layout
TmpTensor packed_int8_input_a(
&graph,
Expand All @@ -40,12 +45,8 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
utils::kBuffer,
quant_layout);

TmpTensor packed_int8_input_b(
&graph,
graph.sizes_of(fp_input_b),
vkapi::kInt8x4,
utils::kBuffer,
quant_layout);
ValueRef packed_int8_input_b = graph.add_tensor(
graph.sizes_of(input_b), vkapi::kInt8x4, utils::kBuffer, quant_layout);

TmpTensor packed_int8_output(
&graph,
Expand All @@ -54,12 +55,19 @@ void q8ta_add_test(ComputeGraph& graph, const std::vector<ValueRef>& args) {
utils::kBuffer,
quant_layout);

// Quantize: FP -> int8x4 with specified layout
// Quantize input A: FP -> int8x4
add_q8ta_quantize_node(
graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a);

add_q8ta_quantize_node(
graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b);
if (input_b_is_int8) {
// Input B is a pre-quantized int8 TensorRef; prepack directly into packed
// int8x4 format
add_staging_to_int8x4_buffer_node(graph, input_b, packed_int8_input_b);
} else {
// Input B is a float tensor; quantize at runtime
add_q8ta_quantize_node(
graph, input_b, input_b_scale, input_b_zp, packed_int8_input_b);
}

// Binary add: int8x4 -> int8x4 (same layout for all tensors)
add_q8ta_binary_node(
Expand Down
Loading
Loading