Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/plugins/intel_gpu/src/graph/non_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ GPU_DEFINE_PRIMITIVE_TYPE_ID(count_nonzero)
layout count_nonzero_inst::calc_output_layout(count_nonzero_node const& node, kernel_impl_params const& impl_param) {
assert(static_cast<bool>(node.get_primitive()->output_data_types[0]) == false &&
"Output data type forcing is not supported for count_nonzero_node!");
return layout{cldnn::data_types::i32, cldnn::format::bfyx, tensor{1, 1, 1, 1}};
return layout{cldnn::data_types::i32, cldnn::format::bfyx, tensor{1, 1, 1, 1024}};
}

template<typename ShapeType>
std::vector<layout> count_nonzero_inst::calc_output_layouts(count_nonzero_node const& /*node*/, kernel_impl_params const& impl_param) {
assert(static_cast<bool>(impl_param.desc->output_data_types[0]) == false &&
"Output data type forcing is not supported for count_nonzero_node!");
return {layout{ov::PartialShape{1}, cldnn::data_types::i32, cldnn::format::bfyx}};
return {layout{ov::PartialShape{1024}, cldnn::data_types::i32, cldnn::format::bfyx}};
}
Comment on lines 19 to 30
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] count_nonzero now advertises a fixed 1024-element output buffer. This hard-coded magic size is duplicated across graph layout and kernel dispatch (and will increase host-side reads like read_vector(...) that only need element 0). Consider centralizing the constant (or deriving the output layout from the chosen dispatch size) to avoid drift and to keep shape-inference overhead minimal.

Copilot uses AI. Check for mistakes.

std::string count_nonzero_inst::to_string(count_nonzero_node const& node) {
Expand All @@ -38,7 +38,7 @@ std::string count_nonzero_inst::to_string(count_nonzero_node const& node) {

json_composite count_nonzero_info;
count_nonzero_info.add("input id", input.id());
count_nonzero_info.add("output shape", tensor{1, 1, 1, 4});
count_nonzero_info.add("output shape", tensor{1, 1, 1, 1024});

node_info->add("count_nonzero info", count_nonzero_info);
node_info->dump(primitive_description);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@ KERNEL (count_nonzero_ref)(
workitem_nonzero_count += count;
}

sub_group_barrier(CLK_LOCAL_MEM_FENCE);
uint subgroup_nonzero_count = sub_group_reduce_add(workitem_nonzero_count);
barrier(CLK_LOCAL_MEM_FENCE);
uint workitem_offset = work_group_scan_inclusive_add(workitem_nonzero_count);

barrier(CLK_GLOBAL_MEM_FENCE);
if (get_sub_group_local_id() == 0)
atomic_add(&(output[0]), subgroup_nonzero_count);
output[local_idx == num_work_items - 1 ? 0 : local_idx + 1] = workitem_offset;
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,21 @@ KERNEL (gather_nonzero_ref)(
volatile __global INPUT1_TYPE* output_shape,
__global OUTPUT_TYPE* output)
{
int local_offset = 0;
const uint local_idx = get_local_id(0);
const uint num_work_items = get_global_size(0);
const uint data_size = TOTAL_DATA_SIZE;
const uint items_num = data_size / num_work_items;
const uint leftovers = data_size - (items_num * num_work_items);

uint actual_items_num = items_num;
uint input_idx = (actual_items_num * local_idx) + leftovers;
if (local_idx < leftovers) {
actual_items_num = items_num + 1;
input_idx = actual_items_num * local_idx;
}
uint final_item = input_idx + actual_items_num;

int local_offset = local_idx == 0 ? 0 : output_shape[local_idx];
const int result_size = OV_INPUT_RANK * OUTPUT_FEATURE_NUM; // output shape: [ov_rank, count_nonzero]

OUTPUT_TYPE* out_mem;
Expand Down Expand Up @@ -115,10 +129,9 @@ KERNEL (gather_nonzero_ref)(
out_mem[x_pos] = x; \
local_offset++;
#endif
int input_idx = 0;
int global_output_offset = 0;

// load to local mem
for (; input_idx + VSIZE <= TOTAL_DATA_SIZE; input_idx += VSIZE) {
for (; input_idx + VSIZE <= final_item; input_idx += VSIZE) {
MAKE_VECTOR_TYPE(INPUT0_TYPE, VSIZE) inputs = VLOAD(0, input + input_idx);
for (int v = 0; v < VSIZE; ++v) {
int input_idx_v = input_idx + v;
Expand All @@ -129,15 +142,17 @@ KERNEL (gather_nonzero_ref)(
}

// leftovers
for (;input_idx < TOTAL_DATA_SIZE; ++input_idx) {
for (;input_idx < final_item; ++input_idx) {
int input_idx_v = input_idx;
int v = 0;
if (input[input_idx] != INPUT0_VAL_ZERO) {
ADD_IDXS;
}
}

if (use_local_mem) {
barrier(CLK_LOCAL_MEM_FENCE);
int global_output_offset = 0;
if (use_local_mem && local_idx == 0) {
// write back to global mem
int local_out_iter = 0;
for (; local_out_iter + VSIZE < result_size; local_out_iter += VSIZE) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ CountNonzeroKernelRef::DispatchData CountNonzeroKernelRef::SetDefault(const coun
dispatchData.dataSize = input.LogicalSize();
size_t max_dim_size = (dispatchData.dataSize > params.engineInfo.maxWorkGroupSize) ?
params.engineInfo.maxWorkGroupSize : dispatchData.dataSize;
// FixMe: This limit is created by the presence of a defined API between count_nonzero
// and gather_nonzero. Ideally, both need to be refactored into a single multikernel
// implementation
max_dim_size = std::min(max_dim_size, (size_t)1024);
dispatchData.lws = dispatchData.gws = { max_dim_size, 1, 1};

return dispatchData;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,15 @@ JitConstants GatherNonzeroKernelRef::GetJitConstants(const gather_nonzero_params

CommonDispatchData GatherNonzeroKernelRef::SetDefault(const gather_nonzero_params& params) const {
CommonDispatchData dispatchData;
const auto& input = params.inputs[0];

dispatchData.gws = {1, 1, 1};
dispatchData.lws = {1, 1, 1};
// Set 1 work group to avoid synchornization issue for summation of nonzero counting.
size_t max_dim_size = (input.LogicalSize() > params.engineInfo.maxWorkGroupSize) ? params.engineInfo.maxWorkGroupSize : input.LogicalSize();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: std::min(params.engineInfo.maxWorkGroupSize, input.LogicalSize())

// FixMe: This limit is created by the presence of a defined API between count_nonzero
// and gather_nonzero. Ideally, both need to be refactored into a single multikernel
// implementation
max_dim_size = std::min(max_dim_size, (size_t)1024);
dispatchData.lws = dispatchData.gws = {max_dim_size, 1, 1};

return dispatchData;
}
Expand Down
146 changes: 0 additions & 146 deletions src/plugins/intel_gpu/tests/unit/test_cases/non_zero_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,118 +159,6 @@ TEST(test_count_non_zero, dynamic_2d_f32_bfyx) {
}
}

template<typename T>
void test_gather_non_zero(layout in_layout, std::vector<T> in_data) {
auto& engine = get_test_engine();
auto input_mem = engine.allocate_memory(in_layout);
auto count_non_zero = ov::reference::non_zero_get_count<T>(in_data.data(), in_layout.get_shape());
auto in_rank = in_layout.get_shape().size();
std::vector<int32_t> expected_results(count_non_zero * in_rank);
ov::reference::non_zero<T, int32_t>(in_data.data(), expected_results.data(), in_layout.get_shape());

auto output_shape_layout = layout{ov::PartialShape{1}, data_types::i32, format::bfyx};
auto output_shape_mem = engine.allocate_memory(output_shape_layout);
set_values(input_mem, in_data);

std::vector<int32_t> output_shape_data = {(int32_t)count_non_zero};

set_values(output_shape_mem, output_shape_data);

topology topology;
topology.add(input_layout("InputData", in_layout));
topology.add(data("OutputShape", output_shape_mem));
topology.add(
gather_nonzero("gather_nonzero", input_info("InputData"), input_info("OutputShape"))
);

network network(engine, topology, get_test_default_config(engine));

network.set_input_data("InputData", input_mem);
auto outputs = network.execute();
auto output = outputs.at("gather_nonzero").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
cldnn::mem_lock<int32_t> shape_ptr(output_shape_mem, get_test_stream());

for (size_t i = 0; i < expected_results.size(); ++i) {
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}

TEST(test_gather_non_zero, 4d_fp32_1_3_3_1) {
std::vector<float> in_data = {
0.1f, 0.2f, 0.3f, 0.0f,
0.0f, 0.4f, 0.1f, 0.9f, 0.10f
};
test_gather_non_zero<float>(layout{ov::PartialShape{1, 3, 3, 1}, data_types::f32, format::bfyx}, in_data);
}

TEST(test_gather_non_zero, 4d_fp32_2_4_3_2) {
std::vector<float> in_data = {
0.1f, 0.2f, 0.3f, 0.0f, 12.0f, 2.0f, 0.4f, 0.1f,
1.9f, 0.10f, 1.0f, 0.0f, 0.1f, 0.2f, 0.0f, 100.0f,
0.0001f, 0.0f, 2.9f, 0.2f, 4.0f, 0.0f, 9.1f, 0.9f,
100.0f, 0.4f, 0.1f, 0.3f, 0.0f, 24.2f, 1.23f, 0.0f,
4.0f, 0.0f, 3.1f, 0.9f, 0.10f, 49.2f, 0.0f, 0.3f,
100.0f, 0.4f, 0.1f, 0.9f, 0.1f, 33.12f, 12.1f, 0.0001f
};
test_gather_non_zero<float>(layout{ov::PartialShape{2, 4, 3, 2}, data_types::f32, format::bfyx}, in_data);
}
TEST(test_gather_non_zero, 4d_fp16_2_4_3_2) {
std::vector<ov::float16> in_data = {
0.1f, 0.2f, 0.3f, 0.0f, 12.0f, 2.0f, 0.4f, 0.1f,
1.9f, 0.10f, 1.0f, 0.0f, 0.1f, 0.2f, 0.0f, 100.0f,
0.0001f, 0.0f, 2.9f, 0.2f, 4.0f, 0.0f, 9.1f, 0.9f,
100.0f, 0.4f, 0.1f, 0.3f, 0.0f, 24.2f, 1.23f, 0.0f,
4.0f, 0.0f, 3.1f, 0.9f, 0.10f, 49.2f, 0.0f, 0.3f,
100.0f, 0.4f, 0.1f, 0.9f, 0.1f, 33.12f, 12.1f, 0.0001f
};
test_gather_non_zero<ov::float16>(layout{ov::PartialShape{2, 4, 3, 2}, data_types::f16, format::bfyx}, in_data);
}

TEST(test_gather_non_zero, 5d_fp32_1_3_3_2_2) {
std::vector<float> in_data = {
0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
0.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f,
8.0f, 3.0f, 0.1f, 0.00001f, 0.10f, 0.001f,
0.1f, -0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
0.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f,
0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
8.0f, 3.0f, 0.1f, 0.00001f, 0.10f, 0.001f,
0.1f, -0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
};
test_gather_non_zero<float>(layout{ov::PartialShape{1, 3, 4, 2, 2}, data_types::f32, format::bfzyx}, in_data);
}

TEST(test_gather_non_zero, 6d_fp16_2_3_1_3_2_4) {
std::vector<float> in_data = {
0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
1.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f,
0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
19.0f, 0.0f, 0.1f, 0.9f, 0.10f, -0.001f,
0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
8.0f, 3.0f, 0.1f, 0.00001f, 0.10f, 0.001f,
0.1f, -0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
13.0f, 1.0f, 0.1f, 0.9f, 0.10f, 0.001f,
11.1f, 0.2f, 0.3f, 66.0f, 12.1f, 11.1f,
0.0f, 0.0001f, 0.1f, 0.9f, 0.10f, 0.001f,
0.1f, 0.2f, 0.3f, 2.0f, 12.1f, 11.1f,
0.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f,
0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
-13.0f, 1.0f, 0.1f, 0.9f, 0.10f, 0.001f,
0.1f, 0.2f, 0.3f, 66.0f, 12.1f, 11.1f,
0.0f, 0.001f, 0.1f, 0.9f, 0.10f, 0.001f,
0.1f, 0.2f, 0.3f, 2.0f, 12.1f, 11.1f,
0.1f, 1.2f, 0.3f, 99.0f, 12.1f, 11.1f,
100.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f,
0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f,
13.0f, 1.0f, 0.1f, 0.9f, -0.10f, 0.001f,
0.1f, 0.2f, 0.3f, 66.0f, 12.1f, 11.1f,
0.0f, 0.0001f, 0.1f, 0.9f, 0.10f, 0.001f,
0.1f, 0.2f, 0.3f, 2.0f, 12.1f, 11.1f,
};
test_gather_non_zero<float>(layout{ov::PartialShape{2, 3, 1, 3, 2, 4}, data_types::f32, format::bfwzyx}, in_data);
}

TEST(non_zero_gpu, dynamic) {
auto& engine = get_test_engine();
Comment on lines 159 to 163
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[HIGH] The dedicated gather_nonzero unit tests were removed, but this PR changes the internal count→gather contract and introduces a new work partitioning/prefix-sum scheme (including the 1024 work-item cap). Existing non_zero tests here are relatively small and don’t exercise the >1024-element path or the non-local-memory path that used to be covered. Please add a regression test that validates NonZero correctness for an input size >1024 (and ideally another case that forces USE_LOCAL_MEM off) to prevent silent wrong-index regressions.

Copilot generated this review using guidance from repository custom instructions.
ov::Shape in_shape = { 3, 3 };
Expand Down Expand Up @@ -452,40 +340,6 @@ TEST(test_non_zero, 6d_fp16_2_2_2_1_5_1) {
test_non_zero<int32_t>(layout{ov::PartialShape{2, 2, 2, 1, 5, 1}, data_types::i32, format::bfwzyx}, in_data);
}

TEST(test_gather_non_zero, not_use_local_mem) {
auto& engine = get_test_engine();
auto max_local_mem_size = engine.get_device_info().max_local_mem_size;

auto in_layout = layout{ov::PartialShape{ov::Dimension(max_local_mem_size)}, data_types::f32, format::bfyx};
auto input_mem = engine.allocate_memory(in_layout);
auto in_data = std::vector<float>(max_local_mem_size, 1.f);
set_values(input_mem, in_data);

auto output_shape_layout = layout{ov::PartialShape{1}, data_types::i32, format::bfyx};
auto output_shape_mem = engine.allocate_memory(output_shape_layout);
set_values(output_shape_mem, {static_cast<int32_t>(max_local_mem_size)});

topology topology;
topology.add(input_layout("input", in_layout));
topology.add(data("output_shape", output_shape_mem));
topology.add(gather_nonzero("gather_nonzero", input_info("input"), input_info("output_shape")));

network network(engine, topology, get_test_default_config(engine));

network.set_input_data("input", input_mem);

auto outputs = network.execute();
auto output = outputs.at("gather_nonzero").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());

std::vector<int32_t> expected_results(max_local_mem_size);
ov::reference::non_zero<float, int32_t>(in_data.data(), expected_results.data(), in_layout.get_shape());

for (size_t i = 0; i < expected_results.size(); ++i) {
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}

TEST(non_zero_gpu, const_input) {
auto& engine = get_test_engine();
ov::Shape in_shape = { 3, 3 };
Expand Down
Loading