-
Notifications
You must be signed in to change notification settings - Fork 3.2k
[GPU] Alter reference NonZero implementation to parallelize counting #35157
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
472c3b2
ba5be7a
90f4a3d
a62e10d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,9 +60,15 @@ JitConstants GatherNonzeroKernelRef::GetJitConstants(const gather_nonzero_params | |
|
|
||
| CommonDispatchData GatherNonzeroKernelRef::SetDefault(const gather_nonzero_params& params) const { | ||
| CommonDispatchData dispatchData; | ||
| const auto& input = params.inputs[0]; | ||
|
|
||
| dispatchData.gws = {1, 1, 1}; | ||
| dispatchData.lws = {1, 1, 1}; | ||
| // Set 1 work group to avoid synchornization issue for summation of nonzero counting. | ||
| size_t max_dim_size = (input.LogicalSize() > params.engineInfo.maxWorkGroupSize) ? params.engineInfo.maxWorkGroupSize : input.LogicalSize(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: |
||
| // FixMe: This limit is created by the presence of a defined API between count_nonzero | ||
| // and gather_nonzero. Ideally, both need to be refactored into a single multikernel | ||
| // implementation | ||
| max_dim_size = std::min(max_dim_size, (size_t)1024); | ||
| dispatchData.lws = dispatchData.gws = {max_dim_size, 1, 1}; | ||
|
|
||
| return dispatchData; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -159,118 +159,6 @@ TEST(test_count_non_zero, dynamic_2d_f32_bfyx) { | |
| } | ||
| } | ||
|
|
||
| template<typename T> | ||
| void test_gather_non_zero(layout in_layout, std::vector<T> in_data) { | ||
| auto& engine = get_test_engine(); | ||
| auto input_mem = engine.allocate_memory(in_layout); | ||
| auto count_non_zero = ov::reference::non_zero_get_count<T>(in_data.data(), in_layout.get_shape()); | ||
| auto in_rank = in_layout.get_shape().size(); | ||
| std::vector<int32_t> expected_results(count_non_zero * in_rank); | ||
| ov::reference::non_zero<T, int32_t>(in_data.data(), expected_results.data(), in_layout.get_shape()); | ||
|
|
||
| auto output_shape_layout = layout{ov::PartialShape{1}, data_types::i32, format::bfyx}; | ||
| auto output_shape_mem = engine.allocate_memory(output_shape_layout); | ||
| set_values(input_mem, in_data); | ||
|
|
||
| std::vector<int32_t> output_shape_data = {(int32_t)count_non_zero}; | ||
|
|
||
| set_values(output_shape_mem, output_shape_data); | ||
|
|
||
| topology topology; | ||
| topology.add(input_layout("InputData", in_layout)); | ||
| topology.add(data("OutputShape", output_shape_mem)); | ||
| topology.add( | ||
| gather_nonzero("gather_nonzero", input_info("InputData"), input_info("OutputShape")) | ||
| ); | ||
|
|
||
| network network(engine, topology, get_test_default_config(engine)); | ||
|
|
||
| network.set_input_data("InputData", input_mem); | ||
| auto outputs = network.execute(); | ||
| auto output = outputs.at("gather_nonzero").get_memory(); | ||
| cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream()); | ||
| cldnn::mem_lock<int32_t> shape_ptr(output_shape_mem, get_test_stream()); | ||
|
|
||
| for (size_t i = 0; i < expected_results.size(); ++i) { | ||
| ASSERT_EQ(expected_results[i], output_ptr[i]); | ||
| } | ||
| } | ||
|
|
||
| TEST(test_gather_non_zero, 4d_fp32_1_3_3_1) { | ||
| std::vector<float> in_data = { | ||
| 0.1f, 0.2f, 0.3f, 0.0f, | ||
| 0.0f, 0.4f, 0.1f, 0.9f, 0.10f | ||
| }; | ||
| test_gather_non_zero<float>(layout{ov::PartialShape{1, 3, 3, 1}, data_types::f32, format::bfyx}, in_data); | ||
| } | ||
|
|
||
| TEST(test_gather_non_zero, 4d_fp32_2_4_3_2) { | ||
| std::vector<float> in_data = { | ||
| 0.1f, 0.2f, 0.3f, 0.0f, 12.0f, 2.0f, 0.4f, 0.1f, | ||
| 1.9f, 0.10f, 1.0f, 0.0f, 0.1f, 0.2f, 0.0f, 100.0f, | ||
| 0.0001f, 0.0f, 2.9f, 0.2f, 4.0f, 0.0f, 9.1f, 0.9f, | ||
| 100.0f, 0.4f, 0.1f, 0.3f, 0.0f, 24.2f, 1.23f, 0.0f, | ||
| 4.0f, 0.0f, 3.1f, 0.9f, 0.10f, 49.2f, 0.0f, 0.3f, | ||
| 100.0f, 0.4f, 0.1f, 0.9f, 0.1f, 33.12f, 12.1f, 0.0001f | ||
| }; | ||
| test_gather_non_zero<float>(layout{ov::PartialShape{2, 4, 3, 2}, data_types::f32, format::bfyx}, in_data); | ||
| } | ||
| TEST(test_gather_non_zero, 4d_fp16_2_4_3_2) { | ||
| std::vector<ov::float16> in_data = { | ||
| 0.1f, 0.2f, 0.3f, 0.0f, 12.0f, 2.0f, 0.4f, 0.1f, | ||
| 1.9f, 0.10f, 1.0f, 0.0f, 0.1f, 0.2f, 0.0f, 100.0f, | ||
| 0.0001f, 0.0f, 2.9f, 0.2f, 4.0f, 0.0f, 9.1f, 0.9f, | ||
| 100.0f, 0.4f, 0.1f, 0.3f, 0.0f, 24.2f, 1.23f, 0.0f, | ||
| 4.0f, 0.0f, 3.1f, 0.9f, 0.10f, 49.2f, 0.0f, 0.3f, | ||
| 100.0f, 0.4f, 0.1f, 0.9f, 0.1f, 33.12f, 12.1f, 0.0001f | ||
| }; | ||
| test_gather_non_zero<ov::float16>(layout{ov::PartialShape{2, 4, 3, 2}, data_types::f16, format::bfyx}, in_data); | ||
| } | ||
|
|
||
| TEST(test_gather_non_zero, 5d_fp32_1_3_3_2_2) { | ||
| std::vector<float> in_data = { | ||
| 0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| 0.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 8.0f, 3.0f, 0.1f, 0.00001f, 0.10f, 0.001f, | ||
| 0.1f, -0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| 0.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| 8.0f, 3.0f, 0.1f, 0.00001f, 0.10f, 0.001f, | ||
| 0.1f, -0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| }; | ||
| test_gather_non_zero<float>(layout{ov::PartialShape{1, 3, 4, 2, 2}, data_types::f32, format::bfzyx}, in_data); | ||
| } | ||
|
|
||
| TEST(test_gather_non_zero, 6d_fp16_2_3_1_3_2_4) { | ||
| std::vector<float> in_data = { | ||
| 0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| 1.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| 19.0f, 0.0f, 0.1f, 0.9f, 0.10f, -0.001f, | ||
| 0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| 8.0f, 3.0f, 0.1f, 0.00001f, 0.10f, 0.001f, | ||
| 0.1f, -0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| 13.0f, 1.0f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 11.1f, 0.2f, 0.3f, 66.0f, 12.1f, 11.1f, | ||
| 0.0f, 0.0001f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 0.1f, 0.2f, 0.3f, 2.0f, 12.1f, 11.1f, | ||
| 0.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| -13.0f, 1.0f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 0.1f, 0.2f, 0.3f, 66.0f, 12.1f, 11.1f, | ||
| 0.0f, 0.001f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 0.1f, 0.2f, 0.3f, 2.0f, 12.1f, 11.1f, | ||
| 0.1f, 1.2f, 0.3f, 99.0f, 12.1f, 11.1f, | ||
| 100.0f, 0.0f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 0.1f, 0.2f, 0.3f, 0.0f, 12.1f, 11.1f, | ||
| 13.0f, 1.0f, 0.1f, 0.9f, -0.10f, 0.001f, | ||
| 0.1f, 0.2f, 0.3f, 66.0f, 12.1f, 11.1f, | ||
| 0.0f, 0.0001f, 0.1f, 0.9f, 0.10f, 0.001f, | ||
| 0.1f, 0.2f, 0.3f, 2.0f, 12.1f, 11.1f, | ||
| }; | ||
| test_gather_non_zero<float>(layout{ov::PartialShape{2, 3, 1, 3, 2, 4}, data_types::f32, format::bfwzyx}, in_data); | ||
| } | ||
|
|
||
| TEST(non_zero_gpu, dynamic) { | ||
| auto& engine = get_test_engine(); | ||
|
Comment on lines
159
to
163
|
||
| ov::Shape in_shape = { 3, 3 }; | ||
|
|
@@ -452,40 +340,6 @@ TEST(test_non_zero, 6d_fp16_2_2_2_1_5_1) { | |
| test_non_zero<int32_t>(layout{ov::PartialShape{2, 2, 2, 1, 5, 1}, data_types::i32, format::bfwzyx}, in_data); | ||
| } | ||
|
|
||
| TEST(test_gather_non_zero, not_use_local_mem) { | ||
| auto& engine = get_test_engine(); | ||
| auto max_local_mem_size = engine.get_device_info().max_local_mem_size; | ||
|
|
||
| auto in_layout = layout{ov::PartialShape{ov::Dimension(max_local_mem_size)}, data_types::f32, format::bfyx}; | ||
| auto input_mem = engine.allocate_memory(in_layout); | ||
| auto in_data = std::vector<float>(max_local_mem_size, 1.f); | ||
| set_values(input_mem, in_data); | ||
|
|
||
| auto output_shape_layout = layout{ov::PartialShape{1}, data_types::i32, format::bfyx}; | ||
| auto output_shape_mem = engine.allocate_memory(output_shape_layout); | ||
| set_values(output_shape_mem, {static_cast<int32_t>(max_local_mem_size)}); | ||
|
|
||
| topology topology; | ||
| topology.add(input_layout("input", in_layout)); | ||
| topology.add(data("output_shape", output_shape_mem)); | ||
| topology.add(gather_nonzero("gather_nonzero", input_info("input"), input_info("output_shape"))); | ||
|
|
||
| network network(engine, topology, get_test_default_config(engine)); | ||
|
|
||
| network.set_input_data("input", input_mem); | ||
|
|
||
| auto outputs = network.execute(); | ||
| auto output = outputs.at("gather_nonzero").get_memory(); | ||
| cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream()); | ||
|
|
||
| std::vector<int32_t> expected_results(max_local_mem_size); | ||
| ov::reference::non_zero<float, int32_t>(in_data.data(), expected_results.data(), in_layout.get_shape()); | ||
|
|
||
| for (size_t i = 0; i < expected_results.size(); ++i) { | ||
| ASSERT_EQ(expected_results[i], output_ptr[i]); | ||
| } | ||
| } | ||
|
|
||
| TEST(non_zero_gpu, const_input) { | ||
| auto& engine = get_test_engine(); | ||
| ov::Shape in_shape = { 3, 3 }; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[MEDIUM]
count_nonzeronow advertises a fixed 1024-element output buffer. This hard-coded magic size is duplicated across graph layout and kernel dispatch (and will increase host-side reads likeread_vector(...)that only need element 0). Consider centralizing the constant (or deriving the output layout from the chosen dispatch size) to avoid drift and to keep shape-inference overhead minimal.