Skip to content

Commit 3ccb15e

Browse files
[CK Profiler] Initialize tensors on GPU in CK profiler (#3550)
* Initialize tensors on GPU in CK profiler * Kick CI
1 parent 717ed0b commit 3ccb15e

3 files changed

Lines changed: 115 additions & 54 deletions

File tree

profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -58,37 +58,63 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
5858
const auto in_g_n_c_wis_desc =
5959
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
6060

61-
Tensor<OutDataType> out(out_g_n_k_wos_desc);
62-
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
63-
Tensor<InDataType> in_host(in_g_n_c_wis_desc);
64-
Tensor<InDataType> in_device(in_g_n_c_wis_desc);
61+
std::cout << "out: " << out_g_n_k_wos_desc << std::endl;
62+
std::cout << "wei: " << wei_g_k_c_xs_desc << std::endl;
63+
std::cout << "in: " << in_g_n_c_wis_desc << std::endl;
6564

66-
std::cout << "out: " << out.mDesc << std::endl;
67-
std::cout << "wei: " << wei.mDesc << std::endl;
68-
std::cout << "in: " << in_host.mDesc << std::endl;
65+
// Get element space sizes
66+
const auto out_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize();
67+
const auto wei_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize();
68+
const auto in_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize();
6969

70+
// Allocate GPU buffers
71+
DeviceMem out_device_buf(sizeof(OutDataType) * out_element_space_size);
72+
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_element_space_size);
73+
DeviceMem in_device_buf(sizeof(InDataType) * in_element_space_size);
74+
75+
// Generate data directly on GPU using DeviceMem methods
7076
switch(init_method)
7177
{
72-
case 0: break;
78+
case 0:
79+
// Zero initialization
80+
out_device_buf.SetZero();
81+
wei_device_buf.SetZero();
82+
break;
7383
case 1:
74-
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
75-
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
84+
// Discrete integer values in range [-5, 5]
85+
out_device_buf.FillUniformRandInteger<OutDataType>(-5, 5);
86+
wei_device_buf.FillUniformRandInteger<WeiDataType>(-5, 5);
7687
break;
7788
case 2:
78-
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
79-
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
89+
// Continuous float values
90+
out_device_buf.FillUniformRandFp<OutDataType>(0.0f, 1.0f);
91+
wei_device_buf.FillUniformRandFp<WeiDataType>(-0.5f, 0.5f);
8092
break;
8193
default:
82-
out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
83-
wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
94+
// Constant value 1
95+
out_device_buf.SetValue<OutDataType>(ck::type_convert<OutDataType>(1));
96+
wei_device_buf.SetValue<WeiDataType>(ck::type_convert<WeiDataType>(1));
8497
}
8598

86-
DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
87-
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
88-
DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
99+
// Create host tensors (needed only for verification)
100+
Tensor<OutDataType> out(out_g_n_k_wos_desc);
101+
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
102+
Tensor<InDataType> in_host(in_g_n_c_wis_desc);
103+
Tensor<InDataType> in_device(in_g_n_c_wis_desc);
89104

90-
out_device_buf.ToDevice(out.mData.data());
91-
wei_device_buf.ToDevice(wei.mData.data());
105+
// Copy GPU→CPU only if verification is enabled
106+
if(do_verification == 1 || do_verification == 2)
107+
{
108+
out_device_buf.FromDevice(out.mData.data());
109+
wei_device_buf.FromDevice(wei.mData.data());
110+
}
111+
112+
// Copy to host only if CPU verification is needed
113+
if(do_verification == 1)
114+
{
115+
out_device_buf.FromDevice(out.mData.data());
116+
wei_device_buf.FromDevice(wei.mData.data());
117+
}
92118

93119
// Allocate GPU reference buffer (used only if do_verification == 2)
94120
DeviceMem gpu_ref_in_buf(

profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -63,34 +63,51 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
6363
const auto out_g_n_k_wos_desc =
6464
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
6565

66-
Tensor<InDataType> input(in_g_n_c_wis_desc);
67-
Tensor<WeiDataType> weight_host_result(wei_g_k_c_xs_desc);
68-
Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc);
69-
Tensor<OutDataType> output(out_g_n_k_wos_desc);
66+
std::cout << "input: " << in_g_n_c_wis_desc << std::endl;
67+
std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl;
68+
std::cout << "output: " << out_g_n_k_wos_desc << std::endl;
69+
70+
// Get element space sizes
71+
const auto input_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize();
72+
const auto weight_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize();
73+
const auto output_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize();
7074

71-
std::cout << "input: " << input.mDesc << std::endl;
72-
std::cout << "weight: " << weight_host_result.mDesc << std::endl;
73-
std::cout << "output: " << output.mDesc << std::endl;
75+
// Allocate GPU buffers
76+
DeviceMem in_device_buf(sizeof(InDataType) * input_element_space_size);
77+
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_element_space_size);
78+
DeviceMem out_device_buf(sizeof(OutDataType) * output_element_space_size);
7479

80+
// Generate data directly on GPU using DeviceMem methods
7581
switch(init_method)
7682
{
77-
case 0: break;
83+
case 0:
84+
// Zero initialization
85+
in_device_buf.SetZero();
86+
out_device_buf.SetZero();
87+
break;
7888
case 1:
79-
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
80-
output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
89+
// Discrete integer values in range [-5, 5]
90+
in_device_buf.FillUniformRandInteger<InDataType>(-5, 5);
91+
out_device_buf.FillUniformRandInteger<OutDataType>(-5, 5);
8192
break;
8293
default:
83-
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
84-
output.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
94+
// Continuous float values
95+
in_device_buf.FillUniformRandFp<InDataType>(0.0f, 1.0f);
96+
out_device_buf.FillUniformRandFp<OutDataType>(-0.5f, 0.5f);
8597
}
8698

87-
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
88-
DeviceMem wei_device_buf(sizeof(WeiDataType) *
89-
weight_device_result.mDesc.GetElementSpaceSize());
90-
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
99+
// Create host tensors (needed only for verification)
100+
Tensor<InDataType> input(in_g_n_c_wis_desc);
101+
Tensor<WeiDataType> weight_host_result(wei_g_k_c_xs_desc);
102+
Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc);
103+
Tensor<OutDataType> output(out_g_n_k_wos_desc);
91104

92-
in_device_buf.ToDevice(input.mData.data());
93-
out_device_buf.ToDevice(output.mData.data());
105+
// Copy to host only if CPU verification is needed
106+
if(do_verification == 1)
107+
{
108+
in_device_buf.FromDevice(input.mData.data());
109+
out_device_buf.FromDevice(output.mData.data());
110+
}
94111

95112
// Allocate GPU reference buffer (used only if do_verification == 2)
96113
DeviceMem gpu_ref_wei_buf(

profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -86,33 +86,51 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
8686
copy(conv_param.input_left_pads_, input_left_pads);
8787
copy(conv_param.input_right_pads_, input_right_pads);
8888

89-
Tensor<InDataType> input(in_g_n_c_wis_desc);
90-
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
91-
Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
92-
Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
89+
// Get element space sizes for GPU allocation
90+
const auto input_size = in_g_n_c_wis_desc.GetElementSpaceSize();
91+
const auto weight_size = wei_g_k_c_xs_desc.GetElementSpaceSize();
92+
const auto output_size = out_g_n_k_wos_desc.GetElementSpaceSize();
93+
94+
std::cout << "input: " << in_g_n_c_wis_desc << std::endl;
95+
std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl;
96+
std::cout << "output: " << out_g_n_k_wos_desc << std::endl;
9397

94-
std::cout << "input: " << input.mDesc << std::endl;
95-
std::cout << "weight: " << weight.mDesc << std::endl;
96-
std::cout << "output: " << host_output.mDesc << std::endl;
98+
// Allocate GPU memory first (GPU-first workflow)
99+
DeviceMem in_device_buf(sizeof(InDataType) * input_size);
100+
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_size);
101+
DeviceMem out_device_buf(sizeof(OutDataType) * output_size);
97102

103+
// Generate data directly on GPU using DeviceMem methods
98104
switch(init_method)
99105
{
100-
case 0: break;
106+
case 0:
107+
// Zero initialization
108+
in_device_buf.SetZero();
109+
wei_device_buf.SetZero();
110+
break;
101111
case 1:
102-
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
103-
weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
112+
// Discrete integer generation: {-5, -4, -3, ..., 3, 4}
113+
in_device_buf.FillUniformRandInteger<InDataType>(-5, 5);
114+
wei_device_buf.FillUniformRandInteger<WeiDataType>(-5, 5);
104115
break;
105116
default:
106-
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
107-
weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
117+
// Continuous float generation
118+
in_device_buf.FillUniformRandFp<InDataType>(0.0f, 1.0f);
119+
wei_device_buf.FillUniformRandFp<WeiDataType>(-0.5f, 0.5f);
108120
}
109121

110-
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
111-
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
112-
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
122+
// Create host tensors (for verification if needed)
123+
Tensor<InDataType> input(in_g_n_c_wis_desc);
124+
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
125+
Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
126+
Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
113127

114-
in_device_buf.ToDevice(input.mData.data());
115-
wei_device_buf.ToDevice(weight.mData.data());
128+
// Copy to host only if CPU verification is needed
129+
if(do_verification == 1)
130+
{
131+
in_device_buf.FromDevice(input.mData.data());
132+
wei_device_buf.FromDevice(weight.mData.data());
133+
}
116134

117135
// Allocate GPU reference buffer (used only if do_verification == 2)
118136
DeviceMem gpu_ref_out_buf(

0 commit comments

Comments
 (0)