@@ -58,37 +58,63 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
5858 const auto in_g_n_c_wis_desc =
5959 ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
6060
61- Tensor<OutDataType> out (out_g_n_k_wos_desc);
62- Tensor<WeiDataType> wei (wei_g_k_c_xs_desc);
63- Tensor<InDataType> in_host (in_g_n_c_wis_desc);
64- Tensor<InDataType> in_device (in_g_n_c_wis_desc);
61+ std::cout << " out: " << out_g_n_k_wos_desc << std::endl;
62+ std::cout << " wei: " << wei_g_k_c_xs_desc << std::endl;
63+ std::cout << " in: " << in_g_n_c_wis_desc << std::endl;
6564
66- std::cout << " out: " << out.mDesc << std::endl;
67- std::cout << " wei: " << wei.mDesc << std::endl;
68- std::cout << " in: " << in_host.mDesc << std::endl;
65+ // Get element space sizes
66+ const auto out_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize ();
67+ const auto wei_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize ();
68+ const auto in_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize ();
6969
70+ // Allocate GPU buffers
71+ DeviceMem out_device_buf (sizeof (OutDataType) * out_element_space_size);
72+ DeviceMem wei_device_buf (sizeof (WeiDataType) * wei_element_space_size);
73+ DeviceMem in_device_buf (sizeof (InDataType) * in_element_space_size);
74+
75+ // Generate data directly on GPU using DeviceMem methods
7076 switch (init_method)
7177 {
72- case 0 : break ;
78+ case 0 :
79+ // Zero initialization
80+ out_device_buf.SetZero ();
81+ wei_device_buf.SetZero ();
82+ break ;
7383 case 1 :
74- out.GenerateTensorValue (GeneratorTensor_2<OutDataType>{-5 , 5 });
75- wei.GenerateTensorValue (GeneratorTensor_2<WeiDataType>{-5 , 5 });
84+ // Discrete integer values in range [-5, 5]
85+ out_device_buf.FillUniformRandInteger <OutDataType>(-5 , 5 );
86+ wei_device_buf.FillUniformRandInteger <WeiDataType>(-5 , 5 );
7687 break ;
7788 case 2 :
78- out.GenerateTensorValue (GeneratorTensor_3<OutDataType>{0.0 , 1.0 });
79- wei.GenerateTensorValue (GeneratorTensor_3<WeiDataType>{-0.5 , 0.5 });
89+ // Continuous float values
90+ out_device_buf.FillUniformRandFp <OutDataType>(0 .0f , 1 .0f );
91+ wei_device_buf.FillUniformRandFp <WeiDataType>(-0 .5f , 0 .5f );
8092 break ;
8193 default :
82- out.GenerateTensorValue (GeneratorTensor_1<OutDataType>{1 });
83- wei.GenerateTensorValue (GeneratorTensor_1<WeiDataType>{1 });
94+ // Constant value 1
95+ out_device_buf.SetValue <OutDataType>(ck::type_convert<OutDataType>(1 ));
96+ wei_device_buf.SetValue <WeiDataType>(ck::type_convert<WeiDataType>(1 ));
8497 }
8598
86- DeviceMem out_device_buf (sizeof (OutDataType) * out.mDesc .GetElementSpaceSize ());
87- DeviceMem wei_device_buf (sizeof (WeiDataType) * wei.mDesc .GetElementSpaceSize ());
88- DeviceMem in_device_buf (sizeof (InDataType) * in_device.mDesc .GetElementSpaceSize ());
99+ // Create host tensors (needed only for verification)
100+ Tensor<OutDataType> out (out_g_n_k_wos_desc);
101+ Tensor<WeiDataType> wei (wei_g_k_c_xs_desc);
102+ Tensor<InDataType> in_host (in_g_n_c_wis_desc);
103+ Tensor<InDataType> in_device (in_g_n_c_wis_desc);
89104
90- out_device_buf.ToDevice (out.mData .data ());
91- wei_device_buf.ToDevice (wei.mData .data ());
105+ // Copy GPU→CPU only if verification is enabled
106+ if (do_verification == 1 || do_verification == 2 )
107+ {
108+ out_device_buf.FromDevice (out.mData .data ());
109+ wei_device_buf.FromDevice (wei.mData .data ());
110+ }
111+
112+ // Copy to host only if CPU verification is needed
113+ if (do_verification == 1 )
114+ {
115+ out_device_buf.FromDevice (out.mData .data ());
116+ wei_device_buf.FromDevice (wei.mData .data ());
117+ }
92118
93119 // Allocate GPU reference buffer (used only if do_verification == 2)
94120 DeviceMem gpu_ref_in_buf (
0 commit comments