diff --git a/projects/miopen/common_utils/include/common_utils/tensor_utils.hpp b/projects/miopen/common_utils/include/common_utils/tensor_utils.hpp new file mode 100644 index 000000000000..29fc0d55c5e4 --- /dev/null +++ b/projects/miopen/common_utils/include/common_utils/tensor_utils.hpp @@ -0,0 +1,165 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#ifndef GUARD_COMMON_UTILS_TENSOR_UTILS_HPP +#define GUARD_COMMON_UTILS_TENSOR_UTILS_HPP + +#include + +#include +#include +#include +#include +#include + +// Convenience wrappers around the public MIOpen tensor descriptor API. +// These replace direct use of miopen::deref(tensorDesc).Method() with +// clean single-call functions that use only the public C API. + +namespace tensor_utils { + +inline int GetNumDims(miopenTensorDescriptor_t desc) +{ + int size = 0; + miopenGetTensorDescriptorSize(desc, &size); + return size; +} + +inline miopenDataType_t GetType(miopenTensorDescriptor_t desc) +{ + miopenDataType_t dt; + miopenGetTensorDescriptor(desc, &dt, nullptr, nullptr); + return dt; +} + +inline std::vector GetLengths(miopenTensorDescriptor_t desc) +{ + int ndim = GetNumDims(desc); + std::vector lens(ndim); + miopenGetTensorDescriptorV2(desc, nullptr, lens.data(), nullptr); + return lens; +} + +inline std::vector GetStrides(miopenTensorDescriptor_t desc) +{ + int ndim = GetNumDims(desc); + std::vector strides(ndim); + miopenGetTensorDescriptorV2(desc, nullptr, nullptr, strides.data()); + return strides; +} + +inline size_t GetNumBytes(miopenTensorDescriptor_t desc) +{ + size_t numBytes = 0; + miopenGetTensorNumBytes(desc, &numBytes); + return numBytes; +} + +inline size_t GetElementSize(miopenTensorDescriptor_t desc) +{ + auto lens = GetLengths(desc); + size_t n = 1; + for(auto l : lens) + n *= l; + return n; +} + +inline miopenTensorLayout_t GetLayout(miopenTensorDescriptor_t desc) +{ + miopenTensorLayout_t layout; + miopenGetTensorLayout(desc, &layout); + return layout; +} + +inline size_t GetElementSpace(miopenTensorDescriptor_t desc) +{ + size_t space = 0; + miopenGetTensorElementSpace(desc, &space); + return space; +} + +inline bool IsPacked(miopenTensorDescriptor_t desc) +{ + bool packed = false; + miopenIsTensorPacked(desc, &packed); + return packed; +} + +inline size_t GetVectorLength(miopenTensorDescriptor_t desc) +{ + size_t vlen = 0; + miopenGetTensorVectorLength(desc, &vlen); + return vlen; +} + +// Returns the byte size of a MIOpen data type. +inline size_t GetTypeSize(miopenDataType_t dt) +{ + switch(dt) + { + case miopenHalf: return 2; + case miopenFloat: return 4; + case miopenDouble: return 8; + case miopenBFloat16: return 2; + case miopenInt8: return 1; + case miopenInt32: return 4; + case miopenInt64: return 8; + case miopenFloat8_fnuz: return 1; + case miopenBFloat8_fnuz: return 1; + default: return 0; + } +} + +// Unpack a vector into a tuple of N values. +// Replacement for miopen::tien(vec). +template +auto Tien(const std::vector& v) +{ + assert(v.size() >= N); + if constexpr(N == 2) + return std::make_tuple(v[0], v[1]); + else if constexpr(N == 3) + return std::make_tuple(v[0], v[1], v[2]); + else if constexpr(N == 4) + return std::make_tuple(v[0], v[1], v[2], v[3]); + else if constexpr(N == 5) + return std::make_tuple(v[0], v[1], v[2], v[3], v[4]); +} + +// Given a spatial dimension count and a vector of lengths (or strides), +// return a tuple of (N, C, D, H, W) with D=1 for 2D tensors. +// Replacement for miopen::GetNCDHW. +template +auto GetNCDHW(unsigned spatial_dims, const std::vector& data) +{ + if(spatial_dims == 3) + { + assert(data.size() >= 5); + return std::make_tuple(data[0], data[1], data[2], data[3], data[4]); + } + else + { + assert(data.size() >= 4); + return std::make_tuple( + data[0], data[1], static_cast(1), data[2], data[3]); + } +} + +// Print a range with a separator, replacement for miopen::LogRange. +template +std::ostream& LogRange(std::ostream& os, const Container& c, const char* sep) +{ + bool first = true; + for(const auto& v : c) + { + if(!first) + os << sep; + os << v; + first = false; + } + return os; +} + +} // namespace tensor_utils + +#endif // GUARD_COMMON_UTILS_TENSOR_UTILS_HPP diff --git a/projects/miopen/driver/CBAInferFusion_driver.hpp b/projects/miopen/driver/CBAInferFusion_driver.hpp index 14a3d7f806ec..eed19fdd51ce 100644 --- a/projects/miopen/driver/CBAInferFusion_driver.hpp +++ b/projects/miopen/driver/CBAInferFusion_driver.hpp @@ -42,7 +42,7 @@ #include #include -#include +#include #include #include @@ -175,7 +175,7 @@ class CBAInferFusionDriver : public Driver avgtime += time; } - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); STOP_TIME if(WALL_CLOCK) @@ -672,7 +672,7 @@ int CBAInferFusionDriver::SetConvDescriptorFromCmdLineArgs() template std::vector CBAInferFusionDriver::GetOutputTensorLengths() { - int ndim = miopen::deref(inputTensor).GetNumDims(); + int ndim = tensor_utils::GetNumDims(inputTensor); std::vector out_lens(ndim); @@ -1285,14 +1285,31 @@ void CBAInferFusionDriver::runCPUConvFwdInference() in_local_host.data = in_host; wei_local_host.data = wei_host; outhost_local_host.data.resize(outhost_local_host.desc.GetElementSpace()); - cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), + + int cba_spatial_dim = 0; + miopenGetConvolutionSpatialDim(convDesc, &cba_spatial_dim); + std::vector cba_pads(cba_spatial_dim); + std::vector cba_strides(cba_spatial_dim); + std::vector cba_dilations(cba_spatial_dim); + miopenConvolutionMode_t cba_mode_unused; + miopenGetConvolutionNdDescriptor(convDesc, + cba_spatial_dim, + &cba_spatial_dim, + cba_pads.data(), + cba_strides.data(), + cba_dilations.data(), + &cba_mode_unused); + int cba_group_count = 0; + miopenGetConvolutionGroupCount(convDesc, &cba_group_count); + + cpu_convolution_forward(cba_spatial_dim, in_local_host, wei_local_host, outhost_local_host, - miopen::deref(convDesc).GetConvPads(), - miopen::deref(convDesc).GetConvStrides(), - miopen::deref(convDesc).GetConvDilations(), - miopen::deref(convDesc).GetGroupCount()); + cba_pads, + cba_strides, + cba_dilations, + cba_group_count); if constexpr(!std::is_same_v) { @@ -1305,8 +1322,8 @@ void CBAInferFusionDriver::runCPUConvFwdInference() if(bias_mode) { - tensor bias_local_host(miopen::deref(biasTensor).GetLengths(), - miopen::deref(biasTensor).GetStrides()); + tensor bias_local_host(tensor_utils::GetLengths(biasTensor), + tensor_utils::GetStrides(biasTensor)); bias_local_host.data = bias_host; cpu_bias_forward(outhost_local_host, bias_local_host); } diff --git a/projects/miopen/driver/activ_driver.hpp b/projects/miopen/driver/activ_driver.hpp index 0231ae6d0393..189fe8a9b415 100644 --- a/projects/miopen/driver/activ_driver.hpp +++ b/projects/miopen/driver/activ_driver.hpp @@ -34,8 +34,8 @@ #include "timer.hpp" #include "util_driver.hpp" +#include #include -#include #include #include @@ -329,7 +329,7 @@ int ActivationDriver::RunForwardGPU() ExecuteKernel(); - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); STOP_TIME if(WALL_CLOCK) { @@ -373,9 +373,9 @@ int ActivationDriver::RunForwardGPU() avgtime / (iters - 1), iters - 1); int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(inputTensor)); size_t dataSz = - in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); + in_n * in_c * in_h * in_w * tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor)); // layer, readbytes, writebytes, BG/s, timeMS printf("stats: name, bytesRead, bytesWritten, GB/s, timeMs\n"); @@ -432,7 +432,7 @@ int ActivationDriver::RunBackwardGPU() ExecuteKernel(); - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); STOP_TIME if(WALL_CLOCK) { @@ -476,9 +476,9 @@ int ActivationDriver::RunBackwardGPU() avgtime / (iters - 1), iters - 1); int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(inputTensor)); size_t dataSz = - in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); + in_n * in_c * in_h * in_w * tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor)); // layer, readbytes, writebytes, BG/s, timeMS printf("stats: name, bytesRead, bytesWritten, GB/s, timeMs\n"); diff --git a/projects/miopen/driver/adam_driver.hpp b/projects/miopen/driver/adam_driver.hpp index b93e398a9a09..c2b2d5b84981 100644 --- a/projects/miopen/driver/adam_driver.hpp +++ b/projects/miopen/driver/adam_driver.hpp @@ -36,8 +36,8 @@ #include #include +#include #include -#include #include #include @@ -73,7 +73,7 @@ void mloAdamRunHost(miopenTensorDescriptor_t paramDesc, if(is_amp && found_inf) return; - const size_t numel = miopen::deref(paramDesc).GetElementSize(); + const size_t numel = tensor_utils::GetElementSize(paramDesc); const float bias_correction1 = 1.0 - pow(beta1, step); const float bias_correction2 = 1.0 - pow(beta2, step); diff --git a/projects/miopen/driver/addlayernorm_driver.hpp b/projects/miopen/driver/addlayernorm_driver.hpp index 190a9ec9da1e..f01845d4f1cf 100644 --- a/projects/miopen/driver/addlayernorm_driver.hpp +++ b/projects/miopen/driver/addlayernorm_driver.hpp @@ -38,7 +38,6 @@ #include #include #include -#include #include #include @@ -56,7 +55,7 @@ int32_t mloAddLayerNormForwardRunHost(miopenTensorDescriptor_t inputDesc, miopenNormMode_t mode, bool use_multithread) { - auto dims = miopen::deref(inputDesc).GetLengths(); + auto dims = tensor_utils::GetLengths(inputDesc); size_t outer_size = 1; size_t inner_size = 1; size_t norm_dim = static_cast(normalized_dim); diff --git a/projects/miopen/driver/bn_driver.hpp b/projects/miopen/driver/bn_driver.hpp index 7b28802c3f03..1b84f3a75ddd 100644 --- a/projects/miopen/driver/bn_driver.hpp +++ b/projects/miopen/driver/bn_driver.hpp @@ -40,9 +40,8 @@ #include #include -#include +#include #include -#include #include "miopen/batch_norm.hpp" #include @@ -1308,7 +1307,7 @@ int BatchNormDriver::RunForwardGPU() iteration = i; // Modifies the captured reference for the case of not using HIP graph ExecuteKernel(); - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); STOP_TIME if(WALL_CLOCK) { @@ -1354,9 +1353,9 @@ int BatchNormDriver::RunForwardGPU() avgtime / (iters - 1), iters - 1); int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(in.GetTensor().desc.GetLengths()); size_t M = in_n * in_c * in_h * in_w; - size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType()); + size_t dataSz = (M + 2 * in_c) * tensor_utils::GetTypeSize(in.GetTensor().desc.GetType()); float rdCnt = -1.0; float wrCnt = 1.0; if(forw == 1) @@ -1692,7 +1691,7 @@ int BatchNormDriver::RunBackwardGPU() ExecuteKernel(); - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); STOP_TIME if(WALL_CLOCK) { @@ -1718,9 +1717,9 @@ int BatchNormDriver::RunBackwardGPU() avgtime += time; int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(in.GetTensor().desc.GetLengths()); size_t M = in_n * in_c * in_h * in_w; - size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType()); + size_t dataSz = (M + 2 * in_c) * tensor_utils::GetTypeSize(in.GetTensor().desc.GetType()); float rdCnt = 2.0; float wrCnt = 1.0; // layer, flopCnt, reads, writes, GFLOPS, GB/s, timeMs diff --git a/projects/miopen/driver/cat_driver.hpp b/projects/miopen/driver/cat_driver.hpp index 8611f87b0123..bf9af4e2cea1 100644 --- a/projects/miopen/driver/cat_driver.hpp +++ b/projects/miopen/driver/cat_driver.hpp @@ -13,9 +13,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -33,7 +33,7 @@ int32_t mloCatForwardRunHost(const std::vector& inputD uint32_t dim, bool multi_threaded) { - const auto& shape = miopen::deref(outputDesc).GetLengths(); + const auto& shape = tensor_utils::GetLengths(outputDesc); const size_t output_dim_size = shape[dim]; size_t outer_size = 1; size_t inner_size = 1; @@ -61,12 +61,12 @@ int32_t mloCatForwardRunHost(const std::vector& inputD copy_sizes.reserve(n); output_start_offsets.reserve(n); - copy_sizes.emplace_back(inner_size * miopen::deref(inputDescs[0]).GetLengths()[dim]); + copy_sizes.emplace_back(inner_size * tensor_utils::GetLengths(inputDescs[0])[dim]); output_start_offsets.emplace_back(0); for(size_t i{1}; i < n; ++i) { - const size_t dim_size = miopen::deref(inputDescs[i]).GetLengths()[dim]; + const size_t dim_size = tensor_utils::GetLengths(inputDescs[i])[dim]; output_start_offsets.emplace_back(output_start_offsets.back() + copy_sizes.back()); copy_sizes.emplace_back(inner_size * dim_size); diff --git a/projects/miopen/driver/conv_driver.hpp b/projects/miopen/driver/conv_driver.hpp index 795a8b325987..1e4bb0c5c709 100644 --- a/projects/miopen/driver/conv_driver.hpp +++ b/projects/miopen/driver/conv_driver.hpp @@ -18,6 +18,7 @@ #include "util_file.hpp" #include +#include #include #include #include @@ -25,7 +26,6 @@ #include #include #include -#include #include #include @@ -57,12 +57,12 @@ struct AutoMiopenWarmupMode { AutoMiopenWarmupMode() { - debug_logging_quiet_prev = miopen::debug::LoggingQuiet; - debug_find_enforce_disable_prev = miopen::debug::FindEnforceDisable; - debug_is_warmup_ongoing_prev = miopen::debug::IsWarmupOngoing; - miopen::debug::LoggingQuiet = true; - miopen::debug::FindEnforceDisable = true; - miopen::debug::IsWarmupOngoing = true; + miopenGetDebugFlag(miopenDebugLoggingQuiet, &debug_logging_quiet_prev); + miopenGetDebugFlag(miopenDebugFindEnforceDisable, &debug_find_enforce_disable_prev); + miopenGetDebugFlag(miopenDebugIsWarmupOngoing, &debug_is_warmup_ongoing_prev); + miopenSetDebugFlag(miopenDebugLoggingQuiet, true); + miopenSetDebugFlag(miopenDebugFindEnforceDisable, true); + miopenSetDebugFlag(miopenDebugIsWarmupOngoing, true); } AutoMiopenWarmupMode(const AutoMiopenWarmupMode&) = delete; AutoMiopenWarmupMode(AutoMiopenWarmupMode&&) = delete; @@ -70,9 +70,9 @@ struct AutoMiopenWarmupMode AutoMiopenWarmupMode& operator=(AutoMiopenWarmupMode&&) = delete; ~AutoMiopenWarmupMode() { - miopen::debug::LoggingQuiet = debug_logging_quiet_prev; - miopen::debug::FindEnforceDisable = debug_find_enforce_disable_prev; - miopen::debug::IsWarmupOngoing = debug_is_warmup_ongoing_prev; + miopenSetDebugFlag(miopenDebugLoggingQuiet, debug_logging_quiet_prev); + miopenSetDebugFlag(miopenDebugFindEnforceDisable, debug_find_enforce_disable_prev); + miopenSetDebugFlag(miopenDebugIsWarmupOngoing, debug_is_warmup_ongoing_prev); } private: @@ -85,10 +85,10 @@ struct AutoPrepareForGpuReference { AutoPrepareForGpuReference() { - quiet_prev = miopen::debug::LoggingQuiet; - naive_prev = miopen::debug::AlwaysEnableConvDirectNaive; - miopen::debug::AlwaysEnableConvDirectNaive = true; - miopen::debug::LoggingQuiet = true; + miopenGetDebugFlag(miopenDebugLoggingQuiet, &quiet_prev); + miopenGetDebugFlag(miopenDebugAlwaysEnableConvDirectNaive, &naive_prev); + miopenSetDebugFlag(miopenDebugAlwaysEnableConvDirectNaive, true); + miopenSetDebugFlag(miopenDebugLoggingQuiet, true); } AutoPrepareForGpuReference(const AutoPrepareForGpuReference&) = delete; AutoPrepareForGpuReference(AutoPrepareForGpuReference&&) = delete; @@ -96,8 +96,8 @@ struct AutoPrepareForGpuReference AutoPrepareForGpuReference& operator=(AutoPrepareForGpuReference&&) = delete; ~AutoPrepareForGpuReference() { - miopen::debug::LoggingQuiet = quiet_prev; - miopen::debug::AlwaysEnableConvDirectNaive = naive_prev; + miopenSetDebugFlag(miopenDebugLoggingQuiet, quiet_prev); + miopenSetDebugFlag(miopenDebugAlwaysEnableConvDirectNaive, naive_prev); } private: @@ -791,15 +791,15 @@ int ConvDriver::GetandSetData() SetConvDescriptorFromCmdLineArgs(); std::vector out_len = GetOutputTensorLengths(); - if(miopen::deref(inputTensor).GetLayoutEnum() == miopenTensorNCHWc4 || - miopen::deref(inputTensor).GetLayoutEnum() == miopenTensorNCHWc8) + if(tensor_utils::GetLayout(inputTensor) == miopenTensorNCHWc4 || + tensor_utils::GetLayout(inputTensor) == miopenTensorNCHWc8) { - out_len[1] *= miopen::deref(inputTensor).GetVectorLength(); + out_len[1] *= tensor_utils::GetVectorLength(inputTensor); } - if(miopen::deref(inputTensor).GetLayoutEnum() == miopenTensorCHWNc4 || - miopen::deref(inputTensor).GetLayoutEnum() == miopenTensorCHWNc8) + if(tensor_utils::GetLayout(inputTensor) == miopenTensorCHWNc4 || + tensor_utils::GetLayout(inputTensor) == miopenTensorCHWNc8) { - out_len[0] *= miopen::deref(inputTensor).GetVectorLength(); + out_len[0] *= tensor_utils::GetVectorLength(inputTensor); } SetTensorNd(outputTensor, out_len, inflags.GetValueStr("out_layout"), data_type); if(inflags.GetValueStr("out_cast_type") != "-1") @@ -841,7 +841,7 @@ int ConvDriver::GetandSetData() miopenSetConvolutionAttribute( warmupConvDesc, MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, inflags.GetValueInt("math_type")); - int warmup_out_len_size = miopen::deref(warmupInputTensor).GetNumDims(); + int warmup_out_len_size = tensor_utils::GetNumDims(warmupInputTensor); std::vector warmup_out_len(warmup_out_len_size); miopenGetConvolutionNdForwardOutputDim(warmupConvDesc, warmupInputTensor, @@ -1215,7 +1215,7 @@ int ConvDriver::SetConvDescriptorFromCmdLineArgs() template std::vector ConvDriver::GetOutputTensorLengths() { - int ndim = miopen::deref(inputTensor).GetNumDims(); + int ndim = tensor_utils::GetNumDims(inputTensor); std::vector out_lens(ndim); @@ -1439,15 +1439,15 @@ int ConvDriver::AllocateBuffersAndCopy() new GPUMem(ctx, GetTensorSize(weightTensor_vect4), sizeof(Tgpu), buffer_check)); } - outhost = tensor(miopen::deref(outputTensor).GetLayout_t(), - miopen::deref(outputTensor).GetLengths(), - miopen::deref(outputTensor).GetStrides()); - din_host = tensor(miopen::deref(inputTensor).GetLayout_t(), - miopen::deref(inputTensor).GetLengths(), - miopen::deref(inputTensor).GetStrides()); - dwei_host = tensor(miopen::deref(weightTensor).GetLayout_t(), - miopen::deref(weightTensor).GetLengths(), - miopen::deref(weightTensor).GetStrides()); + outhost = tensor(tensor_utils::GetLayout(outputTensor), + tensor_utils::GetLengths(outputTensor), + tensor_utils::GetStrides(outputTensor)); + din_host = tensor(tensor_utils::GetLayout(inputTensor), + tensor_utils::GetLengths(inputTensor), + tensor_utils::GetStrides(inputTensor)); + dwei_host = tensor(tensor_utils::GetLayout(weightTensor), + tensor_utils::GetLengths(weightTensor), + tensor_utils::GetStrides(weightTensor)); std::string inFileName = inflags.GetValueStr("in_data"); std::string weiFileName = inflags.GetValueStr("weights"); @@ -1514,7 +1514,8 @@ int ConvDriver::AllocateBuffersAndCopy() b.AllocOnHost(biasTensor); db.AllocOnHost(b_sz); - db_host = tensor(miopen::deref(biasTensor)); + db_host = tensor(tensor_utils::GetLengths(biasTensor), + tensor_utils::GetStrides(biasTensor)); // Init tensor on host bool b_read = false; @@ -1674,7 +1675,7 @@ void ConvDriver::PrintForwardTime(const float kernel_total_time, float kernel_average_time = ComputeAverageTime(kernel_total_time, kernel_first_time); printf("GPU Kernel Time Forward Conv. Elapsed: %f ms (average)\n", kernel_average_time); - const auto num_dim = miopen::deref(inputTensor).GetNumDims() - 2; + const auto num_dim = tensor_utils::GetNumDims(inputTensor) - 2; if(num_dim != 2 && num_dim != 3) { printf("stats: for conv%ud\n", num_dim); @@ -1683,27 +1684,41 @@ void ConvDriver::PrintForwardTime(const float kernel_total_time, int group_count = std::max(inflags.GetValueInt("group_count"), 1); + int conv_spatial_dim = 0; + miopenGetConvolutionSpatialDim(convDesc, &conv_spatial_dim); + std::vector conv_pads(conv_spatial_dim); + std::vector conv_strides_v(conv_spatial_dim); + std::vector conv_dilations_v(conv_spatial_dim); + miopenConvolutionMode_t conv_mode_unused; + miopenGetConvolutionNdDescriptor(convDesc, + conv_spatial_dim, + &conv_spatial_dim, + conv_pads.data(), + conv_strides_v.data(), + conv_dilations_v.data(), + &conv_mode_unused); + if(num_dim == 2) { int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(inputTensor)); int wei_c, wei_n, wei_h, wei_w; std::tie(wei_c, wei_n, wei_h, wei_w) = - miopen::tien<4>(miopen::deref(weightTensor).GetLengths()); + tensor_utils::Tien<4>(tensor_utils::GetLengths(weightTensor)); int out_n, out_c, out_h, out_w; std::tie(out_n, out_c, out_h, out_w) = - miopen::tien<4>(miopen::deref(outputTensor).GetLengths()); + tensor_utils::Tien<4>(tensor_utils::GetLengths(outputTensor)); size_t flopCnt = static_cast(2) * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w / group_count; size_t inputBytes = - in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); + in_n * in_c * in_h * in_w * tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor)); size_t weightBytes = wei_n * wei_c * wei_h * wei_w * - miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); + tensor_utils::GetTypeSize(tensor_utils::GetType(weightTensor)); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_h * out_w * - miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); + tensor_utils::GetTypeSize(tensor_utils::GetType(outputTensor)); printf("stats: name, n, c, ho, wo, y, x, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); @@ -1711,7 +1726,7 @@ void ConvDriver::PrintForwardTime(const float kernel_total_time, "fwd-conv", wei_h, wei_w, - miopen::deref(convDesc).GetConvStrides()[0], + conv_strides_v[0], in_n, in_c, out_h, @@ -1730,24 +1745,24 @@ void ConvDriver::PrintForwardTime(const float kernel_total_time, { // 3d int in_n, in_c, in_d, in_h, in_w; std::tie(in_n, in_c, in_d, in_h, in_w) = - miopen::tien<5>(miopen::deref(inputTensor).GetLengths()); + tensor_utils::Tien<5>(tensor_utils::GetLengths(inputTensor)); int wei_c, wei_n, wei_d, wei_h, wei_w; std::tie(wei_c, wei_n, wei_d, wei_h, wei_w) = - miopen::tien<5>(miopen::deref(weightTensor).GetLengths()); + tensor_utils::Tien<5>(tensor_utils::GetLengths(weightTensor)); int out_n, out_c, out_d, out_h, out_w; std::tie(out_n, out_c, out_d, out_h, out_w) = - miopen::tien<5>(miopen::deref(outputTensor).GetLengths()); + tensor_utils::Tien<5>(tensor_utils::GetLengths(outputTensor)); size_t flopCnt = static_cast(2) * in_n * in_c * wei_h * wei_w * wei_d * out_c * out_d * out_h * out_w / group_count; size_t inputBytes = in_n * in_c * in_d * in_h * in_w * - miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); + tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor)); size_t weightBytes = wei_n * wei_c * wei_d * wei_h * wei_w * - miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); + tensor_utils::GetTypeSize(tensor_utils::GetType(weightTensor)); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_d * out_h * out_w * - miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); + tensor_utils::GetTypeSize(tensor_utils::GetType(outputTensor)); printf("stats: name, n, c, do, ho, wo, z, y, x, k, flopCnt, bytesRead, bytesWritten, " "GFLOPs, " @@ -1758,7 +1773,7 @@ void ConvDriver::PrintForwardTime(const float kernel_total_time, wei_d, wei_h, wei_w, - miopen::deref(convDesc).GetConvStrides()[0], + conv_strides_v[0], in_n, in_c, out_d, @@ -2324,16 +2339,32 @@ int ConvDriver::RunForwardGpuImmed(const bool is_transform) template int ConvDriver::RunForwardCPU() { + int fwd_spatial_dim = 0; + miopenGetConvolutionSpatialDim(convDesc, &fwd_spatial_dim); + std::vector fwd_pads(fwd_spatial_dim); + std::vector fwd_strides(fwd_spatial_dim); + std::vector fwd_dilations(fwd_spatial_dim); + miopenConvolutionMode_t fwd_mode_unused; + miopenGetConvolutionNdDescriptor(convDesc, + fwd_spatial_dim, + &fwd_spatial_dim, + fwd_pads.data(), + fwd_strides.data(), + fwd_dilations.data(), + &fwd_mode_unused); + int fwd_group_count = 0; + miopenGetConvolutionGroupCount(convDesc, &fwd_group_count); + if(mode == miopenTranspose) { - cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), + cpu_convolution_backward_data(fwd_spatial_dim, outhost, wei.GetTensor(), in.GetTensor(), - miopen::deref(convDesc).GetConvPads(), - miopen::deref(convDesc).GetConvStrides(), - miopen::deref(convDesc).GetConvDilations(), - miopen::deref(convDesc).GetGroupCount()); + fwd_pads, + fwd_strides, + fwd_dilations, + fwd_group_count); if(inflags.GetValueInt("bias") != 0) { @@ -2342,14 +2373,14 @@ int ConvDriver::RunForwardCPU() } else { - cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), + cpu_convolution_forward(fwd_spatial_dim, in.GetTensor(), wei.GetTensor(), outhost, - miopen::deref(convDesc).GetConvPads(), - miopen::deref(convDesc).GetConvStrides(), - miopen::deref(convDesc).GetConvDilations(), - miopen::deref(convDesc).GetGroupCount()); + fwd_pads, + fwd_strides, + fwd_dilations, + fwd_group_count); if(inflags.GetValueInt("bias") != 0) { @@ -2413,7 +2444,8 @@ int ConvDriver::RunForwardGPUReference() { if(!is_gpualloc) { - auto out_tmp = tensor(miopen::deref(outputTensor)); + auto out_tmp = tensor(tensor_utils::GetLengths(outputTensor), + tensor_utils::GetStrides(outputTensor)); out.CopyFromDeviceToHost(GetStream(), out_tmp); for(size_t i = 0; i < out_tmp.data.size(); ++i) { @@ -2701,7 +2733,7 @@ void ConvDriver::PrintBackwardDataTime(float kernel_total_time, floa float kernel_average_time = ComputeAverageTime(kernel_total_time, kernel_first_time); printf("GPU Kernel Time Backward Data Conv. Elapsed: %f ms (average)\n", kernel_average_time); - const auto num_dim = miopen::deref(inputTensor).GetNumDims() - 2; + const auto num_dim = tensor_utils::GetNumDims(inputTensor) - 2; if(num_dim != 2 && num_dim != 3) { printf("stats: for conv%ud\n", num_dim); @@ -2710,27 +2742,42 @@ void ConvDriver::PrintBackwardDataTime(float kernel_total_time, floa int group_count = std::max(inflags.GetValueInt("group_count"), 1); + int bwd_spatial_dim = 0; + miopenGetConvolutionSpatialDim(convDesc, &bwd_spatial_dim); + std::vector bwd_strides_v(bwd_spatial_dim); + { + std::vector tmp_pads(bwd_spatial_dim), tmp_dils(bwd_spatial_dim); + miopenConvolutionMode_t tmp_mode; + miopenGetConvolutionNdDescriptor(convDesc, + bwd_spatial_dim, + &bwd_spatial_dim, + tmp_pads.data(), + bwd_strides_v.data(), + tmp_dils.data(), + &tmp_mode); + } + if(num_dim == 2) { int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(inputTensor)); int wei_c, wei_n, wei_h, wei_w; std::tie(wei_c, wei_n, wei_h, wei_w) = - miopen::tien<4>(miopen::deref(weightTensor).GetLengths()); + tensor_utils::Tien<4>(tensor_utils::GetLengths(weightTensor)); int out_n, out_c, out_h, out_w; std::tie(out_n, out_c, out_h, out_w) = - miopen::tien<4>(miopen::deref(outputTensor).GetLengths()); + tensor_utils::Tien<4>(tensor_utils::GetLengths(outputTensor)); size_t flopCnt = static_cast(2) * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w / group_count; size_t weightBytes = wei_n * wei_c * wei_h * wei_w * - miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); + tensor_utils::GetTypeSize(tensor_utils::GetType(weightTensor)); size_t inputBytes = - in_n * in_c * out_c * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); + in_n * in_c * out_c * tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor)); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_h * out_w * - miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); + tensor_utils::GetTypeSize(tensor_utils::GetType(outputTensor)); printf("stats: name, n, c, ho, wo, y, x, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); @@ -2738,7 +2785,7 @@ void ConvDriver::PrintBackwardDataTime(float kernel_total_time, floa "bwdd-conv", wei_h, wei_w, - miopen::deref(convDesc).GetConvStrides()[0], + bwd_strides_v[0], in_n, in_c, out_h, @@ -2757,24 +2804,24 @@ void ConvDriver::PrintBackwardDataTime(float kernel_total_time, floa { // 3d int in_n, in_c, in_d, in_h, in_w; std::tie(in_n, in_c, in_d, in_h, in_w) = - miopen::tien<5>(miopen::deref(inputTensor).GetLengths()); + tensor_utils::Tien<5>(tensor_utils::GetLengths(inputTensor)); int wei_c, wei_n, wei_d, wei_h, wei_w; std::tie(wei_c, wei_n, wei_d, wei_h, wei_w) = - miopen::tien<5>(miopen::deref(weightTensor).GetLengths()); + tensor_utils::Tien<5>(tensor_utils::GetLengths(weightTensor)); int out_n, out_c, out_d, out_h, out_w; std::tie(out_n, out_c, out_d, out_h, out_w) = - miopen::tien<5>(miopen::deref(outputTensor).GetLengths()); + tensor_utils::Tien<5>(tensor_utils::GetLengths(outputTensor)); size_t flopCnt = static_cast(2) * in_n * in_c * wei_d * wei_h * wei_w * out_c * out_d * out_h * out_w / group_count; size_t weightBytes = wei_n * wei_c * wei_d * wei_h * wei_w * - miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); + tensor_utils::GetTypeSize(tensor_utils::GetType(weightTensor)); size_t inputBytes = - in_n * in_c * out_c * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); + in_n * in_c * out_c * tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor)); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_d * out_h * out_w * - miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); + tensor_utils::GetTypeSize(tensor_utils::GetType(outputTensor)); printf( "stats: name, n, c, do, ho, wo, z, y, x, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " @@ -2785,7 +2832,7 @@ void ConvDriver::PrintBackwardDataTime(float kernel_total_time, floa wei_d, wei_h, wei_w, - miopen::deref(convDesc).GetConvStrides()[0], + bwd_strides_v[0], in_n, in_c, out_d, @@ -2929,7 +2976,7 @@ void ConvDriver::PrintBackwardWrwTime(float kernel_total_time, float printf("GPU Kernel Time Backward Weights Conv. Elapsed: %f ms (average)\n", kernel_average_time); - const auto num_dim = miopen::deref(inputTensor).GetNumDims() - 2; + const auto num_dim = tensor_utils::GetNumDims(inputTensor) - 2; if(num_dim != 2 && num_dim != 3) { printf("stats: for conv%ud\n", num_dim); @@ -2938,16 +2985,31 @@ void ConvDriver::PrintBackwardWrwTime(float kernel_total_time, float int group_count = std::max(inflags.GetValueInt("group_count"), 1); + int wrw_spatial_dim = 0; + miopenGetConvolutionSpatialDim(convDesc, &wrw_spatial_dim); + std::vector wrw_strides_v(wrw_spatial_dim); + { + std::vector tmp_pads(wrw_spatial_dim), tmp_dils(wrw_spatial_dim); + miopenConvolutionMode_t tmp_mode; + miopenGetConvolutionNdDescriptor(convDesc, + wrw_spatial_dim, + &wrw_spatial_dim, + tmp_pads.data(), + wrw_strides_v.data(), + tmp_dils.data(), + &tmp_mode); + } + if(num_dim == 2) { int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(inputTensor)); int wei_c, wei_n, wei_h, wei_w; std::tie(wei_c, wei_n, wei_h, wei_w) = - miopen::tien<4>(miopen::deref(weightTensor).GetLengths()); + tensor_utils::Tien<4>(tensor_utils::GetLengths(weightTensor)); int out_n, out_c, out_h, out_w; std::tie(out_n, out_c, out_h, out_w) = - miopen::tien<4>(miopen::deref(outputTensor).GetLengths()); + tensor_utils::Tien<4>(tensor_utils::GetLengths(outputTensor)); size_t flopCnt = static_cast(2) * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w / group_count; @@ -2960,7 +3022,7 @@ void ConvDriver::PrintBackwardWrwTime(float kernel_total_time, float "bwdw-conv", wei_h, wei_w, - miopen::deref(convDesc).GetConvStrides()[0], + wrw_strides_v[0], in_n, in_c, out_h, @@ -2979,13 +3041,13 @@ void ConvDriver::PrintBackwardWrwTime(float kernel_total_time, float { // 3d int in_n, in_c, in_d, in_h, in_w; std::tie(in_n, in_c, in_d, in_h, in_w) = - miopen::tien<5>(miopen::deref(inputTensor).GetLengths()); + tensor_utils::Tien<5>(tensor_utils::GetLengths(inputTensor)); int wei_c, wei_n, wei_d, wei_h, wei_w; std::tie(wei_c, wei_n, wei_d, wei_h, wei_w) = - miopen::tien<5>(miopen::deref(weightTensor).GetLengths()); + tensor_utils::Tien<5>(tensor_utils::GetLengths(weightTensor)); int out_n, out_c, out_d, out_h, out_w; std::tie(out_n, out_c, out_d, out_h, out_w) = - miopen::tien<5>(miopen::deref(outputTensor).GetLengths()); + tensor_utils::Tien<5>(tensor_utils::GetLengths(outputTensor)); size_t flopCnt = static_cast(2) * in_n * in_c * wei_d * wei_h * wei_w * out_c * out_d * out_h * out_w / group_count; @@ -3001,7 +3063,7 @@ void ConvDriver::PrintBackwardWrwTime(float kernel_total_time, float wei_d, wei_h, wei_w, - miopen::deref(convDesc).GetConvStrides()[0], + wrw_strides_v[0], in_n, in_c, out_d, @@ -3323,27 +3385,43 @@ int ConvDriver::RunBackwardWrwGpuImmed() template int ConvDriver::RunBackwardWeightsCPU() { + int wrw_cpu_spatial_dim = 0; + miopenGetConvolutionSpatialDim(convDesc, &wrw_cpu_spatial_dim); + std::vector wrw_cpu_pads(wrw_cpu_spatial_dim); + std::vector wrw_cpu_strides(wrw_cpu_spatial_dim); + std::vector wrw_cpu_dilations(wrw_cpu_spatial_dim); + miopenConvolutionMode_t wrw_cpu_mode_unused; + miopenGetConvolutionNdDescriptor(convDesc, + wrw_cpu_spatial_dim, + &wrw_cpu_spatial_dim, + wrw_cpu_pads.data(), + wrw_cpu_strides.data(), + wrw_cpu_dilations.data(), + &wrw_cpu_mode_unused); + int wrw_cpu_group_count = 0; + miopenGetConvolutionGroupCount(convDesc, &wrw_cpu_group_count); + if(mode == miopenTranspose) { - cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), + cpu_convolution_backward_weight(wrw_cpu_spatial_dim, dout.GetTensor(), dwei_host, in.GetTensor(), - miopen::deref(convDesc).GetConvPads(), - miopen::deref(convDesc).GetConvStrides(), - miopen::deref(convDesc).GetConvDilations(), - miopen::deref(convDesc).GetGroupCount()); + wrw_cpu_pads, + wrw_cpu_strides, + wrw_cpu_dilations, + wrw_cpu_group_count); } else { - cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), + cpu_convolution_backward_weight(wrw_cpu_spatial_dim, in.GetTensor(), dwei_host, dout.GetTensor(), - miopen::deref(convDesc).GetConvPads(), - miopen::deref(convDesc).GetConvStrides(), - miopen::deref(convDesc).GetConvDilations(), - miopen::deref(convDesc).GetGroupCount()); + wrw_cpu_pads, + wrw_cpu_strides, + wrw_cpu_dilations, + wrw_cpu_group_count); } if(inflags.GetValueInt("dump_output")) @@ -3359,27 +3437,43 @@ int ConvDriver::RunBackwardWeightsCPU() template int ConvDriver::RunBackwardDataCPU() { + int bwd_cpu_spatial_dim = 0; + miopenGetConvolutionSpatialDim(convDesc, &bwd_cpu_spatial_dim); + std::vector bwd_cpu_pads(bwd_cpu_spatial_dim); + std::vector bwd_cpu_strides(bwd_cpu_spatial_dim); + std::vector bwd_cpu_dilations(bwd_cpu_spatial_dim); + miopenConvolutionMode_t bwd_cpu_mode_unused; + miopenGetConvolutionNdDescriptor(convDesc, + bwd_cpu_spatial_dim, + &bwd_cpu_spatial_dim, + bwd_cpu_pads.data(), + bwd_cpu_strides.data(), + bwd_cpu_dilations.data(), + &bwd_cpu_mode_unused); + int bwd_cpu_group_count = 0; + miopenGetConvolutionGroupCount(convDesc, &bwd_cpu_group_count); + if(mode == miopenTranspose) { - cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), + cpu_convolution_forward(bwd_cpu_spatial_dim, dout.GetTensor(), wei.GetTensor(), din_host, - miopen::deref(convDesc).GetConvPads(), - miopen::deref(convDesc).GetConvStrides(), - miopen::deref(convDesc).GetConvDilations(), - miopen::deref(convDesc).GetGroupCount()); + bwd_cpu_pads, + bwd_cpu_strides, + bwd_cpu_dilations, + bwd_cpu_group_count); } else { - cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), + cpu_convolution_backward_data(bwd_cpu_spatial_dim, din_host, wei.GetTensor(), dout.GetTensor(), - miopen::deref(convDesc).GetConvPads(), - miopen::deref(convDesc).GetConvStrides(), - miopen::deref(convDesc).GetConvDilations(), - miopen::deref(convDesc).GetGroupCount()); + bwd_cpu_pads, + bwd_cpu_strides, + bwd_cpu_dilations, + bwd_cpu_group_count); } if(inflags.GetValueInt("dump_output")) @@ -3440,7 +3534,8 @@ int ConvDriver::RunBackwardWeightsGPUReference() { if(!is_gpualloc) { - auto dwei_tmp = tensor(miopen::deref(weightTensor)); + auto dwei_tmp = tensor(tensor_utils::GetLengths(weightTensor), + tensor_utils::GetStrides(weightTensor)); dwei.CopyFromDeviceToHost(GetStream(), dwei_tmp); for(size_t i = 0; i < dwei_tmp.data.size(); ++i) { @@ -3496,7 +3591,8 @@ int ConvDriver::RunBackwardDataGPUReference() { if(!is_gpualloc) { - auto din_tmp = tensor(miopen::deref(inputTensor)); + auto din_tmp = tensor(tensor_utils::GetLengths(inputTensor), + tensor_utils::GetStrides(inputTensor)); din.CopyFromDeviceToHost(GetStream(), din_tmp); for(size_t i = 0; i < din_tmp.data.size(); ++i) { @@ -3583,14 +3679,22 @@ std::string ConvDriver::GetVerificationCacheFileName( ss << get_basename_string(); ss << "_" << mode; ss << "_" << spatial_dim; - ss << "_" << miopen::deref(convDesc).paddingMode; - ss << "_" << miopen::deref(convDesc).GetGroupCount(); - miopen::LogRange(ss << "_", miopen::deref(inputTensor).GetLengths(), "x"); - miopen::LogRange(ss << "_", miopen::deref(weightTensor).GetLengths(), "x"); - miopen::LogRange(ss << "_", pads, "x"); - miopen::LogRange(ss << "_", conv_strides, "x"); - miopen::LogRange(ss << "_", conv_dilations, "x"); - miopen::LogRange(ss << "_", trans_output_pads, "x"); + { + miopenPaddingMode_t pm; + miopenGetConvolutionPaddingMode(convDesc, &pm); + ss << "_" << pm; + } + { + int gc = 0; + miopenGetConvolutionGroupCount(convDesc, &gc); + ss << "_" << gc; + } + tensor_utils::LogRange(ss << "_", tensor_utils::GetLengths(inputTensor), "x"); + tensor_utils::LogRange(ss << "_", tensor_utils::GetLengths(weightTensor), "x"); + tensor_utils::LogRange(ss << "_", pads, "x"); + tensor_utils::LogRange(ss << "_", conv_strides, "x"); + tensor_utils::LogRange(ss << "_", conv_dilations, "x"); + tensor_utils::LogRange(ss << "_", trans_output_pads, "x"); ss << "_" << inflags.GetValueInt("pad_val"); ss << "_" << inflags.GetValueInt("bias"); ss << "_" << "GPU" << get_datatype_string(Tgpu{}); diff --git a/projects/miopen/driver/ctc_driver.hpp b/projects/miopen/driver/ctc_driver.hpp index 85aecb3264d3..e3fe7117292d 100644 --- a/projects/miopen/driver/ctc_driver.hpp +++ b/projects/miopen/driver/ctc_driver.hpp @@ -33,6 +33,7 @@ #include "util_driver.hpp" #include "util_file.hpp" +#include #include #include @@ -279,8 +280,8 @@ int CTCDriver::AllocateBuffersAndCopy() ctcLossDesc, &workSpaceSize); - GetCTCLossWorkspaceSizeCPU(miopen::deref(probsDesc).GetLengths(), - miopen::deref(gradientsDesc).GetLengths(), + GetCTCLossWorkspaceSizeCPU(tensor_utils::GetLengths(probsDesc), + tensor_utils::GetLengths(gradientsDesc), labels.data(), labelLengths.data(), inputLengths.data(), @@ -399,10 +400,10 @@ template int CTCDriver::RunCTCLossCPU() { RunCTCLossCPUVerify(num_class, - miopen::deref(probsDesc).GetLengths(), - miopen::deref(probsDesc).GetStrides(), - miopen::deref(gradientsDesc).GetLengths(), - miopen::deref(gradientsDesc).GetStrides(), + tensor_utils::GetLengths(probsDesc), + tensor_utils::GetStrides(probsDesc), + tensor_utils::GetLengths(gradientsDesc), + tensor_utils::GetStrides(gradientsDesc), probs, labels, labelLengths, diff --git a/projects/miopen/driver/driver.hpp b/projects/miopen/driver/driver.hpp index b08d27efc214..879a9a7e0749 100644 --- a/projects/miopen/driver/driver.hpp +++ b/projects/miopen/driver/driver.hpp @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include "util_driver.hpp" #include "rocrand_wrapper.hpp" @@ -227,7 +227,8 @@ class GpumemTensor void AllocOnHost(miopenTensorDescriptor_t t) { - host = tensor(miopen::deref(t)); + host = tensor(tensor_utils::GetLengths(t), + tensor_utils::GetStrides(t)); if(is_gpualloc) // We do not need host data. { host.data.clear(); diff --git a/projects/miopen/driver/dropout_driver.hpp b/projects/miopen/driver/dropout_driver.hpp index 0016340fd60e..982cc4076217 100644 --- a/projects/miopen/driver/dropout_driver.hpp +++ b/projects/miopen/driver/dropout_driver.hpp @@ -36,7 +36,7 @@ #include -#include +#include #include #include @@ -254,19 +254,19 @@ int DropoutDriver::AllocateBuffersAndCopy() reservespace_dev = std::unique_ptr(new GPUMem(ctx, reserveSpaceSize, sizeof(unsigned char))); - in = tensor(miopen::deref(inputTensor).GetLengths(), - miopen::deref(inputTensor).GetStrides()); - din = tensor(miopen::deref(inputTensor).GetLengths(), - miopen::deref(inputTensor).GetStrides()); - out = tensor(miopen::deref(outputTensor).GetLengths(), - miopen::deref(outputTensor).GetStrides()); - dout = tensor(miopen::deref(outputTensor).GetLengths(), - miopen::deref(outputTensor).GetStrides()); + in = tensor(tensor_utils::GetLengths(inputTensor), + tensor_utils::GetStrides(inputTensor)); + din = tensor(tensor_utils::GetLengths(inputTensor), + tensor_utils::GetStrides(inputTensor)); + out = tensor(tensor_utils::GetLengths(outputTensor), + tensor_utils::GetStrides(outputTensor)); + dout = tensor(tensor_utils::GetLengths(outputTensor), + tensor_utils::GetStrides(outputTensor)); - outhost = tensor(miopen::deref(outputTensor).GetLengths(), - miopen::deref(outputTensor).GetStrides()); - din_host = tensor(miopen::deref(inputTensor).GetLengths(), - miopen::deref(inputTensor).GetStrides()); + outhost = tensor(tensor_utils::GetLengths(outputTensor), + tensor_utils::GetStrides(outputTensor)); + din_host = tensor(tensor_utils::GetLengths(inputTensor), + tensor_utils::GetStrides(inputTensor)); reservespace = std::vector(reserveSpaceSize, static_cast(1)); reservespace_host = std::vector(reserveSpaceSize, static_cast(1)); @@ -362,7 +362,7 @@ int DropoutDriver::RunForwardGPU() template int DropoutDriver::RunForwardCPU() { - InitKernelStateEmulator(states_host, DropoutDesc); + InitKernelStateEmulator(states_host, DropoutDesc, GetHandle()); RunDropoutForwardEmulator(GetHandle(), DropoutDesc, @@ -436,6 +436,7 @@ int DropoutDriver::RunBackwardCPU() if(multithread) { RunDropoutBackwardEmulatorMT(DropoutDesc, + GetHandle(), outputTensor, dout.data, inputTensor, @@ -445,6 +446,7 @@ int DropoutDriver::RunBackwardCPU() else { RunDropoutBackwardEmulator(DropoutDesc, + GetHandle(), outputTensor, dout.data, inputTensor, diff --git a/projects/miopen/driver/dropout_gpu_emulator.hpp b/projects/miopen/driver/dropout_gpu_emulator.hpp index a66e28f076cf..ae9dfb847c7f 100644 --- a/projects/miopen/driver/dropout_gpu_emulator.hpp +++ b/projects/miopen/driver/dropout_gpu_emulator.hpp @@ -26,15 +26,21 @@ #ifndef GUARD_MIOPEN_DROPOUT_GPU_EMULATOR_HPP #define GUARD_MIOPEN_DROPOUT_GPU_EMULATOR_HPP -#include #include #include +#include +#include #include #include #include #include +// Maximum PRNG states for dropout emulation (matches kernel definition). +#ifndef MAX_PRNG_STATE +#define MAX_PRNG_STATE (256 * 64) +#endif + // disable __device__ qualifiers #ifdef FQUALIFIERS #error rocrand FQUALIFIERS defined externally, probably one of rocrand device header included prior to this @@ -43,9 +49,21 @@ #include "../src/kernels/miopen_rocrand.hpp" static void InitKernelStateEmulator(std::vector& states, - const miopenDropoutDescriptor_t dropoutDesc) + const miopenDropoutDescriptor_t dropoutDesc, + miopenHandle_t handle) { - size_t states_num = miopen::deref(dropoutDesc).stateSizeInBytes / sizeof(rocrand_state_xorwow); + size_t stateSizeInBytes = 0; + miopenDropoutGetStatesSize(handle, &stateSizeInBytes); + float dropout_val = 0; + void* states_ptr = nullptr; + unsigned long long seed_val = 0; + bool use_mask_val = false; + bool state_evo = false; + miopenRNGType_t rng_mode; + miopenGetDropoutDescriptor( + dropoutDesc, handle, &dropout_val, &states_ptr, &seed_val, &use_mask_val, &state_evo, &rng_mode); + + size_t states_num = stateSizeInBytes / sizeof(rocrand_state_xorwow); size_t wk_grp_num = std::min(size_t(MAX_PRNG_STATE / 256), (states_num + 255) / 256); size_t glb_sz = wk_grp_num * 256; @@ -55,7 +73,7 @@ static void InitKernelStateEmulator(std::vector& states, { size_t gid = i + j * glb_sz; rocrand_state_xorwow state_gid; - rocrand_init(miopen::deref(dropoutDesc).seed, gid, 0ULL, &state_gid); + rocrand_init(seed_val, gid, 0ULL, &state_gid); states[gid] = state_gid; } } @@ -119,8 +137,8 @@ void RunDropoutForwardEmulator(miopenHandle_t handle, size_t rsvsp_offset = 0) { (void)noise_shape; - auto in_dim = miopen::deref(inputTensor).GetNumDims(); - auto out_dim = miopen::deref(outputTensor).GetNumDims(); + auto in_dim = tensor_utils::GetNumDims(inputTensor); + auto out_dim = tensor_utils::GetNumDims(outputTensor); if(in_dim != out_dim) { printf("CPU verification: Input/Output dimension does not match\n"); @@ -132,13 +150,20 @@ void RunDropoutForwardEmulator(miopenHandle_t handle, printf("CPU verification: Only support 1D to 5D tensors\n"); } - if(miopen::deref(inputTensor).GetElementSize() != miopen::deref(outputTensor).GetElementSize()) + if(tensor_utils::GetElementSize(inputTensor) != tensor_utils::GetElementSize(outputTensor)) { printf("CPU verification: Input/Output element size does not match\n"); } - const auto use_mask = miopen::deref(dropoutDesc).use_mask; - const auto dropout_rate = miopen::deref(dropoutDesc).dropout; + float dropout_rate_f = 0; + void* states_ptr_unused = nullptr; + unsigned long long seed_uu = 0; + bool use_mask = false; + bool state_evo_unused = false; + miopenRNGType_t rng_unused; + miopenGetDropoutDescriptor( + dropoutDesc, handle, &dropout_rate_f, &states_ptr_unused, &seed_uu, &use_mask, &state_evo_unused, &rng_unused); + const auto dropout_rate = dropout_rate_f; if(dropout_rate < 0.0 || dropout_rate > 1.0) { printf("CPU verification: Invalid dropout rate\n"); @@ -150,18 +175,20 @@ void RunDropoutForwardEmulator(miopenHandle_t handle, std::vector out_len(5, 1); std::vector out_str(5, 1); - ExpandTensorDim(miopen::deref(inputTensor).GetLengths(), - miopen::deref(inputTensor).GetStrides(), - miopen::deref(outputTensor).GetLengths(), - miopen::deref(outputTensor).GetStrides(), + ExpandTensorDim(tensor_utils::GetLengths(inputTensor), + tensor_utils::GetStrides(inputTensor), + tensor_utils::GetLengths(outputTensor), + tensor_utils::GetStrides(outputTensor), in_len, in_str, out_len, out_str); + int maxGridDimX = 0; + hipDeviceGetAttribute(&maxGridDimX, hipDeviceAttributeMaxGridDimX, 0); const size_t glb_sz = std::min( - size_t(std::min(size_t(MAX_PRNG_STATE), miopen::deref(handle).GetImage3dMaxWidth()) / + size_t(std::min(size_t(MAX_PRNG_STATE), static_cast(maxGridDimX)) / 256), ((in_len[4] * in_len[3] * in_len[2] * in_len[1] * in_len[0] + 255) / 256)) * 256; @@ -251,6 +278,7 @@ struct Indexer template void RunDropoutBackwardEmulator(const miopenDropoutDescriptor_t dropoutDesc, + miopenHandle_t handle, const miopenTensorDescriptor_t outputTensor, std::vector& dout, const miopenTensorDescriptor_t inputTensor, @@ -260,8 +288,8 @@ void RunDropoutBackwardEmulator(const miopenDropoutDescriptor_t dropoutDesc, size_t out_offset = 0, size_t rsvsp_offset = 0) { - auto in_dim = miopen::deref(inputTensor).GetNumDims(); - auto out_dim = miopen::deref(outputTensor).GetNumDims(); + auto in_dim = tensor_utils::GetNumDims(inputTensor); + auto out_dim = tensor_utils::GetNumDims(outputTensor); if(in_dim != out_dim) { printf("CPU verification: Input/Output dimension does not match\n"); @@ -273,12 +301,14 @@ void RunDropoutBackwardEmulator(const miopenDropoutDescriptor_t dropoutDesc, printf("CPU verification: Only support 1D to 5D tensors\n"); } - if(miopen::deref(inputTensor).GetElementSize() != miopen::deref(outputTensor).GetElementSize()) + if(tensor_utils::GetElementSize(inputTensor) != tensor_utils::GetElementSize(outputTensor)) { printf("CPU verification: Input/Output element size does not match\n"); } - const auto dropout_rate = miopen::deref(dropoutDesc).dropout; + float bwd_dropout_f = 0; + miopenGetDropoutDescriptor(dropoutDesc, handle, &bwd_dropout_f, nullptr, nullptr, nullptr, nullptr, nullptr); + const auto dropout_rate = bwd_dropout_f; if(dropout_rate < 0.0 || dropout_rate > 1.0) { printf("CPU verification: Invalid dropout rate\n"); @@ -290,10 +320,10 @@ void RunDropoutBackwardEmulator(const miopenDropoutDescriptor_t dropoutDesc, std::vector out_len(5, 1); std::vector out_str(5, 1); - ExpandTensorDim(miopen::deref(inputTensor).GetLengths(), - miopen::deref(inputTensor).GetStrides(), - miopen::deref(outputTensor).GetLengths(), - miopen::deref(outputTensor).GetStrides(), + ExpandTensorDim(tensor_utils::GetLengths(inputTensor), + tensor_utils::GetStrides(inputTensor), + tensor_utils::GetLengths(outputTensor), + tensor_utils::GetStrides(outputTensor), in_len, in_str, out_len, @@ -323,6 +353,7 @@ void RunDropoutBackwardEmulator(const miopenDropoutDescriptor_t dropoutDesc, template void RunDropoutBackwardEmulatorMT(const miopenDropoutDescriptor_t dropoutDesc, + miopenHandle_t handle, const miopenTensorDescriptor_t outputTensor, std::vector& dout, const miopenTensorDescriptor_t inputTensor, @@ -332,8 +363,8 @@ void RunDropoutBackwardEmulatorMT(const miopenDropoutDescriptor_t dropoutDesc, size_t out_offset = 0, size_t rsvsp_offset = 0) { - auto in_dim = miopen::deref(inputTensor).GetNumDims(); - auto out_dim = miopen::deref(outputTensor).GetNumDims(); + auto in_dim = tensor_utils::GetNumDims(inputTensor); + auto out_dim = tensor_utils::GetNumDims(outputTensor); if(in_dim != out_dim) { printf("CPU verification: Input/Output dimension does not match\n"); @@ -345,12 +376,14 @@ void RunDropoutBackwardEmulatorMT(const miopenDropoutDescriptor_t dropoutDesc, printf("CPU verification: Only support 1D to 5D tensors\n"); } - if(miopen::deref(inputTensor).GetElementSize() != miopen::deref(outputTensor).GetElementSize()) + if(tensor_utils::GetElementSize(inputTensor) != tensor_utils::GetElementSize(outputTensor)) { printf("CPU verification: Input/Output element size does not match\n"); } - const auto dropout_rate = miopen::deref(dropoutDesc).dropout; + float mt_dropout_f = 0; + miopenGetDropoutDescriptor(dropoutDesc, handle, &mt_dropout_f, nullptr, nullptr, nullptr, nullptr, nullptr); + const auto dropout_rate = mt_dropout_f; if(dropout_rate < 0.0 || dropout_rate > 1.0) { printf("CPU verification: Invalid dropout rate\n"); @@ -362,10 +395,10 @@ void RunDropoutBackwardEmulatorMT(const miopenDropoutDescriptor_t dropoutDesc, std::vector out_len(5, 1); std::vector out_str(5, 1); - ExpandTensorDim(miopen::deref(inputTensor).GetLengths(), - miopen::deref(inputTensor).GetStrides(), - miopen::deref(outputTensor).GetLengths(), - miopen::deref(outputTensor).GetStrides(), + ExpandTensorDim(tensor_utils::GetLengths(inputTensor), + tensor_utils::GetStrides(inputTensor), + tensor_utils::GetLengths(outputTensor), + tensor_utils::GetStrides(outputTensor), in_len, in_str, out_len, diff --git a/projects/miopen/driver/getitem_driver.hpp b/projects/miopen/driver/getitem_driver.hpp index d27fdd6af1a9..ce18754e0954 100644 --- a/projects/miopen/driver/getitem_driver.hpp +++ b/projects/miopen/driver/getitem_driver.hpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -60,10 +61,10 @@ int32_t mloGetitemBackwardRunHost(miopenTensorDescriptor_t dyDesc, int32_t* slices, uint32_t /*offset*/) { - auto dy_dims = miopen::deref(dyDesc).GetLengths(); + auto dy_dims = tensor_utils::GetLengths(dyDesc); auto dy_numel = std::accumulate(dy_dims.begin(), dy_dims.end(), 1L, std::multiplies()); - auto dx_dims = miopen::deref(dxDesc).GetLengths(); - auto index_dims = miopen::deref(indexDescs[0]).GetLengths(); + auto dx_dims = tensor_utils::GetLengths(dxDesc); + auto index_dims = tensor_utils::GetLengths(indexDescs[0]); auto index_numel = std::accumulate(index_dims.begin(), index_dims.end(), 1L, std::multiplies()); auto element_index = std::vector(indexCount * index_numel + indexCount); diff --git a/projects/miopen/driver/glu_driver.hpp b/projects/miopen/driver/glu_driver.hpp index 42a7018891f1..83838be0467f 100644 --- a/projects/miopen/driver/glu_driver.hpp +++ b/projects/miopen/driver/glu_driver.hpp @@ -41,6 +41,7 @@ #include #include +#include #include template @@ -54,7 +55,7 @@ int mloGLUForwardContiguousDim0RunHost(const Tgpu* input, miopenTensorDescriptor_t outputDesc, Tcheck* outputHost) { - auto output_numel = miopen::deref(outputDesc).GetElementSize(); + auto output_numel = tensor_utils::GetElementSize(outputDesc); auto inputFirstHalf = input; auto inputSecondHalf = input + output_numel; @@ -79,7 +80,7 @@ int mloGLUBackwardCongiguousDim0RunHost(const Tgpu* input, { int ret = 0; - auto outputGrad_numel = miopen::deref(outputGradDesc).GetElementSize(); + auto outputGrad_numel = tensor_utils::GetElementSize(outputGradDesc); auto inputFirstHalf = input; auto inputSecondHalf = input + outputGrad_numel; auto inputFistHalf_grad = inputGradHost; diff --git a/projects/miopen/driver/gru_verify_gemm.hpp b/projects/miopen/driver/gru_verify_gemm.hpp index 237d311b1c29..56941bea3fe6 100644 --- a/projects/miopen/driver/gru_verify_gemm.hpp +++ b/projects/miopen/driver/gru_verify_gemm.hpp @@ -136,7 +136,7 @@ void RunGRUForwardGEMMCPUVerify(miopenHandle_t handle, miopenDropoutGetStatesSize(handle, &statesSizeInBytes); size_t states_size = statesSizeInBytes / sizeof(rocrand_state_xorwow); dropout_states_host = std::vector(states_size); - InitKernelStateEmulator(dropout_states_host, dropoutDesc); + InitKernelStateEmulator(dropout_states_host, dropoutDesc, handle); std::array drop_in_len = {{batch_n, hy_h * bi}}; std::array drop_in_str = {{hy_stride, 1}}; @@ -889,6 +889,7 @@ void RunGRUBackwardDataGEMMCPUVerify(std::vector& din_host, std::vector& wkspace_host, bool use_dropout, miopenDropoutDescriptor_t dropoutDesc, + miopenHandle_t handle, bool hx_is_null = false, bool dhy_is_null = false) { @@ -1031,6 +1032,7 @@ void RunGRUBackwardDataGEMMCPUVerify(std::vector& din_host, if(use_dropout) { RunDropoutBackwardEmulator(dropoutDesc, + handle, dropout_inputTensor, dh_state, dropout_inputTensor, diff --git a/projects/miopen/driver/kthvalue_driver.hpp b/projects/miopen/driver/kthvalue_driver.hpp index 087593cf0f48..9693b6dde172 100644 --- a/projects/miopen/driver/kthvalue_driver.hpp +++ b/projects/miopen/driver/kthvalue_driver.hpp @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -241,9 +242,9 @@ int KthvalueDriver::AddCmdLineArgs() template int KthvalueDriver::AllocateBuffersAndCopy() { - size_t in_sz = miopen::deref(inputDesc).GetElementSize(); - size_t idx_sz = miopen::deref(indicesDesc).GetElementSize(); - size_t out_sz = miopen::deref(outputDesc).GetElementSize(); + size_t in_sz = tensor_utils::GetElementSize(inputDesc); + size_t idx_sz = tensor_utils::GetElementSize(indicesDesc); + size_t out_sz = tensor_utils::GetElementSize(outputDesc); uint32_t ctx = 0; diff --git a/projects/miopen/driver/layernorm_driver.hpp b/projects/miopen/driver/layernorm_driver.hpp index 7ed02f5ead23..2a82fdc25c17 100644 --- a/projects/miopen/driver/layernorm_driver.hpp +++ b/projects/miopen/driver/layernorm_driver.hpp @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include template @@ -308,22 +308,22 @@ int LayerNormDriver::AllocateBuffersAndCopy() db_dev = std::unique_ptr(new GPUMem(ctx, db_sz, sizeof(T))); workspace_dev = std::unique_ptr(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte))); - in = tensor(miopen::deref(inputDesc)).generate(genT0val); - weight = tensor(miopen::deref(weightDesc)).generate(genT0val); - bias = tensor(miopen::deref(biasDesc)).generate(genT0val); - out = tensor(miopen::deref(outputDesc)).generate(genT0val); - mean = tensor(miopen::deref(meanDesc)).generate(genT0val); - rstd = tensor(miopen::deref(rstdDesc)).generate(genT0val); - dy = tensor(miopen::deref(dyDesc)).generate(genT0val); - dx = tensor(miopen::deref(dxDesc)).generate(genT0val); - dw = tensor(miopen::deref(dwDesc)).generate(genT0val); - db = tensor(miopen::deref(dbDesc)).generate(genT0val); - outhost = tensor(miopen::deref(outputDesc)).generate(genT0val); - meanhost = tensor(miopen::deref(meanDesc)).generate(genT0val); - rstdhost = tensor(miopen::deref(rstdDesc)).generate(genT0val); - dxhost = tensor(miopen::deref(dxDesc)).generate(genT0val); - dwhost = tensor(miopen::deref(dwDesc)).generate(genT0val); - dbhost = tensor(miopen::deref(dbDesc)).generate(genT0val); + in = tensor(tensor_utils::GetLengths(inputDesc), tensor_utils::GetStrides(inputDesc)).generate(genT0val); + weight = tensor(tensor_utils::GetLengths(weightDesc), tensor_utils::GetStrides(weightDesc)).generate(genT0val); + bias = tensor(tensor_utils::GetLengths(biasDesc), tensor_utils::GetStrides(biasDesc)).generate(genT0val); + out = tensor(tensor_utils::GetLengths(outputDesc), tensor_utils::GetStrides(outputDesc)).generate(genT0val); + mean = tensor(tensor_utils::GetLengths(meanDesc), tensor_utils::GetStrides(meanDesc)).generate(genT0val); + rstd = tensor(tensor_utils::GetLengths(rstdDesc), tensor_utils::GetStrides(rstdDesc)).generate(genT0val); + dy = tensor(tensor_utils::GetLengths(dyDesc), tensor_utils::GetStrides(dyDesc)).generate(genT0val); + dx = tensor(tensor_utils::GetLengths(dxDesc), tensor_utils::GetStrides(dxDesc)).generate(genT0val); + dw = tensor(tensor_utils::GetLengths(dwDesc), tensor_utils::GetStrides(dwDesc)).generate(genT0val); + db = tensor(tensor_utils::GetLengths(dbDesc), tensor_utils::GetStrides(dbDesc)).generate(genT0val); + outhost = tensor(tensor_utils::GetLengths(outputDesc), tensor_utils::GetStrides(outputDesc)).generate(genT0val); + meanhost = tensor(tensor_utils::GetLengths(meanDesc), tensor_utils::GetStrides(meanDesc)).generate(genT0val); + rstdhost = tensor(tensor_utils::GetLengths(rstdDesc), tensor_utils::GetStrides(rstdDesc)).generate(genT0val); + dxhost = tensor(tensor_utils::GetLengths(dxDesc), tensor_utils::GetStrides(dxDesc)).generate(genT0val); + dwhost = tensor(tensor_utils::GetLengths(dwDesc), tensor_utils::GetStrides(dwDesc)).generate(genT0val); + dbhost = tensor(tensor_utils::GetLengths(dbDesc), tensor_utils::GetStrides(dbDesc)).generate(genT0val); for(int i = 0; i < in_sz; i++) { diff --git a/projects/miopen/driver/lstm_verify_gemm.hpp b/projects/miopen/driver/lstm_verify_gemm.hpp index a761779738f4..16ec68f2c128 100644 --- a/projects/miopen/driver/lstm_verify_gemm.hpp +++ b/projects/miopen/driver/lstm_verify_gemm.hpp @@ -142,7 +142,7 @@ void RunLSTMForwardGEMMCPUVerify(miopenHandle_t handle, miopenDropoutGetStatesSize(handle, &statesSizeInBytes); size_t states_size = statesSizeInBytes / sizeof(rocrand_state_xorwow); dropout_states_host = std::vector(states_size); - InitKernelStateEmulator(dropout_states_host, dropoutDesc); + InitKernelStateEmulator(dropout_states_host, dropoutDesc, handle); std::array drop_in_len = {{batch_n, hy_h * bi}}; std::array drop_in_str = {{hy_stride, 1}}; @@ -727,6 +727,7 @@ void RunLSTMBackwardDataGEMMCPUVerify( std::vector& wkspace_host, bool use_dropout, miopenDropoutDescriptor_t dropoutDesc, + miopenHandle_t handle, bool cx_is_null = false, bool dhy_is_null = false, bool dcy_is_null = false) @@ -875,6 +876,7 @@ void RunLSTMBackwardDataGEMMCPUVerify( if(use_dropout) { RunDropoutBackwardEmulator(dropoutDesc, + handle, dropout_inputTensor, dh_state, dropout_inputTensor, diff --git a/projects/miopen/driver/miopen_ConvBatchNormActivHost.hpp b/projects/miopen/driver/miopen_ConvBatchNormActivHost.hpp index e6e6dad19f88..78d43691445a 100644 --- a/projects/miopen/driver/miopen_ConvBatchNormActivHost.hpp +++ b/projects/miopen/driver/miopen_ConvBatchNormActivHost.hpp @@ -29,9 +29,9 @@ #include "mloNeuronHost.hpp" +#include #include #include -#include #include #include @@ -49,7 +49,7 @@ int miopenBNSpatialFwdInferHost(miopenTensorDescriptor_t& inputTensor, { int nIn, cIn, hIn, wIn; miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn); - const auto tensorLayout = miopen::deref(inputTensor).GetLayout_t(); + const auto tensorLayout = tensor_utils::GetLayout(inputTensor); int n_batchs = nIn; int channels = cIn; @@ -106,7 +106,7 @@ int miopenBNPerActivFwdInferHost(miopenTensorDescriptor_t& inputTensor, int nIn, cIn, hIn, wIn; miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn); - const auto tensorLayout = miopen::deref(inputTensor).GetLayout_t(); + const auto tensorLayout = tensor_utils::GetLayout(inputTensor); int n_batchs = nIn; int channels = cIn; @@ -299,7 +299,8 @@ int ConvForwardCPU(const std::vector& in, int stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w; miopenConvolutionMode_t mode; - miopenPaddingMode_t pmode = miopen::deref(convDesc).paddingMode; + miopenPaddingMode_t pmode; + miopenGetConvolutionPaddingMode(convDesc, &pmode); miopenGetConvolutionDescriptor( convDesc, &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilation_h, &dilation_w); diff --git a/projects/miopen/driver/mloConvHost.hpp b/projects/miopen/driver/mloConvHost.hpp index ff7074f2c76f..38aa92229969 100644 --- a/projects/miopen/driver/mloConvHost.hpp +++ b/projects/miopen/driver/mloConvHost.hpp @@ -28,6 +28,7 @@ #define MLO_CONVHOST_H_ #include +#include #include #include @@ -908,11 +909,11 @@ bool mloVerify(const miopenTensorDescriptor_t& cpu_, size_t g_batch_stride, g_channel_stride, g_depth_stride, g_height_stride, g_width_stride; std::tie(n_batchs, n_channels, depth, height, width) = - miopen::GetNCDHW(spatial_dim, cpu.GetLengths()); + tensor_utils::GetNCDHW(spatial_dim, cpu.GetLengths()); std::tie(c_batch_stride, c_channel_stride, c_depth_stride, c_height_stride, c_width_stride) = - miopen::GetNCDHW(spatial_dim, cpu.GetStrides()); + tensor_utils::GetNCDHW(spatial_dim, cpu.GetStrides()); std::tie(g_batch_stride, g_channel_stride, g_depth_stride, g_height_stride, g_width_stride) = - miopen::GetNCDHW(spatial_dim, gpu.GetStrides()); + tensor_utils::GetNCDHW(spatial_dim, gpu.GetStrides()); bool match = true; double rms_accum = 0.0; @@ -1060,11 +1061,11 @@ bool mloVerify_mt(const miopenTensorDescriptor_t& cpu_, size_t g_batch_stride, g_channel_stride, g_depth_stride, g_height_stride, g_width_stride; std::tie(n_batchs, n_channels, depth, height, width) = - miopen::GetNCDHW(spatial_dim, cpu.GetLengths()); + tensor_utils::GetNCDHW(spatial_dim, cpu.GetLengths()); std::tie(c_batch_stride, c_channel_stride, c_depth_stride, c_height_stride, c_width_stride) = - miopen::GetNCDHW(spatial_dim, cpu.GetStrides()); + tensor_utils::GetNCDHW(spatial_dim, cpu.GetStrides()); std::tie(g_batch_stride, g_channel_stride, g_depth_stride, g_height_stride, g_width_stride) = - miopen::GetNCDHW(spatial_dim, gpu.GetStrides()); + tensor_utils::GetNCDHW(spatial_dim, gpu.GetStrides()); double rms_accum = 0.0; Tcheck_ worst_c_val = static_cast(0); diff --git a/projects/miopen/driver/mloGroupNormHost.hpp b/projects/miopen/driver/mloGroupNormHost.hpp index 5a0db0239946..ce79248e2f1f 100644 --- a/projects/miopen/driver/mloGroupNormHost.hpp +++ b/projects/miopen/driver/mloGroupNormHost.hpp @@ -26,7 +26,6 @@ #ifndef MLO_GROUPNORMHOST_H_ #define MLO_GROUPNORMHOST_H_ -#include //////////////////////////////////////////////////////////// // @@ -44,9 +43,9 @@ int32_t mloGroupNormForwardRunHost(miopenTensorDescriptor_t inputDesc, miopenNormMode_t mode, bool use_multithread) { - auto dims = miopen::deref(inputDesc).GetLengths(); + auto dims = tensor_utils::GetLengths(inputDesc); - size_t numel = miopen::deref(inputDesc).GetElementSize(); + size_t numel = tensor_utils::GetElementSize(inputDesc); size_t numel_per_channel = numel / dims[0] / dims[1]; size_t num_channels = dims[1]; diff --git a/projects/miopen/driver/mloPReLUHost.hpp b/projects/miopen/driver/mloPReLUHost.hpp index 8bc5b6769f43..ccbf390538a3 100644 --- a/projects/miopen/driver/mloPReLUHost.hpp +++ b/projects/miopen/driver/mloPReLUHost.hpp @@ -46,8 +46,8 @@ int32_t mloPReLUBackwardRunHost(const miopenTensorDescriptor_t inputDesc, auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc)); auto dinput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc)); - auto input_sz = miopen::deref(inputDesc).GetElementSize(); - auto weight_sz = miopen::deref(weightDesc).GetElementSize(); + auto input_sz = tensor_utils::GetElementSize(inputDesc); + auto weight_sz = tensor_utils::GetElementSize(weightDesc); auto weight_grad_collector = std::vector(input_sz); miopen::par_ford(input_sz)([&](int gid) { diff --git a/projects/miopen/driver/mloPoolingHost.hpp b/projects/miopen/driver/mloPoolingHost.hpp index 480e242d6e8f..5bbd28083bb2 100644 --- a/projects/miopen/driver/mloPoolingHost.hpp +++ b/projects/miopen/driver/mloPoolingHost.hpp @@ -33,6 +33,7 @@ #endif #include +#include #include #include #include @@ -115,10 +116,10 @@ struct TensorDimsStrides TensorDimsStrides(int spatial_dim, const miopen::TensorDescriptor& desc) { std::tie(n_batchs, n_outputs, depth, height, width) = - miopen::GetNCDHW(spatial_dim, desc.GetLengths()); + tensor_utils::GetNCDHW(spatial_dim, desc.GetLengths()); std::tie(n_stride, c_stride, d_stride, h_stride, w_stride) = - miopen::GetNCDHW(spatial_dim, desc.GetStrides()); + tensor_utils::GetNCDHW(spatial_dim, desc.GetStrides()); } }; diff --git a/projects/miopen/driver/multimarginloss_driver.hpp b/projects/miopen/driver/multimarginloss_driver.hpp index 69d2857187cf..70fa7df3a0ca 100644 --- a/projects/miopen/driver/multimarginloss_driver.hpp +++ b/projects/miopen/driver/multimarginloss_driver.hpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -294,7 +295,7 @@ int MultiMarginLossDriver::AllocateBuffersAndCopy() { I[i] = prng::gen_A_to_B(static_cast(-1), static_cast(1)); } - int C = miopen::deref(iDesc).GetLengths()[1]; + int C = tensor_utils::GetLengths(iDesc)[1]; // 0 to C - 1 for(int i = 0; i < t_sz; i++) { diff --git a/projects/miopen/driver/pool_driver.hpp b/projects/miopen/driver/pool_driver.hpp index 6df8f8d5e0c8..a65d4cfcc09a 100644 --- a/projects/miopen/driver/pool_driver.hpp +++ b/projects/miopen/driver/pool_driver.hpp @@ -37,9 +37,8 @@ #include "util_file.hpp" #include +#include #include -#include -#include #include #include @@ -190,7 +189,8 @@ int PoolDriver_impl::GetandSetData() int nOutStride, cOutStride, dOutStride, hOutStride, wOutStride; int nOut, cOut, dOut, hOut, wOut; miopenPoolingMode_t mode = miopenPoolingMax; - miopenPaddingMode_t pmode = miopen::deref(poolDesc).pmode; + miopenPaddingMode_t pmode; + miopenGetPoolingPaddingMode(poolDesc, &pmode); int windowDepth, windowHeight, windowWidth; int pad_d, pad_h, pad_w; int stride_d, stride_h, stride_w; @@ -229,9 +229,9 @@ int PoolDriver_impl::GetandSetData() miopenGetNdPoolingDescriptor( poolDesc, spatial_dim, &mode, nullptr, winV.data(), padV.data(), strV.data()); - std::tie(windowDepth, windowHeight, windowWidth) = miopen::tien<3>(winV); - std::tie(pad_d, pad_h, pad_w) = miopen::tien<3>(padV); - std::tie(stride_d, stride_h, stride_w) = miopen::tien<3>(strV); + std::tie(windowDepth, windowHeight, windowWidth) = tensor_utils::Tien<3>(winV); + std::tie(pad_d, pad_h, pad_w) = tensor_utils::Tien<3>(padV); + std::tie(stride_d, stride_h, stride_w) = tensor_utils::Tien<3>(strV); } else { @@ -400,17 +400,19 @@ int PoolDriver_impl::SetPoolDescriptorFromCmdLineArgs() out_filename = inflags.GetValueStr("out_data"); dump_root = inflags.GetValueStr("dump_root"); - std::initializer_list lens = {win_d, win_h, win_w}; - std::initializer_list pads = {pad_d, pad_h, pad_w}; - std::initializer_list strides = {stride_d, stride_h, stride_w}; - miopen::deref(poolDesc) = miopen::PoolingDescriptor(mode, - pmode, - lens.begin() + 3 - spatial_dim, - pads.begin() + 3 - spatial_dim, - strides.begin() + 3 - spatial_dim, - spatial_dim); - - miopen::deref(poolDesc).SetIndexType(index_type); + std::vector lens_v = {win_d, win_h, win_w}; + std::vector pads_v = {pad_d, pad_h, pad_w}; + std::vector strides_v = {stride_d, stride_h, stride_w}; + // Trim to spatial_dim elements from the end (skip leading 3d-only values for 2d) + int offset = 3 - spatial_dim; + miopenSetNdPoolingDescriptor(poolDesc, + mode, + spatial_dim, + lens_v.data() + offset, + pads_v.data() + offset, + strides_v.data() + offset); + + miopenSetPoolingIndexType(poolDesc, index_type); miopenSetPoolingWorkSpaceIndexMode( poolDesc, diff --git a/projects/miopen/driver/reducecalculation_driver.hpp b/projects/miopen/driver/reducecalculation_driver.hpp index f5ae0c4fe339..272d2b5cc069 100644 --- a/projects/miopen/driver/reducecalculation_driver.hpp +++ b/projects/miopen/driver/reducecalculation_driver.hpp @@ -37,8 +37,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -56,8 +56,8 @@ int32_t mloReduceCalculationForwardRunHost(miopenTensorDescriptor_t inputDesc, int32_t dim, miopenReduceCalculationNanPropagation_t nanPropagation) { - auto input_dims = miopen::deref(inputDesc).GetLengths(); - auto output_dims = miopen::deref(outputDesc).GetLengths(); + auto input_dims = tensor_utils::GetLengths(inputDesc); + auto output_dims = tensor_utils::GetLengths(outputDesc); auto reduce_size = input_dims[dim]; auto output_numel = diff --git a/projects/miopen/driver/reduceextreme_driver.hpp b/projects/miopen/driver/reduceextreme_driver.hpp index 1fc96e7c57a4..5cf31c5a60b3 100644 --- a/projects/miopen/driver/reduceextreme_driver.hpp +++ b/projects/miopen/driver/reduceextreme_driver.hpp @@ -36,8 +36,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -59,12 +59,12 @@ int32_t mloReduceExtremeForwardRunHost(miopenTensorDescriptor_t xDesc, int32_t* indicehost, int32_t dim) { - auto x_dims = miopen::deref(xDesc).GetLengths(); + auto x_dims = tensor_utils::GetLengths(xDesc); std::vector indice_dims; if(yhost) - indice_dims = miopen::deref(yDesc).GetLengths(); + indice_dims = tensor_utils::GetLengths(yDesc); else - indice_dims = miopen::deref(indiceDesc).GetLengths(); + indice_dims = tensor_utils::GetLengths(indiceDesc); int32_t reduce_size = static_cast(x_dims[dim]); auto indice_numel = diff --git a/projects/miopen/driver/rnn_driver.hpp b/projects/miopen/driver/rnn_driver.hpp index 99f2f3aa7ba4..6f278dfe6f03 100644 --- a/projects/miopen/driver/rnn_driver.hpp +++ b/projects/miopen/driver/rnn_driver.hpp @@ -858,7 +858,7 @@ int RNNDriver::RunForwardGPU() time_logger.StopAndPush(); } - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); if(inflags.GetValueInt("time") == 1) { printf("Forward RNN time results:\n"); @@ -1134,7 +1134,7 @@ int RNNDriver::RunBackwardGPU() time_logger.StopAndPush(); } - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); if(inflags.GetValueInt("time") == 1) { printf("Backward Data RNN time results:\n"); @@ -1174,7 +1174,7 @@ int RNNDriver::RunBackwardGPU() reservespace_dev->GetSize()); time_logger.StopAndPush(); } - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); if(inflags.GetValueInt("time") == 1) { @@ -1433,7 +1433,8 @@ int RNNDriver::RunBackwardDataCPU() reservespace_host, workspace_host, bool(inflags.GetValueInt("use_dropout")), - DropoutDesc); + DropoutDesc, + GetHandle()); } else if(mode == miopenLSTM) { @@ -1461,7 +1462,8 @@ int RNNDriver::RunBackwardDataCPU() reservespace_host, workspace_host, bool(inflags.GetValueInt("use_dropout")), - DropoutDesc); + DropoutDesc, + GetHandle()); } else if(mode == miopenGRU) { @@ -1486,7 +1488,8 @@ int RNNDriver::RunBackwardDataCPU() reservespace_host, workspace_host, bool(inflags.GetValueInt("use_dropout")), - DropoutDesc); + DropoutDesc, + GetHandle()); } else { diff --git a/projects/miopen/driver/rnn_seq_driver.hpp b/projects/miopen/driver/rnn_seq_driver.hpp index 22c87592a368..67469b0a825f 100644 --- a/projects/miopen/driver/rnn_seq_driver.hpp +++ b/projects/miopen/driver/rnn_seq_driver.hpp @@ -39,9 +39,9 @@ #include #include +#include #include #include -#include #include #include @@ -760,7 +760,7 @@ int RNNSeqDriver::SetRNNDescriptorFromCmdLineArgs() // GetTensorSize broken So this is WA inline size_t Get3DNoVECTensorSize(miopenTensorDescriptor_t& tensor) { - assert(miopen::deref(tensor).IsPacked() && + assert(tensor_utils::IsPacked(tensor) && "GetTensorSize should not be used on an unpacked tensor."); const auto len = GetTensorLengths(tensor); size_t sz = std::accumulate(len.begin(), len.end(), 1ULL, std::multiplies()); @@ -1109,7 +1109,7 @@ int RNNSeqDriver::RunForwardGPU() time_logger.StopAndPush(); } - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); if(inflags.GetValueInt("time") == 1) { printf("Forward RNN time results:\n"); @@ -1207,7 +1207,7 @@ int RNNSeqDriver::RunBackwardGPU() time_logger.StopAndPush(); } - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); if(inflags.GetValueInt("time") == 1) { printf("Backward Data RNN time results:\n"); @@ -1279,7 +1279,7 @@ int RNNSeqDriver::RunBackwardGPU() time_logger.StopAndPush(); } - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); if(inflags.GetValueInt("time") == 1) { @@ -1644,7 +1644,8 @@ int RNNSeqDriver::RunBackwardDataCPU() reservespace_host, workspace_host, bool(inflags.GetValueInt("use_dropout")), - DropoutDesc); + DropoutDesc, + GetHandle()); } else if(mode == miopenLSTM) { @@ -1672,7 +1673,8 @@ int RNNSeqDriver::RunBackwardDataCPU() reservespace_host, workspace_host, bool(inflags.GetValueInt("use_dropout")), - DropoutDesc); + DropoutDesc, + GetHandle()); } else if(mode == miopenGRU) { @@ -1697,7 +1699,8 @@ int RNNSeqDriver::RunBackwardDataCPU() reservespace_host, workspace_host, bool(inflags.GetValueInt("use_dropout")), - DropoutDesc); + DropoutDesc, + GetHandle()); } else { diff --git a/projects/miopen/driver/rnn_verify_gemm.hpp b/projects/miopen/driver/rnn_verify_gemm.hpp index 04b73111513d..7494b0bedd82 100644 --- a/projects/miopen/driver/rnn_verify_gemm.hpp +++ b/projects/miopen/driver/rnn_verify_gemm.hpp @@ -140,7 +140,7 @@ void RunRNNForwardGEMMCPUVerify(miopenHandle_t handle, miopenDropoutGetStatesSize(handle, &statesSizeInBytes); size_t states_size = statesSizeInBytes / sizeof(rocrand_state_xorwow); dropout_states_host = std::vector(states_size); - InitKernelStateEmulator(dropout_states_host, dropoutDesc); + InitKernelStateEmulator(dropout_states_host, dropoutDesc, handle); std::array drop_in_len = {{batch_n, hy_h * bi}}; std::array drop_in_str = {{hy_stride, 1}}; @@ -600,6 +600,7 @@ void RunRNNBackwardDataGEMMCPUVerify(std::vector& din_host, std::vector& wkspace_host, bool use_dropout, miopenDropoutDescriptor_t dropoutDesc, + miopenHandle_t handle, bool dhy_is_null = false) { /* @@ -738,6 +739,7 @@ void RunRNNBackwardDataGEMMCPUVerify(std::vector& din_host, if(use_dropout) { RunDropoutBackwardEmulator(dropoutDesc, + handle, dropout_inputTensor, dh_state, dropout_inputTensor, diff --git a/projects/miopen/driver/rope_driver.hpp b/projects/miopen/driver/rope_driver.hpp index d44c7f7e411d..fb4c8bf3dfd4 100644 --- a/projects/miopen/driver/rope_driver.hpp +++ b/projects/miopen/driver/rope_driver.hpp @@ -36,8 +36,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -53,8 +53,8 @@ int32_t mloRoPEForwardRunHost(miopenTensorDescriptor_t xDesc, Tgpu* sin, Tcheck* yhost) { - auto x_dims = miopen::deref(xDesc).GetLengths(); - auto cos_dims = miopen::deref(cosDesc).GetLengths(); + auto x_dims = tensor_utils::GetLengths(xDesc); + auto cos_dims = tensor_utils::GetLengths(cosDesc); auto input_numel = std::accumulate(x_dims.begin(), x_dims.end(), 1LL, std::multiplies()); auto rotary_numel = @@ -90,10 +90,10 @@ int32_t mloRoPEBackwardRunHost(miopenTensorDescriptor_t dyDesc, Tgpu* sin, Tcheck* dxhost) { - auto dy_dims = miopen::deref(dyDesc).GetLengths(); + auto dy_dims = tensor_utils::GetLengths(dyDesc); auto input_numel = std::accumulate(dy_dims.begin(), dy_dims.end(), 1LL, std::multiplies()); - auto cos_dims = miopen::deref(cosDesc).GetLengths(); + auto cos_dims = tensor_utils::GetLengths(cosDesc); auto rotary_numel = std::accumulate(cos_dims.begin(), cos_dims.end(), 1LL, std::multiplies()); diff --git a/projects/miopen/driver/softmarginloss_driver.hpp b/projects/miopen/driver/softmarginloss_driver.hpp index 9367ecdf27da..0793c9aef42e 100644 --- a/projects/miopen/driver/softmarginloss_driver.hpp +++ b/projects/miopen/driver/softmarginloss_driver.hpp @@ -33,6 +33,7 @@ #include "random.hpp" #include #include +#include #include #include #include @@ -49,7 +50,7 @@ int32_t mloSoftMarginLossForwardRunHost(miopenTensorDescriptor_t inputDesc, Tcheck* outputhost, miopenLossReductionMode_t reduction_mode) { - auto input_numel = miopen::deref(inputDesc).GetElementSize(); + auto input_numel = tensor_utils::GetElementSize(inputDesc); auto i_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); auto t_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); auto o_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); @@ -87,7 +88,7 @@ int32_t mloSoftMarginLossBackwardRunHost(miopenTensorDescriptor_t inputDesc, Tcheck* dIhost, miopenLossReductionMode_t reduction_mode) { - auto input_numel = miopen::deref(inputDesc).GetElementSize(); + auto input_numel = tensor_utils::GetElementSize(inputDesc); auto i_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); auto t_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); auto dO_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dODesc)); diff --git a/projects/miopen/driver/t5layernorm_driver.hpp b/projects/miopen/driver/t5layernorm_driver.hpp index 71ff04637929..862ffbc57aa0 100644 --- a/projects/miopen/driver/t5layernorm_driver.hpp +++ b/projects/miopen/driver/t5layernorm_driver.hpp @@ -37,7 +37,6 @@ #include #include #include -#include #include template @@ -50,7 +49,7 @@ int32_t mloT5LayerNormForwardRunHost(miopenTensorDescriptor_t xDesc, miopenNormMode_t mode, bool use_multithread) { - auto dims = miopen::deref(xDesc).GetLengths(); + auto dims = tensor_utils::GetLengths(xDesc); size_t outer_size = 1; size_t inner_size = dims[dims.size() - 1]; @@ -97,7 +96,7 @@ int32_t mloT5LayerNormBackwardRunHost(miopenTensorDescriptor_t dyDesc, miopenNormMode_t mode, bool use_multithread) { - auto dims = miopen::deref(dyDesc).GetLengths(); + auto dims = tensor_utils::GetLengths(dyDesc); size_t outer_size = 1; size_t inner_size = dims[dims.size() - 1]; @@ -148,7 +147,7 @@ int32_t mloT5LayerNormBackwardweightRunHost(miopenTensorDescriptor_t dyDesc, Tcheck* dwhost, bool use_multithread) { - auto dims = miopen::deref(dyDesc).GetLengths(); + auto dims = tensor_utils::GetLengths(dyDesc); size_t outer_size = 1; size_t inner_size = dims[dims.size() - 1]; diff --git a/projects/miopen/driver/tensor_driver.hpp b/projects/miopen/driver/tensor_driver.hpp index 0d1dd224f960..b26c49a30404 100644 --- a/projects/miopen/driver/tensor_driver.hpp +++ b/projects/miopen/driver/tensor_driver.hpp @@ -29,10 +29,10 @@ #define UNPACK_VEC4(v) (v[0]), (v[1]), (v[2]), (v[3]) #include +#include #include #include #include -#include #include #include #include @@ -119,7 +119,7 @@ inline std::vector GetTensorLengths(const miopenTensorDescriptor_t& tensor) } std::vector tensor_len; - tensor_len.resize(miopen::deref(tensor).GetNumDims()); + tensor_len.resize(tensor_utils::GetNumDims(tensor)); miopenGetTensorDescriptor(tensor, nullptr, tensor_len.data(), nullptr); return tensor_len; @@ -149,7 +149,7 @@ inline std::vector GetTensorStrides(const miopenTensorDescriptor_t& tensor) } std::vector tensor_strides; - tensor_strides.resize(miopen::deref(tensor).GetNumDims()); + tensor_strides.resize(tensor_utils::GetNumDims(tensor)); miopenGetTensorDescriptor(tensor, nullptr, nullptr, tensor_strides.data()); @@ -255,7 +255,7 @@ inline int SetTensorNd(miopenTensorDescriptor_t t, // The implementation is a copy-paste from miopen::TensorDescriptor. inline size_t GetTensorSize(const miopenTensorDescriptor_t& tensor) { - assert(miopen::deref(tensor).IsPacked() && + assert(tensor_utils::IsPacked(tensor) && "GetTensorSize should not be used on an unpacked tensor."); const auto len = GetTensorLengths(tensor); const size_t vectorLength = GetTensorVectorLength(tensor); @@ -269,7 +269,7 @@ inline size_t GetTensorSize(const miopenTensorDescriptor_t& tensor) // GetTensorSize. Unless, of course, there is absolutely zero chance to receive an unpacked tensor. inline size_t GetTensorSpace(const miopenTensorDescriptor_t& tensor) { - return miopen::deref(tensor).GetElementSpace(); + return tensor_utils::GetElementSpace(tensor); } #endif // GUARD_MIOPEN_TENSOR_DRIVER_HPP diff --git a/projects/miopen/driver/tensorop_driver.hpp b/projects/miopen/driver/tensorop_driver.hpp index 4ed36b72ab6b..fa5a99a77bb1 100644 --- a/projects/miopen/driver/tensorop_driver.hpp +++ b/projects/miopen/driver/tensorop_driver.hpp @@ -36,7 +36,7 @@ #include #include -#include +#include template class TensorOpDriver : public Driver @@ -280,7 +280,7 @@ int TensorOpDriver::RunForwardGPU() else if(is_scale) miopenScaleTensor(GetHandle(), aTensor, a_dev->GetMem(), &ftensor_val); - miopen::deref(GetHandle()).Finish(); + (void)hipStreamSynchronize(GetStream()); STOP_TIME if(WALL_CLOCK) @@ -312,9 +312,9 @@ int TensorOpDriver::RunForwardGPU() avgtime / (iters - 1), iters - 1); int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(aTensor).GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(aTensor)); size_t dataSz = - in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(aTensor).GetType()); + in_n * in_c * in_h * in_w * tensor_utils::GetTypeSize(tensor_utils::GetType(aTensor)); printf("stats: name, bytesRead, bytesWritten, GB/s, timeMs\n"); printf("stats: tensor op, %zu, %zu, %f, %f\n", diff --git a/projects/miopen/driver/timer.hpp b/projects/miopen/driver/timer.hpp index 4af4c9a2ab9f..fed24e92cebb 100644 --- a/projects/miopen/driver/timer.hpp +++ b/projects/miopen/driver/timer.hpp @@ -28,7 +28,29 @@ #include #include -#include +#include +#include + +namespace driver_timer_detail { + +struct HipEventDeleter +{ + void operator()(hipEvent_t e) const + { + if(e != nullptr) + (void)hipEventDestroy(e); + } +}; +using HipEventPtr = std::unique_ptr; + +inline HipEventPtr make_hip_event() +{ + hipEvent_t e = nullptr; + (void)hipEventCreateWithFlags(&e, hipEventDefault); + return HipEventPtr{e}; +} + +} // namespace driver_timer_detail #define WALL_CLOCK inflags.GetValueInt("wall") @@ -154,8 +176,8 @@ class RNNCombTimeLogger endEvent.reserve(size); for(auto i = size; i > 0; --i) { - startEvent.push_back(miopen::make_hip_event()); - endEvent.push_back(miopen::make_hip_event()); + startEvent.push_back(driver_timer_detail::make_hip_event()); + endEvent.push_back(driver_timer_detail::make_hip_event()); } } } @@ -276,8 +298,8 @@ class RNNCombTimeLogger private: std::vector hostTimePerLaunch; - std::vector startEvent; - std::vector endEvent; + std::vector startEvent; + std::vector endEvent; hipStream_t stream; std::chrono::time_point st; diff --git a/projects/miopen/driver/transformers_adam_w_driver.hpp b/projects/miopen/driver/transformers_adam_w_driver.hpp index a1cd81f2eb53..09208ead083a 100644 --- a/projects/miopen/driver/transformers_adam_w_driver.hpp +++ b/projects/miopen/driver/transformers_adam_w_driver.hpp @@ -34,8 +34,8 @@ #include +#include #include -#include #include #include @@ -405,7 +405,7 @@ int TransformersAdamWDriver::RunForwardCPU() const auto exp_avg_sqs = exp_avg_sq_host.data(); const auto step = iter; - const size_t numel = miopen::deref(paramDesc).GetElementSize(); + const size_t numel = tensor_utils::GetElementSize(paramDesc); const float bias_correction1 = 1.0 - pow(beta1, step); const float bias_correction2 = 1.0 - pow(beta2, step); diff --git a/projects/miopen/include/miopen/miopen.h b/projects/miopen/include/miopen/miopen.h index ac8a7a7360f8..2f1eaba6887d 100644 --- a/projects/miopen/include/miopen/miopen.h +++ b/projects/miopen/include/miopen/miopen.h @@ -825,6 +825,25 @@ MIOPEN_EXPORT miopenStatus_t miopenGetTensorDescriptor(miopenTensorDescriptor_t int* dimsA, int* stridesA); +#ifdef MIOPEN_BETA_API +/*! @brief Get the details of the tensor descriptor + * + * Returns the same information as miopenGetTensorDescriptor() but uses size_t + * arrays, matching miopenSetTensorDescriptorV2(). This avoids truncation for + * tensors whose strides exceed INT_MAX. + * + * @param tensorDesc Tensor descriptor (input) + * @param dataType MIOpen datatype (output) + * @param dimsA Array containing the size of dimensions (output) + * @param stridesA Array containing the size of stride (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenGetTensorDescriptorV2(miopenTensorDescriptor_t tensorDesc, + miopenDataType_t* dataType, + size_t* dimsA, + size_t* stridesA); +#endif + /*! @brief Destroys the tensor descriptor * * @param tensorDesc Tensor descriptor (input) @@ -919,6 +938,48 @@ MIOPEN_EXPORT miopenStatus_t miopenScaleTensor(miopenHandle_t handle, MIOPEN_EXPORT miopenStatus_t miopenGetTensorNumBytes(miopenTensorDescriptor_t tensorDesc, size_t* numBytes); +/*! @brief Get the layout of a tensor descriptor + * + * @param tensorDesc Tensor descriptor (input) + * @param layout Pointer to the tensor layout enum value (output). + * Set to miopenTensorNCHW if no explicit layout was specified. + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenGetTensorLayout(miopenTensorDescriptor_t tensorDesc, + miopenTensorLayout_t* layout); + +/*! @brief Get the total element space of a tensor descriptor + * + * Returns the total number of elements including padding/stride gaps. + * This is the minimum number of elements that must be allocated to hold the tensor. + * + * @param tensorDesc Tensor descriptor (input) + * @param elementSpace Pointer to the element space (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenGetTensorElementSpace(miopenTensorDescriptor_t tensorDesc, + size_t* elementSpace); + +/*! @brief Query whether a tensor is packed (contiguous in memory with no stride gaps) + * + * @param tensorDesc Tensor descriptor (input) + * @param isPacked Pointer to boolean result: true if packed (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenIsTensorPacked(miopenTensorDescriptor_t tensorDesc, + bool* isPacked); + +/*! @brief Get the vector length of a vectorized tensor descriptor + * + * For non-vectorized tensor layouts this returns 1. + * + * @param tensorDesc Tensor descriptor (input) + * @param vectorLength Pointer to the vector length (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenGetTensorVectorLength(miopenTensorDescriptor_t tensorDesc, + size_t* vectorLength); + /*! @brief Copies one tensor to another tensor with a different layout/scale. * * This function implements: @@ -1064,6 +1125,16 @@ miopenGetConvolutionNdDescriptor(miopenConvolutionDescriptor_t convDesc, MIOPEN_EXPORT miopenStatus_t miopenGetConvolutionGroupCount(miopenConvolutionDescriptor_t convDesc, int* groupCount); +/*! @brief Get the padding mode of the convolution descriptor + * + * @param convDesc Convolution layer descriptor (input) + * @param paddingMode Pointer to the padding mode (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetConvolutionPaddingMode(miopenConvolutionDescriptor_t convDesc, + miopenPaddingMode_t* paddingMode); + /*! @brief Set the number of groups to be used in Group/Depthwise convolution * * Must be called before all computational APIs of group/depthwise convolution, it is preferable to @@ -2231,6 +2302,16 @@ MIOPEN_EXPORT miopenStatus_t miopenSetPoolingIndexType(miopenPoolingDescriptor_t MIOPEN_EXPORT miopenStatus_t miopenGetPoolingIndexType(miopenPoolingDescriptor_t poolDesc, miopenIndexType_t* index_type); +/*! @brief Get the padding mode of the pooling descriptor + * + * @param poolDesc Pooling layer descriptor (input) + * @param paddingMode Pointer to the padding mode (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetPoolingPaddingMode(miopenPoolingDescriptor_t poolDesc, + miopenPaddingMode_t* paddingMode); + /*! @brief Set workspace index mode for pooling layer. The default mode is * miopenPoolingWorkSpaceIndexMask. * @@ -8048,6 +8129,33 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTuningPolicy(miopenHandle_t handle, MIOPEN_EXPORT miopenStatus_t miopenGetTuningPolicy(miopenHandle_t handle, miopenTuningPolicy_t* value); +/*! @enum miopenDebugFlag_t + * Debug flags that control internal library behavior for testing and debugging. + */ +typedef enum +{ + miopenDebugLoggingQuiet = 0, /*!< Suppress all logging output */ + miopenDebugFindEnforceDisable = 1, /*!< Disable MIOPEN_FIND_ENFORCE */ + miopenDebugIsWarmupOngoing = 2, /*!< Signal that a warmup pass is in progress */ + miopenDebugAlwaysEnableConvDirectNaive = 3, /*!< Force-enable the naive direct conv solver */ +} miopenDebugFlag_t; + +/*! @brief Set a debug flag + * + * @param flag The debug flag to set (input) + * @param value The value to set (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenSetDebugFlag(miopenDebugFlag_t flag, bool value); + +/*! @brief Get the current value of a debug flag + * + * @param flag The debug flag to query (input) + * @param value Pointer to the output value (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenGetDebugFlag(miopenDebugFlag_t flag, bool* value); + #ifdef __cplusplus } #endif diff --git a/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp index f762f80f280c..a1104b17f060 100644 --- a/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp +++ b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp @@ -180,6 +180,15 @@ struct tensor template tensor(miopenTensorLayout_t layout, const std::vector& dims, const std::vector& strides) + : tensor(layout, + std::vector(dims.begin(), dims.end()), + std::vector(strides.begin(), strides.end())) + { + } + + tensor(miopenTensorLayout_t layout, + const std::vector& dims, + const std::vector& strides) : desc(miopen_type{}, layout, dims, strides), data(desc.GetElementSpace()) { assert(dims.size() == strides.size()); diff --git a/projects/miopen/src/convolution_api.cpp b/projects/miopen/src/convolution_api.cpp index 18d5c6496e95..12a19ac94044 100644 --- a/projects/miopen/src/convolution_api.cpp +++ b/projects/miopen/src/convolution_api.cpp @@ -181,6 +181,15 @@ extern "C" miopenStatus_t miopenGetConvolutionGroupCount(miopenConvolutionDescri return miopen::try_([&] { miopen::deref(groupCount) = miopen::deref(convDesc).group_count; }); } +MIOPEN_EXPORT extern "C" miopenStatus_t +miopenGetConvolutionPaddingMode(miopenConvolutionDescriptor_t convDesc, + miopenPaddingMode_t* paddingMode) +{ + MIOPEN_LOG_FUNCTION(convDesc); + return miopen::try_( + [&] { miopen::deref(paddingMode) = miopen::deref(convDesc).paddingMode; }); +} + MIOPEN_EXPORT extern "C" miopenStatus_t miopenSetConvolutionGroupCount(miopenConvolutionDescriptor_t convDesc, int groupCount) diff --git a/projects/miopen/src/handle_api.cpp b/projects/miopen/src/handle_api.cpp index abcab18ca76d..e0e87df4420e 100644 --- a/projects/miopen/src/handle_api.cpp +++ b/projects/miopen/src/handle_api.cpp @@ -25,8 +25,12 @@ *******************************************************************************/ #include #include +#include #include +#include +#include #include +#include extern "C" const char* miopenGetErrorString(miopenStatus_t error) { @@ -120,3 +124,36 @@ extern "C" miopenStatus_t miopenEnableProfiling(miopenHandle_t handle, bool enab { return miopen::try_([&] { miopen::deref(handle).EnableProfiling(enable); }); } + +static bool* GetDebugFlagPtr(miopenDebugFlag_t flag) +{ + switch(flag) + { + case miopenDebugLoggingQuiet: return &miopen::debug::LoggingQuiet; + case miopenDebugFindEnforceDisable: return &miopen::debug::FindEnforceDisable; + case miopenDebugIsWarmupOngoing: return &miopen::debug::IsWarmupOngoing; + case miopenDebugAlwaysEnableConvDirectNaive: + return &miopen::debug::AlwaysEnableConvDirectNaive; + } + return nullptr; +} + +extern "C" miopenStatus_t miopenSetDebugFlag(miopenDebugFlag_t flag, bool value) +{ + return miopen::try_([&] { + bool* ptr = GetDebugFlagPtr(flag); + if(ptr == nullptr) + MIOPEN_THROW(miopenStatusBadParm, "Invalid debug flag"); + *ptr = value; + }); +} + +extern "C" miopenStatus_t miopenGetDebugFlag(miopenDebugFlag_t flag, bool* value) +{ + return miopen::try_([&] { + bool* ptr = GetDebugFlagPtr(flag); + if(ptr == nullptr) + MIOPEN_THROW(miopenStatusBadParm, "Invalid debug flag"); + miopen::deref(value) = *ptr; + }); +} diff --git a/projects/miopen/src/include/miopen/tensor_layout.hpp b/projects/miopen/src/include/miopen/tensor_layout.hpp index f5659d7dd3ef..416957618b57 100644 --- a/projects/miopen/src/include/miopen/tensor_layout.hpp +++ b/projects/miopen/src/include/miopen/tensor_layout.hpp @@ -27,11 +27,12 @@ #define GUARD_TENSOR_LAYOUT_HPP #include -#include #include -#include -#include #include +#include +#include +#include +#include namespace miopen { diff --git a/projects/miopen/src/pooling_api.cpp b/projects/miopen/src/pooling_api.cpp index 2ee1a1aeb617..a05c5ba7e1d3 100644 --- a/projects/miopen/src/pooling_api.cpp +++ b/projects/miopen/src/pooling_api.cpp @@ -145,6 +145,14 @@ extern "C" miopenStatus_t miopenGetPoolingIndexType(miopenPoolingDescriptor_t po return miopen::try_([&] { *index_type = miopen::deref(poolDesc).GetIndexType(); }); } +extern "C" miopenStatus_t +miopenGetPoolingPaddingMode(miopenPoolingDescriptor_t poolDesc, + miopenPaddingMode_t* paddingMode) +{ + MIOPEN_LOG_FUNCTION(poolDesc); + return miopen::try_([&] { miopen::deref(paddingMode) = miopen::deref(poolDesc).pmode; }); +} + extern "C" miopenStatus_t miopenSetPoolingWorkSpaceIndexMode(miopenPoolingDescriptor_t poolDesc, miopenPoolingWorkspaceIndexMode_t workspace_index) diff --git a/projects/miopen/src/tensor_api.cpp b/projects/miopen/src/tensor_api.cpp index ba52f5bc8e28..b07c4703df59 100644 --- a/projects/miopen/src/tensor_api.cpp +++ b/projects/miopen/src/tensor_api.cpp @@ -271,6 +271,35 @@ extern "C" miopenStatus_t miopenGetTensorNumBytes(miopenTensorDescriptor_t tenso return miopen::try_([&] { miopen::deref(numBytes) = miopen::deref(tensorDesc).GetNumBytes(); }); } +extern "C" miopenStatus_t miopenGetTensorLayout(miopenTensorDescriptor_t tensorDesc, + miopenTensorLayout_t* layout) +{ + MIOPEN_LOG_FUNCTION(tensorDesc); + return miopen::try_([&] { miopen::deref(layout) = miopen::deref(tensorDesc).GetLayout_t(); }); +} + +extern "C" miopenStatus_t miopenGetTensorElementSpace(miopenTensorDescriptor_t tensorDesc, + size_t* elementSpace) +{ + MIOPEN_LOG_FUNCTION(tensorDesc); + return miopen::try_( + [&] { miopen::deref(elementSpace) = miopen::deref(tensorDesc).GetElementSpace(); }); +} + +extern "C" miopenStatus_t miopenIsTensorPacked(miopenTensorDescriptor_t tensorDesc, bool* isPacked) +{ + MIOPEN_LOG_FUNCTION(tensorDesc); + return miopen::try_([&] { miopen::deref(isPacked) = miopen::deref(tensorDesc).IsPacked(); }); +} + +extern "C" miopenStatus_t miopenGetTensorVectorLength(miopenTensorDescriptor_t tensorDesc, + size_t* vectorLength) +{ + MIOPEN_LOG_FUNCTION(tensorDesc); + return miopen::try_( + [&] { miopen::deref(vectorLength) = miopen::deref(tensorDesc).GetVectorLength(); }); +} + // Internal API int miopenGetTensorDescriptorElementSize(miopenTensorDescriptor_t tensorDesc) { @@ -311,6 +340,32 @@ extern "C" miopenStatus_t miopenGetTensorDescriptor(miopenTensorDescriptor_t ten }); } +extern "C" miopenStatus_t miopenGetTensorDescriptorV2(miopenTensorDescriptor_t tensorDesc, + miopenDataType_t* dataType, + size_t* dimsA, + size_t* stridesA) +{ + MIOPEN_LOG_FUNCTION(tensorDesc); + return miopen::try_([&] { + if(dataType != nullptr) + { + *dataType = miopen::deref(tensorDesc).GetType(); + } + if(dimsA != nullptr) + { + std::copy(miopen::deref(tensorDesc).GetLengths().begin(), + miopen::deref(tensorDesc).GetLengths().end(), + dimsA); + } + if(stridesA != nullptr) + { + std::copy(miopen::deref(tensorDesc).GetStrides().begin(), + miopen::deref(tensorDesc).GetStrides().end(), + stridesA); + } + }); +} + extern "C" miopenStatus_t miopenDestroyTensorDescriptor(miopenTensorDescriptor_t tensorDesc) { MIOPEN_LOG_FUNCTION(tensorDesc);