ROCm · BradPepersAMD · May 9, 2026 · May 10, 2026
@@ -0,0 +1,165 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#ifndef GUARD_COMMON_UTILS_TENSOR_UTILS_HPP
+#define GUARD_COMMON_UTILS_TENSOR_UTILS_HPP
+
+#include <miopen/miopen.h>
+
+#include <cassert>
+#include <cstddef>
+#include <ostream>
+#include <tuple>
+#include <vector>
+
+// Convenience wrappers around the public MIOpen tensor descriptor API.
+// These replace direct use of miopen::deref(tensorDesc).Method() with
+// clean single-call functions that use only the public C API.
+
+namespace tensor_utils {
+
+inline int GetNumDims(miopenTensorDescriptor_t desc)
+{
+    int size = 0;
+    miopenGetTensorDescriptorSize(desc, &size);
+    return size;
+}
+
+inline miopenDataType_t GetType(miopenTensorDescriptor_t desc)
+{
+    miopenDataType_t dt;
+    miopenGetTensorDescriptor(desc, &dt, nullptr, nullptr);
+    return dt;
+}
+
+inline std::vector<size_t> GetLengths(miopenTensorDescriptor_t desc)
+{
+    int ndim = GetNumDims(desc);
+    std::vector<size_t> lens(ndim);
+    miopenGetTensorDescriptorV2(desc, nullptr, lens.data(), nullptr);
+    return lens;
+}
+
+inline std::vector<size_t> GetStrides(miopenTensorDescriptor_t desc)
+{
+    int ndim = GetNumDims(desc);
+    std::vector<size_t> strides(ndim);
+    miopenGetTensorDescriptorV2(desc, nullptr, nullptr, strides.data());
+    return strides;
+}
+
+inline size_t GetNumBytes(miopenTensorDescriptor_t desc)
+{
+    size_t numBytes = 0;
+    miopenGetTensorNumBytes(desc, &numBytes);
+    return numBytes;
+}
+
+inline size_t GetElementSize(miopenTensorDescriptor_t desc)
+{
+    auto lens = GetLengths(desc);
+    size_t n  = 1;
+    for(auto l : lens)
+        n *= l;
+    return n;
+}
+
+inline miopenTensorLayout_t GetLayout(miopenTensorDescriptor_t desc)
+{
+    miopenTensorLayout_t layout;
+    miopenGetTensorLayout(desc, &layout);
+    return layout;
+}
+
+inline size_t GetElementSpace(miopenTensorDescriptor_t desc)
+{
+    size_t space = 0;
+    miopenGetTensorElementSpace(desc, &space);
+    return space;
+}
+
+inline bool IsPacked(miopenTensorDescriptor_t desc)
+{
+    bool packed = false;
+    miopenIsTensorPacked(desc, &packed);
+    return packed;
+}
+
+inline size_t GetVectorLength(miopenTensorDescriptor_t desc)
+{
+    size_t vlen = 0;
+    miopenGetTensorVectorLength(desc, &vlen);
+    return vlen;
+}
+
+// Returns the byte size of a MIOpen data type.
+inline size_t GetTypeSize(miopenDataType_t dt)
+{
+    switch(dt)
+    {
+    case miopenHalf: return 2;
+    case miopenFloat: return 4;
+    case miopenDouble: return 8;
+    case miopenBFloat16: return 2;
+    case miopenInt8: return 1;
+    case miopenInt32: return 4;
+    case miopenInt64: return 8;
+    case miopenFloat8_fnuz: return 1;
+    case miopenBFloat8_fnuz: return 1;
+    default: return 0;
+    }
+}
+
+// Unpack a vector into a tuple of N values.
+// Replacement for miopen::tien<N>(vec).
+template <std::size_t N, typename T>
+auto Tien(const std::vector<T>& v)
+{
+    assert(v.size() >= N);
+    if constexpr(N == 2)
+        return std::make_tuple(v[0], v[1]);
+    else if constexpr(N == 3)
+        return std::make_tuple(v[0], v[1], v[2]);
+    else if constexpr(N == 4)
+        return std::make_tuple(v[0], v[1], v[2], v[3]);
+    else if constexpr(N == 5)
+        return std::make_tuple(v[0], v[1], v[2], v[3], v[4]);
+}
+
+// Given a spatial dimension count and a vector of lengths (or strides),
+// return a tuple of (N, C, D, H, W) with D=1 for 2D tensors.
+// Replacement for miopen::GetNCDHW.
+template <class TElement>
+auto GetNCDHW(unsigned spatial_dims, const std::vector<TElement>& data)
+{
+    if(spatial_dims == 3)
+    {
+        assert(data.size() >= 5);
+        return std::make_tuple(data[0], data[1], data[2], data[3], data[4]);
+    }
+    else
+    {
+        assert(data.size() >= 4);
+        return std::make_tuple(
+            data[0], data[1], static_cast<TElement>(1), data[2], data[3]);
+    }
+}
+
+// Print a range with a separator, replacement for miopen::LogRange.
+template <typename Container>
+std::ostream& LogRange(std::ostream& os, const Container& c, const char* sep)
+{
+    bool first = true;
+    for(const auto& v : c)
+    {
+        if(!first)
+            os << sep;
+        os << v;
+        first = false;
+    }
+    return os;
+}
+
+} // namespace tensor_utils
+
+#endif // GUARD_COMMON_UTILS_TENSOR_UTILS_HPP
@@ -42,7 +42,7 @@
 #include <miopen_utils/cpu_bias.hpp>
 
 #include <common_utils/errors.hpp>
-#include <miopen/handle.hpp>
+#include <common_utils/tensor_utils.hpp>
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
 
@@ -175,7 +175,7 @@ class CBAInferFusionDriver : public Driver
                 avgtime += time;
         }
 
-        miopen::deref(GetHandle()).Finish();
+        (void)hipStreamSynchronize(GetStream());
         STOP_TIME
 
         if(WALL_CLOCK)
@@ -672,7 +672,7 @@ int CBAInferFusionDriver<Tgpu, Tref>::SetConvDescriptorFromCmdLineArgs()
 template <typename Tgpu, typename Tref>
 std::vector<int> CBAInferFusionDriver<Tgpu, Tref>::GetOutputTensorLengths()
 {
-    int ndim = miopen::deref(inputTensor).GetNumDims();
+    int ndim = tensor_utils::GetNumDims(inputTensor);
 
     std::vector<int> out_lens(ndim);
 
@@ -1285,14 +1285,31 @@ void CBAInferFusionDriver<Tgpu, Tref>::runCPUConvFwdInference()
     in_local_host.data  = in_host;
     wei_local_host.data = wei_host;
     outhost_local_host.data.resize(outhost_local_host.desc.GetElementSpace());
-    cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(),
+
+    int cba_spatial_dim = 0;
+    miopenGetConvolutionSpatialDim(convDesc, &cba_spatial_dim);
+    std::vector<int> cba_pads(cba_spatial_dim);
+    std::vector<int> cba_strides(cba_spatial_dim);
+    std::vector<int> cba_dilations(cba_spatial_dim);
+    miopenConvolutionMode_t cba_mode_unused;
+    miopenGetConvolutionNdDescriptor(convDesc,
+                                     cba_spatial_dim,
+                                     &cba_spatial_dim,
+                                     cba_pads.data(),
+                                     cba_strides.data(),
+                                     cba_dilations.data(),
+                                     &cba_mode_unused);
+    int cba_group_count = 0;
+    miopenGetConvolutionGroupCount(convDesc, &cba_group_count);
+
+    cpu_convolution_forward(cba_spatial_dim,
                             in_local_host,
                             wei_local_host,
                             outhost_local_host,
-                            miopen::deref(convDesc).GetConvPads(),
-                            miopen::deref(convDesc).GetConvStrides(),
-                            miopen::deref(convDesc).GetConvDilations(),
-                            miopen::deref(convDesc).GetGroupCount());
+                            cba_pads,
+                            cba_strides,
+                            cba_dilations,
+                            cba_group_count);
 
     if constexpr(!std::is_same_v<Tgpu, Tref>)
     {
@@ -1305,8 +1322,8 @@ void CBAInferFusionDriver<Tgpu, Tref>::runCPUConvFwdInference()
 
     if(bias_mode)
     {
-        tensor<Tref> bias_local_host(miopen::deref(biasTensor).GetLengths(),
-                                     miopen::deref(biasTensor).GetStrides());
+        tensor<Tref> bias_local_host(tensor_utils::GetLengths(biasTensor),
+                                     tensor_utils::GetStrides(biasTensor));
         bias_local_host.data = bias_host;
         cpu_bias_forward(outhost_local_host, bias_local_host);
     }

@@ -34,8 +34,8 @@
 #include "timer.hpp"
 #include "util_driver.hpp"
 
+#include <common_utils/tensor_utils.hpp>
 #include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
 
 #include <algorithm>
 #include <cfloat>
@@ -329,7 +329,7 @@ int ActivationDriver<Tgpu, Tref>::RunForwardGPU()
 
         ExecuteKernel();
 
-        miopen::deref(GetHandle()).Finish();
+        (void)hipStreamSynchronize(GetStream());
         STOP_TIME
         if(WALL_CLOCK)
         {
@@ -373,9 +373,9 @@ int ActivationDriver<Tgpu, Tref>::RunForwardGPU()
                    avgtime / (iters - 1),
                    iters - 1);
         int in_n, in_c, in_h, in_w;
-        std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths());
+        std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(inputTensor));
         size_t dataSz =
-            in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType());
+            in_n * in_c * in_h * in_w * tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor));
 
         // layer, readbytes, writebytes, BG/s, timeMS
         printf("stats: name, bytesRead, bytesWritten, GB/s, timeMs\n");
@@ -432,7 +432,7 @@ int ActivationDriver<Tgpu, Tref>::RunBackwardGPU()
 
         ExecuteKernel();
 
-        miopen::deref(GetHandle()).Finish();
+        (void)hipStreamSynchronize(GetStream());
         STOP_TIME
         if(WALL_CLOCK)
         {
@@ -476,9 +476,9 @@ int ActivationDriver<Tgpu, Tref>::RunBackwardGPU()
                    avgtime / (iters - 1),
                    iters - 1);
         int in_n, in_c, in_h, in_w;
-        std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths());
+        std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(inputTensor));
         size_t dataSz =
-            in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType());
+            in_n * in_c * in_h * in_w * tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor));
 
         // layer, readbytes, writebytes, BG/s, timeMS
         printf("stats: name, bytesRead, bytesWritten, GB/s, timeMs\n");

@@ -36,8 +36,8 @@
 
 #include <common_utils/float_equal.hpp>
 #include <common_utils/ford.hpp>
+#include <common_utils/tensor_utils.hpp>
 #include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
 
 #include <algorithm>
 #include <cfloat>
@@ -73,7 +73,7 @@ void mloAdamRunHost(miopenTensorDescriptor_t paramDesc,
     if(is_amp && found_inf)
         return;
 
-    const size_t numel = miopen::deref(paramDesc).GetElementSize();
+    const size_t numel = tensor_utils::GetElementSize(paramDesc);
 
     const float bias_correction1             = 1.0 - pow(beta1, step);
     const float bias_correction2             = 1.0 - pow(beta2, step);

@@ -38,7 +38,6 @@
 #include <cfloat>
 #include <cstdlib>
 #include <memory>
-#include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
 
@@ -56,7 +55,7 @@ int32_t mloAddLayerNormForwardRunHost(miopenTensorDescriptor_t inputDesc,
                                       miopenNormMode_t mode,
                                       bool use_multithread)
 {
-    auto dims         = miopen::deref(inputDesc).GetLengths();
+    auto dims         = tensor_utils::GetLengths(inputDesc);
     size_t outer_size = 1;
     size_t inner_size = 1;
     size_t norm_dim   = static_cast<size_t>(normalized_dim);

@@ -40,9 +40,8 @@
 #include <miopen_utils/fusionHost.hpp>
 
 #include <common_utils/errors.hpp>
-#include <miopen/handle.hpp>
+#include <common_utils/tensor_utils.hpp>
 #include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
 #include "miopen/batch_norm.hpp"
 
 #include <algorithm>
@@ -1308,7 +1307,7 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunForwardGPU()
         iteration = i; // Modifies the captured reference for the case of not using HIP graph
         ExecuteKernel();
 
-        miopen::deref(GetHandle()).Finish();
+        (void)hipStreamSynchronize(GetStream());
         STOP_TIME
         if(WALL_CLOCK)
         {
@@ -1354,9 +1353,9 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunForwardGPU()
                    avgtime / (iters - 1),
                    iters - 1);
         int in_n, in_c, in_h, in_w;
-        std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths());
+        std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(in.GetTensor().desc.GetLengths());
         size_t M                         = in_n * in_c * in_h * in_w;
-        size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType());
+        size_t dataSz = (M + 2 * in_c) * tensor_utils::GetTypeSize(in.GetTensor().desc.GetType());
         float rdCnt   = -1.0;
         float wrCnt   = 1.0;
         if(forw == 1)
@@ -1692,7 +1691,7 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardGPU()
 
         ExecuteKernel();
 
-        miopen::deref(GetHandle()).Finish();
+        (void)hipStreamSynchronize(GetStream());
         STOP_TIME
         if(WALL_CLOCK)
         {
@@ -1718,9 +1717,9 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardGPU()
                 avgtime += time;
 
             int in_n, in_c, in_h, in_w;
-            std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths());
+            std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(in.GetTensor().desc.GetLengths());
             size_t M                         = in_n * in_c * in_h * in_w;
-            size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType());
+            size_t dataSz = (M + 2 * in_c) * tensor_utils::GetTypeSize(in.GetTensor().desc.GetType());
             float rdCnt   = 2.0;
             float wrCnt   = 1.0;
             // layer, flopCnt, reads, writes, GFLOPS, GB/s, timeMs