Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 165 additions & 0 deletions projects/miopen/common_utils/include/common_utils/tensor_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT

#ifndef GUARD_COMMON_UTILS_TENSOR_UTILS_HPP
#define GUARD_COMMON_UTILS_TENSOR_UTILS_HPP

#include <miopen/miopen.h>

#include <cassert>
#include <cstddef>
#include <ostream>
#include <tuple>
#include <vector>

// Convenience wrappers around the public MIOpen tensor descriptor API.
// These replace direct use of miopen::deref(tensorDesc).Method() with
// clean single-call functions that use only the public C API.

namespace tensor_utils {

inline int GetNumDims(miopenTensorDescriptor_t desc)
{
int size = 0;
miopenGetTensorDescriptorSize(desc, &size);
return size;
}

inline miopenDataType_t GetType(miopenTensorDescriptor_t desc)
{
miopenDataType_t dt;
miopenGetTensorDescriptor(desc, &dt, nullptr, nullptr);
return dt;
}

inline std::vector<size_t> GetLengths(miopenTensorDescriptor_t desc)
{
int ndim = GetNumDims(desc);
std::vector<size_t> lens(ndim);
miopenGetTensorDescriptorV2(desc, nullptr, lens.data(), nullptr);
return lens;
}

inline std::vector<size_t> GetStrides(miopenTensorDescriptor_t desc)
{
int ndim = GetNumDims(desc);
std::vector<size_t> strides(ndim);
miopenGetTensorDescriptorV2(desc, nullptr, nullptr, strides.data());
return strides;
}

inline size_t GetNumBytes(miopenTensorDescriptor_t desc)
{
size_t numBytes = 0;
miopenGetTensorNumBytes(desc, &numBytes);
return numBytes;
}

inline size_t GetElementSize(miopenTensorDescriptor_t desc)
{
auto lens = GetLengths(desc);
size_t n = 1;
for(auto l : lens)
n *= l;
return n;
}

inline miopenTensorLayout_t GetLayout(miopenTensorDescriptor_t desc)
{
miopenTensorLayout_t layout;
miopenGetTensorLayout(desc, &layout);
return layout;
}

inline size_t GetElementSpace(miopenTensorDescriptor_t desc)
{
size_t space = 0;
miopenGetTensorElementSpace(desc, &space);
return space;
}

inline bool IsPacked(miopenTensorDescriptor_t desc)
{
bool packed = false;
miopenIsTensorPacked(desc, &packed);
return packed;
}

inline size_t GetVectorLength(miopenTensorDescriptor_t desc)
{
size_t vlen = 0;
miopenGetTensorVectorLength(desc, &vlen);
return vlen;
}

// Returns the byte size of a MIOpen data type.
inline size_t GetTypeSize(miopenDataType_t dt)
{
switch(dt)
{
case miopenHalf: return 2;
case miopenFloat: return 4;
case miopenDouble: return 8;
case miopenBFloat16: return 2;
case miopenInt8: return 1;
case miopenInt32: return 4;
case miopenInt64: return 8;
case miopenFloat8_fnuz: return 1;
case miopenBFloat8_fnuz: return 1;
default: return 0;
}
}

// Unpack a vector into a tuple of N values.
// Replacement for miopen::tien<N>(vec).
template <std::size_t N, typename T>
auto Tien(const std::vector<T>& v)
{
assert(v.size() >= N);
if constexpr(N == 2)
return std::make_tuple(v[0], v[1]);
else if constexpr(N == 3)
return std::make_tuple(v[0], v[1], v[2]);
else if constexpr(N == 4)
return std::make_tuple(v[0], v[1], v[2], v[3]);
else if constexpr(N == 5)
return std::make_tuple(v[0], v[1], v[2], v[3], v[4]);
}

// Given a spatial dimension count and a vector of lengths (or strides),
// return a tuple of (N, C, D, H, W) with D=1 for 2D tensors.
// Replacement for miopen::GetNCDHW.
template <class TElement>
auto GetNCDHW(unsigned spatial_dims, const std::vector<TElement>& data)
{
if(spatial_dims == 3)
{
assert(data.size() >= 5);
return std::make_tuple(data[0], data[1], data[2], data[3], data[4]);
}
else
{
assert(data.size() >= 4);
return std::make_tuple(
data[0], data[1], static_cast<TElement>(1), data[2], data[3]);
}
}

// Print a range with a separator, replacement for miopen::LogRange.
template <typename Container>
std::ostream& LogRange(std::ostream& os, const Container& c, const char* sep)
{
bool first = true;
for(const auto& v : c)
{
if(!first)
os << sep;
os << v;
first = false;
}
return os;
}

} // namespace tensor_utils

#endif // GUARD_COMMON_UTILS_TENSOR_UTILS_HPP
37 changes: 27 additions & 10 deletions projects/miopen/driver/CBAInferFusion_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
#include <miopen_utils/cpu_bias.hpp>

#include <common_utils/errors.hpp>
#include <miopen/handle.hpp>
#include <common_utils/tensor_utils.hpp>
#include <miopen/miopen.h>
#include <miopen/tensor.hpp>

Expand Down Expand Up @@ -175,7 +175,7 @@ class CBAInferFusionDriver : public Driver
avgtime += time;
}

miopen::deref(GetHandle()).Finish();
(void)hipStreamSynchronize(GetStream());
STOP_TIME

if(WALL_CLOCK)
Expand Down Expand Up @@ -672,7 +672,7 @@ int CBAInferFusionDriver<Tgpu, Tref>::SetConvDescriptorFromCmdLineArgs()
template <typename Tgpu, typename Tref>
std::vector<int> CBAInferFusionDriver<Tgpu, Tref>::GetOutputTensorLengths()
{
int ndim = miopen::deref(inputTensor).GetNumDims();
int ndim = tensor_utils::GetNumDims(inputTensor);

std::vector<int> out_lens(ndim);

Expand Down Expand Up @@ -1285,14 +1285,31 @@ void CBAInferFusionDriver<Tgpu, Tref>::runCPUConvFwdInference()
in_local_host.data = in_host;
wei_local_host.data = wei_host;
outhost_local_host.data.resize(outhost_local_host.desc.GetElementSpace());
cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(),

int cba_spatial_dim = 0;
miopenGetConvolutionSpatialDim(convDesc, &cba_spatial_dim);
std::vector<int> cba_pads(cba_spatial_dim);
std::vector<int> cba_strides(cba_spatial_dim);
std::vector<int> cba_dilations(cba_spatial_dim);
miopenConvolutionMode_t cba_mode_unused;
miopenGetConvolutionNdDescriptor(convDesc,
cba_spatial_dim,
&cba_spatial_dim,
cba_pads.data(),
cba_strides.data(),
cba_dilations.data(),
&cba_mode_unused);
int cba_group_count = 0;
miopenGetConvolutionGroupCount(convDesc, &cba_group_count);

cpu_convolution_forward(cba_spatial_dim,
in_local_host,
wei_local_host,
outhost_local_host,
miopen::deref(convDesc).GetConvPads(),
miopen::deref(convDesc).GetConvStrides(),
miopen::deref(convDesc).GetConvDilations(),
miopen::deref(convDesc).GetGroupCount());
cba_pads,
cba_strides,
cba_dilations,
cba_group_count);

if constexpr(!std::is_same_v<Tgpu, Tref>)
{
Expand All @@ -1305,8 +1322,8 @@ void CBAInferFusionDriver<Tgpu, Tref>::runCPUConvFwdInference()

if(bias_mode)
{
tensor<Tref> bias_local_host(miopen::deref(biasTensor).GetLengths(),
miopen::deref(biasTensor).GetStrides());
tensor<Tref> bias_local_host(tensor_utils::GetLengths(biasTensor),
tensor_utils::GetStrides(biasTensor));
bias_local_host.data = bias_host;
cpu_bias_forward(outhost_local_host, bias_local_host);
}
Expand Down
14 changes: 7 additions & 7 deletions projects/miopen/driver/activ_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
#include "timer.hpp"
#include "util_driver.hpp"

#include <common_utils/tensor_utils.hpp>
#include <miopen/miopen.h>
#include <miopen/tensor.hpp>

#include <algorithm>
#include <cfloat>
Expand Down Expand Up @@ -329,7 +329,7 @@ int ActivationDriver<Tgpu, Tref>::RunForwardGPU()

ExecuteKernel();

miopen::deref(GetHandle()).Finish();
(void)hipStreamSynchronize(GetStream());
STOP_TIME
if(WALL_CLOCK)
{
Expand Down Expand Up @@ -373,9 +373,9 @@ int ActivationDriver<Tgpu, Tref>::RunForwardGPU()
avgtime / (iters - 1),
iters - 1);
int in_n, in_c, in_h, in_w;
std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths());
std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(inputTensor));
size_t dataSz =
in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType());
in_n * in_c * in_h * in_w * tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor));

// layer, readbytes, writebytes, BG/s, timeMS
printf("stats: name, bytesRead, bytesWritten, GB/s, timeMs\n");
Expand Down Expand Up @@ -432,7 +432,7 @@ int ActivationDriver<Tgpu, Tref>::RunBackwardGPU()

ExecuteKernel();

miopen::deref(GetHandle()).Finish();
(void)hipStreamSynchronize(GetStream());
STOP_TIME
if(WALL_CLOCK)
{
Expand Down Expand Up @@ -476,9 +476,9 @@ int ActivationDriver<Tgpu, Tref>::RunBackwardGPU()
avgtime / (iters - 1),
iters - 1);
int in_n, in_c, in_h, in_w;
std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths());
std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(tensor_utils::GetLengths(inputTensor));
size_t dataSz =
in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType());
in_n * in_c * in_h * in_w * tensor_utils::GetTypeSize(tensor_utils::GetType(inputTensor));

// layer, readbytes, writebytes, BG/s, timeMS
printf("stats: name, bytesRead, bytesWritten, GB/s, timeMs\n");
Expand Down
4 changes: 2 additions & 2 deletions projects/miopen/driver/adam_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@

#include <common_utils/float_equal.hpp>
#include <common_utils/ford.hpp>
#include <common_utils/tensor_utils.hpp>
#include <miopen/miopen.h>
#include <miopen/tensor.hpp>

#include <algorithm>
#include <cfloat>
Expand Down Expand Up @@ -73,7 +73,7 @@ void mloAdamRunHost(miopenTensorDescriptor_t paramDesc,
if(is_amp && found_inf)
return;

const size_t numel = miopen::deref(paramDesc).GetElementSize();
const size_t numel = tensor_utils::GetElementSize(paramDesc);

const float bias_correction1 = 1.0 - pow(beta1, step);
const float bias_correction2 = 1.0 - pow(beta2, step);
Expand Down
3 changes: 1 addition & 2 deletions projects/miopen/driver/addlayernorm_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#include <cfloat>
#include <cstdlib>
#include <memory>
#include <miopen/tensor.hpp>
#include <numeric>
#include <vector>

Expand All @@ -56,7 +55,7 @@ int32_t mloAddLayerNormForwardRunHost(miopenTensorDescriptor_t inputDesc,
miopenNormMode_t mode,
bool use_multithread)
{
auto dims = miopen::deref(inputDesc).GetLengths();
auto dims = tensor_utils::GetLengths(inputDesc);
size_t outer_size = 1;
size_t inner_size = 1;
size_t norm_dim = static_cast<size_t>(normalized_dim);
Expand Down
15 changes: 7 additions & 8 deletions projects/miopen/driver/bn_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,8 @@
#include <miopen_utils/fusionHost.hpp>

#include <common_utils/errors.hpp>
#include <miopen/handle.hpp>
#include <common_utils/tensor_utils.hpp>
#include <miopen/miopen.h>
#include <miopen/tensor.hpp>
#include "miopen/batch_norm.hpp"

#include <algorithm>
Expand Down Expand Up @@ -1308,7 +1307,7 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunForwardGPU()
iteration = i; // Modifies the captured reference for the case of not using HIP graph
ExecuteKernel();

miopen::deref(GetHandle()).Finish();
(void)hipStreamSynchronize(GetStream());
STOP_TIME
if(WALL_CLOCK)
{
Expand Down Expand Up @@ -1354,9 +1353,9 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunForwardGPU()
avgtime / (iters - 1),
iters - 1);
int in_n, in_c, in_h, in_w;
std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths());
std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(in.GetTensor().desc.GetLengths());
size_t M = in_n * in_c * in_h * in_w;
size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType());
size_t dataSz = (M + 2 * in_c) * tensor_utils::GetTypeSize(in.GetTensor().desc.GetType());
float rdCnt = -1.0;
float wrCnt = 1.0;
if(forw == 1)
Expand Down Expand Up @@ -1692,7 +1691,7 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardGPU()

ExecuteKernel();

miopen::deref(GetHandle()).Finish();
(void)hipStreamSynchronize(GetStream());
STOP_TIME
if(WALL_CLOCK)
{
Expand All @@ -1718,9 +1717,9 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardGPU()
avgtime += time;

int in_n, in_c, in_h, in_w;
std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths());
std::tie(in_n, in_c, in_h, in_w) = tensor_utils::Tien<4>(in.GetTensor().desc.GetLengths());
size_t M = in_n * in_c * in_h * in_w;
size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType());
size_t dataSz = (M + 2 * in_c) * tensor_utils::GetTypeSize(in.GetTensor().desc.GetType());
float rdCnt = 2.0;
float wrCnt = 1.0;
// layer, flopCnt, reads, writes, GFLOPS, GB/s, timeMs
Expand Down
Loading
Loading