diff --git a/projects/miopen/CMakeLists.txt b/projects/miopen/CMakeLists.txt
index af87cd1c7e16..57089253f3e1 100644
--- a/projects/miopen/CMakeLists.txt
+++ b/projects/miopen/CMakeLists.txt
@@ -110,6 +110,13 @@ if(MIOPEN_INCBIN)
     enable_language(ASM)
 endif()
 
+# Truncation rounding or (default) rounding to nearest even (RNE) is enabled.
+# This switch controls two related but different aspects of MIOpen behavior:
+#  1. How host code performs conversions of float to bfloat16 (important for testing).
+#  2. How BF16 kernels perform the final conversion (and rounding) of FP32 to BF16 results
+#     (affects the main functionality of the library).
+option(MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON)
+
 # Strip symbols for release
 if(MIOPEN_STRIP_SYMBOLS AND NOT WIN32 AND NOT APPLE)
     set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s")
@@ -894,8 +901,10 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 if(NOT MIOPEN_USE_SQLITE_PERFDB)
     add_subdirectory(tools/sqlite2txt)
 endif()
+add_subdirectory(common_utils)
 add_subdirectory(addkernels)
 add_subdirectory(src)
+add_subdirectory(miopen_utils)
 if(MIOPEN_BUILD_DRIVER)
     add_subdirectory(driver)
 endif()
diff --git a/projects/miopen/common_utils/CMakeLists.txt b/projects/miopen/common_utils/CMakeLists.txt
new file mode 100644
index 000000000000..d538ef6ef258
--- /dev/null
+++ b/projects/miopen/common_utils/CMakeLists.txt
@@ -0,0 +1,46 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+
+# INTERNAL BUILD-ONLY library — not installed, not exported, not part of the public MIOpen API.
+# Header-only pure C++ utilities shared by MIOpen, MIOpenDriver, and tests.
+# Contains NO MIOpen or GPU dependencies.
+# Do NOT add install(TARGETS miopen_common_utils ...) — headers live in the build tree only.
+
+add_library(miopen_common_utils INTERFACE)
+set_target_properties(miopen_common_utils PROPERTIES EXCLUDE_FROM_ALL TRUE)
+
+target_include_directories(miopen_common_utils INTERFACE
+    # BUILD_INTERFACE only — no install interface; these headers are not installed.
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
+
+# bfloat16.hpp needs MIOPEN_USE_RNE_BFLOAT16 at compile time.
+# The option is declared in the top-level CMakeLists.txt.
+if(MIOPEN_USE_RNE_BFLOAT16)
+    target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=1)
+else()
+    target_compile_definitions(miopen_common_utils INTERFACE MIOPEN_USE_RNE_BFLOAT16=0)
+endif()
diff --git a/projects/miopen/common_utils/include/common_utils/algorithm.hpp b/projects/miopen/common_utils/include/common_utils/algorithm.hpp
new file mode 100644
index 000000000000..d1098a066077
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/algorithm.hpp
@@ -0,0 +1,47 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MLOPEN_ALGORITHM_HPP
+#define GUARD_MLOPEN_ALGORITHM_HPP
+
+#include <algorithm>
+
+namespace miopen {
+
+template <typename Range, typename Predicate>
+bool any_of(const Range& r, Predicate p)
+{
+    return std::any_of(r.begin(), r.end(), p);
+}
+
+template <typename Range, typename Predicate>
+bool all_of(const Range& r, Predicate p)
+{
+    return std::all_of(r.begin(), r.end(), p);
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/bfloat16.hpp b/projects/miopen/common_utils/include/common_utils/bfloat16.hpp
new file mode 100644
index 000000000000..71fe70bbd3c7
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/bfloat16.hpp
@@ -0,0 +1,179 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#ifndef BFLOAT16_H_
+#define BFLOAT16_H_
+
+#include <iostream>
+// MIOPEN_USE_RNE_BFLOAT16 is provided via CMake compile definitions.
+
+class bfloat16
+{
+public:
+    bfloat16() : data_{0} {}
+    explicit bfloat16(float rhs)
+    {
+        union
+        {
+            float float_st;
+            std::uint32_t bf16_st;
+        } bits_st = {rhs};
+
+        // BF16 round and NaN preservation code matches
+        // https://github.com/ROCm/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h
+        if((~bits_st.bf16_st & 0x7f800000) == 0) // Inf or NaN
+        {
+            // When all of the exponent bits are 1, the value is Inf or NaN.
+            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+            // bit being 1. Signaling NaN is indicated by the most significant
+            // mantissa bit being 0 but some other bit(s) being 1. If any of the
+            // lower 16 bits of the mantissa are 1, we set the least significant bit
+            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+            // the bloat16's mantissa bits are all 0.
+            if((bits_st.bf16_st & 0xffff) != 0)
+            {
+                bits_st.bf16_st |= 0x10000; // Preserve signaling NaN
+            }
+        }
+        else
+        {
+#if MIOPEN_USE_RNE_BFLOAT16 == 1
+            // When the exponent bits are not all 1s, then the value is zero, normal,
+            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+            // least significant bits of the float mantissa are greater than 0x8000,
+            // or if they are equal to 0x8000 and the least significant bit of the
+            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+            // has the value 0x7f, then incrementing it causes it to become 0x00 and
+            // the exponent is incremented by one, which is the next higher FP value
+            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+            // incrementing it causes it to become an exponent of 0xFF and a mantissa
+            // of 0x00, which is Inf, the next higher value to the unrounded value.
+            bits_st.bf16_st +=
+                (0x7fff + ((bits_st.bf16_st >> 16) & 1)); // Round to nearest, round to even
+#else                                                     // truncation
+// do nothing
+#endif
+        }
+        data_ = bits_st.bf16_st >> 16;
+    }
+    operator float() const
+    {
+        union
+        {
+            std::uint32_t bf16_st;
+            float float_st;
+        } bits_st = {data_};
+
+        bits_st.bf16_st = bits_st.bf16_st << 16;
+        return bits_st.float_st;
+    }
+
+    bfloat16 operator-() const { return bfloat16(-static_cast<float>(*this)); }
+    bfloat16 operator+() const { return *this; }
+
+    bfloat16& operator=(const float rhs)
+    {
+        *this = bfloat16(rhs);
+        return *this;
+    }
+    bfloat16& operator+=(bfloat16 rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) + static_cast<float>(rhs));
+        return *this;
+    }
+
+    bfloat16& operator+=(float rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) + rhs);
+        return *this;
+    }
+
+    bfloat16& operator-=(bfloat16 rhs)
+    {
+        *this += -rhs;
+        return *this;
+    }
+    bfloat16& operator*=(bfloat16 rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) * static_cast<float>(rhs));
+        return *this;
+    }
+    bfloat16& operator*=(float rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) * rhs);
+        return *this;
+    }
+
+    bfloat16& operator/=(bfloat16 rhs)
+    {
+        *this = bfloat16(static_cast<float>(*this) / static_cast<float>(rhs));
+        return *this;
+    }
+    bool operator<(bfloat16 rhs) const
+    {
+        return static_cast<float>(*this) < static_cast<float>(rhs);
+    }
+    bool operator==(bfloat16 rhs) const { return std::equal_to<float>()(*this, rhs); }
+
+    static constexpr bfloat16 generate(uint16_t val) { return bfloat16{val, true}; }
+
+private:
+    constexpr bfloat16(std::uint16_t val, bool) : data_{val} {}
+
+    std::uint16_t data_;
+};
+
+inline bfloat16 operator+(bfloat16 a, const bfloat16& b)
+{
+    a += b;
+    return a;
+}
+
+inline bfloat16 operator-(bfloat16 a, const bfloat16& b)
+{
+    a -= b;
+    return a;
+}
+
+inline bfloat16 operator*(bfloat16 a, const bfloat16& b)
+{
+    a *= b;
+    return a;
+}
+
+inline bfloat16 operator/(bfloat16 a, const bfloat16& b)
+{
+    a /= b;
+    return a;
+}
+
+namespace std {
+template <>
+class numeric_limits<bfloat16>
+{
+public:
+    static constexpr bool is_specialized = true;
+    static constexpr bfloat16 min() noexcept { return bfloat16::generate(0x0080); } // 0x1.00p-126
+    static constexpr bfloat16 max() noexcept { return bfloat16::generate(0x7F7F); }
+    static constexpr bfloat16 lowest() noexcept { return bfloat16::generate(0xFF7F); }
+    static constexpr bfloat16 epsilon() noexcept { return bfloat16::generate(0x3C00); }
+    static constexpr bfloat16 infinity() noexcept { return bfloat16::generate(0x7F80); }
+    static constexpr bfloat16 quiet_NaN() noexcept { return bfloat16::generate(0x7FC0); } // qnan(0)
+    static constexpr bfloat16 signaling_NaN() noexcept
+    {
+        return bfloat16::generate(0x7F81); // snan(1)
+    }
+    static constexpr bfloat16 denorm_min() noexcept
+    {
+        return bfloat16::generate(0x0001); // 0x0.02p-126
+    }
+};
+} // namespace std
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/each_args.hpp b/projects/miopen/common_utils/include/common_utils/each_args.hpp
new file mode 100644
index 000000000000..e078153dc998
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/each_args.hpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_EACH_ARGS_HPP
+#define GUARD_MIOPEN_EACH_ARGS_HPP
+
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+
+namespace miopen {
+namespace detail {
+
+template <class F, std::size_t... Ns, class... Ts>
+void each_args_i_impl(F f, std::index_sequence<Ns...>, Ts&&... xs)
+{
+    (void)std::initializer_list<int>{
+        (f(std::integral_constant<std::size_t, Ns>{}, std::forward<Ts>(xs)), 0)...};
+}
+
+template <class F, std::size_t... Ns, class T>
+auto unpack_impl(F f, std::index_sequence<Ns...>, T&& x)
+{
+    return f(std::get<Ns>(x)...);
+}
+
+} // namespace detail
+
+template <class F, class... Ts>
+void each_args_i(F f, Ts&&... xs)
+{
+    detail::each_args_i_impl(f, std::make_index_sequence<sizeof...(Ts)>(), std::forward<Ts>(xs)...);
+}
+
+template <class F, class... Ts>
+void each_args(F f, Ts&&... xs)
+{
+    (void)std::initializer_list<int>{(f(std::forward<Ts>(xs)), 0)...};
+}
+
+// Workaround for gcc warnings
+template <class F>
+void each_args(F)
+{
+}
+
+template <class F, std::size_t... Ns, class T>
+auto unpack(F f, T&& x)
+{
+    using type = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+    return detail::unpack_impl(
+        f, std::make_index_sequence<std::tuple_size<type>::value>(), std::forward<T>(x));
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/float_equal.hpp b/projects/miopen/common_utils/include/common_utils/float_equal.hpp
new file mode 100644
index 000000000000..24bbdc55ad11
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/float_equal.hpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP
+#define GUARD_MLOPEN_FLOAT_EQUAL_HPP
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <numeric>
+
+namespace miopen {
+
+template <class... Ts>
+using common_type = typename std::common_type<Ts...>::type;
+
+struct float_equal_fn
+{
+    template <class T>
+    static bool apply(T x, T y)
+    {
+        // The standard library from MSVC does not implement std::isfinite() for integer
+        // types - no additional overloads are provided. According to the documentation,
+        // integer types should be treaded as doubles.
+        // Refer to https://en.cppreference.com/w/cpp/numeric/math/isfinite for more information.
+        return std::isfinite(static_cast<double>(x)) and std::isfinite(static_cast<double>(y)) and
+               std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
+               std::nextafter(x, std::numeric_limits<T>::max()) >= y;
+    }
+
+    template <class T, class U>
+    bool operator()(T x, U y) const
+    {
+        return float_equal_fn::apply<common_type<T, U>>(x, y);
+    }
+};
+
+static constexpr float_equal_fn float_equal{};
+
+/// Special case for comparing with a sentinel value
+struct float_equal_sentinel_fn
+{
+    template <class T>
+    static bool apply(T x, T y)
+    {
+// In this case we have to ignore this warning, because we intend to compare with the exact value
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+        bool equals_sentinel = x == y;
+#pragma clang diagnostic pop
+
+        return std::isfinite(static_cast<double>(x)) and std::isfinite(static_cast<double>(y)) and
+               equals_sentinel;
+    }
+
+    template <class T, class U>
+    bool operator()(T x, U y) const
+    {
+        return float_equal_sentinel_fn::apply<common_type<T, U>>(x, y);
+    }
+};
+
+static constexpr float_equal_sentinel_fn float_equal_sentinel{};
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/ford.hpp b/projects/miopen/common_utils/include/common_utils/ford.hpp
new file mode 100644
index 000000000000..4ff4ddfa32e2
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/ford.hpp
@@ -0,0 +1,122 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_FORD_HPP
+#define GUARD_FORD_HPP
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <functional>
+#include <common_utils/par_for.hpp>
+#include <common_utils/each_args.hpp>
+#include <common_utils/returns.hpp>
+#include <numeric>
+#include <vector>
+
+#include <thread>
+
+#include <future>
+
+namespace miopen {
+
+// An improved async, that doesn't block
+template <class Function>
+std::future<typename std::invoke_result<Function>::type> detach_async(Function&& f)
+{
+    using result_type = typename std::invoke_result<Function>::type;
+    std::packaged_task<result_type()> task(std::forward<Function>(f));
+    auto fut = task.get_future();
+    std::thread(std::move(task)).detach();
+    return fut;
+}
+
+template <class T, class Work>
+auto then(std::future<T> f, Work w) -> std::future<decltype(w(f.get()))>
+{
+    return std::async(std::launch::deferred,
+                      [=, f_ = std::move(f)]() mutable { return w(f_.get()); });
+}
+
+template <class T>
+struct ford_wrapper
+{
+    template <class... Ts>
+    auto operator()(Ts... xs) const MIOPEN_RETURNS(std::bind(T{}, std::placeholders::_1, xs...));
+};
+
+// Multidimensional for loop
+struct ford_impl
+{
+    template <class F>
+    void operator()(F f) const
+    {
+        f();
+    }
+
+    template <class F, class T, class... Ts>
+    void operator()(F f, T x, Ts... xs) const
+    {
+        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55914
+        for(T i = 0; i < x; i++)
+        {
+            (*this)([&](Ts... is) { f(i, is...); }, xs...);
+        }
+    }
+};
+
+static constexpr ford_wrapper<ford_impl> ford{};
+
+struct par_ford_impl
+{
+    template <class F, class... Ts>
+    void operator()(F f, Ts... xs) const
+    {
+        using array_type = std::array<std::size_t, sizeof...(Ts)>;
+        array_type lens  = {{static_cast<std::size_t>(xs)...}};
+        array_type strides;
+        strides.fill(1);
+        std::partial_sum(
+            lens.rbegin(), lens.rend() - 1, strides.rbegin() + 1, std::multiplies<std::size_t>());
+        auto size = std::accumulate(
+            lens.begin(), lens.end(), static_cast<std::size_t>(1), std::multiplies<std::size_t>());
+        par_for(size, [&](std::size_t i) {
+            array_type indices;
+            std::transform(strides.begin(),
+                           strides.end(),
+                           lens.begin(),
+                           indices.begin(),
+                           [&](size_t stride, size_t len) { return (i / stride) % len; });
+            unpack(f, indices);
+        });
+    }
+};
+
+static constexpr ford_wrapper<par_ford_impl> par_ford{};
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/functional.hpp b/projects/miopen/common_utils/include/common_utils/functional.hpp
new file mode 100644
index 000000000000..19dde2bd28dc
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/functional.hpp
@@ -0,0 +1,131 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MLOPEN_FUNCTIONAL_HPP
+#define GUARD_MLOPEN_FUNCTIONAL_HPP
+
+#include <common_utils/each_args.hpp>
+#include <common_utils/returns.hpp>
+#include <utility>
+
+namespace miopen {
+namespace detail {
+
+template <class F, std::size_t... Ns>
+auto each_i_impl(F f, std::index_sequence<Ns...>)
+    MIOPEN_RETURNS(f(std::integral_constant<std::size_t, Ns>{}...));
+} // namespace detail
+
+template <class F, class P>
+struct by_t
+{
+    F f;
+    P p;
+    template <class... Ts>
+    auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(p(std::forward<Ts>(xs))...))
+};
+
+template <class F, class P>
+by_t<F, P> by(F f, P p)
+{
+    return {std::move(f), std::move(p)};
+}
+
+template <class F, class G>
+struct compose_t
+{
+    F f;
+    G g;
+    template <class... Ts>
+    auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(g(std::forward<Ts>(xs)...)))
+};
+
+template <class F, class G>
+compose_t<F, G> compose(F f, G g)
+{
+    return {std::move(f), std::move(g)};
+}
+
+template <class F>
+struct flip_t
+{
+    F f;
+    template <class T, class U>
+    auto operator()(T&& x, U&& y) const MIOPEN_RETURNS(f(std::forward<U>(y), std::forward<T>(x)))
+};
+
+template <class F>
+flip_t<F> flip(F f)
+{
+    return {std::move(f)};
+}
+
+template <class F>
+struct sequence_t
+{
+    F f;
+    template <class IntegralConstant>
+    auto operator()(IntegralConstant) const
+        MIOPEN_RETURNS(detail::each_i_impl(f, std::make_index_sequence<IntegralConstant::value>()));
+};
+
+template <class F>
+sequence_t<F> sequence(F f)
+{
+    return {std::move(f)};
+}
+
+template <typename F, std::size_t N>
+void repeat_n(F f, std::integral_constant<std::size_t, N>)
+{
+    auto fs = [&f](auto... is) { return each_args(f, is...); };
+    sequence(fs)(std::integral_constant<std::size_t, N>{});
+}
+
+template <class T>
+struct cast_to
+{
+    template <class X>
+    T operator()(X&& x) const
+    {
+        return static_cast<T>(std::forward<X>(x));
+    }
+};
+
+template <class F>
+auto unpacker(F f)
+{
+    return [=](auto xs) { return miopen::unpack(f, xs); };
+};
+
+template <class F, class... Xs>
+auto prepender(F f, Xs... xs)
+{
+    return [=](auto... ys) { return f(xs..., ys...); };
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/par_for.hpp b/projects/miopen/common_utils/include/common_utils/par_for.hpp
new file mode 100644
index 000000000000..1272dcf6ac9b
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/par_for.hpp
@@ -0,0 +1,149 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
+#define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include <thread>
+
+namespace miopen {
+
+struct joinable_thread : std::thread
+{
+    template <class... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
+    {
+    }
+
+    joinable_thread& operator=(joinable_thread&& other) = default;
+    joinable_thread(joinable_thread&& other)            = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+struct thread_factory
+{
+    template <class F>
+    joinable_thread operator()(std::size_t& work, std::size_t n, std::size_t grainsize, F f) const
+    {
+        auto result = joinable_thread([=] {
+            std::size_t start = work;
+            std::size_t last  = std::min(n, work + grainsize);
+            for(std::size_t i = start; i < last; i++)
+            {
+                f(i);
+            }
+        });
+        work += grainsize;
+        return result;
+    }
+};
+
+template <class F>
+void par_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        for(std::size_t i = 0; i < n; i++)
+            f(i);
+    }
+    else
+    {
+        std::vector<joinable_thread> threads(threadsize);
+        const std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
+
+        std::size_t work = 0;
+        std::generate(threads.begin(),
+                      threads.end(),
+                      std::bind(thread_factory{}, std::ref(work), n, grainsize, f));
+        assert(work >= n);
+    }
+}
+
+template <class F>
+void par_for(std::size_t n, std::size_t min_grain, F f)
+{
+    const auto threadsize =
+        std::min<std::size_t>(std::thread::hardware_concurrency(), n / min_grain);
+    par_for_impl(n, threadsize, f);
+}
+
+struct min_grain
+{
+    std::size_t n = 0;
+};
+
+template <class F>
+void par_for(std::size_t n, min_grain mg, F f)
+{
+    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), n / mg.n);
+    par_for_impl(n, threadsize, f);
+}
+
+template <class F>
+void par_for(std::size_t n, F f)
+{
+    par_for(n, min_grain{8}, f);
+}
+
+struct max_threads
+{
+    std::size_t n = 0;
+};
+
+template <class F>
+void par_for(std::size_t n, max_threads mt, F f)
+{
+    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), mt.n);
+    par_for_impl(n, std::min(threadsize, n), f);
+}
+
+template <class F>
+void par_for_strided(std::size_t n, max_threads mt, F f)
+{
+    auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), mt.n);
+    par_for_impl(threadsize, threadsize, [&](auto start) {
+        for(std::size_t i = start; i < n; i += threadsize)
+        {
+            f(i);
+        }
+    });
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/random.hpp b/projects/miopen/common_utils/include/common_utils/random.hpp
new file mode 100644
index 000000000000..f6f8d85c4ce4
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/random.hpp
@@ -0,0 +1,159 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_RANDOM_GEN_
+#define GUARD_RANDOM_GEN_
+
+#include <miopen/env.hpp>
+
+#include <cassert>
+#include <iostream>
+#include <random>
+
+MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_DRIVER_PRNG_SEED, 12345678)
+
+namespace env = miopen::env;
+
+namespace prng {
+namespace details {
+using glibc_gen = std::linear_congruential_engine<std::uint32_t, 1103515245, 12345, 2147483648>;
+
+inline std::random_device::result_type get_default_seed()
+{
+    static std::random_device::result_type seed{[] {
+        auto external_seed = env::value(MIOPEN_DEBUG_DRIVER_PRNG_SEED);
+
+        auto seed_ = external_seed == 0
+                         ? std::random_device{}()
+                         : static_cast<std::random_device::result_type>(external_seed);
+        std::cout << "PRNG seed: " << seed_ << "\n";
+        return seed_;
+    }()};
+    return seed;
+}
+
+inline glibc_gen& get_prng()
+{
+    static thread_local glibc_gen gen{get_default_seed()};
+    return gen;
+}
+
+template <class, class = void>
+struct has_digits : std::false_type
+{
+};
+
+template <class T>
+struct has_digits<T, std::void_t<decltype(std::numeric_limits<T>::digits)>> : std::true_type
+{
+};
+
+} // namespace details
+
+inline void reset_seed(std::random_device::result_type seed = 0)
+{
+    details::get_prng().seed(seed + details::get_default_seed());
+}
+
+// similar to std::generate_canonical, but simpler and faster
+template <typename T>
+inline T gen_canonical()
+{
+    if constexpr(std::is_floating_point_v<T>) // native fp
+    {
+        static constexpr T range =
+            static_cast<T>(1) /
+            static_cast<T>(details::glibc_gen::max() - details::glibc_gen::min() + 1);
+        return range * static_cast<T>(details::get_prng()() - details::glibc_gen::min());
+    }
+    else if constexpr(std::is_integral_v<T>)
+    {
+        auto val = details::get_prng()();
+        return static_cast<T>(((val >> 4) + (val >> 16)) & 0x1);
+    }
+    else
+    {
+        return static_cast<T>(gen_canonical<float>());
+    }
+}
+
+template <typename T>
+inline T gen_0_to_B(T B)
+{
+    if constexpr(std::is_floating_point_v<T>) // native fp
+    {
+        return gen_canonical<T>() * B;
+    }
+    else if constexpr(std::is_integral_v<T>)
+    {
+        // can only generate 27bit range, so it may not be suitable
+        // for huge 64 bit ranges, but we do not expect such ranges
+        return static_cast<T>((details::get_prng()() >> 4) % B);
+    }
+    else // half/bfloat/etc
+    {
+        return static_cast<T>(gen_0_to_B(static_cast<float>(B)));
+    }
+}
+
+template <typename T>
+inline T gen_A_to_B(T A, T B)
+{
+    assert(B > A);
+    return gen_0_to_B(B - A) + A;
+}
+
+template <typename T>
+inline T gen_off_range(T offset, T range)
+{
+    static_assert(std::is_integral_v<T>);
+    return prng::gen_0_to_B(range) + offset;
+}
+
+template <typename T, bool Signed = false>
+inline T gen_subnorm()
+{
+    T denorm_val = static_cast<T>(0);
+    if constexpr(!std::is_integral_v<T> && !std::is_same_v<T, double> &&
+                 std::is_trivially_copyable<T>::value && details::has_digits<T>::value)
+    {
+        using BitType = std::conditional_t<sizeof(T) == 1,
+                                           uint8_t,
+                                           std::conditional_t<sizeof(T) == 2, uint16_t, uint32_t>>;
+        static_assert(sizeof(T) == sizeof(BitType));
+
+        // -1 because ::digits counts the first implicit digit
+        static constexpr auto mantissa_bits = std::numeric_limits<T>::digits - 1;
+
+        BitType denorm_bits = static_cast<BitType>(gen_0_to_B(1 << mantissa_bits));
+        denorm_bits |= Signed ? (gen_canonical<BitType>() << (sizeof(T) * 8 - 1)) : 0;
+
+        // the proper way to do a type punning
+        std::memcpy(&denorm_val, &denorm_bits, sizeof(T));
+    }
+    return denorm_val;
+}
+} // namespace prng
+#endif // GUARD_RANDOM_GEN_
diff --git a/projects/miopen/common_utils/include/common_utils/rank.hpp b/projects/miopen/common_utils/include/common_utils/rank.hpp
new file mode 100644
index 000000000000..013ec6e7f7f4
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/rank.hpp
@@ -0,0 +1,42 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_RANK_HPP
+#define GUARD_MIOPEN_RANK_HPP
+
+namespace miopen {
+
+template <int N>
+struct rank : rank<N - 1>
+{
+};
+
+template <>
+struct rank<0>
+{
+};
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/reduce_common.hpp b/projects/miopen/common_utils/include/common_utils/reduce_common.hpp
new file mode 100644
index 000000000000..74ce541f694b
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/reduce_common.hpp
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP
+#define GUARD_MIOPEN_REDUCE_COMMON_HPP
+
+#include <half/half.hpp>
+#include <common_utils/bfloat16.hpp>
+
+namespace reduce {
+
+template <typename Tdst, typename Tsrc>
+static inline Tdst convert_type(Tsrc x)
+{
+    return static_cast<Tdst>(x);
+}
+
+template <>
+inline float convert_type<float>(half_float::half x)
+{
+    return half_float::half_cast<float>(x);
+};
+
+template <>
+inline half_float::half convert_type<half_float::half>(float x)
+{
+    return half_float::half_cast<half_float::half>(x);
+};
+
+template <>
+inline float convert_type<float>(bfloat16 x)
+{
+    return float(x);
+};
+
+template <>
+inline bfloat16 convert_type<bfloat16>(float x)
+{
+    return bfloat16(x);
+};
+
+}; // end of namespace reduce
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/returns.hpp b/projects/miopen/common_utils/include/common_utils/returns.hpp
new file mode 100644
index 000000000000..4fdb1db18b87
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/returns.hpp
@@ -0,0 +1,38 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_MIOPEN_RETURNS_HPP
+#define GUARD_MIOPEN_RETURNS_HPP
+
+#define MIOPEN_RETURNS(...) \
+    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
+
+#define MIOPEN_BODY_RETURNS(...) \
+    {                            \
+        return __VA_ARGS__;      \
+    }
+
+#endif
diff --git a/projects/miopen/common_utils/include/common_utils/stringutils.hpp b/projects/miopen/common_utils/include/common_utils/stringutils.hpp
new file mode 100644
index 000000000000..19d579014c73
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/stringutils.hpp
@@ -0,0 +1,165 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_STRINGUTILS_HPP
+#define GUARD_MIOPEN_STRINGUTILS_HPP
+
+#include <common_utils/algorithm.hpp>
+#include <algorithm>
+#include <stdexcept>
+#include <iterator>
+#include <numeric>
+#include <string>
+#include <vector>
+#include <sstream>
+
+#define MIOPEN_STRINGIZE_1(...) #__VA_ARGS__
+#define MIOPEN_STRINGIZE(...) MIOPEN_STRINGIZE_1(__VA_ARGS__)
+
+namespace miopen {
+
+inline std::string
+ReplaceString(const std::string& in, const std::string& search, const std::string& replace)
+{
+    size_t pos = 0;
+    std::string subject(in);
+    while((pos = subject.find(search, pos)) != std::string::npos)
+    {
+        subject.replace(pos, search.length(), replace);
+        pos += replace.length();
+    }
+    return subject;
+}
+
+inline bool EndsWith(const std::string& value, const std::string& suffix)
+{
+    if(suffix.size() > value.size())
+        return false;
+    else
+        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
+}
+
+template <class Strings>
+inline std::string JoinStrings(Strings strings, std::string delim)
+{
+    auto it = strings.begin();
+    if(it == strings.end())
+        return "";
+
+    auto nit = std::next(it);
+    return std::accumulate(
+        nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; });
+}
+
+template <class F>
+static inline std::string TransformString(std::string s, F f)
+{
+    std::transform(s.begin(), s.end(), s.begin(), f);
+    return s;
+}
+
+inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); }
+
+inline bool StartsWith(const std::string& value, const std::string& prefix)
+{
+    if(prefix.size() > value.size())
+        return false;
+    else
+        return std::equal(prefix.begin(), prefix.end(), value.begin());
+}
+
+inline std::string RemovePrefix(std::string s, std::string prefix)
+{
+    if(StartsWith(s, prefix))
+        return s.substr(prefix.length());
+    else
+        return s;
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::string& in)
+{
+    std::istringstream ss(in);
+    const std::istream_iterator<std::string> begin(ss), end;
+    return {begin, end};
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::vector<std::string>& in)
+{
+    std::vector<std::string> rv;
+    for(const auto& item : in)
+    {
+        if(item.find(' ') != std::string::npos)
+        {
+            const auto splitted = SplitSpaceSeparated(item);
+            std::copy(splitted.begin(), splitted.end(), std::back_inserter(rv));
+        }
+        else
+        {
+            rv.emplace_back(item);
+        }
+    }
+    return rv;
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
+                                                    const std::vector<std::string>& dontSplitAfter)
+{
+    std::vector<std::string> rv;
+    std::istringstream ss(in);
+    std::string s;
+    while(ss >> s)
+    {
+        if(any_of(dontSplitAfter, [&](const auto& dont) { return dont == s; }))
+        {
+            std::string s2;
+            if(ss >> s2)
+            {
+                s += std::string(" ").append(s2); // Exactly one space is important.
+                rv.push_back(s);
+                continue;
+            }
+            throw std::runtime_error("Error parsing string: '" + in + '\'');
+        }
+        rv.push_back(s);
+    }
+    return rv;
+}
+
+inline std::vector<std::string> SplitDelim(const std::string& in, const char delim)
+{
+    std::vector<std::string> rv;
+    std::string token;
+    std::istringstream ss(in);
+
+    while(std::getline(ss, token, delim))
+    {
+        rv.push_back(token);
+    }
+    return rv;
+}
+
+} // namespace miopen
+
+#endif // GUARD_MIOPEN_STRINGUTILS_HPP
diff --git a/projects/miopen/common_utils/include/common_utils/type_name.hpp b/projects/miopen/common_utils/include/common_utils/type_name.hpp
new file mode 100644
index 000000000000..ac7fd2ff6017
--- /dev/null
+++ b/projects/miopen/common_utils/include/common_utils/type_name.hpp
@@ -0,0 +1,139 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017-2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_TYPE_NAME_HPP
+#define GUARD_TYPE_NAME_HPP
+
+#include <string>
+#include <string_view>
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__)
+#include <type_traits>
+#endif
+
+namespace miopen {
+
+template <class T>
+constexpr std::string_view type_name()
+{
+#if defined(__clang__) || defined(__GNUC__)
+    // clang or gcc
+    constexpr auto full_name = std::string_view{__PRETTY_FUNCTION__};
+#elif defined(_MSC_VER)
+    // msvc
+    constexpr auto full_name = std::string_view{__FUNCSIG__};
+#endif
+
+    // The substring with the data type name is located within the original string, between the
+    // prefix and the suffix, with the prefix always not at the beginning of the string and the
+    // suffix always at the end of the string.
+#if defined(__clang__)
+    // clang
+    constexpr auto prefix = std::string_view{"[T = "};
+    constexpr auto suffix = std::string_view{"]"};
+#elif defined(__GNUC__)
+    // gcc
+    constexpr auto prefix = std::string_view{"[with T = "};
+    constexpr auto suffix = std::string_view{"; std::string_view = std::basic_string_view<char>]"};
+#elif defined(_MSC_VER)
+    // msvc
+    constexpr auto prefix = std::string_view{"type_name<"};
+    constexpr auto suffix = std::string_view{">(void)"};
+#endif
+
+    constexpr auto prefix_pos = full_name.find(prefix);
+    static_assert(prefix_pos != std::string_view::npos);
+
+    constexpr auto suffix_pos = full_name.rfind(suffix);
+    static_assert(suffix_pos != std::string_view::npos);
+    static_assert(suffix_pos == full_name.size() - suffix.size());
+
+    constexpr auto pos = prefix_pos + prefix.size();
+    static_assert(pos < suffix_pos);
+    constexpr auto count = suffix_pos - pos;
+
+    constexpr auto name = full_name.substr(pos, count);
+
+#if defined(__clang__) || defined(__GNUC__)
+    // clang or gcc
+    return name;
+#elif defined(_MSC_VER)
+    // msvc
+    if constexpr(std::is_compound_v<T>)
+    {
+        // For compound data types, the string contains the keyword 'class/struct/union/enum' before
+        // the data type name, separated by a space.
+        constexpr auto sep     = std::string_view{" "};
+        constexpr auto sep_pos = name.find(sep);
+        static_assert(sep_pos != std::string_view::npos);
+        static_assert(sep_pos != 0); // must not be at the 0 position
+
+        constexpr auto name_pos = sep_pos + sep.size();
+        constexpr auto tname    = name.substr(name_pos);
+        static_assert(tname.size() > 0);
+
+        return tname;
+    }
+    else
+    {
+        return name;
+    }
+#endif
+}
+
+template <class T>
+constexpr std::string_view type_name_bare()
+{
+    constexpr auto name = type_name<T>();
+    constexpr auto pos  = name.rfind(':');
+    if constexpr(pos == std::string_view::npos)
+    {
+        constexpr auto result = name;
+        return result;
+    }
+    else
+    {
+        constexpr auto bare_name = name.substr(pos + 1);
+        static_assert(bare_name.size() > 0);
+        return bare_name;
+    }
+}
+
+template <class T>
+const std::string& get_type_name()
+{
+    static const auto ret = std::string(type_name<T>());
+    return ret;
+}
+
+template <class T>
+const std::string& get_type_name(const T&)
+{
+    return miopen::get_type_name<T>();
+}
+
+} // namespace miopen
+
+#endif
diff --git a/projects/miopen/driver/CBAInferFusion_driver.hpp b/projects/miopen/driver/CBAInferFusion_driver.hpp
index 0b63f8fe5af6..8bc25e1ffc58 100644
--- a/projects/miopen/driver/CBAInferFusion_driver.hpp
+++ b/projects/miopen/driver/CBAInferFusion_driver.hpp
@@ -36,9 +36,9 @@
 #include "util_driver.hpp"
 #include "conv_common.hpp"
 
-#include "../test/verify.hpp"
-#include "../test/cpu_conv.hpp"
-#include "../test/cpu_bias.hpp"
+#include <miopen_utils/verify.hpp>
+#include <miopen_utils/cpu_conv.hpp>
+#include <miopen_utils/cpu_bias.hpp>
 
 #include <miopen/env.hpp>
 #include <miopen/errors.hpp>
diff --git a/projects/miopen/driver/CMakeLists.txt b/projects/miopen/driver/CMakeLists.txt
index 4aac2358c432..835d6437b650 100644
--- a/projects/miopen/driver/CMakeLists.txt
+++ b/projects/miopen/driver/CMakeLists.txt
@@ -74,7 +74,7 @@ endif()
 add_dependencies(MIOpenDriver generate_kernels)
 target_include_directories(MIOpenDriver PRIVATE ../src/kernels)
 # MIOpen_with_plugins ensures CK plugin .so's are built alongside MIOpenDriver
-target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json )
+target_link_libraries(MIOpenDriver PRIVATE MIOpen_with_plugins Threads::Threads roc::rocrand nlohmann_json::nlohmann_json miopen_common_utils miopen_utils)
 if(NOT MIOPEN_EMBED_DB STREQUAL "")
     target_link_libraries(MIOpenDriver PRIVATE $<BUILD_INTERFACE:miopen_data> )
 endif()
diff --git a/projects/miopen/driver/adam_driver.hpp b/projects/miopen/driver/adam_driver.hpp
index f0c0258c8241..6c1984c44e87 100644
--- a/projects/miopen/driver/adam_driver.hpp
+++ b/projects/miopen/driver/adam_driver.hpp
@@ -32,7 +32,7 @@
 #include "tensor_driver.hpp"
 #include "timer.hpp"
 
-#include "../test/verify.hpp"
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/float_equal.hpp>
 #include <miopen/ford.hpp>
diff --git a/projects/miopen/driver/addlayernorm_driver.hpp b/projects/miopen/driver/addlayernorm_driver.hpp
index effdc90c6127..a1bac6125dfc 100644
--- a/projects/miopen/driver/addlayernorm_driver.hpp
+++ b/projects/miopen/driver/addlayernorm_driver.hpp
@@ -26,8 +26,8 @@
 #ifndef GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP
 #define GUARD_MIOPEN_ADDLAYERNORM_DRIVER_HPP
 
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include "InputFlags.hpp"
 #include "driver.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/driver/bn_driver.hpp b/projects/miopen/driver/bn_driver.hpp
index 29cdfd970356..82802f8bd965 100644
--- a/projects/miopen/driver/bn_driver.hpp
+++ b/projects/miopen/driver/bn_driver.hpp
@@ -35,9 +35,9 @@
 #include "util_driver.hpp"
 #include "rocrand_wrapper.hpp"
 
-#include "../test/verify.hpp"
-#include "../test/random.hpp"
-#include "../test/fusionHost.hpp"
+#include <miopen_utils/verify.hpp>
+#include <miopen_utils/random.hpp>
+#include <miopen_utils/fusionHost.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/handle.hpp>
diff --git a/projects/miopen/driver/cat_driver.hpp b/projects/miopen/driver/cat_driver.hpp
index f9a675440c15..a4e6804f9aad 100644
--- a/projects/miopen/driver/cat_driver.hpp
+++ b/projects/miopen/driver/cat_driver.hpp
@@ -18,8 +18,8 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include <miopen/ford.hpp>
 
 #ifndef MLO_CATHOST_H_
diff --git a/projects/miopen/driver/conv_driver.hpp b/projects/miopen/driver/conv_driver.hpp
index fcdbdbbd2ea6..77010d71e87a 100644
--- a/projects/miopen/driver/conv_driver.hpp
+++ b/projects/miopen/driver/conv_driver.hpp
@@ -28,10 +28,10 @@
 #include <miopen/conv/solvers.hpp>
 #include <miopen/tensor.hpp>
 
-#include <../test/cpu_bias.hpp>
-#include <../test/cpu_conv.hpp>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/cpu_bias.hpp>
+#include <miopen_utils/cpu_conv.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <algorithm>
 #include <cstdlib>
diff --git a/projects/miopen/driver/conv_verify.hpp b/projects/miopen/driver/conv_verify.hpp
index ae315843f01e..31d611bce134 100644
--- a/projects/miopen/driver/conv_verify.hpp
+++ b/projects/miopen/driver/conv_verify.hpp
@@ -27,7 +27,7 @@
 #define GUARD_MIOPEN_CONV_VERIFY_HPP
 
 #include <cassert>
-#include "../test/gemm.hpp"
+#include <miopen_utils/gemm.hpp>
 
 template <typename Tgpu_ /* the data type used in GPU computations (usually half) */,
           typename Tcheck_ /* the data type used in CPU checkings (usually double) */>
diff --git a/projects/miopen/driver/ctc_driver.hpp b/projects/miopen/driver/ctc_driver.hpp
index 2b8e64a8f79a..85aecb3264d3 100644
--- a/projects/miopen/driver/ctc_driver.hpp
+++ b/projects/miopen/driver/ctc_driver.hpp
@@ -35,7 +35,7 @@
 
 #include <miopen/miopen.h>
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <algorithm>
 #include <array>
diff --git a/projects/miopen/driver/driver.hpp b/projects/miopen/driver/driver.hpp
index 5bb698554566..2ebbcc2a4000 100644
--- a/projects/miopen/driver/driver.hpp
+++ b/projects/miopen/driver/driver.hpp
@@ -39,7 +39,7 @@
 #include <miopen/miopen.h>
 #include <miopen/bfloat16.hpp>
 #include <miopen/handle.hpp>
-#include <../test/tensor_holder.hpp>
+#include <miopen_utils/tensor_holder.hpp>
 #include "util_driver.hpp"
 #include "rocrand_wrapper.hpp"
 using half         = half_float::half;
diff --git a/projects/miopen/driver/dropout_driver.hpp b/projects/miopen/driver/dropout_driver.hpp
index 84d942155a08..0016340fd60e 100644
--- a/projects/miopen/driver/dropout_driver.hpp
+++ b/projects/miopen/driver/dropout_driver.hpp
@@ -34,7 +34,7 @@
 #include "util_driver.hpp"
 #include "util_file.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/dropout.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/gemm_driver.hpp b/projects/miopen/driver/gemm_driver.hpp
index d89a09a56644..8383b01ec22f 100644
--- a/projects/miopen/driver/gemm_driver.hpp
+++ b/projects/miopen/driver/gemm_driver.hpp
@@ -34,7 +34,7 @@
 #include "random.hpp"
 #include "util_driver.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/gemm_v2.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/getitem_driver.hpp b/projects/miopen/driver/getitem_driver.hpp
index 52a5bc262f82..55b0dfcd296c 100644
--- a/projects/miopen/driver/getitem_driver.hpp
+++ b/projects/miopen/driver/getitem_driver.hpp
@@ -40,8 +40,8 @@
 #include <miopen/tensor_view_utils.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 
 template <typename Tgpu, typename Tcheck>
 int32_t mloGetitemBackwardRunHost(miopenTensorDescriptor_t dyDesc,
diff --git a/projects/miopen/driver/glu_driver.hpp b/projects/miopen/driver/glu_driver.hpp
index 38deb2d69e78..63bf7188db4d 100644
--- a/projects/miopen/driver/glu_driver.hpp
+++ b/projects/miopen/driver/glu_driver.hpp
@@ -38,7 +38,7 @@
 #include <memory>
 #include <vector>
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/groupnorm_driver.hpp b/projects/miopen/driver/groupnorm_driver.hpp
index 3773654c842d..97553dd3c13e 100644
--- a/projects/miopen/driver/groupnorm_driver.hpp
+++ b/projects/miopen/driver/groupnorm_driver.hpp
@@ -32,7 +32,7 @@
 #include "mloGroupNormHost.hpp"
 #include "tensor_driver.hpp"
 #include "timer.hpp"
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 #include <algorithm>
 #include <cstdlib>
 #include <cfloat>
@@ -40,7 +40,7 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
+#include <miopen_utils/tensor_holder.hpp>
 #include "random.hpp"
 
 template <typename Tgpu, typename Tref>
diff --git a/projects/miopen/driver/gru_verify_gemm.hpp b/projects/miopen/driver/gru_verify_gemm.hpp
index e07d6eab0bff..237d311b1c29 100644
--- a/projects/miopen/driver/gru_verify_gemm.hpp
+++ b/projects/miopen/driver/gru_verify_gemm.hpp
@@ -28,7 +28,7 @@
 
 #include "dropout_gpu_emulator.hpp"
 
-#include <../test/rnn_util.hpp>
+#include <miopen_utils/rnn_util.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/projects/miopen/driver/kthvalue_driver.hpp b/projects/miopen/driver/kthvalue_driver.hpp
index 75f7e5b535b2..8cbfa302bf14 100644
--- a/projects/miopen/driver/kthvalue_driver.hpp
+++ b/projects/miopen/driver/kthvalue_driver.hpp
@@ -30,8 +30,8 @@
 #include "timer.hpp"
 #include "random.hpp"
 
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/tensor_view_utils.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/layernorm_driver.hpp b/projects/miopen/driver/layernorm_driver.hpp
index 6f6662f202f6..042e8a7164ea 100644
--- a/projects/miopen/driver/layernorm_driver.hpp
+++ b/projects/miopen/driver/layernorm_driver.hpp
@@ -26,9 +26,9 @@
 #ifndef GUARD_MIOPEN_LAYERNORM_DRIVER_HPP
 #define GUARD_MIOPEN_LAYERNORM_DRIVER_HPP
 
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
-#include <../test/cpu_layernorm.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
+#include <miopen_utils/cpu_layernorm.hpp>
 #include "InputFlags.hpp"
 #include "driver.hpp"
 #include "miopen/miopen.h"
diff --git a/projects/miopen/driver/lrn_driver.hpp b/projects/miopen/driver/lrn_driver.hpp
index c1645621acd4..2f164aad38b1 100644
--- a/projects/miopen/driver/lrn_driver.hpp
+++ b/projects/miopen/driver/lrn_driver.hpp
@@ -12,7 +12,7 @@
 #include "timer.hpp"
 #include "util_driver.hpp"
 
-#include "../test/verify.hpp"
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/lstm_verify_gemm.hpp b/projects/miopen/driver/lstm_verify_gemm.hpp
index fb98d5616ad5..a761779738f4 100644
--- a/projects/miopen/driver/lstm_verify_gemm.hpp
+++ b/projects/miopen/driver/lstm_verify_gemm.hpp
@@ -28,7 +28,7 @@
 
 #include "dropout_gpu_emulator.hpp"
 
-#include <../test/rnn_util.hpp>
+#include <miopen_utils/rnn_util.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/projects/miopen/driver/miopen_Reduction.hpp b/projects/miopen/driver/miopen_Reduction.hpp
index 3aee4e375c97..0fc05603bf2e 100644
--- a/projects/miopen/driver/miopen_Reduction.hpp
+++ b/projects/miopen/driver/miopen_Reduction.hpp
@@ -31,7 +31,7 @@
 #include <cassert>
 #include <cmath>
 
-#include "../test/cpu_reduce_util.hpp"
+#include <miopen_utils/cpu_reduce_util.hpp>
 
 #include "tensor_driver.hpp"
 
diff --git a/projects/miopen/driver/mloSoftmaxHost.hpp b/projects/miopen/driver/mloSoftmaxHost.hpp
index fd0a1768e6a6..928eb6f63490 100644
--- a/projects/miopen/driver/mloSoftmaxHost.hpp
+++ b/projects/miopen/driver/mloSoftmaxHost.hpp
@@ -1,350 +1,2 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#ifndef MLO_SOFTMAXHOST_H_
-#define MLO_SOFTMAXHOST_H_
-
-#include <miopen/tensor.hpp>
-#include <miopen/tensor_extra.hpp>
-
-////////////////////////////////////////////////////////////
-//
-///////////////////////////////////////////////////////////
-
-#define NEGATIVE_INF_FP32 (-1e20)
-#define NEGATIVE_INF_FP16 (-1e5)
-
-template <typename T>
-T logaddexp(T x, T y, T neg_inf)
-{
-    T a = std::max(x, y);
-    T b = std::min(x, y);
-    T c = b - a;
-
-    return c <= neg_inf ? std::max(a, neg_inf) : std::max(T(a + log(T(1) + exp(b - a))), neg_inf);
-}
-
-template <typename Tgpu, typename Tcheck /* the data type used in CPU checkings (usually double) */>
-int mloSoftmaxForwardRunHost(miopenTensorDescriptor_t inputTensor,
-                             miopenTensorDescriptor_t outputTensor,
-                             Tgpu* in,
-                             Tcheck* outhost,
-                             float alpha,
-                             float beta,
-                             miopenSoftmaxAlgorithm_t algo,
-                             miopenSoftmaxMode_t mode)
-{
-    int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr;
-    int out_nstr, out_cstr, out_hstr, out_wstr;
-    miopenGet4dTensorDescriptorLengths(inputTensor, &n, &c, &h, &w);
-    miopenGet4dTensorDescriptorStrides(inputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr);
-    miopenGet4dTensorDescriptorStrides(outputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr);
-
-    Tcheck max_val = (sizeof(Tgpu) == 4) ? 3.402823466e+38f : 65504.;
-    std::vector<Tcheck> channel_max((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w),
-                                    static_cast<Tcheck>(-max_val));
-    std::vector<Tcheck> results(n * c * h * w, static_cast<Tcheck>(0.0));
-
-    int ret = 0;
-
-    if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE)
-    {
-        for(int i = 0; i < n; i++)
-        {
-            if(algo == MIOPEN_SOFTMAX_FAST)
-            {
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] = static_cast<Tcheck>(
-                                in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]);
-                        }
-            }
-            else
-            {
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            channel_max[i] = std::max(
-                                static_cast<Tcheck>(
-                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]),
-                                channel_max[i]);
-                        }
-
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(
-                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) -
-                                channel_max[i];
-                        }
-            }
-
-            if(algo == MIOPEN_SOFTMAX_LOG)
-            {
-                Tcheck neg_inf = static_cast<Tcheck>(
-                    miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16
-                                                                       : NEGATIVE_INF_FP32);
-                channel_max[i] = neg_inf;
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            channel_max[i] = logaddexp(results[(i * c + j) * h * w + s0 * w + s1],
-                                                       channel_max[i],
-                                                       neg_inf);
-                        }
-
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
-                                alpha *
-                                    (results[(i * c + j) * h * w + s0 * w + s1] - channel_max[i]) +
-                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                               s1 * out_wstr];
-                        }
-            }
-            else
-            {
-                channel_max[i] = 0.0;
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                exp(results[(i * c + j) * h * w + s0 * w + s1]);
-                            channel_max[i] += results[(i * c + j) * h * w + s0 * w + s1];
-                        }
-
-                for(int j = 0; j < c; j++)
-                    for(int s0 = 0; s0 < h; s0++)
-                        for(int s1 = 0; s1 < w; s1++)
-                        {
-                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
-                                alpha *
-                                    (results[(i * c + j) * h * w + s0 * w + s1] / channel_max[i]) +
-                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                               s1 * out_wstr];
-                        }
-            }
-        }
-    }
-    else
-    {
-        for(int i = 0; i < n; i++)
-        {
-            for(int s0 = 0; s0 < h; s0++)
-                for(int s1 = 0; s1 < w; s1++)
-                {
-                    if(algo == MIOPEN_SOFTMAX_FAST)
-                    {
-                        for(int j = 0; j < c; j++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] = static_cast<Tcheck>(
-                                in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]);
-                        }
-                    }
-                    else
-                    {
-                        for(int j = 0; j < c; j++)
-                        {
-                            channel_max[i * h * w + s0 * w + s1] = std::max(
-                                static_cast<Tcheck>(
-                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]),
-                                channel_max[i * h * w + s0 * w + s1]);
-                        }
-
-                        for(int j = 0; j < c; j++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(
-                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) -
-                                channel_max[i * h * w + s0 * w + s1];
-                        }
-                    }
-
-                    if(algo == MIOPEN_SOFTMAX_LOG)
-                    {
-                        Tcheck neg_inf = static_cast<Tcheck>(
-                            miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16
-                                                                               : NEGATIVE_INF_FP32);
-                        channel_max[i * h * w + s0 * w + s1] = results[i * c * h * w + s0 * w + s1];
-                        for(int j = 1; j < c; j++)
-                        {
-                            channel_max[i * h * w + s0 * w + s1] =
-                                logaddexp(results[(i * c + j) * h * w + s0 * w + s1],
-                                          channel_max[i * h * w + s0 * w + s1],
-                                          neg_inf);
-                        }
-
-                        for(int j = 0; j < c; j++)
-                        {
-                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
-                                alpha * (results[(i * c + j) * h * w + s0 * w + s1] -
-                                         channel_max[i * h * w + s0 * w + s1]) +
-                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                               s1 * out_wstr];
-                        }
-                    }
-                    else
-                    {
-                        channel_max[i * h * w + s0 * w + s1] = 0.0;
-                        for(int j = 0; j < c; j++)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                exp(results[(i * c + j) * h * w + s0 * w + s1]);
-                            channel_max[i * h * w + s0 * w + s1] +=
-                                results[(i * c + j) * h * w + s0 * w + s1];
-                        }
-
-                        for(int j = 0; j < c; j++)
-                        {
-                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
-                                alpha * (results[(i * c + j) * h * w + s0 * w + s1] /
-                                         channel_max[i * h * w + s0 * w + s1]) +
-                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                               s1 * out_wstr];
-                        }
-                    }
-                }
-        }
-    }
-
-    return ret;
-}
-
-template <typename Tgpu /* the data type used in GPU computations (usually half) */,
-          typename Tcheck /* the data type used in CPU checkings (usually double) */>
-int mloSoftmaxBackwardRunHost(miopenTensorDescriptor_t dInputTensor,
-                              miopenTensorDescriptor_t dOutputTensor,
-                              Tgpu* out,
-                              Tgpu* dout,
-                              Tcheck* dinhost,
-                              float alpha,
-                              float beta,
-                              miopenSoftmaxAlgorithm_t algo,
-                              miopenSoftmaxMode_t mode)
-{
-    int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr;
-    int out_nstr, out_cstr, out_hstr, out_wstr;
-    miopenGet4dTensorDescriptorLengths(dOutputTensor, &n, &c, &h, &w);
-    miopenGet4dTensorDescriptorStrides(dInputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr);
-    miopenGet4dTensorDescriptorStrides(dOutputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr);
-
-    std::vector<Tcheck> channel_dot((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w),
-                                    static_cast<Tcheck>(0.0));
-    std::vector<Tcheck> results(n * c * h * w, static_cast<Tcheck>(0.0));
-
-    int ret = 0;
-
-    for(int i = 0; i < n; i++)
-    {
-        if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE)
-        {
-            for(int j = 0; j < c; j++)
-                for(int s0 = 0; s0 < h; s0++)
-                    for(int s1 = 0; s1 < w; s1++)
-                    {
-                        if(algo == MIOPEN_SOFTMAX_LOG)
-                        {
-                            channel_dot[i] += static_cast<Tcheck>(
-                                dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        else
-                        {
-                            channel_dot[i] +=
-                                static_cast<Tcheck>(out[i * out_nstr + j * out_cstr +
-                                                        s0 * out_hstr + s1 * out_wstr]) *
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]);
-                        }
-                    }
-
-            for(int j = 0; j < c; j++)
-                for(int s0 = 0; s0 < h; s0++)
-                    for(int s1 = 0; s1 < w; s1++)
-                    {
-                        if(algo == MIOPEN_SOFTMAX_LOG)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]) -
-                                channel_dot[i] * std::exp(out[i * out_nstr + j * out_cstr +
-                                                              s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        else
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]) -
-                                channel_dot[i];
-
-                            results[(i * c + j) * h * w + s0 * w + s1] *= static_cast<Tcheck>(
-                                out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] =
-                            alpha * results[(i * c + j) * h * w + s0 * w + s1] +
-                            beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr];
-                    }
-        }
-        else
-        {
-            for(int s0 = 0; s0 < h; s0++)
-                for(int s1 = 0; s1 < w; s1++)
-                {
-                    for(int j = 0; j < c; j++)
-                    {
-                        if(algo == MIOPEN_SOFTMAX_LOG)
-                        {
-                            channel_dot[i * h * w + s0 * w + s1] += static_cast<Tcheck>(
-                                dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        else
-                        {
-                            channel_dot[i * h * w + s0 * w + s1] +=
-                                static_cast<Tcheck>(out[i * out_nstr + j * out_cstr +
-                                                        s0 * out_hstr + s1 * out_wstr]) *
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]);
-                        }
-                    }
-
-                    for(int j = 0; j < c; j++)
-                    {
-                        if(algo == MIOPEN_SOFTMAX_LOG)
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]) -
-                                channel_dot[i * h * w + s0 * w + s1] *
-                                    std::exp(out[i * out_nstr + j * out_cstr + s0 * out_hstr +
-                                                 s1 * out_wstr]);
-                        }
-                        else
-                        {
-                            results[(i * c + j) * h * w + s0 * w + s1] =
-                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
-                                                         s0 * out_hstr + s1 * out_wstr]) -
-                                channel_dot[i * h * w + s0 * w + s1];
-
-                            results[(i * c + j) * h * w + s0 * w + s1] *= static_cast<Tcheck>(
-                                out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
-                        }
-                        dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] =
-                            alpha * results[(i * c + j) * h * w + s0 * w + s1] +
-                            beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr];
-                    }
-                }
-        }
-    }
-
-    return ret;
-}
-
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/mloSoftmaxHost.hpp>
diff --git a/projects/miopen/driver/multimarginloss_driver.hpp b/projects/miopen/driver/multimarginloss_driver.hpp
index dab040ef3ef3..5d2a60db4507 100644
--- a/projects/miopen/driver/multimarginloss_driver.hpp
+++ b/projects/miopen/driver/multimarginloss_driver.hpp
@@ -36,8 +36,8 @@
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include <miopen/tensor_view_utils.hpp>
 
 template <typename Tgpu, typename Tcheck>
diff --git a/projects/miopen/driver/prelu_driver.hpp b/projects/miopen/driver/prelu_driver.hpp
index 761f97cc64eb..cab2eb811885 100644
--- a/projects/miopen/driver/prelu_driver.hpp
+++ b/projects/miopen/driver/prelu_driver.hpp
@@ -31,7 +31,7 @@
 #include "tensor_driver.hpp"
 #include "timer.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/miopen.h>
 
diff --git a/projects/miopen/driver/random.hpp b/projects/miopen/driver/random.hpp
index f6f8d85c4ce4..30be9387d99c 100644
--- a/projects/miopen/driver/random.hpp
+++ b/projects/miopen/driver/random.hpp
@@ -1,159 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2025 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_RANDOM_GEN_
-#define GUARD_RANDOM_GEN_
-
-#include <miopen/env.hpp>
-
-#include <cassert>
-#include <iostream>
-#include <random>
-
-MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_DRIVER_PRNG_SEED, 12345678)
-
-namespace env = miopen::env;
-
-namespace prng {
-namespace details {
-using glibc_gen = std::linear_congruential_engine<std::uint32_t, 1103515245, 12345, 2147483648>;
-
-inline std::random_device::result_type get_default_seed()
-{
-    static std::random_device::result_type seed{[] {
-        auto external_seed = env::value(MIOPEN_DEBUG_DRIVER_PRNG_SEED);
-
-        auto seed_ = external_seed == 0
-                         ? std::random_device{}()
-                         : static_cast<std::random_device::result_type>(external_seed);
-        std::cout << "PRNG seed: " << seed_ << "\n";
-        return seed_;
-    }()};
-    return seed;
-}
-
-inline glibc_gen& get_prng()
-{
-    static thread_local glibc_gen gen{get_default_seed()};
-    return gen;
-}
-
-template <class, class = void>
-struct has_digits : std::false_type
-{
-};
-
-template <class T>
-struct has_digits<T, std::void_t<decltype(std::numeric_limits<T>::digits)>> : std::true_type
-{
-};
-
-} // namespace details
-
-inline void reset_seed(std::random_device::result_type seed = 0)
-{
-    details::get_prng().seed(seed + details::get_default_seed());
-}
-
-// similar to std::generate_canonical, but simpler and faster
-template <typename T>
-inline T gen_canonical()
-{
-    if constexpr(std::is_floating_point_v<T>) // native fp
-    {
-        static constexpr T range =
-            static_cast<T>(1) /
-            static_cast<T>(details::glibc_gen::max() - details::glibc_gen::min() + 1);
-        return range * static_cast<T>(details::get_prng()() - details::glibc_gen::min());
-    }
-    else if constexpr(std::is_integral_v<T>)
-    {
-        auto val = details::get_prng()();
-        return static_cast<T>(((val >> 4) + (val >> 16)) & 0x1);
-    }
-    else
-    {
-        return static_cast<T>(gen_canonical<float>());
-    }
-}
-
-template <typename T>
-inline T gen_0_to_B(T B)
-{
-    if constexpr(std::is_floating_point_v<T>) // native fp
-    {
-        return gen_canonical<T>() * B;
-    }
-    else if constexpr(std::is_integral_v<T>)
-    {
-        // can only generate 27bit range, so it may not be suitable
-        // for huge 64 bit ranges, but we do not expect such ranges
-        return static_cast<T>((details::get_prng()() >> 4) % B);
-    }
-    else // half/bfloat/etc
-    {
-        return static_cast<T>(gen_0_to_B(static_cast<float>(B)));
-    }
-}
-
-template <typename T>
-inline T gen_A_to_B(T A, T B)
-{
-    assert(B > A);
-    return gen_0_to_B(B - A) + A;
-}
-
-template <typename T>
-inline T gen_off_range(T offset, T range)
-{
-    static_assert(std::is_integral_v<T>);
-    return prng::gen_0_to_B(range) + offset;
-}
-
-template <typename T, bool Signed = false>
-inline T gen_subnorm()
-{
-    T denorm_val = static_cast<T>(0);
-    if constexpr(!std::is_integral_v<T> && !std::is_same_v<T, double> &&
-                 std::is_trivially_copyable<T>::value && details::has_digits<T>::value)
-    {
-        using BitType = std::conditional_t<sizeof(T) == 1,
-                                           uint8_t,
-                                           std::conditional_t<sizeof(T) == 2, uint16_t, uint32_t>>;
-        static_assert(sizeof(T) == sizeof(BitType));
-
-        // -1 because ::digits counts the first implicit digit
-        static constexpr auto mantissa_bits = std::numeric_limits<T>::digits - 1;
-
-        BitType denorm_bits = static_cast<BitType>(gen_0_to_B(1 << mantissa_bits));
-        denorm_bits |= Signed ? (gen_canonical<BitType>() << (sizeof(T) * 8 - 1)) : 0;
-
-        // the proper way to do a type punning
-        std::memcpy(&denorm_val, &denorm_bits, sizeof(T));
-    }
-    return denorm_val;
-}
-} // namespace prng
-#endif // GUARD_RANDOM_GEN_
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/random.hpp>
diff --git a/projects/miopen/driver/reduce_driver.hpp b/projects/miopen/driver/reduce_driver.hpp
index ab1c50e806f1..6300fa32a690 100644
--- a/projects/miopen/driver/reduce_driver.hpp
+++ b/projects/miopen/driver/reduce_driver.hpp
@@ -35,7 +35,7 @@
 #include "util_driver.hpp"
 #include "util_file.hpp"
 
-#include "../test/verify.hpp"
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/miopen.h>
 #include <memory>
diff --git a/projects/miopen/driver/reducecalculation_driver.hpp b/projects/miopen/driver/reducecalculation_driver.hpp
index 200196950997..738fb6032f3c 100644
--- a/projects/miopen/driver/reducecalculation_driver.hpp
+++ b/projects/miopen/driver/reducecalculation_driver.hpp
@@ -40,8 +40,8 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include "../src/kernels/MIOpenReduceCalculation.hpp"
 
 #ifndef MLO_REDUCE_CALCULATIONMHOST_H_
diff --git a/projects/miopen/driver/reduceextreme_driver.hpp b/projects/miopen/driver/reduceextreme_driver.hpp
index a06f5288a164..b2caf5dda398 100644
--- a/projects/miopen/driver/reduceextreme_driver.hpp
+++ b/projects/miopen/driver/reduceextreme_driver.hpp
@@ -39,8 +39,8 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include "../src/kernels/MIOpenReduceExtreme.hpp"
 
 template <typename T>
diff --git a/projects/miopen/driver/rnn_driver.hpp b/projects/miopen/driver/rnn_driver.hpp
index 4cd47739f5ea..7f35be320155 100644
--- a/projects/miopen/driver/rnn_driver.hpp
+++ b/projects/miopen/driver/rnn_driver.hpp
@@ -36,7 +36,7 @@
 #include "util_driver.hpp"
 #include "util_file.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/miopen.h>
diff --git a/projects/miopen/driver/rnn_seq_driver.hpp b/projects/miopen/driver/rnn_seq_driver.hpp
index 1ac9b23c0b4c..7babcfd00273 100644
--- a/projects/miopen/driver/rnn_seq_driver.hpp
+++ b/projects/miopen/driver/rnn_seq_driver.hpp
@@ -36,7 +36,7 @@
 #include "util_driver.hpp"
 #include "util_file.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/errors.hpp>
 #include <miopen/logger.hpp>
diff --git a/projects/miopen/driver/rnn_verify_gemm.hpp b/projects/miopen/driver/rnn_verify_gemm.hpp
index b1fa42c3503b..04b73111513d 100644
--- a/projects/miopen/driver/rnn_verify_gemm.hpp
+++ b/projects/miopen/driver/rnn_verify_gemm.hpp
@@ -28,7 +28,7 @@
 
 #include "dropout_gpu_emulator.hpp"
 
-#include <../test/rnn_util.hpp>
+#include <miopen_utils/rnn_util.hpp>
 
 #include <algorithm>
 #include <cassert>
diff --git a/projects/miopen/driver/rope_driver.hpp b/projects/miopen/driver/rope_driver.hpp
index bbad2370bf4e..27f0a03126ac 100644
--- a/projects/miopen/driver/rope_driver.hpp
+++ b/projects/miopen/driver/rope_driver.hpp
@@ -39,8 +39,8 @@
 #include <miopen/tensor.hpp>
 #include <numeric>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 
 template <typename Tgpu, typename Tcheck>
 int32_t mloRoPEForwardRunHost(miopenTensorDescriptor_t xDesc,
diff --git a/projects/miopen/driver/softmarginloss_driver.hpp b/projects/miopen/driver/softmarginloss_driver.hpp
index 3a6b095eaa0e..6589abd88db9 100644
--- a/projects/miopen/driver/softmarginloss_driver.hpp
+++ b/projects/miopen/driver/softmarginloss_driver.hpp
@@ -35,8 +35,8 @@
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
 #include <vector>
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include <miopen/tensor_view_utils.hpp>
 
 template <typename Tgpu, typename Tcheck>
diff --git a/projects/miopen/driver/softmax_driver.hpp b/projects/miopen/driver/softmax_driver.hpp
index e147191b2deb..52f42fdfd5f8 100644
--- a/projects/miopen/driver/softmax_driver.hpp
+++ b/projects/miopen/driver/softmax_driver.hpp
@@ -11,7 +11,7 @@
 #include "timer.hpp"
 #include "util_driver.hpp"
 
-#include <../test/verify.hpp>
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
diff --git a/projects/miopen/driver/t5layernorm_driver.hpp b/projects/miopen/driver/t5layernorm_driver.hpp
index c8517ad525d8..b57fe456403f 100644
--- a/projects/miopen/driver/t5layernorm_driver.hpp
+++ b/projects/miopen/driver/t5layernorm_driver.hpp
@@ -26,8 +26,8 @@
 #ifndef GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP
 #define GUARD_MIOPEN_T5LAYERNORM_DRIVER_HPP
 
-#include <../test/tensor_holder.hpp>
-#include <../test/verify.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
 #include "InputFlags.hpp"
 #include "driver.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/driver/transformers_adam_w_driver.hpp b/projects/miopen/driver/transformers_adam_w_driver.hpp
index dfd82a3284c6..a1cd81f2eb53 100644
--- a/projects/miopen/driver/transformers_adam_w_driver.hpp
+++ b/projects/miopen/driver/transformers_adam_w_driver.hpp
@@ -32,7 +32,7 @@
 #include "tensor_driver.hpp"
 #include "timer.hpp"
 
-#include "../test/verify.hpp"
+#include <miopen_utils/verify.hpp>
 
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
diff --git a/projects/miopen/miopen_utils/CMakeLists.txt b/projects/miopen/miopen_utils/CMakeLists.txt
new file mode 100644
index 000000000000..e93a717d0a0e
--- /dev/null
+++ b/projects/miopen/miopen_utils/CMakeLists.txt
@@ -0,0 +1,40 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+
+# INTERNAL BUILD-ONLY library — not installed, not exported, not part of the public MIOpen API.
+# Shared verification/test utilities for MIOpenDriver and tests.
+# Depends on common_utils and the MIOpen public API (miopen.h).
+# Do NOT add install(TARGETS miopen_utils ...) — headers live in the build tree only.
+
+add_library(miopen_utils INTERFACE)
+set_target_properties(miopen_utils PROPERTIES EXCLUDE_FROM_ALL TRUE)
+
+target_include_directories(miopen_utils INTERFACE
+    # BUILD_INTERFACE only — no install interface; these headers are not installed.
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
+
+target_link_libraries(miopen_utils INTERFACE miopen_common_utils)
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp
new file mode 100644
index 000000000000..0125ca37d298
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_bias.hpp
@@ -0,0 +1,140 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_BIAS_HPP
+#define GUARD_CPU_BIAS_HPP
+
+#include <array>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <utility>
+
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/functional.hpp>
+
+template <std::size_t NSpatialDim, typename Tout, typename Tbias>
+void cpu_bias_forward_impl(tensor<Tout>& out, const tensor<Tbias>& bias)
+{
+    assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2);
+    assert(
+        bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[1] &&
+        std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) {
+            return v == 1;
+        }));
+
+    out.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) {
+        out(out_n_id, out_k_id, out_spatial_id_pack...) =
+            double(out(out_n_id, out_k_id, out_spatial_id_pack...)) + double(bias.data[out_k_id]);
+    });
+}
+
+template <std::size_t NSpatialDim, typename Tout, typename Tbias>
+void cpu_bias_backward_data_impl(const tensor<Tout>& out, tensor<Tbias>& bias)
+{
+    assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2);
+    assert(
+        bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[0] &&
+        std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) {
+            return v == 1;
+        }));
+
+    std::size_t out_n_len = out.desc.GetLengths()[0];
+    std::size_t out_k_len = out.desc.GetLengths()[1];
+
+    std::array<std::size_t, NSpatialDim> out_spatial_len{};
+    std::copy_n(out.desc.GetLengths().begin() + 2, NSpatialDim, out_spatial_len.begin());
+
+    miopen::par_ford(out_k_len)([&](auto out_k_id) {
+        auto ford_out_n_spatial =
+            miopen::unpacker(miopen::prepender(miopen::ford, out_n_len))(out_spatial_len);
+
+        double acc = 0;
+        ford_out_n_spatial([&](auto out_n_id, auto... out_spatial_id_pack) {
+            acc += double(out(out_n_id, out_k_id, out_spatial_id_pack...));
+        });
+
+        bias.data[out_k_id] = acc;
+    });
+}
+
+template <typename Tout, typename Tbias>
+void cpu_bias_forward(tensor<Tout>& out, const tensor<Tbias>& bias)
+{
+    switch(out.desc.GetNumDims())
+    {
+    case 3: {
+        cpu_bias_forward_impl<1>(out, bias);
+        break;
+    }
+    case 4: {
+        cpu_bias_forward_impl<2>(out, bias);
+        break;
+    }
+    case 5: {
+        cpu_bias_forward_impl<3>(out, bias);
+        break;
+    }
+    case 6: {
+        cpu_bias_forward_impl<4>(out, bias);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+
+template <typename Tout, typename Tbias>
+void cpu_bias_backward_data(const tensor<Tout>& out, tensor<Tbias>& bias)
+{
+    switch(out.desc.GetNumDims())
+    {
+    case 3: {
+        cpu_bias_backward_data_impl<1>(out, bias);
+        break;
+    }
+    case 4: {
+        cpu_bias_backward_data_impl<2>(out, bias);
+        break;
+    }
+    case 5: {
+        cpu_bias_backward_data_impl<3>(out, bias);
+        break;
+    }
+    case 6: {
+        cpu_bias_backward_data_impl<4>(out, bias);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp
new file mode 100644
index 000000000000..2ef2c5b31236
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_conv.hpp
@@ -0,0 +1,514 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_CONV_HPP
+#define GUARD_CPU_CONV_HPP
+
+#include <array>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <utility>
+
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/functional.hpp>
+#include <hip_float8.hpp>
+
+template <class T, class... Ts>
+static constexpr auto make_array(T x, Ts... xs)
+{
+    return std::array<T, 1 + sizeof...(Ts)>{{x, xs...}};
+}
+
+template <typename T>
+struct PassThru
+{
+    T operator()(T t) { return t; }
+};
+
+template <typename Tin, typename Twei, typename Tout>
+struct cpu_convolution_acc_type
+{
+    using type = double; // default using double as accumulator
+};
+
+template <>
+struct cpu_convolution_acc_type<int8_t, int8_t, int32_t>
+{
+    using type = int32_t;
+};
+
+template <>
+struct cpu_convolution_acc_type<int8_t, int8_t, float>
+{
+    using type = double;
+};
+
+template <std::size_t ConvDim,
+          typename Tacc,
+          typename FI,
+          typename FW,
+          typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range>
+void cpu_convolution_forward_impl(const tensor<Tin>& in,
+                                  const tensor<Twei>& wei,
+                                  tensor<Tout>& out,
+                                  const Range& pads,
+                                  const Range& strides,
+                                  const Range& dilations,
+                                  std::size_t group_count,
+                                  FI fi = {},
+                                  FW fw = {})
+{
+    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
+    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
+           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
+           strides.size() == ConvDim and dilations.size() == ConvDim);
+    std::size_t out_n_len = out.desc.GetLengths()[0];
+
+    std::size_t wei_k_len = wei.desc.GetLengths()[0];
+    std::size_t wei_c_len = wei.desc.GetLengths()[1];
+
+    std::size_t vector_len = in.desc.GetVectorLength();
+
+    std::array<std::size_t, ConvDim> in_spatial_len{};
+    std::array<std::size_t, ConvDim> wei_spatial_len{};
+    std::array<std::size_t, ConvDim> out_spatial_len{};
+
+    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
+    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
+    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
+
+    if(wei.desc.GetLayout_str() == "CHWNc")
+    {
+        wei_c_len = wei.desc.GetLengths()[0];
+        std::copy_n(wei.desc.GetLengths().begin() + 1, ConvDim, wei_spatial_len.begin());
+        wei_k_len = wei.desc.GetLengths()[3];
+    }
+
+    std::size_t wei_k_len_per_group = wei_k_len / group_count;
+
+    // f(x0, x1, xs...)
+    // f1(xs...) = f(x0, x1, xs...)
+    // f2(xs_array) = f1(xs...)
+    auto par_ford_out_nk_spatial = miopen::unpacker(
+        miopen::prepender(miopen::par_ford, out_n_len, wei_k_len))(out_spatial_len);
+
+    par_ford_out_nk_spatial([&](std::size_t out_n_id,
+                                std::size_t out_k_id,
+                                auto... out_spatial_id_pack) {
+        auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+        std::size_t group_id = out_k_id / wei_k_len_per_group;
+        Tacc acc             = 0;
+
+        miopen::ford(wei_c_len)([&](std::size_t wei_c_id) {
+            std::size_t in_c_id = group_id * wei_c_len + wei_c_id;
+
+            auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len);
+
+            ford_wei_spatial([&](auto... wei_spatial_id_pack) {
+                auto wei_spatial_id = make_array(wei_spatial_id_pack...);
+
+                std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
+
+                for(std::size_t i = 0; i < ConvDim; ++i)
+                {
+                    in_spatial_id[i] =
+                        out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i];
+                }
+                bool out_of_bound = false;
+                for(std::size_t i = 0; i < ConvDim; ++i)
+                {
+                    out_of_bound = out_of_bound or
+                                   (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]);
+                }
+                if(!out_of_bound)
+                {
+                    if(vector_len > 1)
+                    {
+                        std::array<std::size_t, ConvDim + 3> in_id{};
+                        in_id[1] = out_n_id;
+                        in_id[2] = in_c_id;
+                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 3);
+                        for(std::size_t i = 0; i < vector_len; i++)
+                        {
+                            in_id[0] = i;
+                            acc += Tacc(in(in_id)) *
+                                   Tacc(wei(i, out_k_id, wei_c_id, wei_spatial_id_pack...));
+                        }
+                    }
+                    else
+                    {
+                        std::array<std::size_t, ConvDim + 2> in_id{};
+                        in_id[0] = out_n_id;
+                        in_id[1] = in_c_id;
+                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
+                        Tacc tmp1 = static_cast<Tacc>(fi(in(in_id)));
+                        Tacc tmp2 =
+                            static_cast<Tacc>(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)));
+                        acc += tmp1 * tmp2;
+                    }
+                }
+            });
+        });
+        if(vector_len > 1)
+        {
+            out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) =
+                static_cast<Tout>(acc);
+        }
+        else
+        {
+            out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast<Tout>(acc);
+        }
+    });
+}
+
+template <std::size_t ConvDim,
+          typename Tacc,
+          typename FW,
+          typename FO,
+          typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range>
+void cpu_convolution_backward_data_impl(tensor<Tin>& in,
+                                        const tensor<Twei>& wei,
+                                        const tensor<Tout>& out,
+                                        const Range& pads,
+                                        const Range& strides,
+                                        const Range& dilations,
+                                        std::size_t group_count,
+                                        FW fw = {},
+                                        FO fo = {})
+{
+    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
+    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
+           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
+           strides.size() == ConvDim and dilations.size() == ConvDim);
+
+    std::size_t in_n_len = in.desc.GetLengths()[0];
+    std::size_t in_c_len = in.desc.GetLengths()[1];
+
+    std::size_t wei_k_len = wei.desc.GetLengths()[0];
+    std::size_t wei_c_len = wei.desc.GetLengths()[1];
+
+    std::size_t wei_k_len_per_group = wei_k_len / group_count;
+
+    std::array<std::size_t, ConvDim> in_spatial_len{};
+    std::array<std::size_t, ConvDim> wei_spatial_len{};
+    std::array<std::size_t, ConvDim> out_spatial_len{};
+
+    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
+    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
+    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
+
+    auto par_ford_in_nc_spatial =
+        miopen::unpacker(miopen::prepender(miopen::par_ford, in_n_len, in_c_len))(in_spatial_len);
+
+    par_ford_in_nc_spatial(
+        [&](std::size_t in_n_id, std::size_t in_c_id, auto... in_spatial_id_pack) {
+            auto in_spatial_id = make_array(in_spatial_id_pack...);
+
+            std::size_t group_id = in_c_id / wei_c_len;
+
+            Tacc acc = 0;
+
+            miopen::ford(wei_k_len_per_group)([&](std::size_t wei_k_id_inside_group) {
+                auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len);
+
+                ford_wei_spatial([&](auto... wei_spatial_id_pack) {
+                    auto wei_spatial_id = make_array(wei_spatial_id_pack...);
+
+                    std::array<ptrdiff_t, ConvDim> out_spatial_id_{};
+                    std::array<ptrdiff_t, ConvDim> out_spatial_id{};
+
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        out_spatial_id_[i] =
+                            pads[i] + in_spatial_id[i] - wei_spatial_id[i] * dilations[i];
+                        out_spatial_id[i] = out_spatial_id_[i] / strides[i];
+                    }
+
+                    bool use = true;
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        use &= out_spatial_id_[i] % strides[i] == 0 and out_spatial_id[i] >= 0 and
+                               out_spatial_id[i] < out_spatial_len[i];
+                    }
+
+                    if(use)
+                    {
+                        std::size_t out_k_id =
+                            group_id * wei_k_len_per_group + wei_k_id_inside_group;
+                        std::size_t wei_c_id = in_c_id % wei_c_len;
+
+                        std::array<std::size_t, ConvDim + 2> out_id{};
+                        out_id[0] = in_n_id;
+                        out_id[1] = out_k_id;
+                        std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2);
+                        Tacc tmp1 = fo(out(out_id));
+                        Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...));
+                        acc += tmp1 * tmp2;
+                    }
+                });
+            });
+            // TODO: Why do we need a no-lint here ?
+            in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast<Tout>(acc); // NOLINT
+        });
+}
+
+template <std::size_t ConvDim,
+          typename Tacc,
+          typename FI,
+          typename FO,
+          typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range>
+void cpu_convolution_backward_weight_impl(const tensor<Tin>& in,
+                                          tensor<Twei>& wei,
+                                          const tensor<Tout>& out,
+                                          const Range& pads,
+                                          const Range& strides,
+                                          const Range& dilations,
+                                          std::size_t group_count,
+                                          FI fi,
+                                          FO fo)
+{
+    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
+    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
+           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
+           strides.size() == ConvDim and dilations.size() == ConvDim);
+
+    std::size_t out_n_len = out.desc.GetLengths()[0];
+
+    std::size_t wei_k_len = wei.desc.GetLengths()[0];
+    std::size_t wei_c_len = wei.desc.GetLengths()[1];
+
+    std::size_t wei_k_len_per_group = wei_k_len / group_count;
+
+    std::array<std::size_t, ConvDim> in_spatial_len{};
+    std::array<std::size_t, ConvDim> wei_spatial_len{};
+    std::array<std::size_t, ConvDim> out_spatial_len{};
+
+    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
+    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
+    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
+
+    auto par_ford_wei_kc_spatial = miopen::unpacker(
+        miopen::prepender(miopen::par_ford, wei_k_len, wei_c_len))(wei_spatial_len);
+
+    par_ford_wei_kc_spatial(
+        [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) {
+            auto wei_spatial_id = make_array(wei_spatial_id_pack...);
+
+            std::size_t group_id = wei_k_id / wei_k_len_per_group;
+            std::size_t in_c_id  = group_id * wei_c_len + wei_c_id;
+
+            Tacc acc = 0;
+
+            miopen::ford(out_n_len)([&](std::size_t out_n_id) {
+                auto ford_out_spatial = miopen::unpacker(miopen::ford)(out_spatial_len);
+
+                ford_out_spatial([&](auto... out_spatial_id_pack) {
+                    auto out_spatial_id = make_array(out_spatial_id_pack...);
+
+                    std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
+
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        in_spatial_id[i] = out_spatial_id[i] * strides[i] +
+                                           wei_spatial_id[i] * dilations[i] - pads[i];
+                    }
+
+                    bool out_of_bound = false;
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or
+                                                        in_spatial_id[i] >= in_spatial_len[i]);
+                    }
+
+                    if(!out_of_bound)
+                    {
+                        std::array<std::size_t, ConvDim + 2> in_id{};
+                        in_id[0] = out_n_id;
+                        in_id[1] = in_c_id;
+                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
+                        Tacc tmp1 = fi(in(in_id));
+                        Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...));
+                        acc += tmp1 * tmp2;
+                    }
+                });
+
+                wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast<Twei>(acc);
+            });
+        });
+}
+
+template <typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range,
+          typename Tacc = double,
+          typename FI   = PassThru<Tin>,
+          typename FW   = PassThru<Twei>>
+void cpu_convolution_forward(std::size_t spatial_dim,
+                             const tensor<Tin>& in,
+                             const tensor<Twei>& wei,
+                             tensor<Tout>& out,
+                             const Range& pads,
+                             const Range& strides,
+                             const Range& dilations,
+                             std::size_t group_count,
+                             FI fi = {},
+                             FW fw = {})
+{
+    switch(spatial_dim)
+    {
+    case 1: {
+        cpu_convolution_forward_impl<1, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
+        break;
+    }
+    case 2: {
+        cpu_convolution_forward_impl<2, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
+        break;
+    }
+    case 3: {
+        cpu_convolution_forward_impl<3, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
+        break;
+    }
+    case 4: {
+        cpu_convolution_forward_impl<4, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+
+template <typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range,
+          typename Tacc = double,
+          typename FW   = PassThru<Twei>,
+          typename FO   = PassThru<Tout>>
+void cpu_convolution_backward_data(std::size_t spatial_dim,
+                                   tensor<Tin>& in,
+                                   const tensor<Twei>& wei,
+                                   const tensor<Tout>& out,
+                                   const Range& pads,
+                                   const Range& strides,
+                                   const Range& dilations,
+                                   std::size_t group_count,
+                                   FW fw = {},
+                                   FO fo = {})
+{
+    switch(spatial_dim)
+    {
+    case 1: {
+        cpu_convolution_backward_data_impl<1, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
+        break;
+    }
+    case 2: {
+        cpu_convolution_backward_data_impl<2, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
+        break;
+    }
+    case 3: {
+        cpu_convolution_backward_data_impl<3, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
+        break;
+    }
+    case 4: {
+        cpu_convolution_backward_data_impl<4, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+
+template <typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range,
+          typename Tacc = double,
+          typename FI   = PassThru<Tin>,
+          typename FO   = PassThru<Tout>>
+void cpu_convolution_backward_weight(std::size_t spatial_dim,
+                                     const tensor<Tin>& in,
+                                     tensor<Twei>& wei,
+                                     const tensor<Tout>& out,
+                                     const Range& pads,
+                                     const Range& strides,
+                                     const Range& dilations,
+                                     std::size_t group_count,
+                                     FI fi = {},
+                                     FO fo = {})
+{
+    switch(spatial_dim)
+    {
+    case 1: {
+        cpu_convolution_backward_weight_impl<1, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
+        break;
+    }
+    case 2: {
+        cpu_convolution_backward_weight_impl<2, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
+        break;
+    }
+    case 3: {
+        cpu_convolution_backward_weight_impl<3, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
+        break;
+    }
+    case 4: {
+        cpu_convolution_backward_weight_impl<4, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
+        break;
+    }
+    default: {
+        MIOPEN_THROW("not belong to any case");
+    }
+    }
+}
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp
new file mode 100644
index 000000000000..0a6ab5556865
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_layernorm.hpp
@@ -0,0 +1,216 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#ifndef GUARD_CPU_CONV_HPP
+#define GUARD_CPU_CONV_HPP
+
+#include <miopen_utils/tensor_holder.hpp>
+
+template <class T>
+void cpu_layernorm_forward(tensor<T> input,
+                           tensor<T> weight,
+                           tensor<T> bias,
+                           tensor<T>& ref_output,
+                           tensor<T>& ref_mean,
+                           tensor<T>& ref_rstd,
+                           float eps,
+                           int32_t dim,
+                           miopenNormMode_t mode,
+                           bool use_multithread = false)
+{
+    auto layout   = input.desc.GetLayoutEnum();
+    size_t stride = 1;
+    if(dim > 1 && layout.has_value() &&
+       (layout.value() == miopenTensorNHWC || layout.value() == miopenTensorNDHWC))
+    {
+        stride = input.desc.GetLengths()[1]; // stride = C
+    }
+
+    auto dims         = input.desc.GetLengths();
+    size_t outer_size = 1;
+    size_t inner_size = 1;
+    for(size_t i = 0; i < dims.size(); ++i)
+    {
+        if(i < dim)
+        {
+            if(!(stride > 1 && i == 1))
+            {
+                outer_size *= dims[i];
+            }
+        }
+        else
+        {
+            inner_size *= dims[i];
+        }
+    }
+
+    size_t min_grain = use_multithread ? 8 : outer_size;
+    miopen::par_for(outer_size, min_grain, [&](int32_t o) {
+        miopen::ford(stride)([&](int32_t s) {
+            double mean_v = 0.0;
+            double var_v  = 0.0;
+
+            miopen::ford(inner_size)([&](int32_t i) {
+                double tmp = static_cast<double>(input[o * inner_size * stride + i * stride + s]);
+                mean_v += tmp;
+                var_v += tmp * tmp;
+            });
+
+            mean_v        = mean_v / inner_size;
+            var_v         = var_v / inner_size - mean_v * mean_v;
+            double rstd_v = 1.0 / sqrt(var_v + eps);
+
+            ref_mean[o * stride + s] = static_cast<T>(mean_v);
+            ref_rstd[o * stride + s] = static_cast<T>(rstd_v);
+
+            miopen::ford(inner_size)([&](int32_t i) {
+                double weight_v =
+                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<float>(weight[i]);
+                double bias_v =
+                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0.0 : static_cast<float>(bias[i]);
+
+                ref_output[o * inner_size * stride + i * stride + s] = static_cast<T>(
+                    (static_cast<double>(input[o * inner_size * stride + i * stride + s]) -
+                     mean_v) *
+                        rstd_v * weight_v +
+                    bias_v);
+            });
+        });
+    });
+}
+
+template <class T>
+void cpu_layernorm_backward(tensor<T> dy,
+                            tensor<T> x,
+                            tensor<T> weight,
+                            tensor<T> mean,
+                            tensor<T> rstd,
+                            tensor<T>& ref_dx,
+                            int32_t dim,
+                            miopenNormMode_t mode,
+                            bool use_multithread = false)
+{
+    auto layout   = dy.desc.GetLayoutEnum();
+    size_t stride = 1;
+    if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC))
+    {
+        stride = dy.desc.GetLengths()[1]; // stride = C
+    }
+
+    auto dims         = dy.desc.GetLengths();
+    size_t outer_size = 1;
+    size_t inner_size = 1;
+    for(size_t i = 0; i < dims.size(); ++i)
+    {
+        if(i < dim)
+        {
+            if(!(stride > 1 && i == 1))
+            {
+                outer_size *= dims[i];
+            }
+        }
+        else
+        {
+            inner_size *= dims[i];
+        }
+    }
+
+    size_t min_grain = use_multithread ? 8 : outer_size;
+    miopen::par_for(outer_size, min_grain, [&](int32_t o) {
+        miopen::ford(stride)([&](int32_t s) {
+            double sum_dy_weight   = 0.0;
+            double sum_dy_weight_x = 0.0;
+
+            miopen::ford(inner_size)([&](int32_t i) {
+                double pweight =
+                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<double>(weight[i]);
+                double pdy = (dy.GetSize() != 0)
+                                 ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
+                                 : 0.0;
+                double px  = static_cast<double>(x[o * inner_size * stride + i * stride + s]);
+
+                sum_dy_weight += pdy * pweight;
+                sum_dy_weight_x += pdy * px * pweight;
+            });
+
+            double scale = 1.0 / static_cast<double>(inner_size);
+            double prstd = static_cast<double>(rstd[o * stride + s]);
+            double pmean = static_cast<double>(mean[o * stride + s]);
+            double a = prstd * prstd * prstd * scale * (sum_dy_weight_x - sum_dy_weight * pmean);
+            double b = prstd * sum_dy_weight * scale - a * pmean;
+
+            miopen::ford(inner_size)([&](int32_t i) {
+                double pweight =
+                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<double>(weight[i]);
+                double pdy = (dy.GetSize() != 0)
+                                 ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
+                                 : 0.0;
+                double val = prstd * pdy * pweight -
+                             a * static_cast<double>(x[o * inner_size * stride + i * stride + s]) -
+                             b;
+
+                ref_dx[o * inner_size * stride + i * stride + s] = static_cast<T>(val);
+            });
+        });
+    });
+}
+
+template <class T>
+void cpu_layernorm_backward_weight_bias(tensor<T> dy,
+                                        tensor<T> x,
+                                        tensor<T> mean,
+                                        tensor<T> rstd,
+                                        tensor<T>& ref_dw,
+                                        tensor<T>& ref_db,
+                                        int32_t dim,
+                                        bool use_multithread = false)
+{
+    auto layout   = dy.desc.GetLayoutEnum();
+    size_t stride = 1;
+    if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC))
+    {
+        stride = dy.desc.GetLengths()[1]; // stride = C
+    }
+
+    auto dims         = dy.desc.GetLengths();
+    size_t outer_size = 1;
+    size_t inner_size = 1;
+    for(size_t i = 0; i < dims.size(); ++i)
+    {
+        if(i < dim)
+        {
+            if(!(stride > 1 && i == 1))
+            {
+                outer_size *= dims[i];
+            }
+        }
+        else
+        {
+            inner_size *= dims[i];
+        }
+    }
+
+    size_t min_grain = use_multithread ? 8 : inner_size;
+    miopen::par_for(inner_size, min_grain, [&](int32_t i) {
+        double sum_dw = 0.0;
+        double sum_db = 0.0;
+
+        miopen::ford(stride)([&](int32_t s) {
+            miopen::ford(outer_size)([&](int32_t o) {
+                double prstd = static_cast<double>(rstd[o * stride + s]);
+                double pmean = static_cast<double>(mean[o * stride + s]);
+                double pdy   = (dy.GetSize() != 0)
+                                   ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
+                                   : 0;
+                double px    = static_cast<double>(x[o * inner_size * stride + i * stride + s]);
+
+                sum_dw += pdy * (px - pmean) * prstd;
+                sum_db += pdy;
+            });
+        });
+
+        ref_dw[i] = sum_dw;
+        ref_db[i] = sum_db;
+    });
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp b/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp
new file mode 100644
index 000000000000..e5f7d50f9d0b
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/cpu_reduce_util.hpp
@@ -0,0 +1,649 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_REDUCE_UTIL_HPP
+#define GUARD_CPU_REDUCE_UTIL_HPP
+
+#include "miopen/reducetensor.hpp"
+#include <miopen_utils/tensor_holder.hpp>
+#include <cstddef>
+#include <half/half.hpp>
+#include <limits>
+#include <cmath>
+#include <cassert>
+#include <ratio>
+#include <stdexcept>
+#include <string>
+#include <miopen/miopen.h>
+#include <miopen/reduce_common.hpp>
+
+namespace reduce {
+
+template <typename T>
+static inline bool float_equal_one(T);
+
+static inline bool float_equal_one(float x) { return x == 1.0f; };
+
+static inline bool float_equal_one(double x) { return x == 1.0; };
+
+static inline bool float_equal_one(half_float::half x)
+{
+    return x == convert_type<half_float::half>(1.0f);
+};
+
+template <typename T>
+static inline bool float_equal_zero(T x);
+
+static inline bool float_equal_zero(float x) { return x == 0.0f; };
+
+static inline bool float_equal_zero(double x) { return x == 0.0; };
+
+static inline bool float_equal_zero(half_float::half x)
+{
+    return x == convert_type<half_float::half>(0.0f);
+};
+
+template <typename SizeT>
+static inline void build_radix(const std::vector<SizeT>& lens, std::vector<std::size_t>& radix)
+{
+    const std::size_t D = lens.size();
+    radix.assign(D, 1);
+    for(std::size_t d = D; d-- > 1;)
+        radix[d - 1] = radix[d] * static_cast<std::size_t>(lens[d]); // radix[d] = Π_{k>d} lens[k]
+}
+
+// i -> memory offset using lens-radix + actual strides
+template <typename SizeT>
+static inline std::size_t linear_to_offset_by_lens_strides(std::size_t i,
+                                                           const std::vector<SizeT>& lens,
+                                                           const std::vector<std::size_t>& radix,
+                                                           const std::vector<SizeT>& strides)
+{
+    std::size_t off = 0;
+    for(std::size_t d = 0; d < lens.size(); ++d)
+    {
+        const std::size_t idx_d = (i / radix[d]) % static_cast<std::size_t>(lens[d]);
+        off += idx_d * static_cast<std::size_t>(strides[d]);
+    }
+    return off;
+}
+
+template <typename compType>
+static inline std::function<void(compType&)> PreUnaryOpFn(miopenReduceTensorOp_t op_, std::size_t)
+{
+    using std::abs;
+
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_NORM1: return ([&](compType& a_) { a_ = abs(a_); });
+    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = a_ * a_; });
+    case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType& a_) { a_ = abs(a_); });
+
+    case MIOPEN_REDUCE_TENSOR_AVG:
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_MUL:
+    case MIOPEN_REDUCE_TENSOR_MIN:
+    case MIOPEN_REDUCE_TENSOR_MAX: return ([&](compType&) {});
+    }
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType>
+static inline std::function<void(compType&)> PosUnaryOpFn(miopenReduceTensorOp_t op_,
+                                                          std::size_t divider)
+{
+    using std::sqrt;
+
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = sqrt(a_); });
+
+    case MIOPEN_REDUCE_TENSOR_AVG:
+        return ([&, divider](compType& a_) {
+            a_ = a_ / convert_type<compType>(static_cast<float>(divider));
+        });
+
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_NORM1:
+    case MIOPEN_REDUCE_TENSOR_MUL:
+    case MIOPEN_REDUCE_TENSOR_MIN:
+    case MIOPEN_REDUCE_TENSOR_MAX:
+    case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType&) {});
+    }
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType>
+static inline std::function<void(compType&, compType)> ReduceOpFn(miopenReduceTensorOp_t op_)
+{
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_AVG:
+    case MIOPEN_REDUCE_TENSOR_NORM1:
+    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_, compType b_) { a_ = a_ + b_; });
+
+    case MIOPEN_REDUCE_TENSOR_MUL: return ([&](compType& a_, compType b_) { a_ = a_ * b_; });
+
+    case MIOPEN_REDUCE_TENSOR_MIN:
+        return ([&](compType& a_, compType b_) {
+            if(a_ > b_)
+                a_ = b_;
+        });
+
+    case MIOPEN_REDUCE_TENSOR_MAX:
+    case MIOPEN_REDUCE_TENSOR_AMAX:
+        return ([&](compType& a_, compType b_) {
+            if(a_ < b_)
+                a_ = b_;
+        });
+    }
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType>
+static inline std::function<void(compType&, compType, bool& changed)>
+ReduceOpFn2(miopenReduceTensorOp_t op_)
+{
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_MIN:
+        return ([&](compType& a_, compType b_, bool& changed) {
+            if(a_ > b_)
+            {
+                a_      = b_;
+                changed = true;
+            }
+            else
+            {
+                changed = false;
+            }
+        });
+
+    case MIOPEN_REDUCE_TENSOR_MAX:
+    case MIOPEN_REDUCE_TENSOR_AMAX:
+        return ([&](compType& a_, compType b_, bool& changed) {
+            if(a_ < b_)
+            {
+                a_      = b_;
+                changed = true;
+            }
+            else
+            {
+                changed = false;
+            }
+        });
+
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_MUL:
+    case MIOPEN_REDUCE_TENSOR_AVG:
+    case MIOPEN_REDUCE_TENSOR_NORM1:
+    case MIOPEN_REDUCE_TENSOR_NORM2: return (std::function<void(compType&, compType, bool&)>{});
+    };
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType>
+static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_)
+{
+    switch(op_)
+    {
+    case MIOPEN_REDUCE_TENSOR_ADD:
+    case MIOPEN_REDUCE_TENSOR_AVG:
+    case MIOPEN_REDUCE_TENSOR_NORM1:
+    case MIOPEN_REDUCE_TENSOR_NORM2: return (convert_type<compType>(0.0f));
+
+    case MIOPEN_REDUCE_TENSOR_MUL: return (convert_type<compType>(1.0f));
+
+    case MIOPEN_REDUCE_TENSOR_MIN: return (std::numeric_limits<compType>::max());
+
+    case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits<compType>::lowest());
+    case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type<compType>(0.0f));
+    }
+
+    throw std::runtime_error(std::string(__FUNCTION__) +
+                             ": using undefined Reduction operation is not permitted");
+};
+
+template <typename compType, typename reduceOpT>
+static inline void binop_with_nan_check(miopenNanPropagation_t nanOpt,
+                                        reduceOpT&& opReduce,
+                                        compType& accuVal,
+                                        compType currVal)
+{
+    using std::isnan;
+
+    if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN)
+    {
+        opReduce(accuVal, currVal);
+    }
+    else
+    {
+        if(isnan(currVal))
+            accuVal = currVal;
+        else
+            opReduce(accuVal, currVal);
+    };
+};
+
+template <typename compType, typename reduceOpT>
+static inline void binop_with_nan_check2(miopenNanPropagation_t nanOpt,
+                                         reduceOpT&& opReduce,
+                                         compType& accuVal,
+                                         compType currVal,
+                                         int& accuIndex,
+                                         int currIndex)
+{
+    using std::isnan;
+
+    if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN)
+    {
+        bool changed;
+
+        opReduce(accuVal, currVal, changed);
+
+        if(changed)
+            accuIndex = currIndex;
+    }
+    else
+    {
+        if(isnan(currVal))
+        {
+            accuVal   = currVal;
+            accuIndex = currIndex;
+        }
+        else
+        {
+            bool changed;
+
+            opReduce(accuVal, currVal, changed);
+
+            if(changed)
+                accuIndex = currIndex;
+        };
+    };
+};
+
+}; // end of namespace reduce
+
+template <typename T>
+std::vector<std::vector<T>> get_all_indexes(const std::vector<T>& lens)
+{
+    const std::size_t D = lens.size();
+    assert(D > 0);
+
+    std::size_t N = 1;
+    for(const auto L : lens)
+        N *= static_cast<std::size_t>(L);
+
+    std::vector<std::vector<T>> out;
+    out.resize(N);
+    for(auto& row : out)
+        row.resize(D);
+
+    std::vector<std::size_t> stride(D, 1);
+    for(std::size_t d = D; d-- > 1;)
+        stride[d - 1] = stride[d] * static_cast<std::size_t>(lens[d]);
+
+    for(std::size_t r = 0; r < N; ++r)
+    {
+        for(std::size_t d = 0; d < D; ++d)
+            out[r][d] = static_cast<T>((r / stride[d]) % static_cast<std::size_t>(lens[d]));
+    }
+
+    return out;
+}
+
+template <typename T>
+static inline T
+linear_to_offset(size_t li, const std::vector<T>& lens, const std::vector<T>& strides)
+{
+    T off = 0;
+    for(int d = int(lens.size()) - 1; d >= 0; --d)
+    {
+        const T idx = li % lens[d];
+        li /= lens[d];
+        off += idx * strides[d];
+    }
+    return off;
+}
+
+template <typename T>
+T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
+{
+    T offset = 0;
+
+    assert(strides.size() == index.size());
+
+    for(int i = 0; i < index.size(); i++)
+        offset += strides[i] * index[i];
+
+    return (offset);
+};
+
+template <typename T>
+T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
+{
+    T offset = 0;
+
+    assert(lengths.size() == index.size() && !lengths.empty());
+
+    int len  = lengths.size();
+    T stride = 1;
+
+    // for len==1, the loop is not executed
+    for(int i = len - 1; i > 0; i--)
+    {
+        offset += stride * index[i];
+
+        stride *= lengths[i];
+    };
+
+    offset += stride * index[0];
+
+    return (offset);
+};
+
+template <typename compType>
+struct Reducer
+{
+    compType acc;
+    bool withIdx;
+    int idx; // meaningful only when WithIdx==true
+    miopenNanPropagation_t nanOpt;
+    // functors for reduction
+    decltype(reduce::ReduceOpFn<compType>(MIOPEN_REDUCE_TENSOR_ADD)) opNoIdx;
+    decltype(reduce::ReduceOpFn2<compType>(MIOPEN_REDUCE_TENSOR_ADD)) opWithIdx;
+
+    Reducer(miopenNanPropagation_t n, miopenReduceTensorOp_t rop, compType zero, bool useIdx)
+        : acc(zero),
+          withIdx(useIdx),
+          idx(0),
+          nanOpt(n),
+          opNoIdx(reduce::ReduceOpFn<compType>(rop)),
+          opWithIdx(reduce::ReduceOpFn2<compType>(rop))
+    {
+    }
+
+    inline void step(compType v, int flat_i)
+    {
+        if(withIdx)
+            reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, v, idx, flat_i);
+        else
+            reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, v);
+    }
+
+    inline void combine(const Reducer& other)
+    {
+        if(withIdx)
+            reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, other.acc, idx, other.idx);
+        else
+            reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, other.acc);
+    }
+};
+
+template <typename Tgpu, typename Tref, typename compType, typename SizeT>
+std::tuple<tensor<Tref>, tensor<int>> reduce_cpu_common(const miopenReduceTensorOp_t& reduceOp,
+                                                        const miopenNanPropagation_t& nanOpt,
+                                                        const std::vector<SizeT>& inLengths,
+                                                        const std::vector<SizeT>& outLengths,
+                                                        const std::vector<Tgpu>& input,
+                                                        const std::vector<SizeT>& inStrides,
+                                                        const std::vector<Tref>& output,
+                                                        const std::vector<SizeT>& outStrides,
+                                                        float alpha,
+                                                        float beta,
+                                                        bool parallel,
+                                                        bool withIdx)
+{
+    using reduce::convert_type;
+    using reduce::ReduceOpZeroVal;
+
+    // Partition dims
+    std::vector<int> invariantDims, toReduceDims;
+    std::vector<std::size_t> invLens, redLens, invStrides_v, redStrides_v;
+
+    for(int i = 0; i < static_cast<int>(inLengths.size()); ++i)
+    {
+        if(inLengths[i] == outLengths[i])
+        {
+            invariantDims.push_back(i);
+            invLens.push_back(inLengths[i]);
+            invStrides_v.push_back(inStrides[i]);
+        }
+        else
+        {
+            toReduceDims.push_back(i);
+            redLens.push_back(inLengths[i]);
+            redStrides_v.push_back(inStrides[i]);
+        }
+    }
+
+    const bool reduceAllDims = invariantDims.empty();
+
+    // unary ops & zero vals
+    const compType zeroV = ReduceOpZeroVal<compType>(reduceOp);
+
+    // divider = Π reduced dims (or N if reduce-all)
+    std::size_t divider = 1;
+    if(reduceAllDims)
+        divider = std::accumulate(
+            inLengths.begin(), inLengths.end(), std::size_t{1}, std::multiplies<>());
+    else
+        divider =
+            std::accumulate(redLens.begin(), redLens.end(), std::size_t{1}, std::multiplies<>());
+
+    auto PreUnaryOp = reduce::PreUnaryOpFn<compType>(reduceOp, divider);
+    auto PosUnaryOp = reduce::PosUnaryOpFn<compType>(reduceOp, divider);
+
+    // outputs
+    auto res         = tensor<Tref>{outLengths};
+    res.data         = output;
+    auto res_indices = tensor<int>{outLengths};
+    if(withIdx)
+        std::fill(res_indices.begin(), res_indices.end(), 0);
+
+    if(reduceAllDims)
+    {
+        // Flatten whole tensor
+        const std::size_t N = divider; // product of all dims
+        std::vector<std::size_t> lens_radix;
+        reduce::build_radix(inLengths, lens_radix);
+
+        // parallel chunking
+        std::size_t hw =
+            std::max(std::size_t{1}, static_cast<std::size_t>(std::thread::hardware_concurrency()));
+        const std::size_t P     = std::min(N, hw * 4ul);
+        const std::size_t chunk = (N + P - 1) / P;
+
+        std::vector<Reducer<compType>> partial;
+        partial.reserve(P);
+        for(std::size_t p = 0; p < P; ++p)
+            partial.emplace_back(nanOpt, reduceOp, zeroV, withIdx);
+
+        auto worker = [&](int p) {
+            const std::size_t begin = std::size_t(p) * chunk;
+            const std::size_t end   = std::min(begin + chunk, N);
+
+            auto& r = partial[p];
+            for(std::size_t i = begin; i < end; ++i)
+            {
+                const auto off =
+                    reduce::linear_to_offset_by_lens_strides(i, inLengths, lens_radix, inStrides);
+                auto v = convert_type<compType>(input[off]);
+                PreUnaryOp(v);
+                r.step(v, static_cast<int>(i)); // flat index across whole tensor
+            }
+        };
+
+        if(parallel)
+        {
+            miopen::par_for(static_cast<int>(P), worker);
+        }
+        else
+        {
+            for(int p = 0; p < P; ++p)
+            {
+                worker(p);
+            }
+        }
+
+        // combine
+        Reducer<compType> R(nanOpt, reduceOp, zeroV, withIdx);
+        for(std::size_t p = 0; p < P; ++p)
+            R.combine(partial[p]);
+
+        // post
+        PosUnaryOp(R.acc);
+        if(alpha != 1.0f)
+            R.acc *= convert_type<compType>(alpha);
+        if(beta != 0.0f)
+            R.acc += convert_type<compType>(output[0]) * convert_type<compType>(beta);
+
+        res.data[0] = convert_type<Tref>(R.acc);
+        if(withIdx)
+            res_indices.data[0] = R.idx;
+    }
+    else
+    {
+        // Build radices for invariant and reduced subspaces
+        std::vector<std::size_t> invRad, redRad;
+        reduce::build_radix(invLens, invRad);
+        reduce::build_radix(redLens, redRad);
+
+        const std::size_t INV =
+            std::accumulate(invLens.begin(), invLens.end(), std::size_t{1}, std::multiplies<>());
+        const std::size_t TR = divider;
+
+        std::size_t hw =
+            std::max(std::size_t{1}, static_cast<std::size_t>(std::thread::hardware_concurrency()));
+        const std::size_t Te    = std::min(hw * 4ul, std::max<std::size_t>(1, INV));
+        const std::size_t chunk = (INV + Te - 1) / Te;
+
+        auto worker = [&](int t) {
+            const std::size_t row0 = std::size_t(t) * chunk;
+            const std::size_t row1 = std::min(row0 + chunk, INV);
+
+            for(std::size_t r = row0; r < row1; ++r)
+            {
+                // decode invariant multi-index; compute base offsets
+                std::size_t tmp          = r;
+                std::size_t base_in_off  = 0;
+                std::size_t base_out_off = 0;
+                for(std::size_t k = 0; k < invLens.size(); ++k)
+                {
+                    const std::size_t idx = (tmp / invRad[k]) % invLens[k];
+                    base_in_off += idx * invStrides_v[k];
+                    base_out_off += idx * outStrides[invariantDims[k]];
+                }
+
+                Reducer<compType> R(nanOpt, reduceOp, zeroV, withIdx);
+
+                // iterate reduced subspace
+                for(std::size_t i = 0; i < TR; ++i)
+                {
+                    std::size_t tmp2    = i;
+                    std::size_t red_off = 0;
+                    for(std::size_t k = 0; k < redLens.size(); ++k)
+                    {
+                        const std::size_t idx = (tmp2 / redRad[k]) % redLens[k];
+                        red_off += idx * redStrides_v[k];
+                    }
+
+                    auto v = convert_type<compType>(input[base_in_off + red_off]);
+                    PreUnaryOp(v);
+                    R.step(v, static_cast<int>(i)); // flat index inside reduced subspace
+                }
+
+                PosUnaryOp(R.acc);
+                if(alpha != 1.0f)
+                    R.acc *= convert_type<compType>(alpha);
+                if(beta != 0.0f)
+                    R.acc +=
+                        convert_type<compType>(output[base_out_off]) * convert_type<compType>(beta);
+
+                res.data[base_out_off] = convert_type<Tref>(R.acc);
+                if(withIdx)
+                    res_indices.data[base_out_off] = R.idx;
+            }
+        };
+
+        if(parallel)
+        {
+            miopen::par_for(static_cast<int>(Te), worker);
+        }
+        else
+        {
+            for(int te = 0; te < Te; ++te)
+            {
+                worker(te);
+            }
+        }
+    }
+
+    return {res, res_indices};
+}
+
+template <typename T, typename compType>
+std::tuple<tensor<T>, tensor<int>>
+reduce_cpu_common(const miopen::ReduceTensorDescriptor& reduceDesc,
+                  const tensor<T>& input,
+                  const tensor<T>& output,
+                  float alpha,
+                  float beta,
+                  bool parallel,
+                  bool withIdx)
+{
+    auto inLengths  = input.desc.GetLengths();
+    auto outLengths = output.desc.GetLengths();
+    auto inStrides  = input.desc.GetStrides();
+    auto outStrides = output.desc.GetStrides();
+
+    const auto reduceOp = reduceDesc.reduceTensorOp_;
+    const auto nanOpt   = reduceDesc.reduceTensorNanOpt_;
+
+    return reduce_cpu_common<T, T, compType, std::size_t>(reduceOp,
+                                                          nanOpt,
+                                                          inLengths,
+                                                          outLengths,
+                                                          input.data,
+                                                          inStrides,
+                                                          output.data,
+                                                          outStrides,
+                                                          alpha,
+                                                          beta,
+                                                          parallel,
+                                                          withIdx);
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp b/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp
new file mode 100644
index 000000000000..2d1d33cc898a
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/fusionHost.hpp
@@ -0,0 +1,993 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+#include <array>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <miopen/miopen.h>
+#include <miopen/convolution.hpp>
+#include <miopen/batch_norm.hpp>
+#include <miopen/activ.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/fusion_plan.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+#include <miopen_utils/verify.hpp>
+
+template <class T>
+void convHostForward(const tensor<T>& input,
+                     tensor<T>& output,
+                     const tensor<T>& weights,
+                     const int bias_mode,
+                     const tensor<T>& bias,
+                     const miopenConvolutionDescriptor_t convDesc)
+{
+
+    int in_n, in_c, in_h, in_w;
+    int in_nstride, in_cstride, in_hstride, in_wstride;
+    std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(input.desc.GetLengths());
+    std::tie(in_nstride, in_cstride, in_hstride, in_wstride) =
+        miopen::tien<4>(input.desc.GetStrides());
+
+    int wei_n, wei_c, wei_h, wei_w;
+    int wei_nstride, wei_cstride, wei_hstride, wei_wstride;
+    std::tie(wei_n, wei_c, wei_h, wei_w) = miopen::tien<4>(weights.desc.GetLengths());
+    std::tie(wei_nstride, wei_cstride, wei_hstride, wei_wstride) =
+        miopen::tien<4>(weights.desc.GetStrides());
+
+    int out_n, out_c, out_h, out_w;
+    int out_nstride, out_cstride, out_hstride, out_wstride;
+    std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(output.desc.GetLengths());
+    std::tie(out_nstride, out_cstride, out_hstride, out_wstride) =
+        miopen::tien<4>(output.desc.GetStrides());
+
+    int stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w;
+    miopenConvolutionMode_t mode;
+    miopenPaddingMode_t pmode = miopen::deref(convDesc).paddingMode;
+    miopenGetConvolutionDescriptor(
+        convDesc, &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilation_h, &dilation_w);
+
+    if(pmode == miopenPaddingSame)
+    {
+        pad_h = (in_h % stride_h == 0) ? (std::max((wei_h - stride_h), 0))
+                                       : (std::max((wei_h - (in_h % stride_h)), 0));
+        pad_w = (in_w % stride_w == 0) ? (std::max((wei_w - stride_w), 0))
+                                       : (std::max((wei_w - (in_w % stride_w)), 0));
+        pad_h /= 2;
+        pad_w /= 2;
+    }
+    else if(pmode == miopenPaddingValid)
+    {
+        pad_h = 0;
+        pad_w = 0;
+    }
+
+    if(out_h <= 0 || out_w <= 0)
+        MIOPEN_THROW("Invalid Test Case: Check Output Dimension.");
+
+    for(int o = 0; o < out_n; o++)
+    { // mini-batch size
+        for(int w = 0; w < out_c; w++)
+        { // out_channels (num filters)
+            for(int i = 0; i < out_h; i++)
+            { // output_height (from getforwardoutputdim())
+                int in_off_h = i * stride_h;
+                for(int j = 0; j < out_w; j++)
+                { // output_width (from getforwardoutputdim())
+                    /*auto acc     = static_cast<T>(0.);*/
+                    auto acc     = static_cast<double>(0.);
+                    int in_off_w = j * stride_w;
+                    for(int k = 0; k < in_c; k++)
+                    { // in_channels (RGB)
+                        for(int x = 0; x < wei_h; x++)
+                        {
+                            int in_x = in_off_h - pad_h + x * dilation_h;
+                            if(in_x >= 0 && in_x < in_h)
+                            {
+                                for(int y = 0; y < wei_w; y++)
+                                {
+                                    int in_y = in_off_w - pad_w + y * dilation_w;
+                                    if(in_y >= 0 && in_y < in_w)
+                                    {
+                                        acc += double(
+                                            static_cast<T>(input[o * in_nstride + k * in_cstride +
+                                                                 in_x * in_w + in_y]) *
+                                            static_cast<T>(weights(w, k, x, y)));
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    acc = bias_mode != 0 ? acc + static_cast<double>(bias[w]) : acc;
+                    output[o * out_nstride + w * out_cstride + i * out_hstride + j] =
+                        static_cast<T>(acc);
+                }
+            }
+        }
+    }
+}
+
+template <class T, class Tref, class U, class V = U>
+void batchNormSpatialHostInference(const tensor<T>& input,
+                                   tensor<Tref>& output,
+                                   const tensor<U>& scale,
+                                   const tensor<U>& bias,
+                                   double epsilon,
+                                   const tensor<V>& estimatedMean,
+                                   const tensor<V>& estimatedVariance,
+                                   bool useInverseVariance = false)
+{
+
+    int n_batches, channels, height, width;
+    std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+    miopen::par_for(channels, 1, [&](int cidx) { // via channel
+        V mean     = estimatedMean(0, cidx, 0, 0);
+        V variance = estimatedVariance(0, cidx, 0, 0);
+        double invertVar =
+            useInverseVariance ? static_cast<double>(variance) : 1.0 / sqrt(variance + epsilon);
+        // process the batch per channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batches; bidx++)
+                { // via mini_batch
+                    double elemStd = static_cast<double>(input(bidx, cidx, row, column)) - mean;
+                    double inhat   = elemStd * invertVar;
+                    output(bidx, cidx, row, column) =
+                        static_cast<T>(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
+                    // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
+                }
+            }
+        }
+    });
+}
+
+template <class T, class U, class V, class Tref>
+void batchNormPerActivHostInference(const tensor<T>& input,
+                                    tensor<Tref>& output,
+                                    const tensor<U>& scale,
+                                    const tensor<U>& bias,
+                                    double epsilon,
+                                    const tensor<V>& estimatedMean,
+                                    const tensor<V>& estimatedVariance,
+                                    bool useInverseVariance = false)
+{
+    int n_batches, channels, height, width;
+    std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+    miopen::par_for(channels, 1, [&](int cidx) { // via channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                // apply down the n_batch dimension
+                double mean       = estimatedMean(0, cidx, row, column);
+                double variance   = estimatedVariance(0, cidx, row, column);
+                double elemInvVar = useInverseVariance ? variance : 1.0 / sqrt(variance + epsilon);
+                for(int bidx = 0; bidx < n_batches; bidx++)
+                { // via mini_batch
+                    // per (x-dims) channel load a block of data into LDS
+                    double elemStd = input(bidx, cidx, row, column) - mean;
+                    double inhat   = elemStd * elemInvVar;
+                    output(bidx, cidx, row, column) =
+                        scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column);
+                    //    printf("output: %f\n",output(bidx, cidx, row, column));
+                }
+            }
+        }
+    });
+}
+
+template <class T, class U, class Tref = U, class Tout>
+void batchNormSpatialHostFwdTrain(const tensor<T>& input,
+                                  tensor<Tout>& out,
+                                  const tensor<U>& scale,
+                                  const tensor<U>& bias,
+                                  double epsilon,
+                                  double expAvgFactor,
+                                  tensor<Tref>& saveMean,
+                                  tensor<Tref>& saveInvVar,
+                                  tensor<Tref>& runMean,
+                                  tensor<Tref>& runVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+    const auto nhw                             = double(height * width * n_batch);
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd        = 0.;
+        double variance_accum = 0.;
+        double mean_accum     = 0.;
+        double invVar         = 0.;
+        double newRunMean     = 0.;
+        double adjust         = 0.;
+
+        // process the batch per channel
+        for(int bidx = 0; bidx < n_batch; bidx++)
+        { // via mini_batch
+            for(int row = 0; row < height; row++)
+            { // via rows
+                for(int column = 0; column < width; column++)
+                { // via columns
+                    // #1 calculate the mean
+                    // iterating through the stack of images in the mini_batch
+                    auto inval = static_cast<double>(input(bidx, cidx, row, column));
+                    mean_accum += inval;
+                    variance_accum += inval * inval;
+                } // end for (column)
+            } // end for (row)
+        } // end for (n)
+
+        mean_accum /= nhw;
+        variance_accum /= nhw;
+        variance_accum += (-mean_accum * mean_accum);
+        invVar = 1.0 / sqrt(variance_accum + epsilon);
+
+        // #4 apply the normalization
+        // x_hat = (x_i - mean) / sqrt(variance_accum + epsilon)
+        for(int bidx = 0; bidx < n_batch; bidx++)
+        { // via mini_batch
+            for(int row = 0; row < height; row++)
+            { // via rows
+                for(int column = 0; column < width; column++)
+                { // via columns
+                    // #5 Gamma and Beta adjust
+                    // y_i = gamma*x_hat + beta
+                    elemStd = (static_cast<double>(input(bidx, cidx, row, column)) -
+                               mean_accum); // (x_i - mean)
+                    out(bidx, cidx, row, column) = static_cast<T>(
+                        scale(0, cidx, 0, 0) * (invVar * elemStd) + bias(0, cidx, 0, 0));
+                } // for (column)
+            } // for (row)
+        } // end for(n_batchs)
+        if(!saveMean.data.empty())
+        {
+            saveMean(0, cidx, 0, 0)   = mean_accum;
+            saveInvVar(0, cidx, 0, 0) = invVar;
+        }
+        if(!runMean.data.empty())
+        {
+            newRunMean             = runMean(0, cidx, 0, 0) * (1 - expAvgFactor);
+            runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
+            // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
+            adjust = (n_batch * height * width == 1) ? variance_accum
+                                                     : (nhw / (nhw - 1)) * variance_accum;
+            runVar(0, cidx, 0, 0) =
+                (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust;
+        }
+    });
+}
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename ScaleDataType,
+          typename AccDataType,
+          typename RefDataType>
+void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
+                                  tensor<DyDataType>& dy_input,
+                                  tensor<DxDataType>& dx_out,
+                                  const tensor<ScaleDataType>& bnScale,
+                                  const tensor<ScaleDataType>& bnBias,
+                                  tensor<RefDataType>& dscale,
+                                  tensor<RefDataType>& dbias,
+                                  const tensor<AccDataType>& savedMean,
+                                  const tensor<AccDataType>& savedInvVar,
+                                  miopenActivationMode_t activ_mode,
+                                  double activ_beta,
+                                  double activ_alpha)
+{
+    double activ_gamma = 0.;
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+    auto nhw                                   = double(height * width * n_batch);
+    int in_cstride                             = height * width;
+
+    if(activ_mode > 0)
+    {
+        tensor<AccDataType> input_norm =
+            tensor<AccDataType>{x_input.desc.GetLayout_t(), x_input.desc.GetLengths()};
+        miopen::par_for(channels, 1, [&](int cidx) {
+            double mean           = 0.0;
+            double invVar         = 0.0;
+            double elemStd        = 0.;
+            double mean_accum     = 0.0;
+            double variance_accum = 0.0;
+            if(!savedMean.data.empty())
+            {
+                mean   = static_cast<double>(savedMean(0, cidx, 0, 0));   // HxW elements
+                invVar = static_cast<double>(savedInvVar(0, cidx, 0, 0)); // HxW elements
+            }
+            else
+            {
+                for(int row = 0; row < height; row++)
+                { // via rows
+                    for(int column = 0; column < width; column++)
+                    { // via columns
+                        for(int bidx = 0; bidx < n_batch; bidx++)
+                        { // via mini_batch
+                            auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
+                            mean_accum += inval;
+                            variance_accum += inval * inval;
+                        }
+                    }
+                }
+                mean_accum /= nhw;
+                variance_accum /= nhw;
+                variance_accum += (-mean_accum * mean_accum);
+                mean   = mean_accum;
+                invVar = 1.0 / sqrt(variance_accum);
+            }
+            for(int row = 0; row < height; row++)
+            { // via rows
+                for(int column = 0; column < width; column++)
+                { // via columns
+                    for(int bidx = 0; bidx < n_batch; bidx++)
+                    { // via mini_batch
+                        elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                                  mean; // (x_i - mean)
+                        input_norm(bidx, cidx, row, column) = static_cast<AccDataType>(
+                            bnScale(0, cidx, 0, 0) * (elemStd * invVar) + bnBias(0, cidx, 0, 0));
+                    }
+                }
+            }
+        });
+
+        activationHostBnormBwd(activ_mode,
+                               activ_gamma,
+                               activ_beta,
+                               activ_alpha,
+                               dy_input.data,
+                               input_norm.data,
+                               dy_input.data);
+    }
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd = 0.;
+        unsigned int xhat_index;
+        double mean   = 0.0;
+        double invVar = 0.0;
+        double dyelem = 0.;
+        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride, 0.0);
+        // process the batch per channel
+        dscale(0, cidx, 0, 0) = 0.;
+        dbias(0, cidx, 0, 0)  = 0.;
+
+        if(!savedMean.data.empty())
+        {
+
+            mean   = savedMean(0, cidx, 0, 0);   // HxW elements
+            invVar = savedInvVar(0, cidx, 0, 0); // HxW elements
+        }
+        else
+        {
+            double variance_accum = 0.;
+            double mean_accum     = 0.;
+            double inv_Var        = 0.;
+
+            // process the batch per channel
+            for(int bidx = 0; bidx < n_batch; bidx++)
+            { // via mini_batch
+                for(int row = 0; row < height; row++)
+                { // via rows
+                    for(int column = 0; column < width; column++)
+                    { // via columns
+                        // #1 calculate the mean
+                        // iterating through the stack of images in the mini_batch
+                        auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
+                        mean_accum += inval;
+                        variance_accum += inval * inval;
+                    } // end for (column)
+                } // end for (row)
+            } // end for (n)
+
+            mean_accum /= nhw;
+            variance_accum /= nhw;
+            variance_accum += (-mean_accum * mean_accum);
+            inv_Var = 1.0 / sqrt(variance_accum);
+
+            mean   = mean_accum;
+            invVar = inv_Var;
+        }
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    // per (x-dims) channel load a block of data into LDS
+                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                              mean; // (x_i - mean)
+                    xhat[xhat_index] = elemStd * invVar;
+                    dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));
+                    dbias(0, cidx, 0, 0) += dyelem;
+                    dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem;
+                } // end for(n_batch)
+            } // for (column)
+        } // for (row)
+
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+
+                    double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
+                    double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
+                    double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
+                    dx_out(bidx, cidx, row, column) =
+                        static_cast<RefDataType>(tmp3 * (tmp2 + tmp1));
+                } // end for(n_batchs)
+            } // for (column)
+        } // for (row)
+    }); // for (channel)
+}
+
+template <typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename ScaleDataType,
+          typename AccDataType,
+          typename OutRefDataType,
+          typename RefDataType>
+void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
+                                       double gamma,
+                                       double beta,
+                                       double alpha,
+                                       const tensor<XDataType>& x_input,
+                                       const tensor<DyDataType>& dy_input,
+                                       const tensor<DxDataType>& y_input,
+                                       tensor<OutRefDataType>& dx_out,
+                                       const tensor<ScaleDataType>& bnScale,
+                                       const tensor<AccDataType>& bias,
+                                       tensor<RefDataType>& dscale,
+                                       tensor<RefDataType>& dbias,
+                                       const tensor<AccDataType>& savedMean,
+                                       const tensor<AccDataType>& savedInvVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+    auto nhw                                   = double(height * width * n_batch);
+    int in_cstride                             = height * width;
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd = 0.;
+        unsigned int xhat_index;
+        double mean   = static_cast<double>(savedMean(0, cidx, 0, 0));   // HxW elements
+        double invVar = static_cast<double>(savedInvVar(0, cidx, 0, 0)); // HxW elements
+        double dyelem = 0.;
+        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride, 0.0);
+        // process the batch per channel
+        dscale(0, cidx, 0, 0) = 0.;
+        dbias(0, cidx, 0, 0)  = 0.;
+
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+
+                    // recompute forward batch norm
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    // per (x-dims) channel load a block of data into LDS
+                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                              mean; // (x_i - mean)
+                    xhat[xhat_index] = elemStd * invVar;
+                    double bnrefowd =
+                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
+                    activationHostBwdElement(activMode,
+                                             gamma,
+                                             beta,
+                                             alpha,
+                                             dy_input(bidx, cidx, row, column),
+                                             bnrefowd,
+                                             y_input(bidx, cidx, row, column),
+                                             dyelem);
+                    dbias(0, cidx, 0, 0) += dyelem;
+                    dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem;
+                } // end for(n_batch)
+            } // for (column)
+        } // for (row)
+
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    double bnrefowd =
+                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
+                    activationHostBwdElement(activMode,
+                                             gamma,
+                                             beta,
+                                             alpha,
+                                             dy_input(bidx, cidx, row, column),
+                                             bnrefowd,
+                                             y_input(bidx, cidx, row, column),
+                                             dyelem);
+                    // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
+                    double tmp1                     = nhw * dyelem - dbias(0, cidx, 0, 0);
+                    double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
+                    double tmp3                     = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
+                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * (tmp2 + tmp1));
+                } // end for(n_batchs)
+            } // for (column)
+        } // for (row)
+    }); // for (channel)
+}
+
+template <class T, class U, class Tref, class TOutref>
+void batchNormPerActHostFwdTrain(const tensor<T>& input,
+                                 tensor<TOutref>& out,
+                                 const tensor<U>& scale,
+                                 const tensor<U>& bias,
+                                 double epsilon,
+                                 double expAvgFactor,
+                                 tensor<Tref>& saveMean,
+                                 tensor<Tref>& saveInvVar,
+                                 tensor<Tref>& runMean,
+                                 tensor<Tref>& runVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+    const auto n                               = double(n_batch);
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double mean_accum     = 0.;
+        double variance_accum = 0.;
+        double elemStd        = 0.;
+        double elemInvVar     = 0.;
+        double inhat          = 0.;
+        double newRunMean     = 0.;
+        double adjust         = 0.;
+
+        // process the batch per channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+
+                mean_accum     = 0.;
+                variance_accum = 0.;
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    // #1 calculate the mean :: iterating through the stack of images in the
+                    // mini_batch
+                    auto intval = static_cast<double>(input(bidx, cidx, row, column));
+                    mean_accum += intval;
+                    variance_accum += intval * intval;
+                }
+                mean_accum /= n;
+                variance_accum /= n;
+                variance_accum = variance_accum - (mean_accum * mean_accum);
+                elemInvVar     = 1.0 / double(sqrt(variance_accum + epsilon));
+
+                // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum -
+                // epsilon)
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                {                                                            // via mini_batch
+                    elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean)
+                    inhat   = elemStd * elemInvVar;
+                    // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta
+                    out(bidx, cidx, row, column) = static_cast<Tref>(
+                        scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column));
+                } // end for(n_batch)
+
+                if(!runMean.data.empty())
+                {
+                    newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor);
+                    runMean(0, cidx, row, column) =
+                        mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
+                }
+                // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
+                if(!runVar.data.empty())
+                {
+                    adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum;
+                    runVar(0, cidx, row, column) =
+                        (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust;
+                }
+                if(!saveMean.data.empty() || !saveInvVar.data.empty())
+                {
+                    saveMean(0, cidx, row, column)   = static_cast<Tref>(mean_accum);
+                    saveInvVar(0, cidx, row, column) = static_cast<Tref>(elemInvVar);
+                }
+
+            } // for (column)
+        } // for (row)
+    });
+}
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType = XDataType,
+          typename ScaleDataType,
+          typename AccDataType = ScaleDataType,
+          typename RefDataType = DxDataType>
+void batchNormPerActHostBwdTrain(const tensor<XDataType>& x_input,
+                                 const tensor<DyDataType>& dy_input,
+                                 tensor<DxDataType>& dx_out,
+                                 const tensor<ScaleDataType>& scale,
+                                 tensor<RefDataType>& dscale,
+                                 tensor<RefDataType>& dbias,
+                                 const tensor<AccDataType>& savedMean,
+                                 const tensor<AccDataType>& savedInvVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+    int in_cstride                             = height * width;
+    auto n                                     = double(n_batch);
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd = 0.;
+        unsigned int xhat_index;
+        double mean       = 0.;
+        double elemInvVar = 0.;
+        double dyelem     = 0.;
+        double dxhat      = 0.;
+        double dxhathat   = 0.;
+        double tmp1       = 0.;
+        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride);
+
+        // process the batch per channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                dxhat    = 0.;
+                dxhathat = 0.;
+
+                if(!savedMean.data.empty())
+                {
+                    mean       = savedMean(0, cidx, row, column);   // HxW elements
+                    elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements
+                }
+                else
+                {
+                    double variance_accum = 0.;
+                    double mean_accum     = 0.;
+
+                    // process the batch per channel
+                    for(int bidx = 0; bidx < n_batch; bidx++)
+                    { // via mini_batch
+                        auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
+                        mean_accum += inval;
+                        variance_accum += inval * inval;
+                    } // end for (n)
+
+                    mean_accum /= n;
+                    variance_accum /= n;
+                    variance_accum += (-mean_accum * mean_accum);
+
+                    mean       = mean_accum;
+                    elemInvVar = 1.0 / sqrt(variance_accum);
+                }
+
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    // per (x-dims) channel load a block of data into LDS
+                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                              mean; // (x_i - mean)
+                    xhat[xhat_index] = elemStd * elemInvVar;
+                    dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));
+                    dbias(0, cidx, row, column) += dyelem;
+                    dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem;
+                    tmp1 = scale(0, cidx, row, column) * dyelem;
+                    dxhat += tmp1;
+                    dxhathat += tmp1 * xhat[xhat_index];
+
+                } // end for(n_batchs)
+
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    tmp1       = xhat[xhat_index] * dxhathat + dxhat;
+                    double tmp2 =
+                        n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) -
+                        tmp1;
+                    double tmp3                     = elemInvVar / (double(n));
+                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * tmp2);
+                } // end for(n_batchs)
+            } // for (column)
+        } // for (row)
+    });
+}
+
+template <class T, class U>
+void batchNormActivPerActHostBwdTrain(miopenActivationMode_t activMode,
+                                      double gamma,
+                                      double beta,
+                                      double alpha,
+                                      const tensor<T>& x_input,
+                                      const tensor<T>& dy_input,
+                                      const tensor<T>& y_input,
+                                      tensor<T>& dx_out,
+                                      const tensor<U>& scale,
+                                      const tensor<U>& bias,
+                                      tensor<U>& dscale,
+                                      tensor<U>& dbias,
+                                      const tensor<U>& savedMean,
+                                      const tensor<U>& savedInvVar)
+{
+
+    int height, width, n_batch, channels;
+    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+    int in_cstride                             = height * width;
+    auto n                                     = double(n_batch);
+
+    miopen::par_for(channels, 1, [&](int cidx) {
+        double elemStd = 0.;
+        unsigned int xhat_index;
+        double mean       = 0.;
+        double elemInvVar = 0.;
+        double dyelem     = 0.;
+        double dxhat      = 0.;
+        double dxhathat   = 0.;
+        double tmp1       = 0.;
+        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride);
+
+        // process the batch per channel
+        for(int row = 0; row < height; row++)
+        { // via rows
+            for(int column = 0; column < width; column++)
+            { // via columns
+                dxhat    = 0.;
+                dxhathat = 0.;
+
+                mean       = savedMean(0, cidx, row, column);   // HxW elements
+                elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements
+
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    // per (x-dims) channel load a block of data into LDS
+                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
+                              mean; // (x_i - mean)
+                    xhat[xhat_index] = elemStd * elemInvVar;
+                    double bnrefowd =
+                        scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column);
+                    activationHostBwdElement(activMode,
+                                             gamma,
+                                             beta,
+                                             alpha,
+                                             dy_input(bidx, cidx, row, column),
+                                             bnrefowd,
+                                             y_input(bidx, cidx, row, column),
+                                             dyelem);
+                    /*dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));*/
+                    dbias(0, cidx, row, column) += dyelem;
+                    dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem;
+                    tmp1 = scale(0, cidx, row, column) * dyelem;
+                    dxhat += tmp1;
+                    dxhathat += tmp1 * xhat[xhat_index];
+
+                } // end for(n_batchs)
+
+                for(int bidx = 0; bidx < n_batch; bidx++)
+                { // via mini_batch
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    tmp1       = xhat[xhat_index] * dxhathat + dxhat;
+                    double bnrefowd =
+                        scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column);
+                    activationHostBwdElement(activMode,
+                                             gamma,
+                                             beta,
+                                             alpha,
+                                             dy_input(bidx, cidx, row, column),
+                                             bnrefowd,
+                                             y_input(bidx, cidx, row, column),
+                                             dyelem);
+                    double tmp2 = (n_batch * scale(0, cidx, row, column) * dyelem) - tmp1;
+                    double tmp3 = elemInvVar / (double(n));
+                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * tmp2);
+                } // end for(n_batchs)
+            } // for (column)
+        } // for (row)
+    });
+}
+
+template <class F>
+void visitActivationHostInfer(
+    miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f)
+{
+    switch(activMode)
+    {
+    case miopenActivationPASTHRU: //  x
+        f([=](double x) { return x; });
+        break;
+    case miopenActivationLOGISTIC: // 1 / (1 + e^-x)  //Sigmoid
+        f([=](double x) { return (1. / (1. + std::exp(-x))); });
+        break;
+    case miopenActivationTANH: // beta * tanh(alpha * x)
+        f([=](double x) { return (beta * std::tanh(alpha * x)); });
+        break;
+    case miopenActivationRELU: // max(0, x)
+        f([=](double x) { return ((x > 0.) ? x : 0.); });
+        break;
+    case miopenActivationSOFTRELU: //  log(1 + e^x)   // bonomial normal log likelihood
+        f([=](double x) {
+            return (x > 0.) ? (x + std::log1p(std::exp(-x))) : (std::log1p(std::exp(x)));
+        });
+        break;
+    case miopenActivationABS: //  abs(x)
+        f([=](double x) { return (std::fabs(x)); });
+        break;
+    case miopenActivationPOWER: // (alpha + beta * x) ^ gamma
+        f([=](double x) {
+            auto v = (alpha + beta * x);
+            return (v <= std::numeric_limits<double>::epsilon()) ? 0. : pow(v, gamma);
+        });
+        break;
+    case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x))
+        f([=](double x) { return (std::min(alpha, std::max(double(0.), x))); });
+        break;
+    case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0
+        f([=](double x) { return ((x > 0.) ? x : x * alpha); });
+        break;
+    case miopenActivationELU: // alpha * (exp(x)-1) | x<=0; x | x>0
+        f([=](double x) { return ((x > 0.) ? x : alpha * std::expm1(x)); });
+        break;
+    case miopenActivationCLAMP: // max(alpha, min(beta, x))
+        f([=](double x) { return (std::max(alpha, std::min(beta, x))); });
+        break;
+        // default: printf("ERROR: unknown neuron type: %d\n", activMode); break;
+    }
+}
+
+template <class T>
+inline void activationHostInfer(miopenActivationMode_t activMode,
+                                double gamma,
+                                double beta,
+                                double alpha,
+                                const std::vector<T> input,
+                                std::vector<T>& output)
+{
+    visitActivationHostInfer(activMode, gamma, beta, alpha, [&](auto f) {
+        miopen::par_for(input.size(), 1, [&](int index) {
+            output[index] = static_cast<T>(f(static_cast<double>(input[index])));
+        });
+    });
+}
+
+template <class F>
+void visitActivationHostBwd(
+    miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f)
+{
+    switch(activMode)
+    {
+    case miopenActivationPASTHRU: //  x
+        f([=](double dy, double, double) { return dy; });
+        break;
+    case miopenActivationLOGISTIC: // 1 / (1 + e^-x)  //Sigmoid
+        f([=](double dy, double, double y) { return dy * y * (1 - y); });
+        break;
+    case miopenActivationTANH: // beta * tanh(alpha * x)
+        f([=](double dy, double, double y) { return dy * alpha * (beta - y * y / beta); });
+        break;
+    case miopenActivationRELU: // max(0, x)
+        f([=](double dy, double x, double) { return (x > 0) ? dy : 0; });
+        break;
+    case miopenActivationSOFTRELU: //  log(1 + e^x)   // bonomial normal log likelihood
+        f([=](double dy, double x, double) {
+            static const double threshold = 50.;
+            double expval                 = std::exp(std::min(x, threshold));
+            return dy * expval / (expval + 1.0);
+        });
+        break;
+    case miopenActivationABS: //  abs(x)
+        f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : -1); });
+        break;
+    case miopenActivationPOWER: // (alpha + beta * x) ^ gamma
+        f([=](double, double x, double y) {
+            auto v = alpha + beta * x;
+            return v <= std::numeric_limits<double>::epsilon() ? 0 : gamma * beta * y / v;
+        });
+        break;
+    case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x))
+        f([=](double dy, double x, double) { return (x > 0 && x <= alpha) ? dy : 0; });
+        break;
+    case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0
+        f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : alpha); });
+        break;
+    case miopenActivationELU: // alpah * (exp(x)-1) | x<=0; x | x>0
+        f([=](double dy, double x, double y) { return dy * ((x > 0) ? 1 : y + alpha); });
+        break;
+    case miopenActivationCLAMP: // max(alpha, min(beta, x))
+        f([=](double dy, double x, double) { return (x > alpha && x <= beta) ? dy : 0; });
+        break;
+        // default: printf("ERROR: unknown neuron type: %d\n", activMode); break;
+    }
+}
+
+template <class T, class U, class V>
+inline void activationHostBnormBwd(miopenActivationMode_t activMode,
+                                   double gamma,
+                                   double beta,
+                                   double alpha,
+                                   const std::vector<U> dyinput,
+                                   const std::vector<V> xinput,
+                                   std::vector<T>& output)
+{
+    double dummy;
+    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
+        miopen::par_for(dyinput.size(), 1, [&](int index) {
+            output[index] = static_cast<T>(
+                f(static_cast<double>(dyinput[index]), static_cast<double>(xinput[index]), dummy));
+        });
+    });
+}
+
+template <class T>
+inline void activationHostBwd(miopenActivationMode_t activMode,
+                              double gamma,
+                              double beta,
+                              double alpha,
+                              const std::vector<T> dyinput,
+                              const std::vector<T> xinput,
+                              const std::vector<T> yinput,
+                              std::vector<T>& output)
+{
+    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
+        miopen::par_for(dyinput.size(), 1, [&](int index) {
+            output[index] = static_cast<T>(f(static_cast<double>(dyinput[index]),
+                                             static_cast<double>(xinput[index]),
+                                             static_cast<double>(yinput[index])));
+        });
+    });
+}
+
+inline void activationHostBwdElement(miopenActivationMode_t activMode,
+                                     double gamma,
+                                     double beta,
+                                     double alpha,
+                                     const double dyinput,
+                                     const double xinput,
+                                     const double yinput,
+                                     double& output)
+{
+    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
+        output = static_cast<double>(f(dyinput, xinput, yinput));
+    });
+}
+
+template <class T>
+tensor<T> get_output_tensor(const miopen::ConvolutionDescriptor& filter,
+                            const tensor<T>& input,
+                            const tensor<T>& weights)
+{
+    return tensor<T>{filter.GetForwardOutputTensor(input.desc, weights.desc, miopen_type<T>{})};
+}
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp b/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp
new file mode 100644
index 000000000000..81c38db0fdf3
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/gemm.hpp
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_GEMM_HPP
+#define GUARD_GEMM_HPP
+
+#include <iostream>
+#include <miopen/ford.hpp>
+#include <miopen/errors.hpp>
+
+/*
+    A and B rows and cols should be passed as default values (NxM, MxK), independently of
+   a_transponse/b_transpose flag value
+    C rows and cols should have correct values based on a_transponse/b_transpose values
+    A, B, C strides should have corret values based on a_transponse/b_transpose values
+*/
+template <typename Dtype>
+void gemm_cpu(const Dtype* a_ptr,
+              const size_t a_cols,
+              const size_t a_rows,
+              const size_t a_stride,
+              const bool a_transpose,
+              const Dtype* b_ptr,
+              const size_t b_cols,
+              const size_t b_rows,
+              const size_t b_stride,
+              const bool b_transpose,
+              Dtype* c_ptr,
+              const size_t c_cols,
+              const size_t c_rows,
+              const size_t c_stride,
+              double alpha = 1.0,
+              double beta  = 1.0)
+{
+    if((!a_transpose && !b_transpose &&
+        ((a_cols != b_rows) || (a_rows != c_rows) || (b_cols != c_cols))) ||
+       (a_transpose && b_transpose &&
+        ((a_rows != b_cols) || (a_cols != c_rows) || (b_rows != c_cols))) ||
+       (a_transpose && !b_transpose &&
+        ((a_rows != b_rows) || (a_cols != c_rows) || (b_cols != c_cols))) ||
+       (!a_transpose && b_transpose &&
+        ((a_cols != b_cols) || (a_rows != c_rows) || (b_rows != c_cols))))
+    {
+        MIOPEN_THROW("MM_CPU_ERROR. Incompatible matrix size:\nA: " + std::to_string(a_rows) + "x" +
+                     std::to_string(a_cols) + " transpose: " + (a_transpose ? "true" : "false") +
+                     "\nB: " + std::to_string(b_rows) + "x" + std::to_string(b_cols) +
+                     " transpose: " + (b_transpose ? "true" : "false") +
+                     "\nC: " + std::to_string(c_rows) + "x" + std::to_string(c_cols) + "\n");
+    }
+
+    size_t inner_loop_limit = a_transpose ? a_rows : a_cols;
+    auto inner_loop         = [&](int m, int n) {
+        double el = 0.0;
+        if(!a_transpose && !b_transpose)
+        {
+            miopen::ford(inner_loop_limit)([&](int k) {
+                el += static_cast<double>(a_ptr[m * a_stride + k]) *
+                      static_cast<double>(b_ptr[k * b_stride + n]);
+            });
+        }
+        else if(!a_transpose && b_transpose)
+        {
+            miopen::ford(inner_loop_limit)([&](int k) {
+                el += static_cast<double>(a_ptr[m * a_stride + k]) *
+                      static_cast<double>(b_ptr[n * b_stride + k]);
+            });
+        }
+        else if(a_transpose && !b_transpose)
+        {
+            miopen::ford(inner_loop_limit)([&](int k) {
+                el += static_cast<double>(a_ptr[k * a_stride + m]) *
+                      static_cast<double>(b_ptr[k * b_stride + n]);
+            });
+        }
+        else
+        {
+            miopen::ford(inner_loop_limit)([&](int k) {
+                el += static_cast<double>(a_ptr[k * a_stride + m]) *
+                      static_cast<double>(b_ptr[n * b_stride + k]);
+            });
+        }
+
+        c_ptr[m * c_stride + n] =
+            static_cast<Dtype>(beta * static_cast<double>(c_ptr[m * c_stride + n]) + alpha * el);
+    };
+
+    constexpr size_t iter_margin = 1'048'576; // 2^20
+    if(c_rows * c_cols * inner_loop_limit > iter_margin)
+    {
+        miopen::par_ford(c_rows, c_cols)(inner_loop);
+    }
+    else
+    {
+        miopen::ford(c_rows, c_cols)(inner_loop);
+    }
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp b/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp
new file mode 100644
index 000000000000..ee1f52b3090d
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/gpu_mem.hpp
@@ -0,0 +1,12 @@
+// Forwarding header — GPUMem is defined in driver/driver.hpp.
+// This allows test code to include GPUMem without directly depending
+// on the driver/ directory. The GPUMem class should eventually be
+// extracted into a standalone header here.
+#ifndef GUARD_MIOPEN_UTILS_GPU_MEM_HPP
+#define GUARD_MIOPEN_UTILS_GPU_MEM_HPP
+
+// Phase 1: Forward to driver.hpp which defines GPUMem.
+// Phase 2: Extract GPUMem into this file directly.
+#include "../../driver/driver.hpp"
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp b/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp
new file mode 100644
index 000000000000..fd0a1768e6a6
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/mloSoftmaxHost.hpp
@@ -0,0 +1,350 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#ifndef MLO_SOFTMAXHOST_H_
+#define MLO_SOFTMAXHOST_H_
+
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_extra.hpp>
+
+////////////////////////////////////////////////////////////
+//
+///////////////////////////////////////////////////////////
+
+#define NEGATIVE_INF_FP32 (-1e20)
+#define NEGATIVE_INF_FP16 (-1e5)
+
+template <typename T>
+T logaddexp(T x, T y, T neg_inf)
+{
+    T a = std::max(x, y);
+    T b = std::min(x, y);
+    T c = b - a;
+
+    return c <= neg_inf ? std::max(a, neg_inf) : std::max(T(a + log(T(1) + exp(b - a))), neg_inf);
+}
+
+template <typename Tgpu, typename Tcheck /* the data type used in CPU checkings (usually double) */>
+int mloSoftmaxForwardRunHost(miopenTensorDescriptor_t inputTensor,
+                             miopenTensorDescriptor_t outputTensor,
+                             Tgpu* in,
+                             Tcheck* outhost,
+                             float alpha,
+                             float beta,
+                             miopenSoftmaxAlgorithm_t algo,
+                             miopenSoftmaxMode_t mode)
+{
+    int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr;
+    int out_nstr, out_cstr, out_hstr, out_wstr;
+    miopenGet4dTensorDescriptorLengths(inputTensor, &n, &c, &h, &w);
+    miopenGet4dTensorDescriptorStrides(inputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr);
+    miopenGet4dTensorDescriptorStrides(outputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr);
+
+    Tcheck max_val = (sizeof(Tgpu) == 4) ? 3.402823466e+38f : 65504.;
+    std::vector<Tcheck> channel_max((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w),
+                                    static_cast<Tcheck>(-max_val));
+    std::vector<Tcheck> results(n * c * h * w, static_cast<Tcheck>(0.0));
+
+    int ret = 0;
+
+    if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE)
+    {
+        for(int i = 0; i < n; i++)
+        {
+            if(algo == MIOPEN_SOFTMAX_FAST)
+            {
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] = static_cast<Tcheck>(
+                                in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]);
+                        }
+            }
+            else
+            {
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            channel_max[i] = std::max(
+                                static_cast<Tcheck>(
+                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]),
+                                channel_max[i]);
+                        }
+
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(
+                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) -
+                                channel_max[i];
+                        }
+            }
+
+            if(algo == MIOPEN_SOFTMAX_LOG)
+            {
+                Tcheck neg_inf = static_cast<Tcheck>(
+                    miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16
+                                                                       : NEGATIVE_INF_FP32);
+                channel_max[i] = neg_inf;
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            channel_max[i] = logaddexp(results[(i * c + j) * h * w + s0 * w + s1],
+                                                       channel_max[i],
+                                                       neg_inf);
+                        }
+
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
+                                alpha *
+                                    (results[(i * c + j) * h * w + s0 * w + s1] - channel_max[i]) +
+                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                               s1 * out_wstr];
+                        }
+            }
+            else
+            {
+                channel_max[i] = 0.0;
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                exp(results[(i * c + j) * h * w + s0 * w + s1]);
+                            channel_max[i] += results[(i * c + j) * h * w + s0 * w + s1];
+                        }
+
+                for(int j = 0; j < c; j++)
+                    for(int s0 = 0; s0 < h; s0++)
+                        for(int s1 = 0; s1 < w; s1++)
+                        {
+                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
+                                alpha *
+                                    (results[(i * c + j) * h * w + s0 * w + s1] / channel_max[i]) +
+                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                               s1 * out_wstr];
+                        }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < n; i++)
+        {
+            for(int s0 = 0; s0 < h; s0++)
+                for(int s1 = 0; s1 < w; s1++)
+                {
+                    if(algo == MIOPEN_SOFTMAX_FAST)
+                    {
+                        for(int j = 0; j < c; j++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] = static_cast<Tcheck>(
+                                in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]);
+                        }
+                    }
+                    else
+                    {
+                        for(int j = 0; j < c; j++)
+                        {
+                            channel_max[i * h * w + s0 * w + s1] = std::max(
+                                static_cast<Tcheck>(
+                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]),
+                                channel_max[i * h * w + s0 * w + s1]);
+                        }
+
+                        for(int j = 0; j < c; j++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(
+                                    in[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr]) -
+                                channel_max[i * h * w + s0 * w + s1];
+                        }
+                    }
+
+                    if(algo == MIOPEN_SOFTMAX_LOG)
+                    {
+                        Tcheck neg_inf = static_cast<Tcheck>(
+                            miopen::deref(inputTensor).GetType() == miopenHalf ? NEGATIVE_INF_FP16
+                                                                               : NEGATIVE_INF_FP32);
+                        channel_max[i * h * w + s0 * w + s1] = results[i * c * h * w + s0 * w + s1];
+                        for(int j = 1; j < c; j++)
+                        {
+                            channel_max[i * h * w + s0 * w + s1] =
+                                logaddexp(results[(i * c + j) * h * w + s0 * w + s1],
+                                          channel_max[i * h * w + s0 * w + s1],
+                                          neg_inf);
+                        }
+
+                        for(int j = 0; j < c; j++)
+                        {
+                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
+                                alpha * (results[(i * c + j) * h * w + s0 * w + s1] -
+                                         channel_max[i * h * w + s0 * w + s1]) +
+                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                               s1 * out_wstr];
+                        }
+                    }
+                    else
+                    {
+                        channel_max[i * h * w + s0 * w + s1] = 0.0;
+                        for(int j = 0; j < c; j++)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                exp(results[(i * c + j) * h * w + s0 * w + s1]);
+                            channel_max[i * h * w + s0 * w + s1] +=
+                                results[(i * c + j) * h * w + s0 * w + s1];
+                        }
+
+                        for(int j = 0; j < c; j++)
+                        {
+                            outhost[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr] =
+                                alpha * (results[(i * c + j) * h * w + s0 * w + s1] /
+                                         channel_max[i * h * w + s0 * w + s1]) +
+                                beta * outhost[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                               s1 * out_wstr];
+                        }
+                    }
+                }
+        }
+    }
+
+    return ret;
+}
+
+template <typename Tgpu /* the data type used in GPU computations (usually half) */,
+          typename Tcheck /* the data type used in CPU checkings (usually double) */>
+int mloSoftmaxBackwardRunHost(miopenTensorDescriptor_t dInputTensor,
+                              miopenTensorDescriptor_t dOutputTensor,
+                              Tgpu* out,
+                              Tgpu* dout,
+                              Tcheck* dinhost,
+                              float alpha,
+                              float beta,
+                              miopenSoftmaxAlgorithm_t algo,
+                              miopenSoftmaxMode_t mode)
+{
+    int n, c, h, w, in_nstr, in_cstr, in_hstr, in_wstr;
+    int out_nstr, out_cstr, out_hstr, out_wstr;
+    miopenGet4dTensorDescriptorLengths(dOutputTensor, &n, &c, &h, &w);
+    miopenGet4dTensorDescriptorStrides(dInputTensor, &in_nstr, &in_cstr, &in_hstr, &in_wstr);
+    miopenGet4dTensorDescriptorStrides(dOutputTensor, &out_nstr, &out_cstr, &out_hstr, &out_wstr);
+
+    std::vector<Tcheck> channel_dot((mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w),
+                                    static_cast<Tcheck>(0.0));
+    std::vector<Tcheck> results(n * c * h * w, static_cast<Tcheck>(0.0));
+
+    int ret = 0;
+
+    for(int i = 0; i < n; i++)
+    {
+        if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE)
+        {
+            for(int j = 0; j < c; j++)
+                for(int s0 = 0; s0 < h; s0++)
+                    for(int s1 = 0; s1 < w; s1++)
+                    {
+                        if(algo == MIOPEN_SOFTMAX_LOG)
+                        {
+                            channel_dot[i] += static_cast<Tcheck>(
+                                dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        else
+                        {
+                            channel_dot[i] +=
+                                static_cast<Tcheck>(out[i * out_nstr + j * out_cstr +
+                                                        s0 * out_hstr + s1 * out_wstr]) *
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]);
+                        }
+                    }
+
+            for(int j = 0; j < c; j++)
+                for(int s0 = 0; s0 < h; s0++)
+                    for(int s1 = 0; s1 < w; s1++)
+                    {
+                        if(algo == MIOPEN_SOFTMAX_LOG)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]) -
+                                channel_dot[i] * std::exp(out[i * out_nstr + j * out_cstr +
+                                                              s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        else
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]) -
+                                channel_dot[i];
+
+                            results[(i * c + j) * h * w + s0 * w + s1] *= static_cast<Tcheck>(
+                                out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] =
+                            alpha * results[(i * c + j) * h * w + s0 * w + s1] +
+                            beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr];
+                    }
+        }
+        else
+        {
+            for(int s0 = 0; s0 < h; s0++)
+                for(int s1 = 0; s1 < w; s1++)
+                {
+                    for(int j = 0; j < c; j++)
+                    {
+                        if(algo == MIOPEN_SOFTMAX_LOG)
+                        {
+                            channel_dot[i * h * w + s0 * w + s1] += static_cast<Tcheck>(
+                                dout[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        else
+                        {
+                            channel_dot[i * h * w + s0 * w + s1] +=
+                                static_cast<Tcheck>(out[i * out_nstr + j * out_cstr +
+                                                        s0 * out_hstr + s1 * out_wstr]) *
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]);
+                        }
+                    }
+
+                    for(int j = 0; j < c; j++)
+                    {
+                        if(algo == MIOPEN_SOFTMAX_LOG)
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]) -
+                                channel_dot[i * h * w + s0 * w + s1] *
+                                    std::exp(out[i * out_nstr + j * out_cstr + s0 * out_hstr +
+                                                 s1 * out_wstr]);
+                        }
+                        else
+                        {
+                            results[(i * c + j) * h * w + s0 * w + s1] =
+                                static_cast<Tcheck>(dout[i * out_nstr + j * out_cstr +
+                                                         s0 * out_hstr + s1 * out_wstr]) -
+                                channel_dot[i * h * w + s0 * w + s1];
+
+                            results[(i * c + j) * h * w + s0 * w + s1] *= static_cast<Tcheck>(
+                                out[i * out_nstr + j * out_cstr + s0 * out_hstr + s1 * out_wstr]);
+                        }
+                        dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr] =
+                            alpha * results[(i * c + j) * h * w + s0 * w + s1] +
+                            beta * dinhost[i * in_nstr + j * in_cstr + s0 * in_hstr + s1 * in_wstr];
+                    }
+                }
+        }
+    }
+
+    return ret;
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp b/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp
new file mode 100644
index 000000000000..987d4dda9929
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/network_data.hpp
@@ -0,0 +1,438 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
+#define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
+
+#include <initializer_list>
+#include <set>
+#include <vector>
+#include <type_traits>
+
+#ifndef MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR
+#define MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR 0
+#endif
+
+template <typename T = int>
+inline constexpr T pick_batch_size(T x, T y)
+{
+    return (y == 0 || y > x) ? 1 : x / y;
+}
+
+// Reduce tests execution time
+#define MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS 1
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size<T>(32,  n), 1,    14,  14  },
+        { pick_batch_size<T>(100, n), 1,    8,   8   },
+        { pick_batch_size<T>(256, n), 1,    27,  27  },
+#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS
+        { pick_batch_size<T>(64, n), 19,   1024,2048},
+#endif
+        { pick_batch_size<T>(100, n), 3,    32,  32  },
+        { pick_batch_size<T>(100, n), 32,   16,  16  },
+        { pick_batch_size<T>(100, n), 32,   8,   8   },
+        { pick_batch_size<T>(128, n), 256,  12,  12  },
+        { pick_batch_size<T>(128, n), 3,    231, 231 },
+        { pick_batch_size<T>(128, n), 512,  12,  12  },
+        { pick_batch_size<T>(256, n), 256,  13,  13  },
+        { pick_batch_size<T>(256, n), 3,    227, 227 },
+        { pick_batch_size<T>(256, n), 384,  13,  13  },
+        { pick_batch_size<T>(256, n), 96,   27,  27  },
+        { pick_batch_size<T>(32, n),  128,  28,  28  },
+        { pick_batch_size<T>(32, n),  144,  14,  14  },
+        { pick_batch_size<T>(32, n),  192,  28,  28  },
+        { pick_batch_size<T>(32, n),  192,  7,   7   },
+        { pick_batch_size<T>(32, n),  256,  28,  28  },
+        { pick_batch_size<T>(32, n),  3,    224, 224 },
+        { pick_batch_size<T>(32, n),  32,   28,  28  },
+        { pick_batch_size<T>(32, n),  48,   7,   7   },
+        { pick_batch_size<T>(32, n),  480,  128, 256 },
+        { pick_batch_size<T>(32, n),  480,  64,  128 },
+        { pick_batch_size<T>(32, n),  512,  4,   4   },
+        { pick_batch_size<T>(32, n),  512,  64,  128 },
+        { pick_batch_size<T>(16, n),  64,   56,  56  },
+        { pick_batch_size<T>(32, n),  832,  7,   7   },
+        { pick_batch_size<T>(64, n),  128,  56,  56  },
+        { pick_batch_size<T>(64, n),  256,  28,  28  },
+        { pick_batch_size<T>(64, n),  3,    224, 224 },
+        { pick_batch_size<T>(64, n),  512,  28,  28  },
+        { pick_batch_size<T>(64, n),  64,   112, 112 },
+        { pick_batch_size<T>(32, n),  64,   14,  14  },
+        { pick_batch_size<T>(32, n),  192,  14,  14  },
+        { pick_batch_size<T>(32, n),  320,  28,  28  },
+        { pick_batch_size<T>(32, n),  576,  14,  14  },
+        { pick_batch_size<T>(32, n),  576,  4,   4   },
+        { pick_batch_size<T>(32, n),  1056, 7,   7   },
+        { pick_batch_size<T>(32, n),  2048, 11,  11  },
+#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS
+        { pick_batch_size<T>(32, n),  16,   2048, 2048 },
+        { pick_batch_size<T>(32, n),  16,   3072, 3072 },
+        { pick_batch_size<T>(32, n),  16,   4096, 4096 },
+#endif
+        { 1,                       1,    1,   1   }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(1024, n),1024, 3,  3  },
+        { pick_batch_size(1024, n),512,  3,  3  },
+        { pick_batch_size(128, n), 256,  1,  1  },
+        { pick_batch_size(128, n), 528,  1,  1  },
+        { pick_batch_size(128, n), 96,   3,  3  },
+        { pick_batch_size(16, n),  192,  1,  1  },
+        { pick_batch_size(224, n), 112,  3,  3  },
+        { pick_batch_size(256, n), 96,   5,  5  },
+        { pick_batch_size(288, n), 144,  3,  3  },
+        { pick_batch_size(48, n),  832,  1,  1  },
+        { pick_batch_size(512, n), 256,  3,  3  },
+        { pick_batch_size(64, n),  1,    2,  2  },
+        { pick_batch_size(64, n),  3,    3,  3  },
+        { pick_batch_size(64, n),  3,    7,  7  },
+        { pick_batch_size(64, n),  32,   5,  5  },
+        { pick_batch_size(64, n),  480,  1,  1  },
+        { pick_batch_size(64, n),  64,   1,  1  },
+        { pick_batch_size(96, n),  3,    11, 11 },
+        { pick_batch_size(192, n), 64,   5,  5  },
+        { pick_batch_size(64, n),  64,   3,  3  },
+        { pick_batch_size(224, n), 224,  3,  3  },
+        { pick_batch_size(224, n), 192,  3,  3  },
+        { pick_batch_size(128, n), 320,  1,  1  },
+        { pick_batch_size(192, n), 576,  1,  1  },
+        { pick_batch_size(128, n), 1056, 1,  1  },
+        { pick_batch_size(128, n), 1024, 1,  1  },
+        { pick_batch_size(512, n), 2048, 1,  1  }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_immed_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32,  n), 1,    14,  14  },
+        { pick_batch_size(256, n), 1,    27,  27  },
+        { pick_batch_size(128, n), 512,  12,  12  },
+        { pick_batch_size(256, n), 256,  13,  13  },
+        { pick_batch_size(256, n), 3,    227, 227 },
+        { pick_batch_size(32, n),  64,   56,  56  },
+        { pick_batch_size(32, n),  96,   14,  14  },
+        { pick_batch_size(32, n),  96,   28,  28  },
+        { pick_batch_size(64, n),  128,  56,  56  },
+        { pick_batch_size(64, n),  3,    224, 224 },
+        { pick_batch_size(64, n),  256,  14,  14  },
+        { 1,                       1,    1,   1   }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_immed_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(208, n), 96,   3,  3  },
+        { pick_batch_size(24, n),  512,  1,  1  },
+        { pick_batch_size(256, n), 128,  3,  3  },
+        { pick_batch_size(256, n), 256,  3,  3  },
+        { pick_batch_size(256, n), 64,   5,  5  },
+        { pick_batch_size(288, n), 144,  3,  3  },
+        { pick_batch_size(96, n),  3,    11, 11 },
+        { pick_batch_size(32, n),  128,   5,  5  },
+        { pick_batch_size(32, n),  128,  1,  1  },
+        { pick_batch_size(256, n), 256,  3,  3  },
+        { pick_batch_size(512, n), 512,  3,  3  },
+        { pick_batch_size(160, n), 128,  3,  3  },
+        { pick_batch_size(32, n),  3,    7,  7  }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>>
+get_3d_conv_input_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(128, n),   1,   1,   2,   2},
+        { pick_batch_size(128, n),  64,   1,   1,   1},
+        { pick_batch_size(128, n),  64,   3,   4,   4},
+        { pick_batch_size(352, n),  32,   4,   9,   9},
+        { pick_batch_size(192, n), 512,   3,  14,  14},
+        { pick_batch_size(352, n), 512,   4,  28,  28},
+        { pick_batch_size(256, n), 512,   4,  56,  56},
+        { pick_batch_size(192, n),   3,   4, 227, 227},
+        { pick_batch_size(128, n),   4,   4, 161, 700}
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>>
+get_3d_conv_weight_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size( 128, n),   1,   1,   1,   1},
+        { pick_batch_size( 352, n), 128,   1,   1,   1},
+        { pick_batch_size( 256, n), 128,   1,   1,   1},
+        { pick_batch_size( 352, n),  32,   3,   3,   3},
+        { pick_batch_size( 352, n),   4,   3,   3,   3},
+        { pick_batch_size( 160, n),   4,   3,   5,   5},
+        { pick_batch_size( 128, n),  64,   5,   7,   7},
+        { pick_batch_size( 192, n),   4,   3,  11,  11},
+        { pick_batch_size( 128, n),   1,   3,   1,   7},
+        { pick_batch_size( 128, n),   1,   3,   7,   1},
+        { pick_batch_size( 128, n),   1,   3,   5,  20}
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32, n),  4,    1024,2048}, //Making this much smaller
+        { pick_batch_size(100, n), 3,    32,  32  },
+        { pick_batch_size(100, n), 32,   8,   8   },
+        { pick_batch_size(128, n), 256,  12,  12  },
+        { pick_batch_size(256, n), 3,    227, 227 },
+        { pick_batch_size(64, n),  64,   112, 112 },//Batch-norm ResNet 152 after this line
+        { pick_batch_size(256, n), 1024, 14,  14  },// n is from the paper @ 256
+        { pick_batch_size(256, n), 2048, 7,   7   },
+        { pick_batch_size(256, n), 256,  56,  56  },
+        { pick_batch_size(256, n), 256,  14,  14  },
+        { pick_batch_size(256, n), 512,  28,  28  },
+        { pick_batch_size(256, n), 512,  7,   7   },
+        { pick_batch_size(256, n), 64,   112, 112 },
+        { pick_batch_size(256, n), 64,   56,  56  },//Batch-norm Inception_v3 after this
+        { pick_batch_size(32, n),  1024, 1,   1   },// n is from the paper @ 32
+        { pick_batch_size(32, n),  128,  14,  14  },
+        { pick_batch_size(32, n),  128,  28,  28  },
+        { pick_batch_size(32, n),  128,  4,   4   },
+        { pick_batch_size(32, n),  128,  7,   7   },
+        { pick_batch_size(32, n),  160,  7,   7   },
+        { pick_batch_size(32, n),  192,  14,  14  },
+        { pick_batch_size(32, n),  192,  56,  56  },
+        { pick_batch_size(32, n),  192,  7,   7   },
+        { pick_batch_size(32, n),  224,  14,  14  },
+        { pick_batch_size(32, n),  256,  7,   7   },
+        { pick_batch_size(32, n),  256,  14,  14  },
+        { pick_batch_size(32, n),  352,  7,   7   },
+        { pick_batch_size(32, n),  64,   112, 112 },
+        { pick_batch_size(32, n),  64,   14,  14  },
+        { pick_batch_size(32, n),  64,   56,  56  },
+        { pick_batch_size(32, n),  96,   28,  28  },
+        { pick_batch_size(32, n),  32,  256,  512 }, //Killing this config. Takes way too long on the CPU
+        { pick_batch_size(32, n),  256,  28,  28  },
+        { pick_batch_size(32, n),  3,    224, 224 },
+        { pick_batch_size(32, n),  480,  128, 256 },
+        { pick_batch_size(32, n),  528,  64,  128 }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32, n),  4,    1024,2048}, //Making this much smaller
+        { pick_batch_size(32, n),  192,  256, 512 },
+        { pick_batch_size(32, n),  480,  128, 256 },
+        { pick_batch_size(256, n), 3,    227, 227 },
+        { pick_batch_size(256, n), 64,   112, 112 },
+        { pick_batch_size(512, n), 16,   32,  32  },
+        { pick_batch_size(100, n), 32,   8,   8   },
+        { pick_batch_size(128, n), 256,  12,  12  },
+        { pick_batch_size(256, n), 128,  28,  28  },
+        { pick_batch_size(256, n), 2048, 7,   7   },
+        { pick_batch_size(256, n), 256,  56,  56  },
+        { pick_batch_size(256, n), 256,  14,  14  },
+        { pick_batch_size(256, n), 512,  28,  28  },
+        { pick_batch_size(256, n), 512,  7,   7   },
+        { pick_batch_size(256, n), 64,   56,  56  },//Batch-norm Inception_v3 after this
+        { pick_batch_size(32, n),  1024, 1,   1   },// n is from the paper @ 32
+        { pick_batch_size(32, n),  128,  14,  14  },
+        { pick_batch_size(32, n),  128,  4,   4   },
+        { pick_batch_size(32, n),  160,  7,   7   },
+        { pick_batch_size(32, n),  192,  14,  14  },
+        { pick_batch_size(32, n),  192,  56,  56  },
+        { pick_batch_size(32, n),  192,  7,   7   },
+        { pick_batch_size(32, n),  224,  14,  14  },
+        { pick_batch_size(32, n),  256,  7,   7   },
+        { pick_batch_size(32, n),  352,  7,   7   },
+        { pick_batch_size(32, n),  64,   14,  14  },
+        { pick_batch_size(32, n),  64,   28,  28  },
+        { pick_batch_size(32, n),  64,   56,  56  },
+        { pick_batch_size(32, n),  96,   28,  28  },
+        { pick_batch_size(32, n),  192,  256, 512 },
+        { pick_batch_size(32, n),  256,  28,  28  },
+        { pick_batch_size(32, n),  3,    224, 224 },
+        { pick_batch_size(32, n),  480,  128, 256 },
+        { pick_batch_size(32, n),  528,  64,  128 },
+        { pick_batch_size(770, n),  1,  8,  8 },
+        { pick_batch_size(770, n),  1024,  1,  1 },
+        { pick_batch_size(152, n),  128,  80,  80 },
+        { pick_batch_size(152, n),  256,  20,  20 },
+        { pick_batch_size(152, n),  32,  160,  160 },
+        { pick_batch_size(152, n),  512,  20,  20 },
+        { pick_batch_size(152, n),  64,  160,  160 },
+        { pick_batch_size(152, n),  64,  80,  80 },
+        { pick_batch_size(256, n),  256,  20,  20 },
+        { pick_batch_size(256, n),  512,  20,  20 }
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>> get_3d_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32, n),   1,   32,  32,  32  },       // 32x32x32 based on VoxNet arch
+        { pick_batch_size(32, n),   1,   14,  14,  14  },
+        { pick_batch_size(32, n),  32,   14,  14,  14  },
+        { pick_batch_size(32, n),  32,   12,  12,  12  },
+        { pick_batch_size(32, n),  32,    6,   6,   6  },
+        { pick_batch_size(256, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
+        { pick_batch_size(256, n), 32,   14,  14,  14  },
+        { pick_batch_size(256, n), 32,   12,  12,  12  },
+        { pick_batch_size(256, n), 32,    6,   6,   6  },
+        { pick_batch_size(512, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
+        { pick_batch_size(512, n), 32,   14,  14,  14  },
+        { pick_batch_size(512, n), 32,   12,  12,  12  },
+        { pick_batch_size(512, n), 32,    6,   6,   6  },
+        { pick_batch_size(32, n),   2,   32,  57, 125  },       // Hand-gesture recognition CVPR 2015 paper High Res Net Path
+        { pick_batch_size(32, n),  32,   14,  25,  59  },
+        { pick_batch_size(32, n),  32,    6,  10,  27  },
+        { pick_batch_size(32, n),  32,    4,   6,  11  },
+        { pick_batch_size(32, n),  32,    2,   2,   3  },
+        { pick_batch_size(32, n),  32,   32,  28,  62  },       // Hand-gesture recognition CVPR 2015 paper Low Res Net Path
+        { pick_batch_size(32, n),  32,   14,  12,  29  },
+        { pick_batch_size(32, n),  32,    6,   4,  12  },
+        { pick_batch_size(32, n),  32,    4,   2,   2  },
+        { pick_batch_size(16, n),  32,    6,  50,  50  },       // Multi-view 3D convnet
+        { pick_batch_size(1,  n),   3,    8, 240, 320  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,   16, 240, 320  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,    8, 128, 171  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,   16, 128, 171  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,    8, 112, 112  },      // 3D convet on video
+        { pick_batch_size(1,  n),   3,   16, 112, 112  }      // 3D convet on video
+    };
+
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::set<std::vector<T>>
+get_3d_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
+{
+    // clang-format off
+    return
+    {
+        { pick_batch_size(32, n),   1,   32,  32,  32  },       // 32x32x32 based on VoxNet arch
+        { pick_batch_size(32, n),   1,   14,  14,  14  },
+        { pick_batch_size(32, n),  32,   14,  14,  14  },
+        { pick_batch_size(32, n),  32,   12,  12,  12  },
+        { pick_batch_size(32, n),  32,    6,   6,   6  },
+        { pick_batch_size(256, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
+        { pick_batch_size(256, n), 32,   14,  14,  14  },
+        { pick_batch_size(256, n), 32,   12,  12,  12  },
+        { pick_batch_size(256, n), 32,    6,   6,   6  },
+        { pick_batch_size(512, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
+        { pick_batch_size(512, n), 32,   14,  14,  14  },
+        { pick_batch_size(512, n), 32,   12,  12,  12  },
+        { pick_batch_size(512, n), 32,    6,   6,   6  },
+        { pick_batch_size(32,  n),  2,   32,  57, 125  },       // Hand-gesture recognition CVPR 2015 paper High Res Net Path
+        { pick_batch_size(32,  n), 32,   14,  25,  59  },
+        { pick_batch_size(32,  n), 32,    6,  10,  27  },
+        { pick_batch_size(32,  n), 32,    4,   6,  11  },
+        { pick_batch_size(32,  n), 32,    2,   2,   3  },
+        { pick_batch_size(32,  n), 32,   32,  28,  62  },       // Hand-gesture recognition CVPR 2015 paper Low Res Net Path
+        { pick_batch_size(32,  n), 32,   14,  12,  29  },
+        { pick_batch_size(32,  n), 32,    6,   4,  12  },
+        { pick_batch_size(32,  n), 32,    4,   2,   2  },
+        { pick_batch_size(16,  n), 32,    6,  50,  50  },       // Multi-view 3D convnet
+        { pick_batch_size(1,   n), 3,     8,  240, 320 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,    16,  240, 320 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,     8,  128, 171 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,    16,  128, 171 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,     8,  112, 112 },      // 3D convet on video
+        { pick_batch_size(1,   n), 3,    16,  112, 112 }      // 3D convet on video
+    };
+    // clang-format on
+}
+
+template <typename T = int>
+inline std::vector<std::vector<T>> get_sub_tensor()
+{
+    return {{16, 4, 8, 1, 4},
+            {2, 4, 8, 8, 4},
+            {16, 4, 8, 4},
+            {13, 8, 4, 8},
+            {3, 8, 7},
+            {16, 4, 10},
+            {3, 8},
+            {16, 4},
+            {4}};
+}
+
+template <typename T = int>
+inline std::vector<std::vector<T>> get_tensor_offsets()
+{
+    static_assert(std::is_signed_v<T>);
+    return {{0, 0}, {0, 2}, {4, 0}, {5, 7}};
+}
+
+template <typename T = int>
+inline std::vector<T> get_tensor_offset()
+{
+    static_assert(std::is_signed_v<T>);
+    return {0, 1, 2, 3, 4, 5};
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/random.hpp b/projects/miopen/miopen_utils/include/miopen_utils/random.hpp
new file mode 100644
index 000000000000..63b69ac9875a
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/random.hpp
@@ -0,0 +1,62 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_TEST_RANDOM_HPP
+#define GUARD_MIOPEN_TEST_RANDOM_HPP
+
+#include <common_utils/random.hpp>
+
+namespace prng {
+template <typename T>
+inline T gen_descreet_uniform_sign(double scale, int32_t range)
+{
+    return static_cast<T>(scale * prng::gen_A_to_B(-range + 1, range));
+}
+
+template <typename T>
+inline T gen_descreet_unsigned(double scale, int32_t range)
+{
+    return static_cast<T>(scale * static_cast<double>(gen_0_to_B(range)));
+}
+
+} // namespace prng
+
+// lambda factory
+template <typename T, typename ScaleT, typename RangeT>
+auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg)
+{
+    return [=](auto&&...) -> T {
+        // uniform sign give balance of both negative and positive values
+        return prng::gen_descreet_uniform_sign<T>(scale_arg, range_arg);
+    };
+}
+
+template <typename T, typename ScaleT, typename RangeT>
+auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg)
+{
+    return [=](auto&&...) -> T { return prng::gen_descreet_unsigned<T>(scale_arg, range_arg); };
+}
+
+#endif // GUARD_MIOPEN_TEST_RANDOM_HPP
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp b/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp
new file mode 100644
index 000000000000..a6569cebb7e6
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/rnn_util.hpp
@@ -0,0 +1,305 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_RNN_UTIL_H_
+#define MIOPEN_RNN_UTIL_H_
+
+#include <cfloat>
+#include <cmath>
+#include <initializer_list>
+#include <set>
+#include <vector>
+#include <cstdlib>
+#include <numeric>
+
+#include <miopen_utils/gemm.hpp>
+#include <miopen_utils/random.hpp>
+
+#include <miopen/tensor.hpp>
+
+// complexity O(NlogN)
+inline std::vector<int> GetReverseOrderIndex(const std::vector<int>& base_index)
+{
+    std::vector<int> reverse_index(base_index.size());
+    unsigned next_rev_index = 0;
+    for(auto id : base_index)
+        reverse_index[id] = next_rev_index++;
+    return reverse_index;
+};
+
+inline std::vector<int> GetSamplesIndexDescendingOrder(const std::vector<size_t>& unsorted_seq_lens)
+{
+    const auto sample_count = unsorted_seq_lens.size();
+
+    std::vector<int> index_v(sample_count);
+    std::iota(index_v.begin(), index_v.end(), 0);
+
+    auto seq_len_cmp = [&unsorted_seq_lens](unsigned a_id, unsigned b_id) {
+        return unsorted_seq_lens[a_id] > unsorted_seq_lens[b_id];
+    };
+
+    std::stable_sort(index_v.begin(), index_v.end(), seq_len_cmp);
+
+    return index_v;
+}
+
+template <typename Tgpu>
+inline void HiddenTensorReorder(const std::vector<Tgpu>& src_array,
+                                std::vector<Tgpu>& dst_array,
+                                const std::vector<int>& batch_order,
+                                const std::vector<size_t> hid_len,
+                                bool is_dst_direct_order)
+{
+    const size_t copy_size = hid_len[2];
+
+    const size_t batch_stride = hid_len[2];
+    const size_t layer_stride = batch_stride * hid_len[1];
+
+    for(size_t batch_id = 0; batch_id < hid_len[1]; batch_id++)
+    {
+        const auto src_batch_off =
+            batch_stride * (is_dst_direct_order ? batch_order[batch_id] : batch_id);
+        const auto dst_batch_off =
+            batch_stride * (is_dst_direct_order ? batch_id : batch_order[batch_id]);
+
+        for(size_t layer_id = 0; layer_id < hid_len[0]; layer_id++)
+        {
+            const auto dst_offset = dst_batch_off + layer_id * layer_stride;
+            const auto src_offset = src_batch_off + layer_id * layer_stride;
+
+            std::copy(src_array.begin() + src_offset,
+                      src_array.begin() + src_offset + copy_size,
+                      dst_array.begin() + dst_offset);
+        }
+    }
+}
+
+inline void createTensorDescArray(std::vector<miopen::TensorDescriptor>& td,
+                                  std::vector<miopenTensorDescriptor_t>& ptd,
+                                  const std::vector<int> bs,
+                                  const int secondDim,
+                                  miopenDataType_t dataType)
+{
+
+    std::transform(bs.begin(), bs.end(), std::back_inserter(td), [&](int x) {
+        return miopen::TensorDescriptor(
+            dataType, {static_cast<std::size_t>(x), static_cast<std::size_t>(secondDim)});
+    });
+    std::transform(td.begin(), td.end(), std::back_inserter(ptd), [](miopen::TensorDescriptor& x) {
+        return &x;
+    });
+}
+
+inline std::tuple<size_t, size_t>
+GetTempPackedBuffersSize(std::vector<int> batchs, int in_vec, int out_vec)
+{
+    size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0ULL);
+
+    size_t in_buff_size  = total_batch * in_vec;
+    size_t out_buff_size = total_batch * out_vec;
+    return {in_buff_size, out_buff_size};
+}
+
+inline size_t getSuperTensorSize(const std::vector<int>& bs,
+                                 int seqLength,
+                                 int inputSize,
+                                 int hiddenSize,
+                                 int maxPaddingVal,
+                                 bool isBidirect,
+                                 bool isInput,
+                                 bool isPadded)
+{
+    return (isPadded //
+                ? static_cast<size_t>(seqLength) * maxPaddingVal
+                : std::accumulate(bs.begin(), bs.end(), 0ULL)) //
+           * (isInput                                          //
+                  ? static_cast<size_t>(inputSize)
+                  : static_cast<size_t>(hiddenSize) * (isBidirect ? 2 : 1));
+}
+
+template <typename Tgpu>
+void ChangeDataPadding(const std::vector<Tgpu>& src_array,
+                       std::vector<Tgpu>& dst_array,
+                       const std::vector<int>& batch_list,
+                       int max_batch,
+                       int sample_size,
+                       bool is_src_packed)
+{
+    auto seq_len = batch_list.size();
+
+    auto scr_ptr = &src_array[0];
+    auto dst_ptr = &dst_array[0];
+
+    for(int seq_id = 0; seq_id < seq_len; seq_id++)
+    {
+        auto packed_size = batch_list[seq_id] * sample_size;
+
+        std::copy(scr_ptr, scr_ptr + packed_size, dst_ptr);
+
+        if(is_src_packed)
+        {
+            dst_ptr += max_batch * sample_size;
+            scr_ptr += packed_size;
+        }
+        else
+        {
+            scr_ptr += max_batch * sample_size;
+            dst_ptr += packed_size;
+        }
+    }
+}
+
+// RNN VANILLA configs
+inline std::vector<int> get_rnn_num_layers() { return {{1, 3}}; }
+
+inline std::vector<int> get_rnn_batchSize() { return {{1, 17}}; }
+
+inline std::vector<int> get_rnn_seq_len() { return {{1, 3, 51}}; }
+
+inline std::vector<int> get_rnn_vector_len() { return {31}; }
+
+inline std::vector<int> get_rnn_hidden_size() { return {127}; }
+
+// LSTM configs
+inline std::vector<int> get_lstm_num_layers() { return {{1, 3}}; }
+
+inline std::vector<int> get_lstm_batchSize() { return {{1, 17}}; }
+
+inline std::vector<int> get_lstm_seq_len() { return {{1, 25}}; }
+
+inline std::vector<int> get_lstm_vector_len() { return {17}; }
+
+inline std::vector<int> get_lstm_hidden_size() { return {67}; }
+
+// GRU configs
+inline std::vector<int> get_gru_num_layers() { return {{1, 3}}; }
+
+inline std::vector<int> get_gru_batchSize() { return {{1, 17}}; }
+
+inline std::vector<int> get_gru_seq_len() { return {{1, 23}}; }
+
+inline std::vector<int> get_gru_vector_len() { return {13}; }
+
+inline std::vector<int> get_gru_hidden_size() { return {67}; }
+
+inline std::vector<std::vector<int>> generate_batchSeq(const int batchSize, const int seqLength)
+{
+
+    static constexpr int modval = 3;
+
+    int currentval = batchSize;
+    std::vector<int> batchSeq;
+    batchSeq.reserve(seqLength);
+    for(int i = 0; i < seqLength; i++)
+    {
+        if(i > 0)
+        {
+            int nvalue = currentval - prng::gen_0_to_B(modval);
+            currentval = (nvalue < 1) ? 1 : nvalue;
+            // printf("current value: %d\n", currentval);
+        }
+        // printf("adding a value to batch sequence: %d\n", currentval);
+        batchSeq.push_back(currentval);
+    }
+    return {batchSeq};
+}
+
+inline int sumvc(const std::vector<int>& x) { return std::accumulate(x.begin(), x.end(), 0); }
+
+template <typename T>
+inline T activfunc(T x, int actvf)
+{
+    T alpha = static_cast<T>(1), beta0 = static_cast<T>(0), beta1 = static_cast<T>(1);
+    if(actvf == 0)
+    {
+        return (x > 0) ? x : x * beta0;
+    }
+    else if(actvf == 2)
+    {
+        return static_cast<T>(1 / (1 + std::exp(-x)));
+    }
+    return static_cast<T>(alpha * std::tanh(beta1 * x));
+}
+
+template <typename T>
+inline T dervactivfunc(T x, int actvf)
+{
+    if(actvf == 0)
+    {
+        return static_cast<T>(x > 0 ? 1 : 0);
+    }
+    else if(actvf == 2)
+    {
+        return static_cast<T>(std::exp(-x) / (1 + std::exp(-x)) / (1 + std::exp(-x)));
+    }
+
+    return static_cast<T>(1 / std::cosh(x) / std::cosh(x));
+}
+
+template <typename Dtype>
+void RNN_mm_cpu_batched(const Dtype* a_ptr,
+                        size_t a_cols,
+                        size_t a_rows,
+                        size_t lda,
+                        size_t a_stride,
+                        int a_flags,
+                        const Dtype* b_ptr,
+                        size_t b_cols,
+                        size_t b_rows,
+                        size_t ldb,
+                        size_t b_stride,
+                        int b_flags,
+                        Dtype* c_ptr,
+                        size_t c_cols,
+                        size_t c_rows,
+                        size_t ldc,
+                        size_t c_stride,
+                        int batchCount,
+                        double alpha,
+                        double beta)
+{
+    for(int i = 0; i < batchCount; ++i)
+    {
+        gemm_cpu(a_ptr + a_stride * i,
+                 a_cols,
+                 a_rows,
+                 lda,
+                 a_flags == 1 ? true : false,
+                 b_ptr + b_stride * i,
+                 b_cols,
+                 b_rows,
+                 ldb,
+                 b_flags == 1 ? true : false,
+                 c_ptr + c_stride * i,
+                 c_cols,
+                 c_rows,
+                 ldc,
+                 alpha,
+                 beta);
+    }
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp b/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp
new file mode 100644
index 000000000000..71d3133df063
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/serialize.hpp
@@ -0,0 +1,129 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP
+#define MIOPEN_GUARD_TEST_SERIALIZE_HPP
+
+#include <common_utils/rank.hpp>
+#include <common_utils/each_args.hpp>
+#include <half/half.hpp>
+#include <fstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+template <class T>
+struct is_trivial_serializable : std::is_trivially_copy_constructible<T>
+{
+};
+
+template <>
+struct is_trivial_serializable<half_float::half> : std::true_type
+{
+};
+
+template <class T>
+std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::ostream& os, const T& x)
+{
+    os.write(reinterpret_cast<const char*>(&x), sizeof(T));
+}
+
+template <class T>
+auto serialize(std::ostream& os,
+               const T& x) -> decltype(x.begin(), x.end(), T(x.begin(), x.end()), void())
+{
+    std::size_t n = std::distance(x.begin(), x.end());
+    serialize(os, n);
+    for(auto&& y : x)
+        serialize(os, y);
+}
+
+template <class... Ts>
+std::enable_if_t<not is_trivial_serializable<std::tuple<Ts...>>{}>
+serialize(std::ostream& os, const std::tuple<Ts...>& t)
+{
+    miopen::unpack(
+        [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(os, x); }, xs...); }, t);
+}
+
+template <class T>
+std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::istream& is, T& x)
+{
+    is.read(reinterpret_cast<char*>(&x), sizeof(T));
+}
+
+template <class T>
+std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::istream& is, std::vector<T>& x)
+{
+    std::size_t n;
+    serialize(is, n);
+    x.resize(n);
+    is.read(reinterpret_cast<char*>(x.data()), sizeof(T) * n);
+}
+
+template <class T>
+auto serialize(std::istream& is,
+               T& x) -> decltype(x.begin(), x.end(), x.assign(x.begin(), x.end()), void())
+{
+    using value_type = std::decay_t<decltype(*x.begin())>;
+    std::size_t n;
+    serialize(is, n);
+    std::vector<value_type> v;
+    v.reserve(n);
+    for(std::size_t i = 0; i < n; i++)
+    {
+        value_type y;
+        serialize(is, y);
+        v.push_back(y);
+    }
+    x.assign(v.begin(), v.end());
+}
+
+template <class... Ts>
+std::enable_if_t<not is_trivial_serializable<std::tuple<Ts...>>{}>
+serialize(std::istream& is,
+          // cppcheck-suppress constParameter
+          std::tuple<Ts...>& t)
+{
+    miopen::unpack(
+        [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(is, x); }, xs...); }, t);
+}
+
+template <class T>
+void load(std::string name, T& x)
+{
+    std::ifstream is{name.c_str()};
+    serialize(is, x);
+}
+
+template <class T>
+void save(std::string name, const T& x)
+{
+    std::ofstream os{name.c_str()};
+    serialize(os, x);
+}
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp
new file mode 100644
index 000000000000..f762f80f280c
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/tensor_holder.hpp
@@ -0,0 +1,505 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_TENSOR_HOLDER_HPP
+#define GUARD_TENSOR_HOLDER_HPP
+
+#include <miopen_utils/network_data.hpp>
+#include <miopen/ford.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/functional.hpp>
+#include <miopen/type_name.hpp>
+#include <miopen/each_args.hpp>
+#include <miopen/bfloat16.hpp>
+#include <common_utils/random.hpp>
+
+#include <miopen_utils/serialize.hpp>
+
+#include <half/half.hpp>
+using half         = half_float::half;
+using hip_bfloat16 = bfloat16;
+#include "../../src/kernels/hip_float8.hpp"
+using float8_fnuz  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
+using bfloat8_fnuz = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
+
+#include <iomanip>
+#include <fstream>
+
+template <class F>
+void visit_tensor_size(std::size_t n, F f)
+{
+    switch(n)
+    {
+    case 0: {
+        f(std::integral_constant<std::size_t, 0>{});
+        break;
+    }
+    case 1: {
+        f(std::integral_constant<std::size_t, 1>{});
+        break;
+    }
+    case 2: {
+        f(std::integral_constant<std::size_t, 2>{});
+        break;
+    }
+    case 3: {
+        f(std::integral_constant<std::size_t, 3>{});
+        break;
+    }
+    case 4: {
+        f(std::integral_constant<std::size_t, 4>{});
+        break;
+    }
+    case 5: {
+        f(std::integral_constant<std::size_t, 5>{});
+        break;
+    }
+    default: throw std::runtime_error("Unknown tensor size");
+    }
+}
+
+template <class T>
+struct miopen_type;
+
+template <>
+struct miopen_type<float> : std::integral_constant<miopenDataType_t, miopenFloat>
+{
+};
+
+template <>
+struct miopen_type<double> : std::integral_constant<miopenDataType_t, miopenDouble>
+{
+};
+
+template <>
+struct miopen_type<half_float::half> : std::integral_constant<miopenDataType_t, miopenHalf>
+{
+};
+template <>
+struct miopen_type<bfloat16> : std::integral_constant<miopenDataType_t, miopenBFloat16>
+{
+};
+
+template <>
+struct miopen_type<int8_t> : std::integral_constant<miopenDataType_t, miopenInt8>
+{
+};
+
+template <>
+struct miopen_type<int> : std::integral_constant<miopenDataType_t, miopenInt32>
+{
+};
+
+template <>
+struct miopen_type<int64_t> : std::integral_constant<miopenDataType_t, miopenInt64>
+{
+};
+
+template <>
+struct miopen_type<float8_fnuz> : std::integral_constant<miopenDataType_t, miopenFloat8_fnuz>
+{
+};
+
+template <>
+struct miopen_type<bfloat8_fnuz> : std::integral_constant<miopenDataType_t, miopenBFloat8_fnuz>
+{
+};
+
+template <>
+struct miopen_type<uint8_t> : std::integral_constant<miopenDataType_t, miopenInt8>
+{
+};
+
+template <>
+struct miopen_type<uint16_t> : std::integral_constant<miopenDataType_t, miopenHalf>
+{
+};
+
+template <>
+struct miopen_type<uint64_t> : std::integral_constant<miopenDataType_t, miopenInt64>
+{
+};
+
+template <class T>
+struct tensor
+{
+    using value_type = T;
+    miopen::TensorDescriptor desc;
+    std::vector<T> data;
+
+#if defined(__clang__) || defined(__GNUG__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+    tensor() : desc(miopen_type<T>{}) {}
+
+#if defined(__clang__) || defined(__GNUG__)
+#pragma GCC diagnostic pop
+#endif
+
+    template <class X>
+    tensor(const std::vector<X>& dims) : desc(miopen_type<T>{}, dims), data(desc.GetElementSpace())
+    {
+    }
+
+    template <class X>
+    tensor(const std::vector<X>& dims, const std::vector<X>& strides)
+        : desc(miopen_type<T>{}, dims, strides), data(desc.GetElementSpace())
+    {
+        assert(dims.size() == strides.size());
+    }
+
+    template <class X>
+    tensor(miopenTensorLayout_t layout, const std::vector<X>& dims)
+        : desc(miopen_type<T>{}, layout, dims), data(desc.GetElementSpace())
+    {
+    }
+
+    template <class X>
+    tensor(miopenTensorLayout_t layout, const std::vector<X>& dims, const std::vector<X>& strides)
+        : desc(miopen_type<T>{}, layout, dims, strides), data(desc.GetElementSpace())
+    {
+        assert(dims.size() == strides.size());
+    }
+
+    tensor(std::size_t n, std::size_t c, std::size_t h, std::size_t w)
+        : desc(miopen_type<T>{}, {n, c, h, w}), data(n * c * h * w)
+    {
+    }
+
+    tensor(miopenTensorLayout_t layout, std::size_t n, std::size_t c, std::size_t h, std::size_t w)
+        : desc(miopen_type<T>{}, layout, {n, c, h, w}), data(desc.GetElementSpace())
+    {
+    }
+
+    tensor(std::size_t n, std::size_t c, std::size_t d, std::size_t h, std::size_t w)
+        : desc(miopen_type<T>{}, {n, c, d, h, w}), data(n * c * d * h * w)
+    {
+    }
+
+    tensor(std::size_t n) : desc(miopen_type<T>{}, {n}), data(n) {}
+
+    tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs))
+    {
+        assert(desc.GetType() == miopen_type<T>{}
+               /// In the driver, T is input tensor type, but output tensor holders
+               /// are instantiatied with T as well. This leads to false assertion
+               /// failures when T is INT8 because output type is different.
+               /// \todo Get rid of this hack when the driver is improved:
+               || (miopen_type<T>{} == miopenInt8 && desc.GetType() == miopenInt32));
+        data.resize(desc.GetElementSpace());
+    }
+
+    size_t GetDataByteSize() const { return GetSize() * sizeof(T); }
+
+    size_t GetSize() const { return desc.GetElementSpace(); }
+
+    template <class G>
+    tensor& generate(G g) &
+    {
+        if(this->desc.GetVectorLength() > 1)
+            this->generate_vect_impl(g);
+        else
+            this->generate_impl(g);
+        return *this;
+    }
+
+    template <class G>
+    tensor&& generate(G g) &&
+    {
+        if(this->desc.GetVectorLength() > 1)
+            this->generate_vect_impl(g);
+        else
+            this->generate_impl(g);
+        return std::move(*this);
+    }
+
+    template <class G>
+    void generate_impl(G g)
+    {
+        auto seed = std::accumulate(desc.GetLengths().begin(),
+                                    desc.GetLengths().end(),
+                                    std::size_t{521288629},
+                                    [](auto x, auto y) {
+                                        x ^= x << 1U;
+                                        return x ^ y;
+                                    });
+        seed ^= data.size();
+        seed ^= desc.GetLengths().size();
+        prng::reset_seed(seed);
+        auto iterator = data.begin();
+        auto assign   = [&](T x) {
+            *iterator = x;
+            ++iterator;
+        };
+        this->for_each(
+            miopen::compose(miopen::compose(assign, miopen::cast_to<T>()), std::move(g)));
+    }
+
+    template <class G>
+    void generate_vect_impl(G g)
+    {
+        auto seed = std::accumulate(desc.GetLengths().begin(),
+                                    desc.GetLengths().end(),
+                                    std::size_t{521288629},
+                                    [](auto x, auto y) {
+                                        x ^= x << 1U;
+                                        return x ^ y;
+                                    });
+        seed ^= data.size();
+        seed ^= desc.GetLengths().size();
+        prng::reset_seed(seed);
+        auto iterator     = data.begin();
+        auto vectorLength = desc.GetVectorLength();
+        auto assign       = [&](T x) {
+            assert(iterator < data.end());
+            // for debugging
+            for(auto i = 0; i < vectorLength; i++)
+            {
+                *(iterator + i) = x;
+            }
+            iterator += vectorLength;
+        };
+        this->for_each(
+            miopen::compose(miopen::compose(assign, miopen::cast_to<T>()), std::move(g)));
+    }
+
+    template <class Loop, class F>
+    struct for_each_unpacked
+    {
+        Loop loop;
+        F f;
+        template <class... Ts>
+        auto operator()(Ts... xs) const -> decltype(f(xs...), void())
+        {
+            loop(xs...)(std::move(f));
+        }
+
+        struct any
+        {
+            any() {}
+            template <class X>
+            any(X)
+            {
+            }
+        };
+
+        [[noreturn]] void operator()(any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {},
+                                     any = {}) const
+        {
+            throw std::runtime_error(
+                "Arguments to for_each do not match tensor size or the function " +
+                miopen::get_type_name<F>() + " can not be called.");
+        }
+    };
+
+    struct for_each_handler
+    {
+        template <class Self, class Loop, class F, class Size>
+        void operator()(Self* self, Loop loop, F f, Size size) const
+        {
+            auto dims = miopen::tien<size>(self->desc.GetLengths());
+            miopen::unpack(for_each_unpacked<Loop, F>{loop, std::move(f)}, dims);
+        }
+    };
+
+    template <class F>
+    void for_each(F f) const
+    {
+        visit_tensor_size(
+            desc.GetLengths().size(),
+            std::bind(for_each_handler{}, this, miopen::ford, std::move(f), std::placeholders::_1));
+    }
+
+    template <class F>
+    void par_for_each(F f) const
+    {
+        visit_tensor_size(
+            desc.GetLengths().size(),
+            std::bind(
+                for_each_handler{}, this, miopen::par_ford, std::move(f), std::placeholders::_1));
+    }
+
+    template <class... Ts>
+    T& operator()(Ts... xs)
+    {
+        assert(this->desc.GetIndex(xs...) < data.size());
+        return this->data[this->desc.GetIndex(xs...)];
+    }
+
+    template <class... Ts>
+    const T& operator()(Ts... xs) const
+    {
+        assert(this->desc.GetIndex(xs...) < data.size());
+        return this->data[this->desc.GetIndex(xs...)];
+    }
+
+    template <class Integer, Integer N>
+    const T& operator()(const std::array<Integer, N>& multi_id) const
+    {
+        auto f = [&](auto... is) { return this->desc.GetIndex(is...); };
+        assert(miopen::unpack(f, multi_id) < data.size());
+        return this->data[miopen::unpack(f, multi_id)];
+    }
+
+    T& operator[](std::size_t i) { return data.at(i); }
+
+    const T& operator[](std::size_t i) const { return data.at(i); }
+
+    typename std::vector<T>::iterator begin() { return data.begin(); }
+
+    typename std::vector<T>::iterator end() { return data.end(); }
+
+    typename std::vector<T>::const_iterator begin() const { return data.begin(); }
+
+    typename std::vector<T>::const_iterator end() const { return data.end(); }
+
+    friend std::ostream& operator<<(std::ostream& stream, const tensor& t)
+    {
+        return stream << t.desc;
+    }
+
+    template <size_t N, typename Stream>
+    void dump_inner(size_t dim, std::array<size_t, N>& coord, Stream& stream) const
+    {
+        const auto lengths = this->desc.GetLengths();
+        if(lengths.size() == 0)
+        {
+            // 0D special case: Just print the one value that we have and return.
+            stream << (*this)(coord);
+        }
+        else if(dim + 1 == lengths.size())
+        {
+            // 1D special case: dump everything on one line
+            for(size_t i = 0; i < lengths[dim]; ++i)
+            {
+                if(i != 0)
+                    stream << ' ';
+
+                coord[dim] = i;
+                stream << std::setw(4) << (*this)(coord);
+            }
+
+            stream << '\n';
+        }
+        else
+        {
+            if(dim + 2 == lengths.size())
+            {
+                // 2D special case: Also print which 2D slice we are currently printing
+                // Note: this is not needed for higher dimensions, as they will also pass
+                // through this branch.
+                stream << "slice [";
+                for(size_t i = 0; i < dim; ++i)
+                {
+                    stream << coord[i] << ", ";
+                }
+                stream << ":, :]\n";
+            }
+
+            for(size_t i = 0; i < lengths[dim]; ++i)
+            {
+                coord[dim] = i;
+                this->dump_inner<N>(dim + 1, coord, stream);
+            }
+        }
+    }
+
+    template <typename Stream = decltype(std::cout)>
+    void dump(const char* name, Stream& stream = std::cout) const
+    {
+        const auto n = this->desc.GetLengths().size();
+        stream << "==== " << name << ": " << *this << n << '\n';
+        stream.fill(' ');
+
+        const auto flags = stream.flags();
+
+        visit_tensor_size(n, [&](const auto size) {
+            constexpr size_t N = decltype(size)::value;
+            std::array<size_t, N> coord;
+            this->dump_inner<N>(0, coord, stream);
+        });
+
+        stream.flags(flags);
+    }
+};
+
+template <class T>
+void serialize(std::istream& s, tensor<T>& x)
+{
+    std::vector<std::size_t> lens;
+    serialize(s, lens);
+    std::vector<std::size_t> strides;
+    serialize(s, strides);
+    x.desc = miopen::TensorDescriptor{miopen_type<T>{}, lens, strides};
+    serialize(s, x.data);
+}
+
+template <class T>
+void serialize(std::ostream& s, const tensor<T>& x)
+{
+    const auto& lens    = x.desc.GetLengths();
+    const auto& strides = x.desc.GetStrides();
+    serialize(s, lens);
+    serialize(s, strides);
+    serialize(s, x.data);
+}
+
+struct tensor_generate
+{
+    template <class Tensor, class G>
+    Tensor&& operator()(Tensor&& t, G g) const
+    {
+        return std::forward<Tensor>(t.generate(g));
+    }
+};
+
+struct tensor_elem_gen_integer
+{
+    uint64_t max_value = 17;
+
+    template <class... Ts>
+    double operator()(Ts... Xs) const
+    {
+        static_assert(sizeof...(Ts) < 6,
+                      "Dimensions in tensor_elem_gen_integer must be less than 6.");
+        assert(max_value > 0);
+        std::array<uint64_t, sizeof...(Ts)> left = {{Xs...}};
+        std::array<uint64_t, 5> right            = {{613, 547, 701, 877, 1049}};
+        uint64_t dot =
+            std::inner_product(left.begin(), left.end(), right.begin(), static_cast<uint64_t>(173));
+        return static_cast<double>(dot % max_value);
+    }
+};
+
+#endif
diff --git a/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp b/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp
new file mode 100644
index 000000000000..81af2afbcf2d
--- /dev/null
+++ b/projects/miopen/miopen_utils/include/miopen_utils/verify.hpp
@@ -0,0 +1,245 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_VERIFY_HPP
+#define GUARD_VERIFY_HPP
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include <miopen/float_equal.hpp>
+#include <miopen/returns.hpp>
+#include <numeric>
+#include <miopen/bfloat16.hpp>
+using half         = half_float::half;
+using hip_bfloat16 = bfloat16;
+#include <hip_float8.hpp>
+#include <miopen_utils/tensor_holder.hpp>
+
+namespace miopen {
+
+// Compute the value of a range
+template <class R>
+using range_value = typename std::decay<decltype(*std::declval<R>().begin())>::type;
+
+struct sum_fn
+{
+    template <class T, class U>
+    auto operator()(T x, U y) const MIOPEN_RETURNS(x + y);
+};
+static constexpr sum_fn sum{};
+
+struct max_fn
+{
+    template <class T>
+    static T id(T x)
+    {
+        return x;
+    }
+
+    template <class T, class U>
+    auto operator()(T x, U y) const MIOPEN_RETURNS(max_fn::id(x > y ? x : y));
+};
+static constexpr max_fn max{};
+
+namespace abs_diff_detail {
+using std::fabs;
+struct fn
+{
+    template <class T, class U>
+    auto operator()(T x, U y) const MIOPEN_RETURNS(fabs(x - y));
+};
+
+} // namespace abs_diff_detail
+
+static constexpr abs_diff_detail::fn abs_diff{};
+
+struct not_finite_fn
+{
+    template <class T, typename std::enable_if<(std::is_floating_point_v<T>), bool>::type = false>
+    bool operator()(T x) const
+    {
+        return !std::isfinite(x);
+    }
+
+    template <class T,
+              typename std::enable_if<
+                  (std::is_same_v<typename std::remove_cv<T>::type, half_float::half>),
+                  bool>::type = false>
+    bool operator()(T x) const
+    {
+        return !half_float::isfinite(x);
+    }
+
+    template <class T,
+              typename std::enable_if<(std::is_same_v<typename std::remove_cv<T>::type, bfloat16>),
+                                      bool>::type = false>
+    bool operator()(T x) const
+    {
+        return !std::isfinite(x); // bfloat16 has float() conversion operator
+    }
+
+    template <class T, typename std::enable_if<(std::is_integral_v<T>), bool>::type = false>
+    bool operator()(T x) const
+    {
+        std::ignore = x;
+        return false;
+    }
+};
+static constexpr not_finite_fn not_finite{};
+
+template <class T, class U>
+T as(T, U x)
+{
+    return x;
+}
+
+struct compare_mag_fn
+{
+    template <class T, class U>
+    bool operator()(T x, U y) const
+    {
+        using std::fabs;
+        return fabs(x) < fabs(y);
+    }
+};
+static constexpr compare_mag_fn compare_mag{};
+
+struct square_diff_fn
+{
+    template <class T, class U>
+    double operator()(T x, U y) const
+    {
+        double diff = static_cast<double>(x - y);
+        return diff * diff;
+    }
+};
+static constexpr square_diff_fn square_diff{};
+
+template <class T, std::enable_if_t<!std::is_floating_point_v<T>, bool> = true>
+bool equal_values(T const& lhs, T const& rhs)
+{
+    return lhs == rhs;
+}
+
+template <class T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
+bool equal_values(T const& lhs, T const& rhs)
+{
+    return miopen::float_equal_sentinel(lhs, rhs);
+}
+
+template <class R1>
+bool range_empty(R1&& r1)
+{
+    return r1.begin() == r1.end();
+}
+
+template <class R1>
+auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end()));
+
+template <class T>
+bool range_zero(const std::vector<T>& r)
+{
+    return std::all_of(r.begin(), r.end(), [](T x) { return equal_values(x, T()); });
+}
+
+template <class T>
+bool range_zero(const tensor<T>& r)
+{
+    return range_zero(r.data);
+}
+
+template <class R1, class R2, class T, class Reducer, class Product>
+T range_product(R1&& r1, R2&& r2, T state, Reducer r, Product p)
+{
+    return std::inner_product(r1.begin(), r1.end(), r2.begin(), state, r, p);
+}
+
+template <class R1, class R2, class Compare>
+std::size_t mismatch_idx(R1&& r1, R2&& r2, Compare compare)
+{
+    auto p = std::mismatch(r1.begin(), r1.end(), r2.begin(), compare);
+    return std::distance(r1.begin(), p.first);
+}
+
+template <class R1, class Predicate>
+int64_t find_idx(R1&& r1, Predicate p)
+{
+    auto it = std::find_if(r1.begin(), r1.end(), p);
+    if(it == r1.end())
+        return -1;
+    else
+        return std::distance(r1.begin(), it);
+}
+
+template <class R1, class R2>
+double max_diff(R1&& r1, R2&& r2)
+{
+    return range_product(r1, r2, 0.0, max, abs_diff);
+}
+
+template <class R1, class R2>
+auto max_diff_v2(R1&& r1, R2&& r2)
+{
+    using T            = decltype(r1[0] - r2[0]);
+    auto abs_diff_func = [](auto x, auto y) { return x > y ? x - y : y - x; };
+    // BUG: deduced wrong datatype, half_float bug
+    if constexpr(std::is_same_v<T, half_float::detail::expr>)
+        return range_product(r1, r2, half_float::half(), max, abs_diff_func);
+    else
+        return range_product(r1, r2, T(), max, abs_diff_func);
+}
+
+template <class R1, class R2, class T>
+std::size_t mismatch_diff(R1&& r1, R2&& r2, T diff)
+{
+    return mismatch_idx(
+        r1,
+        r2,
+        std::bind(
+            float_equal, diff, std::bind(abs_diff, std::placeholders::_1, std::placeholders::_2)));
+}
+
+template <class R1, class R2>
+double rms_range(R1&& r1, R2&& r2)
+{
+    std::size_t n = range_distance(r1);
+    if(n == range_distance(r2))
+    {
+        if(n == 0)
+            return 0;
+        double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff);
+        double mag1 = static_cast<double>(*std::max_element(r1.begin(), r1.end(), compare_mag));
+        double mag2 = static_cast<double>(*std::max_element(r2.begin(), r2.end(), compare_mag));
+        double mag =
+            std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
+        return std::sqrt(square_difference) / (std::sqrt(n) * mag);
+    }
+    else
+        return double(std::numeric_limits<range_value<R1>>::max());
+}
+} // namespace miopen
+#endif
diff --git a/projects/miopen/speedtests/CMakeLists.txt b/projects/miopen/speedtests/CMakeLists.txt
index 9aa89974cc75..826da17b59db 100644
--- a/projects/miopen/speedtests/CMakeLists.txt
+++ b/projects/miopen/speedtests/CMakeLists.txt
@@ -16,7 +16,7 @@ function(add_speedtest_executable TEST_NAME)
     endif()
     separate_arguments(MIOPEN_TEST_FLAGS_ARGS NATIVE_COMMAND ${MIOPEN_TEST_FLAGS})
     # MIOpen_with_plugins ensures CK plugin .so's are built alongside the speedtest
-    target_link_libraries(${TEST_NAME} MIOpen_with_plugins)
+    target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils miopen_utils)
     target_include_directories(${TEST_NAME} PRIVATE ../test ../src/kernels)
 endfunction(add_speedtest_executable)
 
diff --git a/projects/miopen/src/CMakeLists.txt b/projects/miopen/src/CMakeLists.txt
index 9e6f401b7506..84bbd53716fb 100644
--- a/projects/miopen/src/CMakeLists.txt
+++ b/projects/miopen/src/CMakeLists.txt
@@ -8,14 +8,7 @@ if(MIOPEN_ENABLE_SQLITE)
     add_subdirectory(sqlite)
 endif()
 
-# Truncation rounding or (default) rounding to nearest even (RNE) is enabled.
-# This switch controls two related but different aspects of MIOpen behavior
-# 1.  How host code performs conversions of float to bfloat16, important only
-#     for testing.
-# 2.  How BF16 kernels (which are kind of mixed-precision now and expected to
-#     remain in the future)  perform final conversion (and rounding) of FP32
-#     to BF16 results. This affects the main functionality of the library.
-option( MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON )
+# MIOPEN_USE_RNE_BFLOAT16 is declared in the top-level CMakeLists.txt.
 option( MIOPEN_FP8_IEEE_EXPONENT_BIAS "Sets the FP8 exponent bias to IEEE" OFF)
 option( MIOPEN_FP8_CLIPPING "Sets the FP8 clipping" ON)
 
@@ -931,7 +924,7 @@ endif()
 target_include_directories(MIOpen SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 # Workaround : change in rocm-cmake was causing linking error so had to add ${CMAKE_DL_LIBS}
 #               We can remove ${CMAKE_DL_LIBS} once root cause is identified.
-target_link_libraries(MIOpen PRIVATE ${CMAKE_DL_LIBS} Threads::Threads BZip2::BZip2)
+target_link_libraries(MIOpen PRIVATE ${CMAKE_DL_LIBS} Threads::Threads BZip2::BZip2 miopen_common_utils)
 miopen_generate_export_header(MIOpen)
 
 if(WIN32)
diff --git a/projects/miopen/src/ck_impl/CMakeLists.txt b/projects/miopen/src/ck_impl/CMakeLists.txt
index ae380f174007..791250958533 100644
--- a/projects/miopen/src/ck_impl/CMakeLists.txt
+++ b/projects/miopen/src/ck_impl/CMakeLists.txt
@@ -145,7 +145,7 @@ foreach(gpu_target IN LISTS _CK_FILTERED_TARGETS)
     target_link_libraries(${lib_name} PRIVATE hip::device)
 
     # Link against MIOpen for shared types (ConvSolution, InvokerFactory, etc.)
-    target_link_libraries(${lib_name} PRIVATE MIOpen)
+    target_link_libraries(${lib_name} PRIVATE MIOpen miopen_common_utils)
 
     # Install alongside MIOpen
     install(TARGETS ${lib_name}
diff --git a/projects/miopen/src/include/miopen/algorithm.hpp b/projects/miopen/src/include/miopen/algorithm.hpp
index d1098a066077..38b87c1e38b4 100644
--- a/projects/miopen/src/include/miopen/algorithm.hpp
+++ b/projects/miopen/src/include/miopen/algorithm.hpp
@@ -1,47 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_MLOPEN_ALGORITHM_HPP
-#define GUARD_MLOPEN_ALGORITHM_HPP
-
-#include <algorithm>
-
-namespace miopen {
-
-template <typename Range, typename Predicate>
-bool any_of(const Range& r, Predicate p)
-{
-    return std::any_of(r.begin(), r.end(), p);
-}
-
-template <typename Range, typename Predicate>
-bool all_of(const Range& r, Predicate p)
-{
-    return std::all_of(r.begin(), r.end(), p);
-}
-
-} // namespace miopen
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/algorithm.hpp>
diff --git a/projects/miopen/src/include/miopen/bfloat16.hpp b/projects/miopen/src/include/miopen/bfloat16.hpp
index 3e3a184a72d1..eab3c5b2c826 100644
--- a/projects/miopen/src/include/miopen/bfloat16.hpp
+++ b/projects/miopen/src/include/miopen/bfloat16.hpp
@@ -1,179 +1,2 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#ifndef BFLOAT16_H_
-#define BFLOAT16_H_
-
-#include <iostream>
-#include <miopen/config.h>
-
-class bfloat16
-{
-public:
-    bfloat16() : data_{0} {}
-    explicit bfloat16(float rhs)
-    {
-        union
-        {
-            float float_st;
-            std::uint32_t bf16_st;
-        } bits_st = {rhs};
-
-        // BF16 round and NaN preservation code matches
-        // https://github.com/ROCm/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h
-        if((~bits_st.bf16_st & 0x7f800000) == 0) // Inf or NaN
-        {
-            // When all of the exponent bits are 1, the value is Inf or NaN.
-            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-            // bit being 1. Signaling NaN is indicated by the most significant
-            // mantissa bit being 0 but some other bit(s) being 1. If any of the
-            // lower 16 bits of the mantissa are 1, we set the least significant bit
-            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-            // the bloat16's mantissa bits are all 0.
-            if((bits_st.bf16_st & 0xffff) != 0)
-            {
-                bits_st.bf16_st |= 0x10000; // Preserve signaling NaN
-            }
-        }
-        else
-        {
-#if MIOPEN_USE_RNE_BFLOAT16 == 1
-            // When the exponent bits are not all 1s, then the value is zero, normal,
-            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-            // least significant bits of the float mantissa are greater than 0x8000,
-            // or if they are equal to 0x8000 and the least significant bit of the
-            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-            // has the value 0x7f, then incrementing it causes it to become 0x00 and
-            // the exponent is incremented by one, which is the next higher FP value
-            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
-            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-            // incrementing it causes it to become an exponent of 0xFF and a mantissa
-            // of 0x00, which is Inf, the next higher value to the unrounded value.
-            bits_st.bf16_st +=
-                (0x7fff + ((bits_st.bf16_st >> 16) & 1)); // Round to nearest, round to even
-#else                                                     // truncation
-// do nothing
-#endif
-        }
-        data_ = bits_st.bf16_st >> 16;
-    }
-    operator float() const
-    {
-        union
-        {
-            std::uint32_t bf16_st;
-            float float_st;
-        } bits_st = {data_};
-
-        bits_st.bf16_st = bits_st.bf16_st << 16;
-        return bits_st.float_st;
-    }
-
-    bfloat16 operator-() const { return bfloat16(-static_cast<float>(*this)); }
-    bfloat16 operator+() const { return *this; }
-
-    bfloat16& operator=(const float rhs)
-    {
-        *this = bfloat16(rhs);
-        return *this;
-    }
-    bfloat16& operator+=(bfloat16 rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) + static_cast<float>(rhs));
-        return *this;
-    }
-
-    bfloat16& operator+=(float rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) + rhs);
-        return *this;
-    }
-
-    bfloat16& operator-=(bfloat16 rhs)
-    {
-        *this += -rhs;
-        return *this;
-    }
-    bfloat16& operator*=(bfloat16 rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) * static_cast<float>(rhs));
-        return *this;
-    }
-    bfloat16& operator*=(float rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) * rhs);
-        return *this;
-    }
-
-    bfloat16& operator/=(bfloat16 rhs)
-    {
-        *this = bfloat16(static_cast<float>(*this) / static_cast<float>(rhs));
-        return *this;
-    }
-    bool operator<(bfloat16 rhs) const
-    {
-        return static_cast<float>(*this) < static_cast<float>(rhs);
-    }
-    bool operator==(bfloat16 rhs) const { return std::equal_to<float>()(*this, rhs); }
-
-    static constexpr bfloat16 generate(uint16_t val) { return bfloat16{val, true}; }
-
-private:
-    constexpr bfloat16(std::uint16_t val, bool) : data_{val} {}
-
-    std::uint16_t data_;
-};
-
-inline bfloat16 operator+(bfloat16 a, const bfloat16& b)
-{
-    a += b;
-    return a;
-}
-
-inline bfloat16 operator-(bfloat16 a, const bfloat16& b)
-{
-    a -= b;
-    return a;
-}
-
-inline bfloat16 operator*(bfloat16 a, const bfloat16& b)
-{
-    a *= b;
-    return a;
-}
-
-inline bfloat16 operator/(bfloat16 a, const bfloat16& b)
-{
-    a /= b;
-    return a;
-}
-
-namespace std {
-template <>
-class numeric_limits<bfloat16>
-{
-public:
-    static constexpr bool is_specialized = true;
-    static constexpr bfloat16 min() noexcept { return bfloat16::generate(0x0080); } // 0x1.00p-126
-    static constexpr bfloat16 max() noexcept { return bfloat16::generate(0x7F7F); }
-    static constexpr bfloat16 lowest() noexcept { return bfloat16::generate(0xFF7F); }
-    static constexpr bfloat16 epsilon() noexcept { return bfloat16::generate(0x3C00); }
-    static constexpr bfloat16 infinity() noexcept { return bfloat16::generate(0x7F80); }
-    static constexpr bfloat16 quiet_NaN() noexcept { return bfloat16::generate(0x7FC0); } // qnan(0)
-    static constexpr bfloat16 signaling_NaN() noexcept
-    {
-        return bfloat16::generate(0x7F81); // snan(1)
-    }
-    static constexpr bfloat16 denorm_min() noexcept
-    {
-        return bfloat16::generate(0x0001); // 0x0.02p-126
-    }
-};
-} // namespace std
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/bfloat16.hpp>
diff --git a/projects/miopen/src/include/miopen/each_args.hpp b/projects/miopen/src/include/miopen/each_args.hpp
index e078153dc998..983c7da843dd 100644
--- a/projects/miopen/src/include/miopen/each_args.hpp
+++ b/projects/miopen/src/include/miopen/each_args.hpp
@@ -1,79 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_MIOPEN_EACH_ARGS_HPP
-#define GUARD_MIOPEN_EACH_ARGS_HPP
-
-#include <initializer_list>
-#include <type_traits>
-#include <utility>
-
-namespace miopen {
-namespace detail {
-
-template <class F, std::size_t... Ns, class... Ts>
-void each_args_i_impl(F f, std::index_sequence<Ns...>, Ts&&... xs)
-{
-    (void)std::initializer_list<int>{
-        (f(std::integral_constant<std::size_t, Ns>{}, std::forward<Ts>(xs)), 0)...};
-}
-
-template <class F, std::size_t... Ns, class T>
-auto unpack_impl(F f, std::index_sequence<Ns...>, T&& x)
-{
-    return f(std::get<Ns>(x)...);
-}
-
-} // namespace detail
-
-template <class F, class... Ts>
-void each_args_i(F f, Ts&&... xs)
-{
-    detail::each_args_i_impl(f, std::make_index_sequence<sizeof...(Ts)>(), std::forward<Ts>(xs)...);
-}
-
-template <class F, class... Ts>
-void each_args(F f, Ts&&... xs)
-{
-    (void)std::initializer_list<int>{(f(std::forward<Ts>(xs)), 0)...};
-}
-
-// Workaround for gcc warnings
-template <class F>
-void each_args(F)
-{
-}
-
-template <class F, std::size_t... Ns, class T>
-auto unpack(F f, T&& x)
-{
-    using type = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
-    return detail::unpack_impl(
-        f, std::make_index_sequence<std::tuple_size<type>::value>(), std::forward<T>(x));
-}
-
-} // namespace miopen
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/each_args.hpp>
diff --git a/projects/miopen/src/include/miopen/float_equal.hpp b/projects/miopen/src/include/miopen/float_equal.hpp
index 24bbdc55ad11..a48c2e417489 100644
--- a/projects/miopen/src/include/miopen/float_equal.hpp
+++ b/projects/miopen/src/include/miopen/float_equal.hpp
@@ -1,89 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_MLOPEN_FLOAT_EQUAL_HPP
-#define GUARD_MLOPEN_FLOAT_EQUAL_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <numeric>
-
-namespace miopen {
-
-template <class... Ts>
-using common_type = typename std::common_type<Ts...>::type;
-
-struct float_equal_fn
-{
-    template <class T>
-    static bool apply(T x, T y)
-    {
-        // The standard library from MSVC does not implement std::isfinite() for integer
-        // types - no additional overloads are provided. According to the documentation,
-        // integer types should be treaded as doubles.
-        // Refer to https://en.cppreference.com/w/cpp/numeric/math/isfinite for more information.
-        return std::isfinite(static_cast<double>(x)) and std::isfinite(static_cast<double>(y)) and
-               std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
-               std::nextafter(x, std::numeric_limits<T>::max()) >= y;
-    }
-
-    template <class T, class U>
-    bool operator()(T x, U y) const
-    {
-        return float_equal_fn::apply<common_type<T, U>>(x, y);
-    }
-};
-
-static constexpr float_equal_fn float_equal{};
-
-/// Special case for comparing with a sentinel value
-struct float_equal_sentinel_fn
-{
-    template <class T>
-    static bool apply(T x, T y)
-    {
-// In this case we have to ignore this warning, because we intend to compare with the exact value
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wfloat-equal"
-        bool equals_sentinel = x == y;
-#pragma clang diagnostic pop
-
-        return std::isfinite(static_cast<double>(x)) and std::isfinite(static_cast<double>(y)) and
-               equals_sentinel;
-    }
-
-    template <class T, class U>
-    bool operator()(T x, U y) const
-    {
-        return float_equal_sentinel_fn::apply<common_type<T, U>>(x, y);
-    }
-};
-
-static constexpr float_equal_sentinel_fn float_equal_sentinel{};
-
-} // namespace miopen
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/float_equal.hpp>
diff --git a/projects/miopen/src/include/miopen/ford.hpp b/projects/miopen/src/include/miopen/ford.hpp
index f56b20de4d46..beac57e1e6e8 100644
--- a/projects/miopen/src/include/miopen/ford.hpp
+++ b/projects/miopen/src/include/miopen/ford.hpp
@@ -1,122 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_FORD_HPP
-#define GUARD_FORD_HPP
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <cmath>
-#include <functional>
-#include <miopen/par_for.hpp>
-#include <miopen/each_args.hpp>
-#include <miopen/returns.hpp>
-#include <numeric>
-#include <vector>
-
-#include <thread>
-
-#include <future>
-
-namespace miopen {
-
-// An improved async, that doesn't block
-template <class Function>
-std::future<typename std::invoke_result<Function>::type> detach_async(Function&& f)
-{
-    using result_type = typename std::invoke_result<Function>::type;
-    std::packaged_task<result_type()> task(std::forward<Function>(f));
-    auto fut = task.get_future();
-    std::thread(std::move(task)).detach();
-    return fut;
-}
-
-template <class T, class Work>
-auto then(std::future<T> f, Work w) -> std::future<decltype(w(f.get()))>
-{
-    return std::async(std::launch::deferred,
-                      [=, f_ = std::move(f)]() mutable { return w(f_.get()); });
-}
-
-template <class T>
-struct ford_wrapper
-{
-    template <class... Ts>
-    auto operator()(Ts... xs) const MIOPEN_RETURNS(std::bind(T{}, std::placeholders::_1, xs...));
-};
-
-// Multidimensional for loop
-struct ford_impl
-{
-    template <class F>
-    void operator()(F f) const
-    {
-        f();
-    }
-
-    template <class F, class T, class... Ts>
-    void operator()(F f, T x, Ts... xs) const
-    {
-        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55914
-        for(T i = 0; i < x; i++)
-        {
-            (*this)([&](Ts... is) { f(i, is...); }, xs...);
-        }
-    }
-};
-
-static constexpr ford_wrapper<ford_impl> ford{};
-
-struct par_ford_impl
-{
-    template <class F, class... Ts>
-    void operator()(F f, Ts... xs) const
-    {
-        using array_type = std::array<std::size_t, sizeof...(Ts)>;
-        array_type lens  = {{static_cast<std::size_t>(xs)...}};
-        array_type strides;
-        strides.fill(1);
-        std::partial_sum(
-            lens.rbegin(), lens.rend() - 1, strides.rbegin() + 1, std::multiplies<std::size_t>());
-        auto size = std::accumulate(
-            lens.begin(), lens.end(), static_cast<std::size_t>(1), std::multiplies<std::size_t>());
-        par_for(size, [&](std::size_t i) {
-            array_type indices;
-            std::transform(strides.begin(),
-                           strides.end(),
-                           lens.begin(),
-                           indices.begin(),
-                           [&](size_t stride, size_t len) { return (i / stride) % len; });
-            unpack(f, indices);
-        });
-    }
-};
-
-static constexpr ford_wrapper<par_ford_impl> par_ford{};
-
-} // namespace miopen
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/ford.hpp>
diff --git a/projects/miopen/src/include/miopen/functional.hpp b/projects/miopen/src/include/miopen/functional.hpp
index 02c6e3427e87..d0a70ae6794d 100644
--- a/projects/miopen/src/include/miopen/functional.hpp
+++ b/projects/miopen/src/include/miopen/functional.hpp
@@ -1,131 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_MLOPEN_FUNCTIONAL_HPP
-#define GUARD_MLOPEN_FUNCTIONAL_HPP
-
-#include <miopen/each_args.hpp>
-#include <miopen/returns.hpp>
-#include <utility>
-
-namespace miopen {
-namespace detail {
-
-template <class F, std::size_t... Ns>
-auto each_i_impl(F f, std::index_sequence<Ns...>)
-    MIOPEN_RETURNS(f(std::integral_constant<std::size_t, Ns>{}...));
-} // namespace detail
-
-template <class F, class P>
-struct by_t
-{
-    F f;
-    P p;
-    template <class... Ts>
-    auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(p(std::forward<Ts>(xs))...))
-};
-
-template <class F, class P>
-by_t<F, P> by(F f, P p)
-{
-    return {std::move(f), std::move(p)};
-}
-
-template <class F, class G>
-struct compose_t
-{
-    F f;
-    G g;
-    template <class... Ts>
-    auto operator()(Ts&&... xs) const MIOPEN_RETURNS(f(g(std::forward<Ts>(xs)...)))
-};
-
-template <class F, class G>
-compose_t<F, G> compose(F f, G g)
-{
-    return {std::move(f), std::move(g)};
-}
-
-template <class F>
-struct flip_t
-{
-    F f;
-    template <class T, class U>
-    auto operator()(T&& x, U&& y) const MIOPEN_RETURNS(f(std::forward<U>(y), std::forward<T>(x)))
-};
-
-template <class F>
-flip_t<F> flip(F f)
-{
-    return {std::move(f)};
-}
-
-template <class F>
-struct sequence_t
-{
-    F f;
-    template <class IntegralConstant>
-    auto operator()(IntegralConstant) const
-        MIOPEN_RETURNS(detail::each_i_impl(f, std::make_index_sequence<IntegralConstant::value>()));
-};
-
-template <class F>
-sequence_t<F> sequence(F f)
-{
-    return {std::move(f)};
-}
-
-template <typename F, std::size_t N>
-void repeat_n(F f, std::integral_constant<std::size_t, N>)
-{
-    auto fs = [&f](auto... is) { return each_args(f, is...); };
-    sequence(fs)(std::integral_constant<std::size_t, N>{});
-}
-
-template <class T>
-struct cast_to
-{
-    template <class X>
-    T operator()(X&& x) const
-    {
-        return static_cast<T>(std::forward<X>(x));
-    }
-};
-
-template <class F>
-auto unpacker(F f)
-{
-    return [=](auto xs) { return miopen::unpack(f, xs); };
-};
-
-template <class F, class... Xs>
-auto prepender(F f, Xs... xs)
-{
-    return [=](auto... ys) { return f(xs..., ys...); };
-}
-
-} // namespace miopen
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/functional.hpp>
diff --git a/projects/miopen/src/include/miopen/par_for.hpp b/projects/miopen/src/include/miopen/par_for.hpp
index 1272dcf6ac9b..4685b005db77 100644
--- a/projects/miopen/src/include/miopen/par_for.hpp
+++ b/projects/miopen/src/include/miopen/par_for.hpp
@@ -1,149 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
-#define MIOPEN_GUARD_MLOPEN_PAR_FOR_HPP
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <functional>
-#include <numeric>
-#include <vector>
-
-#include <thread>
-
-namespace miopen {
-
-struct joinable_thread : std::thread
-{
-    template <class... Xs>
-    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
-    {
-    }
-
-    joinable_thread& operator=(joinable_thread&& other) = default;
-    joinable_thread(joinable_thread&& other)            = default;
-
-    ~joinable_thread()
-    {
-        if(this->joinable())
-            this->join();
-    }
-};
-
-struct thread_factory
-{
-    template <class F>
-    joinable_thread operator()(std::size_t& work, std::size_t n, std::size_t grainsize, F f) const
-    {
-        auto result = joinable_thread([=] {
-            std::size_t start = work;
-            std::size_t last  = std::min(n, work + grainsize);
-            for(std::size_t i = start; i < last; i++)
-            {
-                f(i);
-            }
-        });
-        work += grainsize;
-        return result;
-    }
-};
-
-template <class F>
-void par_for_impl(std::size_t n, std::size_t threadsize, F f)
-{
-    if(threadsize <= 1)
-    {
-        for(std::size_t i = 0; i < n; i++)
-            f(i);
-    }
-    else
-    {
-        std::vector<joinable_thread> threads(threadsize);
-        const std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
-
-        std::size_t work = 0;
-        std::generate(threads.begin(),
-                      threads.end(),
-                      std::bind(thread_factory{}, std::ref(work), n, grainsize, f));
-        assert(work >= n);
-    }
-}
-
-template <class F>
-void par_for(std::size_t n, std::size_t min_grain, F f)
-{
-    const auto threadsize =
-        std::min<std::size_t>(std::thread::hardware_concurrency(), n / min_grain);
-    par_for_impl(n, threadsize, f);
-}
-
-struct min_grain
-{
-    std::size_t n = 0;
-};
-
-template <class F>
-void par_for(std::size_t n, min_grain mg, F f)
-{
-    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), n / mg.n);
-    par_for_impl(n, threadsize, f);
-}
-
-template <class F>
-void par_for(std::size_t n, F f)
-{
-    par_for(n, min_grain{8}, f);
-}
-
-struct max_threads
-{
-    std::size_t n = 0;
-};
-
-template <class F>
-void par_for(std::size_t n, max_threads mt, F f)
-{
-    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), mt.n);
-    par_for_impl(n, std::min(threadsize, n), f);
-}
-
-template <class F>
-void par_for_strided(std::size_t n, max_threads mt, F f)
-{
-    auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(), mt.n);
-    par_for_impl(threadsize, threadsize, [&](auto start) {
-        for(std::size_t i = start; i < n; i += threadsize)
-        {
-            f(i);
-        }
-    });
-}
-
-} // namespace miopen
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/par_for.hpp>
diff --git a/projects/miopen/src/include/miopen/rank.hpp b/projects/miopen/src/include/miopen/rank.hpp
index 013ec6e7f7f4..88a4541421d4 100644
--- a/projects/miopen/src/include/miopen/rank.hpp
+++ b/projects/miopen/src/include/miopen/rank.hpp
@@ -1,42 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_MIOPEN_RANK_HPP
-#define GUARD_MIOPEN_RANK_HPP
-
-namespace miopen {
-
-template <int N>
-struct rank : rank<N - 1>
-{
-};
-
-template <>
-struct rank<0>
-{
-};
-} // namespace miopen
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/rank.hpp>
diff --git a/projects/miopen/src/include/miopen/reduce_common.hpp b/projects/miopen/src/include/miopen/reduce_common.hpp
index 37b92e727d92..8d47ee0f05b0 100644
--- a/projects/miopen/src/include/miopen/reduce_common.hpp
+++ b/projects/miopen/src/include/miopen/reduce_common.hpp
@@ -1,66 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_MIOPEN_REDUCE_COMMON_HPP
-#define GUARD_MIOPEN_REDUCE_COMMON_HPP
-
-#include <half/half.hpp>
-#include <miopen/bfloat16.hpp>
-
-namespace reduce {
-
-template <typename Tdst, typename Tsrc>
-static inline Tdst convert_type(Tsrc x)
-{
-    return static_cast<Tdst>(x);
-}
-
-template <>
-inline float convert_type<float>(half_float::half x)
-{
-    return half_float::half_cast<float>(x);
-};
-
-template <>
-inline half_float::half convert_type<half_float::half>(float x)
-{
-    return half_float::half_cast<half_float::half>(x);
-};
-
-template <>
-inline float convert_type<float>(bfloat16 x)
-{
-    return float(x);
-};
-
-template <>
-inline bfloat16 convert_type<bfloat16>(float x)
-{
-    return bfloat16(x);
-};
-
-}; // end of namespace reduce
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/reduce_common.hpp>
diff --git a/projects/miopen/src/include/miopen/returns.hpp b/projects/miopen/src/include/miopen/returns.hpp
index 4fdb1db18b87..8bd3067fdea3 100644
--- a/projects/miopen/src/include/miopen/returns.hpp
+++ b/projects/miopen/src/include/miopen/returns.hpp
@@ -1,38 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef GUARD_MIOPEN_RETURNS_HPP
-#define GUARD_MIOPEN_RETURNS_HPP
-
-#define MIOPEN_RETURNS(...) \
-    ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
-
-#define MIOPEN_BODY_RETURNS(...) \
-    {                            \
-        return __VA_ARGS__;      \
-    }
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/returns.hpp>
diff --git a/projects/miopen/src/include/miopen/stringutils.hpp b/projects/miopen/src/include/miopen/stringutils.hpp
index 5a412416d666..168eb6bee75e 100644
--- a/projects/miopen/src/include/miopen/stringutils.hpp
+++ b/projects/miopen/src/include/miopen/stringutils.hpp
@@ -1,165 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_MIOPEN_STRINGUTILS_HPP
-#define GUARD_MIOPEN_STRINGUTILS_HPP
-
-#include <miopen/algorithm.hpp>
-#include <miopen/errors.hpp>
-#include <algorithm>
-#include <iterator>
-#include <numeric>
-#include <string>
-#include <vector>
-#include <sstream>
-
-#define MIOPEN_STRINGIZE_1(...) #__VA_ARGS__
-#define MIOPEN_STRINGIZE(...) MIOPEN_STRINGIZE_1(__VA_ARGS__)
-
-namespace miopen {
-
-inline std::string
-ReplaceString(const std::string& in, const std::string& search, const std::string& replace)
-{
-    size_t pos = 0;
-    std::string subject(in);
-    while((pos = subject.find(search, pos)) != std::string::npos)
-    {
-        subject.replace(pos, search.length(), replace);
-        pos += replace.length();
-    }
-    return subject;
-}
-
-inline bool EndsWith(const std::string& value, const std::string& suffix)
-{
-    if(suffix.size() > value.size())
-        return false;
-    else
-        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
-}
-
-template <class Strings>
-inline std::string JoinStrings(Strings strings, std::string delim)
-{
-    auto it = strings.begin();
-    if(it == strings.end())
-        return "";
-
-    auto nit = std::next(it);
-    return std::accumulate(
-        nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; });
-}
-
-template <class F>
-static inline std::string TransformString(std::string s, F f)
-{
-    std::transform(s.begin(), s.end(), s.begin(), f);
-    return s;
-}
-
-inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); }
-
-inline bool StartsWith(const std::string& value, const std::string& prefix)
-{
-    if(prefix.size() > value.size())
-        return false;
-    else
-        return std::equal(prefix.begin(), prefix.end(), value.begin());
-}
-
-inline std::string RemovePrefix(std::string s, std::string prefix)
-{
-    if(StartsWith(s, prefix))
-        return s.substr(prefix.length());
-    else
-        return s;
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::string& in)
-{
-    std::istringstream ss(in);
-    const std::istream_iterator<std::string> begin(ss), end;
-    return {begin, end};
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::vector<std::string>& in)
-{
-    std::vector<std::string> rv;
-    for(const auto& item : in)
-    {
-        if(item.find(' ') != std::string::npos)
-        {
-            const auto splitted = SplitSpaceSeparated(item);
-            std::copy(splitted.begin(), splitted.end(), std::back_inserter(rv));
-        }
-        else
-        {
-            rv.emplace_back(item);
-        }
-    }
-    return rv;
-}
-
-inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
-                                                    const std::vector<std::string>& dontSplitAfter)
-{
-    std::vector<std::string> rv;
-    std::istringstream ss(in);
-    std::string s;
-    while(ss >> s)
-    {
-        if(any_of(dontSplitAfter, [&](const auto& dont) { return dont == s; }))
-        {
-            std::string s2;
-            if(ss >> s2)
-            {
-                s += std::string(" ").append(s2); // Exactly one space is important.
-                rv.push_back(s);
-                continue;
-            }
-            MIOPEN_THROW("Error parsing string: '" + in + '\'');
-        }
-        rv.push_back(s);
-    }
-    return rv;
-}
-
-inline std::vector<std::string> SplitDelim(const std::string& in, const char delim)
-{
-    std::vector<std::string> rv;
-    std::string token;
-    std::istringstream ss(in);
-
-    while(std::getline(ss, token, delim))
-    {
-        rv.push_back(token);
-    }
-    return rv;
-}
-
-} // namespace miopen
-
-#endif // GUARD_MIOPEN_STRINGUTILS_HPP
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/stringutils.hpp>
diff --git a/projects/miopen/src/include/miopen/type_name.hpp b/projects/miopen/src/include/miopen/type_name.hpp
index ac7fd2ff6017..4f4afd78def0 100644
--- a/projects/miopen/src/include/miopen/type_name.hpp
+++ b/projects/miopen/src/include/miopen/type_name.hpp
@@ -1,139 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017-2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef GUARD_TYPE_NAME_HPP
-#define GUARD_TYPE_NAME_HPP
-
-#include <string>
-#include <string_view>
-#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__)
-#include <type_traits>
-#endif
-
-namespace miopen {
-
-template <class T>
-constexpr std::string_view type_name()
-{
-#if defined(__clang__) || defined(__GNUC__)
-    // clang or gcc
-    constexpr auto full_name = std::string_view{__PRETTY_FUNCTION__};
-#elif defined(_MSC_VER)
-    // msvc
-    constexpr auto full_name = std::string_view{__FUNCSIG__};
-#endif
-
-    // The substring with the data type name is located within the original string, between the
-    // prefix and the suffix, with the prefix always not at the beginning of the string and the
-    // suffix always at the end of the string.
-#if defined(__clang__)
-    // clang
-    constexpr auto prefix = std::string_view{"[T = "};
-    constexpr auto suffix = std::string_view{"]"};
-#elif defined(__GNUC__)
-    // gcc
-    constexpr auto prefix = std::string_view{"[with T = "};
-    constexpr auto suffix = std::string_view{"; std::string_view = std::basic_string_view<char>]"};
-#elif defined(_MSC_VER)
-    // msvc
-    constexpr auto prefix = std::string_view{"type_name<"};
-    constexpr auto suffix = std::string_view{">(void)"};
-#endif
-
-    constexpr auto prefix_pos = full_name.find(prefix);
-    static_assert(prefix_pos != std::string_view::npos);
-
-    constexpr auto suffix_pos = full_name.rfind(suffix);
-    static_assert(suffix_pos != std::string_view::npos);
-    static_assert(suffix_pos == full_name.size() - suffix.size());
-
-    constexpr auto pos = prefix_pos + prefix.size();
-    static_assert(pos < suffix_pos);
-    constexpr auto count = suffix_pos - pos;
-
-    constexpr auto name = full_name.substr(pos, count);
-
-#if defined(__clang__) || defined(__GNUC__)
-    // clang or gcc
-    return name;
-#elif defined(_MSC_VER)
-    // msvc
-    if constexpr(std::is_compound_v<T>)
-    {
-        // For compound data types, the string contains the keyword 'class/struct/union/enum' before
-        // the data type name, separated by a space.
-        constexpr auto sep     = std::string_view{" "};
-        constexpr auto sep_pos = name.find(sep);
-        static_assert(sep_pos != std::string_view::npos);
-        static_assert(sep_pos != 0); // must not be at the 0 position
-
-        constexpr auto name_pos = sep_pos + sep.size();
-        constexpr auto tname    = name.substr(name_pos);
-        static_assert(tname.size() > 0);
-
-        return tname;
-    }
-    else
-    {
-        return name;
-    }
-#endif
-}
-
-template <class T>
-constexpr std::string_view type_name_bare()
-{
-    constexpr auto name = type_name<T>();
-    constexpr auto pos  = name.rfind(':');
-    if constexpr(pos == std::string_view::npos)
-    {
-        constexpr auto result = name;
-        return result;
-    }
-    else
-    {
-        constexpr auto bare_name = name.substr(pos + 1);
-        static_assert(bare_name.size() > 0);
-        return bare_name;
-    }
-}
-
-template <class T>
-const std::string& get_type_name()
-{
-    static const auto ret = std::string(type_name<T>());
-    return ret;
-}
-
-template <class T>
-const std::string& get_type_name(const T&)
-{
-    return miopen::get_type_name<T>();
-}
-
-} // namespace miopen
-
-#endif
+// Forwarding header — implementation moved to common_utils.
+#include <common_utils/type_name.hpp>
diff --git a/projects/miopen/test/CMakeLists.txt b/projects/miopen/test/CMakeLists.txt
index 57601d45ceaf..035f1314fc63 100755
--- a/projects/miopen/test/CMakeLists.txt
+++ b/projects/miopen/test/CMakeLists.txt
@@ -414,9 +414,9 @@ function(add_test_executable TEST_NAME)
     endif()
     # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test
     if(NOT MIOPEN_EMBED_DB STREQUAL "")
-        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data)
+        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_data miopen_common_utils miopen_utils)
     else()
-        target_link_libraries(${TEST_NAME} MIOpen_with_plugins)
+        target_link_libraries(${TEST_NAME} MIOpen_with_plugins miopen_common_utils miopen_utils)
     endif()
     target_include_directories(${TEST_NAME} PRIVATE ../src/kernels)
     if(WIN32)
diff --git a/projects/miopen/test/cpu_bias.hpp b/projects/miopen/test/cpu_bias.hpp
index 9b0c2578feef..2abbcccde0da 100644
--- a/projects/miopen/test/cpu_bias.hpp
+++ b/projects/miopen/test/cpu_bias.hpp
@@ -1,141 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_CPU_BIAS_HPP
-#define GUARD_CPU_BIAS_HPP
-
-#include "test.hpp"
-#include <array>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
-#include <utility>
-
-#include "tensor_holder.hpp"
-#include <miopen/stringutils.hpp>
-#include <miopen/functional.hpp>
-
-template <std::size_t NSpatialDim, typename Tout, typename Tbias>
-void cpu_bias_forward_impl(tensor<Tout>& out, const tensor<Tbias>& bias)
-{
-    assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2);
-    assert(
-        bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[1] &&
-        std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) {
-            return v == 1;
-        }));
-
-    out.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) {
-        out(out_n_id, out_k_id, out_spatial_id_pack...) =
-            double(out(out_n_id, out_k_id, out_spatial_id_pack...)) + double(bias.data[out_k_id]);
-    });
-}
-
-template <std::size_t NSpatialDim, typename Tout, typename Tbias>
-void cpu_bias_backward_data_impl(const tensor<Tout>& out, tensor<Tbias>& bias)
-{
-    assert(out.desc.GetNumDims() == NSpatialDim + 2 and bias.desc.GetNumDims() == NSpatialDim + 2);
-    assert(
-        bias.desc.GetLengths()[0] == 1 && bias.desc.GetLengths()[1] == out.desc.GetLengths()[0] &&
-        std::all_of(bias.desc.GetLengths().begin() + 2, bias.desc.GetLengths().end(), [](auto v) {
-            return v == 1;
-        }));
-
-    std::size_t out_n_len = out.desc.GetLengths()[0];
-    std::size_t out_k_len = out.desc.GetLengths()[1];
-
-    std::array<std::size_t, NSpatialDim> out_spatial_len{};
-    std::copy_n(out.desc.GetLengths().begin() + 2, NSpatialDim, out_spatial_len.begin());
-
-    miopen::par_ford(out_k_len)([&](auto out_k_id) {
-        auto ford_out_n_spatial =
-            miopen::unpacker(miopen::prepender(miopen::ford, out_n_len))(out_spatial_len);
-
-        double acc = 0;
-        ford_out_n_spatial([&](auto out_n_id, auto... out_spatial_id_pack) {
-            acc += double(out(out_n_id, out_k_id, out_spatial_id_pack...));
-        });
-
-        bias.data[out_k_id] = acc;
-    });
-}
-
-template <typename Tout, typename Tbias>
-void cpu_bias_forward(tensor<Tout>& out, const tensor<Tbias>& bias)
-{
-    switch(out.desc.GetNumDims())
-    {
-    case 3: {
-        cpu_bias_forward_impl<1>(out, bias);
-        break;
-    }
-    case 4: {
-        cpu_bias_forward_impl<2>(out, bias);
-        break;
-    }
-    case 5: {
-        cpu_bias_forward_impl<3>(out, bias);
-        break;
-    }
-    case 6: {
-        cpu_bias_forward_impl<4>(out, bias);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
-
-template <typename Tout, typename Tbias>
-void cpu_bias_backward_data(const tensor<Tout>& out, tensor<Tbias>& bias)
-{
-    switch(out.desc.GetNumDims())
-    {
-    case 3: {
-        cpu_bias_backward_data_impl<1>(out, bias);
-        break;
-    }
-    case 4: {
-        cpu_bias_backward_data_impl<2>(out, bias);
-        break;
-    }
-    case 5: {
-        cpu_bias_backward_data_impl<3>(out, bias);
-        break;
-    }
-    case 6: {
-        cpu_bias_backward_data_impl<4>(out, bias);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/cpu_bias.hpp>
diff --git a/projects/miopen/test/cpu_conv.hpp b/projects/miopen/test/cpu_conv.hpp
index 895262311b12..818e215c45e2 100644
--- a/projects/miopen/test/cpu_conv.hpp
+++ b/projects/miopen/test/cpu_conv.hpp
@@ -1,515 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_CPU_CONV_HPP
-#define GUARD_CPU_CONV_HPP
-
-#include "test.hpp"
-#include <array>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
-#include <utility>
-
-#include "tensor_holder.hpp"
-#include <miopen/stringutils.hpp>
-#include <miopen/functional.hpp>
-#include <hip_float8.hpp>
-
-template <class T, class... Ts>
-static constexpr auto make_array(T x, Ts... xs)
-{
-    return std::array<T, 1 + sizeof...(Ts)>{{x, xs...}};
-}
-
-template <typename T>
-struct PassThru
-{
-    T operator()(T t) { return t; }
-};
-
-template <typename Tin, typename Twei, typename Tout>
-struct cpu_convolution_acc_type
-{
-    using type = double; // default using double as accumulator
-};
-
-template <>
-struct cpu_convolution_acc_type<int8_t, int8_t, int32_t>
-{
-    using type = int32_t;
-};
-
-template <>
-struct cpu_convolution_acc_type<int8_t, int8_t, float>
-{
-    using type = double;
-};
-
-template <std::size_t ConvDim,
-          typename Tacc,
-          typename FI,
-          typename FW,
-          typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range>
-void cpu_convolution_forward_impl(const tensor<Tin>& in,
-                                  const tensor<Twei>& wei,
-                                  tensor<Tout>& out,
-                                  const Range& pads,
-                                  const Range& strides,
-                                  const Range& dilations,
-                                  std::size_t group_count,
-                                  FI fi = {},
-                                  FW fw = {})
-{
-    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
-    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
-           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
-           strides.size() == ConvDim and dilations.size() == ConvDim);
-    std::size_t out_n_len = out.desc.GetLengths()[0];
-
-    std::size_t wei_k_len = wei.desc.GetLengths()[0];
-    std::size_t wei_c_len = wei.desc.GetLengths()[1];
-
-    std::size_t vector_len = in.desc.GetVectorLength();
-
-    std::array<std::size_t, ConvDim> in_spatial_len{};
-    std::array<std::size_t, ConvDim> wei_spatial_len{};
-    std::array<std::size_t, ConvDim> out_spatial_len{};
-
-    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
-    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
-    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
-
-    if(wei.desc.GetLayout_str() == "CHWNc")
-    {
-        wei_c_len = wei.desc.GetLengths()[0];
-        std::copy_n(wei.desc.GetLengths().begin() + 1, ConvDim, wei_spatial_len.begin());
-        wei_k_len = wei.desc.GetLengths()[3];
-    }
-
-    std::size_t wei_k_len_per_group = wei_k_len / group_count;
-
-    // f(x0, x1, xs...)
-    // f1(xs...) = f(x0, x1, xs...)
-    // f2(xs_array) = f1(xs...)
-    auto par_ford_out_nk_spatial = miopen::unpacker(
-        miopen::prepender(miopen::par_ford, out_n_len, wei_k_len))(out_spatial_len);
-
-    par_ford_out_nk_spatial([&](std::size_t out_n_id,
-                                std::size_t out_k_id,
-                                auto... out_spatial_id_pack) {
-        auto out_spatial_id = make_array(out_spatial_id_pack...);
-
-        std::size_t group_id = out_k_id / wei_k_len_per_group;
-        Tacc acc             = 0;
-
-        miopen::ford(wei_c_len)([&](std::size_t wei_c_id) {
-            std::size_t in_c_id = group_id * wei_c_len + wei_c_id;
-
-            auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len);
-
-            ford_wei_spatial([&](auto... wei_spatial_id_pack) {
-                auto wei_spatial_id = make_array(wei_spatial_id_pack...);
-
-                std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
-
-                for(std::size_t i = 0; i < ConvDim; ++i)
-                {
-                    in_spatial_id[i] =
-                        out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i];
-                }
-                bool out_of_bound = false;
-                for(std::size_t i = 0; i < ConvDim; ++i)
-                {
-                    out_of_bound = out_of_bound or
-                                   (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]);
-                }
-                if(!out_of_bound)
-                {
-                    if(vector_len > 1)
-                    {
-                        std::array<std::size_t, ConvDim + 3> in_id{};
-                        in_id[1] = out_n_id;
-                        in_id[2] = in_c_id;
-                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 3);
-                        for(std::size_t i = 0; i < vector_len; i++)
-                        {
-                            in_id[0] = i;
-                            acc += Tacc(in(in_id)) *
-                                   Tacc(wei(i, out_k_id, wei_c_id, wei_spatial_id_pack...));
-                        }
-                    }
-                    else
-                    {
-                        std::array<std::size_t, ConvDim + 2> in_id{};
-                        in_id[0] = out_n_id;
-                        in_id[1] = in_c_id;
-                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
-                        Tacc tmp1 = static_cast<Tacc>(fi(in(in_id)));
-                        Tacc tmp2 =
-                            static_cast<Tacc>(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)));
-                        acc += tmp1 * tmp2;
-                    }
-                }
-            });
-        });
-        if(vector_len > 1)
-        {
-            out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) =
-                static_cast<Tout>(acc);
-        }
-        else
-        {
-            out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast<Tout>(acc);
-        }
-    });
-}
-
-template <std::size_t ConvDim,
-          typename Tacc,
-          typename FW,
-          typename FO,
-          typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range>
-void cpu_convolution_backward_data_impl(tensor<Tin>& in,
-                                        const tensor<Twei>& wei,
-                                        const tensor<Tout>& out,
-                                        const Range& pads,
-                                        const Range& strides,
-                                        const Range& dilations,
-                                        std::size_t group_count,
-                                        FW fw = {},
-                                        FO fo = {})
-{
-    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
-    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
-           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
-           strides.size() == ConvDim and dilations.size() == ConvDim);
-
-    std::size_t in_n_len = in.desc.GetLengths()[0];
-    std::size_t in_c_len = in.desc.GetLengths()[1];
-
-    std::size_t wei_k_len = wei.desc.GetLengths()[0];
-    std::size_t wei_c_len = wei.desc.GetLengths()[1];
-
-    std::size_t wei_k_len_per_group = wei_k_len / group_count;
-
-    std::array<std::size_t, ConvDim> in_spatial_len{};
-    std::array<std::size_t, ConvDim> wei_spatial_len{};
-    std::array<std::size_t, ConvDim> out_spatial_len{};
-
-    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
-    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
-    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
-
-    auto par_ford_in_nc_spatial =
-        miopen::unpacker(miopen::prepender(miopen::par_ford, in_n_len, in_c_len))(in_spatial_len);
-
-    par_ford_in_nc_spatial(
-        [&](std::size_t in_n_id, std::size_t in_c_id, auto... in_spatial_id_pack) {
-            auto in_spatial_id = make_array(in_spatial_id_pack...);
-
-            std::size_t group_id = in_c_id / wei_c_len;
-
-            Tacc acc = 0;
-
-            miopen::ford(wei_k_len_per_group)([&](std::size_t wei_k_id_inside_group) {
-                auto ford_wei_spatial = miopen::unpacker(miopen::ford)(wei_spatial_len);
-
-                ford_wei_spatial([&](auto... wei_spatial_id_pack) {
-                    auto wei_spatial_id = make_array(wei_spatial_id_pack...);
-
-                    std::array<ptrdiff_t, ConvDim> out_spatial_id_{};
-                    std::array<ptrdiff_t, ConvDim> out_spatial_id{};
-
-                    for(std::size_t i = 0; i < ConvDim; ++i)
-                    {
-                        out_spatial_id_[i] =
-                            pads[i] + in_spatial_id[i] - wei_spatial_id[i] * dilations[i];
-                        out_spatial_id[i] = out_spatial_id_[i] / strides[i];
-                    }
-
-                    bool use = true;
-                    for(std::size_t i = 0; i < ConvDim; ++i)
-                    {
-                        use &= out_spatial_id_[i] % strides[i] == 0 and out_spatial_id[i] >= 0 and
-                               out_spatial_id[i] < out_spatial_len[i];
-                    }
-
-                    if(use)
-                    {
-                        std::size_t out_k_id =
-                            group_id * wei_k_len_per_group + wei_k_id_inside_group;
-                        std::size_t wei_c_id = in_c_id % wei_c_len;
-
-                        std::array<std::size_t, ConvDim + 2> out_id{};
-                        out_id[0] = in_n_id;
-                        out_id[1] = out_k_id;
-                        std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2);
-                        Tacc tmp1 = fo(out(out_id));
-                        Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...));
-                        acc += tmp1 * tmp2;
-                    }
-                });
-            });
-            // TODO: Why do we need a no-lint here ?
-            in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast<Tout>(acc); // NOLINT
-        });
-}
-
-template <std::size_t ConvDim,
-          typename Tacc,
-          typename FI,
-          typename FO,
-          typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range>
-void cpu_convolution_backward_weight_impl(const tensor<Tin>& in,
-                                          tensor<Twei>& wei,
-                                          const tensor<Tout>& out,
-                                          const Range& pads,
-                                          const Range& strides,
-                                          const Range& dilations,
-                                          std::size_t group_count,
-                                          FI fi,
-                                          FO fo)
-{
-    static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
-    assert(in.desc.GetNumDims() == ConvDim + 2 and wei.desc.GetNumDims() == ConvDim + 2 and
-           out.desc.GetNumDims() == ConvDim + 2 and pads.size() == ConvDim and
-           strides.size() == ConvDim and dilations.size() == ConvDim);
-
-    std::size_t out_n_len = out.desc.GetLengths()[0];
-
-    std::size_t wei_k_len = wei.desc.GetLengths()[0];
-    std::size_t wei_c_len = wei.desc.GetLengths()[1];
-
-    std::size_t wei_k_len_per_group = wei_k_len / group_count;
-
-    std::array<std::size_t, ConvDim> in_spatial_len{};
-    std::array<std::size_t, ConvDim> wei_spatial_len{};
-    std::array<std::size_t, ConvDim> out_spatial_len{};
-
-    std::copy_n(in.desc.GetLengths().begin() + 2, ConvDim, in_spatial_len.begin());
-    std::copy_n(wei.desc.GetLengths().begin() + 2, ConvDim, wei_spatial_len.begin());
-    std::copy_n(out.desc.GetLengths().begin() + 2, ConvDim, out_spatial_len.begin());
-
-    auto par_ford_wei_kc_spatial = miopen::unpacker(
-        miopen::prepender(miopen::par_ford, wei_k_len, wei_c_len))(wei_spatial_len);
-
-    par_ford_wei_kc_spatial(
-        [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) {
-            auto wei_spatial_id = make_array(wei_spatial_id_pack...);
-
-            std::size_t group_id = wei_k_id / wei_k_len_per_group;
-            std::size_t in_c_id  = group_id * wei_c_len + wei_c_id;
-
-            Tacc acc = 0;
-
-            miopen::ford(out_n_len)([&](std::size_t out_n_id) {
-                auto ford_out_spatial = miopen::unpacker(miopen::ford)(out_spatial_len);
-
-                ford_out_spatial([&](auto... out_spatial_id_pack) {
-                    auto out_spatial_id = make_array(out_spatial_id_pack...);
-
-                    std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
-
-                    for(std::size_t i = 0; i < ConvDim; ++i)
-                    {
-                        in_spatial_id[i] = out_spatial_id[i] * strides[i] +
-                                           wei_spatial_id[i] * dilations[i] - pads[i];
-                    }
-
-                    bool out_of_bound = false;
-                    for(std::size_t i = 0; i < ConvDim; ++i)
-                    {
-                        out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or
-                                                        in_spatial_id[i] >= in_spatial_len[i]);
-                    }
-
-                    if(!out_of_bound)
-                    {
-                        std::array<std::size_t, ConvDim + 2> in_id{};
-                        in_id[0] = out_n_id;
-                        in_id[1] = in_c_id;
-                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
-                        Tacc tmp1 = fi(in(in_id));
-                        Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...));
-                        acc += tmp1 * tmp2;
-                    }
-                });
-
-                wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast<Twei>(acc);
-            });
-        });
-}
-
-template <typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range,
-          typename Tacc = double,
-          typename FI   = PassThru<Tin>,
-          typename FW   = PassThru<Twei>>
-void cpu_convolution_forward(std::size_t spatial_dim,
-                             const tensor<Tin>& in,
-                             const tensor<Twei>& wei,
-                             tensor<Tout>& out,
-                             const Range& pads,
-                             const Range& strides,
-                             const Range& dilations,
-                             std::size_t group_count,
-                             FI fi = {},
-                             FW fw = {})
-{
-    switch(spatial_dim)
-    {
-    case 1: {
-        cpu_convolution_forward_impl<1, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fw);
-        break;
-    }
-    case 2: {
-        cpu_convolution_forward_impl<2, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fw);
-        break;
-    }
-    case 3: {
-        cpu_convolution_forward_impl<3, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fw);
-        break;
-    }
-    case 4: {
-        cpu_convolution_forward_impl<4, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fw);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
-
-template <typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range,
-          typename Tacc = double,
-          typename FW   = PassThru<Twei>,
-          typename FO   = PassThru<Tout>>
-void cpu_convolution_backward_data(std::size_t spatial_dim,
-                                   tensor<Tin>& in,
-                                   const tensor<Twei>& wei,
-                                   const tensor<Tout>& out,
-                                   const Range& pads,
-                                   const Range& strides,
-                                   const Range& dilations,
-                                   std::size_t group_count,
-                                   FW fw = {},
-                                   FO fo = {})
-{
-    switch(spatial_dim)
-    {
-    case 1: {
-        cpu_convolution_backward_data_impl<1, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fw, fo);
-        break;
-    }
-    case 2: {
-        cpu_convolution_backward_data_impl<2, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fw, fo);
-        break;
-    }
-    case 3: {
-        cpu_convolution_backward_data_impl<3, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fw, fo);
-        break;
-    }
-    case 4: {
-        cpu_convolution_backward_data_impl<4, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fw, fo);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
-
-template <typename Tin,
-          typename Twei,
-          typename Tout,
-          typename Range,
-          typename Tacc = double,
-          typename FI   = PassThru<Tin>,
-          typename FO   = PassThru<Tout>>
-void cpu_convolution_backward_weight(std::size_t spatial_dim,
-                                     const tensor<Tin>& in,
-                                     tensor<Twei>& wei,
-                                     const tensor<Tout>& out,
-                                     const Range& pads,
-                                     const Range& strides,
-                                     const Range& dilations,
-                                     std::size_t group_count,
-                                     FI fi = {},
-                                     FO fo = {})
-{
-    switch(spatial_dim)
-    {
-    case 1: {
-        cpu_convolution_backward_weight_impl<1, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fo);
-        break;
-    }
-    case 2: {
-        cpu_convolution_backward_weight_impl<2, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fo);
-        break;
-    }
-    case 3: {
-        cpu_convolution_backward_weight_impl<3, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fo);
-        break;
-    }
-    case 4: {
-        cpu_convolution_backward_weight_impl<4, Tacc>(
-            in, wei, out, pads, strides, dilations, group_count, fi, fo);
-        break;
-    }
-    default: {
-        MIOPEN_THROW("not belong to any case");
-    }
-    }
-}
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/cpu_conv.hpp>
diff --git a/projects/miopen/test/cpu_layernorm.hpp b/projects/miopen/test/cpu_layernorm.hpp
index 8b5bf965deab..a9f7b139484c 100644
--- a/projects/miopen/test/cpu_layernorm.hpp
+++ b/projects/miopen/test/cpu_layernorm.hpp
@@ -1,216 +1,2 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-#ifndef GUARD_CPU_CONV_HPP
-#define GUARD_CPU_CONV_HPP
-
-#include <../test/tensor_holder.hpp>
-
-template <class T>
-void cpu_layernorm_forward(tensor<T> input,
-                           tensor<T> weight,
-                           tensor<T> bias,
-                           tensor<T>& ref_output,
-                           tensor<T>& ref_mean,
-                           tensor<T>& ref_rstd,
-                           float eps,
-                           int32_t dim,
-                           miopenNormMode_t mode,
-                           bool use_multithread = false)
-{
-    auto layout   = input.desc.GetLayoutEnum();
-    size_t stride = 1;
-    if(dim > 1 && layout.has_value() &&
-       (layout.value() == miopenTensorNHWC || layout.value() == miopenTensorNDHWC))
-    {
-        stride = input.desc.GetLengths()[1]; // stride = C
-    }
-
-    auto dims         = input.desc.GetLengths();
-    size_t outer_size = 1;
-    size_t inner_size = 1;
-    for(size_t i = 0; i < dims.size(); ++i)
-    {
-        if(i < dim)
-        {
-            if(!(stride > 1 && i == 1))
-            {
-                outer_size *= dims[i];
-            }
-        }
-        else
-        {
-            inner_size *= dims[i];
-        }
-    }
-
-    size_t min_grain = use_multithread ? 8 : outer_size;
-    miopen::par_for(outer_size, min_grain, [&](int32_t o) {
-        miopen::ford(stride)([&](int32_t s) {
-            double mean_v = 0.0;
-            double var_v  = 0.0;
-
-            miopen::ford(inner_size)([&](int32_t i) {
-                double tmp = static_cast<double>(input[o * inner_size * stride + i * stride + s]);
-                mean_v += tmp;
-                var_v += tmp * tmp;
-            });
-
-            mean_v        = mean_v / inner_size;
-            var_v         = var_v / inner_size - mean_v * mean_v;
-            double rstd_v = 1.0 / sqrt(var_v + eps);
-
-            ref_mean[o * stride + s] = static_cast<T>(mean_v);
-            ref_rstd[o * stride + s] = static_cast<T>(rstd_v);
-
-            miopen::ford(inner_size)([&](int32_t i) {
-                double weight_v =
-                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<float>(weight[i]);
-                double bias_v =
-                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0.0 : static_cast<float>(bias[i]);
-
-                ref_output[o * inner_size * stride + i * stride + s] = static_cast<T>(
-                    (static_cast<double>(input[o * inner_size * stride + i * stride + s]) -
-                     mean_v) *
-                        rstd_v * weight_v +
-                    bias_v);
-            });
-        });
-    });
-}
-
-template <class T>
-void cpu_layernorm_backward(tensor<T> dy,
-                            tensor<T> x,
-                            tensor<T> weight,
-                            tensor<T> mean,
-                            tensor<T> rstd,
-                            tensor<T>& ref_dx,
-                            int32_t dim,
-                            miopenNormMode_t mode,
-                            bool use_multithread = false)
-{
-    auto layout   = dy.desc.GetLayoutEnum();
-    size_t stride = 1;
-    if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC))
-    {
-        stride = dy.desc.GetLengths()[1]; // stride = C
-    }
-
-    auto dims         = dy.desc.GetLengths();
-    size_t outer_size = 1;
-    size_t inner_size = 1;
-    for(size_t i = 0; i < dims.size(); ++i)
-    {
-        if(i < dim)
-        {
-            if(!(stride > 1 && i == 1))
-            {
-                outer_size *= dims[i];
-            }
-        }
-        else
-        {
-            inner_size *= dims[i];
-        }
-    }
-
-    size_t min_grain = use_multithread ? 8 : outer_size;
-    miopen::par_for(outer_size, min_grain, [&](int32_t o) {
-        miopen::ford(stride)([&](int32_t s) {
-            double sum_dy_weight   = 0.0;
-            double sum_dy_weight_x = 0.0;
-
-            miopen::ford(inner_size)([&](int32_t i) {
-                double pweight =
-                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<double>(weight[i]);
-                double pdy = (dy.GetSize() != 0)
-                                 ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
-                                 : 0.0;
-                double px  = static_cast<double>(x[o * inner_size * stride + i * stride + s]);
-
-                sum_dy_weight += pdy * pweight;
-                sum_dy_weight_x += pdy * px * pweight;
-            });
-
-            double scale = 1.0 / static_cast<double>(inner_size);
-            double prstd = static_cast<double>(rstd[o * stride + s]);
-            double pmean = static_cast<double>(mean[o * stride + s]);
-            double a = prstd * prstd * prstd * scale * (sum_dy_weight_x - sum_dy_weight * pmean);
-            double b = prstd * sum_dy_weight * scale - a * pmean;
-
-            miopen::ford(inner_size)([&](int32_t i) {
-                double pweight =
-                    (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1.0 : static_cast<double>(weight[i]);
-                double pdy = (dy.GetSize() != 0)
-                                 ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
-                                 : 0.0;
-                double val = prstd * pdy * pweight -
-                             a * static_cast<double>(x[o * inner_size * stride + i * stride + s]) -
-                             b;
-
-                ref_dx[o * inner_size * stride + i * stride + s] = static_cast<T>(val);
-            });
-        });
-    });
-}
-
-template <class T>
-void cpu_layernorm_backward_weight_bias(tensor<T> dy,
-                                        tensor<T> x,
-                                        tensor<T> mean,
-                                        tensor<T> rstd,
-                                        tensor<T>& ref_dw,
-                                        tensor<T>& ref_db,
-                                        int32_t dim,
-                                        bool use_multithread = false)
-{
-    auto layout   = dy.desc.GetLayoutEnum();
-    size_t stride = 1;
-    if(dim > 1 && (layout == miopenTensorNHWC || layout == miopenTensorNDHWC))
-    {
-        stride = dy.desc.GetLengths()[1]; // stride = C
-    }
-
-    auto dims         = dy.desc.GetLengths();
-    size_t outer_size = 1;
-    size_t inner_size = 1;
-    for(size_t i = 0; i < dims.size(); ++i)
-    {
-        if(i < dim)
-        {
-            if(!(stride > 1 && i == 1))
-            {
-                outer_size *= dims[i];
-            }
-        }
-        else
-        {
-            inner_size *= dims[i];
-        }
-    }
-
-    size_t min_grain = use_multithread ? 8 : inner_size;
-    miopen::par_for(inner_size, min_grain, [&](int32_t i) {
-        double sum_dw = 0.0;
-        double sum_db = 0.0;
-
-        miopen::ford(stride)([&](int32_t s) {
-            miopen::ford(outer_size)([&](int32_t o) {
-                double prstd = static_cast<double>(rstd[o * stride + s]);
-                double pmean = static_cast<double>(mean[o * stride + s]);
-                double pdy   = (dy.GetSize() != 0)
-                                   ? static_cast<double>(dy[o * inner_size * stride + i * stride + s])
-                                   : 0;
-                double px    = static_cast<double>(x[o * inner_size * stride + i * stride + s]);
-
-                sum_dw += pdy * (px - pmean) * prstd;
-                sum_db += pdy;
-            });
-        });
-
-        ref_dw[i] = sum_dw;
-        ref_db[i] = sum_db;
-    });
-}
-
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/cpu_layernorm.hpp>
diff --git a/projects/miopen/test/cpu_reduce_util.hpp b/projects/miopen/test/cpu_reduce_util.hpp
index 88728b02faec..401dd20b994b 100644
--- a/projects/miopen/test/cpu_reduce_util.hpp
+++ b/projects/miopen/test/cpu_reduce_util.hpp
@@ -1,649 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_CPU_REDUCE_UTIL_HPP
-#define GUARD_CPU_REDUCE_UTIL_HPP
-
-#include "miopen/reducetensor.hpp"
-#include "tensor_holder.hpp"
-#include <cstddef>
-#include <half/half.hpp>
-#include <limits>
-#include <cmath>
-#include <cassert>
-#include <ratio>
-#include <stdexcept>
-#include <string>
-#include <miopen/miopen.h>
-#include <miopen/reduce_common.hpp>
-
-namespace reduce {
-
-template <typename T>
-static inline bool float_equal_one(T);
-
-static inline bool float_equal_one(float x) { return x == 1.0f; };
-
-static inline bool float_equal_one(double x) { return x == 1.0; };
-
-static inline bool float_equal_one(half_float::half x)
-{
-    return x == convert_type<half_float::half>(1.0f);
-};
-
-template <typename T>
-static inline bool float_equal_zero(T x);
-
-static inline bool float_equal_zero(float x) { return x == 0.0f; };
-
-static inline bool float_equal_zero(double x) { return x == 0.0; };
-
-static inline bool float_equal_zero(half_float::half x)
-{
-    return x == convert_type<half_float::half>(0.0f);
-};
-
-template <typename SizeT>
-static inline void build_radix(const std::vector<SizeT>& lens, std::vector<std::size_t>& radix)
-{
-    const std::size_t D = lens.size();
-    radix.assign(D, 1);
-    for(std::size_t d = D; d-- > 1;)
-        radix[d - 1] = radix[d] * static_cast<std::size_t>(lens[d]); // radix[d] = Π_{k>d} lens[k]
-}
-
-// i -> memory offset using lens-radix + actual strides
-template <typename SizeT>
-static inline std::size_t linear_to_offset_by_lens_strides(std::size_t i,
-                                                           const std::vector<SizeT>& lens,
-                                                           const std::vector<std::size_t>& radix,
-                                                           const std::vector<SizeT>& strides)
-{
-    std::size_t off = 0;
-    for(std::size_t d = 0; d < lens.size(); ++d)
-    {
-        const std::size_t idx_d = (i / radix[d]) % static_cast<std::size_t>(lens[d]);
-        off += idx_d * static_cast<std::size_t>(strides[d]);
-    }
-    return off;
-}
-
-template <typename compType>
-static inline std::function<void(compType&)> PreUnaryOpFn(miopenReduceTensorOp_t op_, std::size_t)
-{
-    using std::abs;
-
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_NORM1: return ([&](compType& a_) { a_ = abs(a_); });
-    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = a_ * a_; });
-    case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType& a_) { a_ = abs(a_); });
-
-    case MIOPEN_REDUCE_TENSOR_AVG:
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_MUL:
-    case MIOPEN_REDUCE_TENSOR_MIN:
-    case MIOPEN_REDUCE_TENSOR_MAX: return ([&](compType&) {});
-    }
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType>
-static inline std::function<void(compType&)> PosUnaryOpFn(miopenReduceTensorOp_t op_,
-                                                          std::size_t divider)
-{
-    using std::sqrt;
-
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_) { a_ = sqrt(a_); });
-
-    case MIOPEN_REDUCE_TENSOR_AVG:
-        return ([&, divider](compType& a_) {
-            a_ = a_ / convert_type<compType>(static_cast<float>(divider));
-        });
-
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_NORM1:
-    case MIOPEN_REDUCE_TENSOR_MUL:
-    case MIOPEN_REDUCE_TENSOR_MIN:
-    case MIOPEN_REDUCE_TENSOR_MAX:
-    case MIOPEN_REDUCE_TENSOR_AMAX: return ([&](compType&) {});
-    }
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType>
-static inline std::function<void(compType&, compType)> ReduceOpFn(miopenReduceTensorOp_t op_)
-{
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_AVG:
-    case MIOPEN_REDUCE_TENSOR_NORM1:
-    case MIOPEN_REDUCE_TENSOR_NORM2: return ([&](compType& a_, compType b_) { a_ = a_ + b_; });
-
-    case MIOPEN_REDUCE_TENSOR_MUL: return ([&](compType& a_, compType b_) { a_ = a_ * b_; });
-
-    case MIOPEN_REDUCE_TENSOR_MIN:
-        return ([&](compType& a_, compType b_) {
-            if(a_ > b_)
-                a_ = b_;
-        });
-
-    case MIOPEN_REDUCE_TENSOR_MAX:
-    case MIOPEN_REDUCE_TENSOR_AMAX:
-        return ([&](compType& a_, compType b_) {
-            if(a_ < b_)
-                a_ = b_;
-        });
-    }
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType>
-static inline std::function<void(compType&, compType, bool& changed)>
-ReduceOpFn2(miopenReduceTensorOp_t op_)
-{
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_MIN:
-        return ([&](compType& a_, compType b_, bool& changed) {
-            if(a_ > b_)
-            {
-                a_      = b_;
-                changed = true;
-            }
-            else
-            {
-                changed = false;
-            }
-        });
-
-    case MIOPEN_REDUCE_TENSOR_MAX:
-    case MIOPEN_REDUCE_TENSOR_AMAX:
-        return ([&](compType& a_, compType b_, bool& changed) {
-            if(a_ < b_)
-            {
-                a_      = b_;
-                changed = true;
-            }
-            else
-            {
-                changed = false;
-            }
-        });
-
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_MUL:
-    case MIOPEN_REDUCE_TENSOR_AVG:
-    case MIOPEN_REDUCE_TENSOR_NORM1:
-    case MIOPEN_REDUCE_TENSOR_NORM2: return (std::function<void(compType&, compType, bool&)>{});
-    };
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType>
-static inline compType ReduceOpZeroVal(miopenReduceTensorOp_t op_)
-{
-    switch(op_)
-    {
-    case MIOPEN_REDUCE_TENSOR_ADD:
-    case MIOPEN_REDUCE_TENSOR_AVG:
-    case MIOPEN_REDUCE_TENSOR_NORM1:
-    case MIOPEN_REDUCE_TENSOR_NORM2: return (convert_type<compType>(0.0f));
-
-    case MIOPEN_REDUCE_TENSOR_MUL: return (convert_type<compType>(1.0f));
-
-    case MIOPEN_REDUCE_TENSOR_MIN: return (std::numeric_limits<compType>::max());
-
-    case MIOPEN_REDUCE_TENSOR_MAX: return (std::numeric_limits<compType>::lowest());
-    case MIOPEN_REDUCE_TENSOR_AMAX: return (convert_type<compType>(0.0f));
-    }
-
-    throw std::runtime_error(std::string(__FUNCTION__) +
-                             ": using undefined Reduction operation is not permitted");
-};
-
-template <typename compType, typename reduceOpT>
-static inline void binop_with_nan_check(miopenNanPropagation_t nanOpt,
-                                        reduceOpT&& opReduce,
-                                        compType& accuVal,
-                                        compType currVal)
-{
-    using std::isnan;
-
-    if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN)
-    {
-        opReduce(accuVal, currVal);
-    }
-    else
-    {
-        if(isnan(currVal))
-            accuVal = currVal;
-        else
-            opReduce(accuVal, currVal);
-    };
-};
-
-template <typename compType, typename reduceOpT>
-static inline void binop_with_nan_check2(miopenNanPropagation_t nanOpt,
-                                         reduceOpT&& opReduce,
-                                         compType& accuVal,
-                                         compType currVal,
-                                         int& accuIndex,
-                                         int currIndex)
-{
-    using std::isnan;
-
-    if(nanOpt == MIOPEN_NOT_PROPAGATE_NAN)
-    {
-        bool changed;
-
-        opReduce(accuVal, currVal, changed);
-
-        if(changed)
-            accuIndex = currIndex;
-    }
-    else
-    {
-        if(isnan(currVal))
-        {
-            accuVal   = currVal;
-            accuIndex = currIndex;
-        }
-        else
-        {
-            bool changed;
-
-            opReduce(accuVal, currVal, changed);
-
-            if(changed)
-                accuIndex = currIndex;
-        };
-    };
-};
-
-}; // end of namespace reduce
-
-template <typename T>
-std::vector<std::vector<T>> get_all_indexes(const std::vector<T>& lens)
-{
-    const std::size_t D = lens.size();
-    assert(D > 0);
-
-    std::size_t N = 1;
-    for(const auto L : lens)
-        N *= static_cast<std::size_t>(L);
-
-    std::vector<std::vector<T>> out;
-    out.resize(N);
-    for(auto& row : out)
-        row.resize(D);
-
-    std::vector<std::size_t> stride(D, 1);
-    for(std::size_t d = D; d-- > 1;)
-        stride[d - 1] = stride[d] * static_cast<std::size_t>(lens[d]);
-
-    for(std::size_t r = 0; r < N; ++r)
-    {
-        for(std::size_t d = 0; d < D; ++d)
-            out[r][d] = static_cast<T>((r / stride[d]) % static_cast<std::size_t>(lens[d]));
-    }
-
-    return out;
-}
-
-template <typename T>
-static inline T
-linear_to_offset(size_t li, const std::vector<T>& lens, const std::vector<T>& strides)
-{
-    T off = 0;
-    for(int d = int(lens.size()) - 1; d >= 0; --d)
-    {
-        const T idx = li % lens[d];
-        li /= lens[d];
-        off += idx * strides[d];
-    }
-    return off;
-}
-
-template <typename T>
-T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
-{
-    T offset = 0;
-
-    assert(strides.size() == index.size());
-
-    for(int i = 0; i < index.size(); i++)
-        offset += strides[i] * index[i];
-
-    return (offset);
-};
-
-template <typename T>
-T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
-{
-    T offset = 0;
-
-    assert(lengths.size() == index.size() && !lengths.empty());
-
-    int len  = lengths.size();
-    T stride = 1;
-
-    // for len==1, the loop is not executed
-    for(int i = len - 1; i > 0; i--)
-    {
-        offset += stride * index[i];
-
-        stride *= lengths[i];
-    };
-
-    offset += stride * index[0];
-
-    return (offset);
-};
-
-template <typename compType>
-struct Reducer
-{
-    compType acc;
-    bool withIdx;
-    int idx; // meaningful only when WithIdx==true
-    miopenNanPropagation_t nanOpt;
-    // functors for reduction
-    decltype(reduce::ReduceOpFn<compType>(MIOPEN_REDUCE_TENSOR_ADD)) opNoIdx;
-    decltype(reduce::ReduceOpFn2<compType>(MIOPEN_REDUCE_TENSOR_ADD)) opWithIdx;
-
-    Reducer(miopenNanPropagation_t n, miopenReduceTensorOp_t rop, compType zero, bool useIdx)
-        : acc(zero),
-          withIdx(useIdx),
-          idx(0),
-          nanOpt(n),
-          opNoIdx(reduce::ReduceOpFn<compType>(rop)),
-          opWithIdx(reduce::ReduceOpFn2<compType>(rop))
-    {
-    }
-
-    inline void step(compType v, int flat_i)
-    {
-        if(withIdx)
-            reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, v, idx, flat_i);
-        else
-            reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, v);
-    }
-
-    inline void combine(const Reducer& other)
-    {
-        if(withIdx)
-            reduce::binop_with_nan_check2(nanOpt, opWithIdx, acc, other.acc, idx, other.idx);
-        else
-            reduce::binop_with_nan_check(nanOpt, opNoIdx, acc, other.acc);
-    }
-};
-
-template <typename Tgpu, typename Tref, typename compType, typename SizeT>
-std::tuple<tensor<Tref>, tensor<int>> reduce_cpu_common(const miopenReduceTensorOp_t& reduceOp,
-                                                        const miopenNanPropagation_t& nanOpt,
-                                                        const std::vector<SizeT>& inLengths,
-                                                        const std::vector<SizeT>& outLengths,
-                                                        const std::vector<Tgpu>& input,
-                                                        const std::vector<SizeT>& inStrides,
-                                                        const std::vector<Tref>& output,
-                                                        const std::vector<SizeT>& outStrides,
-                                                        float alpha,
-                                                        float beta,
-                                                        bool parallel,
-                                                        bool withIdx)
-{
-    using reduce::convert_type;
-    using reduce::ReduceOpZeroVal;
-
-    // Partition dims
-    std::vector<int> invariantDims, toReduceDims;
-    std::vector<std::size_t> invLens, redLens, invStrides_v, redStrides_v;
-
-    for(int i = 0; i < static_cast<int>(inLengths.size()); ++i)
-    {
-        if(inLengths[i] == outLengths[i])
-        {
-            invariantDims.push_back(i);
-            invLens.push_back(inLengths[i]);
-            invStrides_v.push_back(inStrides[i]);
-        }
-        else
-        {
-            toReduceDims.push_back(i);
-            redLens.push_back(inLengths[i]);
-            redStrides_v.push_back(inStrides[i]);
-        }
-    }
-
-    const bool reduceAllDims = invariantDims.empty();
-
-    // unary ops & zero vals
-    const compType zeroV = ReduceOpZeroVal<compType>(reduceOp);
-
-    // divider = Π reduced dims (or N if reduce-all)
-    std::size_t divider = 1;
-    if(reduceAllDims)
-        divider = std::accumulate(
-            inLengths.begin(), inLengths.end(), std::size_t{1}, std::multiplies<>());
-    else
-        divider =
-            std::accumulate(redLens.begin(), redLens.end(), std::size_t{1}, std::multiplies<>());
-
-    auto PreUnaryOp = reduce::PreUnaryOpFn<compType>(reduceOp, divider);
-    auto PosUnaryOp = reduce::PosUnaryOpFn<compType>(reduceOp, divider);
-
-    // outputs
-    auto res         = tensor<Tref>{outLengths};
-    res.data         = output;
-    auto res_indices = tensor<int>{outLengths};
-    if(withIdx)
-        std::fill(res_indices.begin(), res_indices.end(), 0);
-
-    if(reduceAllDims)
-    {
-        // Flatten whole tensor
-        const std::size_t N = divider; // product of all dims
-        std::vector<std::size_t> lens_radix;
-        reduce::build_radix(inLengths, lens_radix);
-
-        // parallel chunking
-        std::size_t hw =
-            std::max(std::size_t{1}, static_cast<std::size_t>(std::thread::hardware_concurrency()));
-        const std::size_t P     = std::min(N, hw * 4ul);
-        const std::size_t chunk = (N + P - 1) / P;
-
-        std::vector<Reducer<compType>> partial;
-        partial.reserve(P);
-        for(std::size_t p = 0; p < P; ++p)
-            partial.emplace_back(nanOpt, reduceOp, zeroV, withIdx);
-
-        auto worker = [&](int p) {
-            const std::size_t begin = std::size_t(p) * chunk;
-            const std::size_t end   = std::min(begin + chunk, N);
-
-            auto& r = partial[p];
-            for(std::size_t i = begin; i < end; ++i)
-            {
-                const auto off =
-                    reduce::linear_to_offset_by_lens_strides(i, inLengths, lens_radix, inStrides);
-                auto v = convert_type<compType>(input[off]);
-                PreUnaryOp(v);
-                r.step(v, static_cast<int>(i)); // flat index across whole tensor
-            }
-        };
-
-        if(parallel)
-        {
-            miopen::par_for(static_cast<int>(P), worker);
-        }
-        else
-        {
-            for(int p = 0; p < P; ++p)
-            {
-                worker(p);
-            }
-        }
-
-        // combine
-        Reducer<compType> R(nanOpt, reduceOp, zeroV, withIdx);
-        for(std::size_t p = 0; p < P; ++p)
-            R.combine(partial[p]);
-
-        // post
-        PosUnaryOp(R.acc);
-        if(alpha != 1.0f)
-            R.acc *= convert_type<compType>(alpha);
-        if(beta != 0.0f)
-            R.acc += convert_type<compType>(output[0]) * convert_type<compType>(beta);
-
-        res.data[0] = convert_type<Tref>(R.acc);
-        if(withIdx)
-            res_indices.data[0] = R.idx;
-    }
-    else
-    {
-        // Build radices for invariant and reduced subspaces
-        std::vector<std::size_t> invRad, redRad;
-        reduce::build_radix(invLens, invRad);
-        reduce::build_radix(redLens, redRad);
-
-        const std::size_t INV =
-            std::accumulate(invLens.begin(), invLens.end(), std::size_t{1}, std::multiplies<>());
-        const std::size_t TR = divider;
-
-        std::size_t hw =
-            std::max(std::size_t{1}, static_cast<std::size_t>(std::thread::hardware_concurrency()));
-        const std::size_t Te    = std::min(hw * 4ul, std::max<std::size_t>(1, INV));
-        const std::size_t chunk = (INV + Te - 1) / Te;
-
-        auto worker = [&](int t) {
-            const std::size_t row0 = std::size_t(t) * chunk;
-            const std::size_t row1 = std::min(row0 + chunk, INV);
-
-            for(std::size_t r = row0; r < row1; ++r)
-            {
-                // decode invariant multi-index; compute base offsets
-                std::size_t tmp          = r;
-                std::size_t base_in_off  = 0;
-                std::size_t base_out_off = 0;
-                for(std::size_t k = 0; k < invLens.size(); ++k)
-                {
-                    const std::size_t idx = (tmp / invRad[k]) % invLens[k];
-                    base_in_off += idx * invStrides_v[k];
-                    base_out_off += idx * outStrides[invariantDims[k]];
-                }
-
-                Reducer<compType> R(nanOpt, reduceOp, zeroV, withIdx);
-
-                // iterate reduced subspace
-                for(std::size_t i = 0; i < TR; ++i)
-                {
-                    std::size_t tmp2    = i;
-                    std::size_t red_off = 0;
-                    for(std::size_t k = 0; k < redLens.size(); ++k)
-                    {
-                        const std::size_t idx = (tmp2 / redRad[k]) % redLens[k];
-                        red_off += idx * redStrides_v[k];
-                    }
-
-                    auto v = convert_type<compType>(input[base_in_off + red_off]);
-                    PreUnaryOp(v);
-                    R.step(v, static_cast<int>(i)); // flat index inside reduced subspace
-                }
-
-                PosUnaryOp(R.acc);
-                if(alpha != 1.0f)
-                    R.acc *= convert_type<compType>(alpha);
-                if(beta != 0.0f)
-                    R.acc +=
-                        convert_type<compType>(output[base_out_off]) * convert_type<compType>(beta);
-
-                res.data[base_out_off] = convert_type<Tref>(R.acc);
-                if(withIdx)
-                    res_indices.data[base_out_off] = R.idx;
-            }
-        };
-
-        if(parallel)
-        {
-            miopen::par_for(static_cast<int>(Te), worker);
-        }
-        else
-        {
-            for(int te = 0; te < Te; ++te)
-            {
-                worker(te);
-            }
-        }
-    }
-
-    return {res, res_indices};
-}
-
-template <typename T, typename compType>
-std::tuple<tensor<T>, tensor<int>>
-reduce_cpu_common(const miopen::ReduceTensorDescriptor& reduceDesc,
-                  const tensor<T>& input,
-                  const tensor<T>& output,
-                  float alpha,
-                  float beta,
-                  bool parallel,
-                  bool withIdx)
-{
-    auto inLengths  = input.desc.GetLengths();
-    auto outLengths = output.desc.GetLengths();
-    auto inStrides  = input.desc.GetStrides();
-    auto outStrides = output.desc.GetStrides();
-
-    const auto reduceOp = reduceDesc.reduceTensorOp_;
-    const auto nanOpt   = reduceDesc.reduceTensorNanOpt_;
-
-    return reduce_cpu_common<T, T, compType, std::size_t>(reduceOp,
-                                                          nanOpt,
-                                                          inLengths,
-                                                          outLengths,
-                                                          input.data,
-                                                          inStrides,
-                                                          output.data,
-                                                          outStrides,
-                                                          alpha,
-                                                          beta,
-                                                          parallel,
-                                                          withIdx);
-}
-
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/cpu_reduce_util.hpp>
diff --git a/projects/miopen/test/fusionHost.hpp b/projects/miopen/test/fusionHost.hpp
index 9693295959d7..11c6d54f6257 100644
--- a/projects/miopen/test/fusionHost.hpp
+++ b/projects/miopen/test/fusionHost.hpp
@@ -1,994 +1,3 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2018 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#pragma once
-#include <array>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <utility>
-#include <miopen/miopen.h>
-#include <miopen/convolution.hpp>
-#include <miopen/batch_norm.hpp>
-#include <miopen/activ.hpp>
-#include <miopen/tensor.hpp>
-#include <miopen/fusion_plan.hpp>
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/fusionHost.hpp>
 #include "get_handle.hpp"
-#include "tensor_holder.hpp"
-#include "verify.hpp"
-
-template <class T>
-void convHostForward(const tensor<T>& input,
-                     tensor<T>& output,
-                     const tensor<T>& weights,
-                     const int bias_mode,
-                     const tensor<T>& bias,
-                     const miopenConvolutionDescriptor_t convDesc)
-{
-
-    int in_n, in_c, in_h, in_w;
-    int in_nstride, in_cstride, in_hstride, in_wstride;
-    std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(input.desc.GetLengths());
-    std::tie(in_nstride, in_cstride, in_hstride, in_wstride) =
-        miopen::tien<4>(input.desc.GetStrides());
-
-    int wei_n, wei_c, wei_h, wei_w;
-    int wei_nstride, wei_cstride, wei_hstride, wei_wstride;
-    std::tie(wei_n, wei_c, wei_h, wei_w) = miopen::tien<4>(weights.desc.GetLengths());
-    std::tie(wei_nstride, wei_cstride, wei_hstride, wei_wstride) =
-        miopen::tien<4>(weights.desc.GetStrides());
-
-    int out_n, out_c, out_h, out_w;
-    int out_nstride, out_cstride, out_hstride, out_wstride;
-    std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(output.desc.GetLengths());
-    std::tie(out_nstride, out_cstride, out_hstride, out_wstride) =
-        miopen::tien<4>(output.desc.GetStrides());
-
-    int stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w;
-    miopenConvolutionMode_t mode;
-    miopenPaddingMode_t pmode = miopen::deref(convDesc).paddingMode;
-    miopenGetConvolutionDescriptor(
-        convDesc, &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilation_h, &dilation_w);
-
-    if(pmode == miopenPaddingSame)
-    {
-        pad_h = (in_h % stride_h == 0) ? (std::max((wei_h - stride_h), 0))
-                                       : (std::max((wei_h - (in_h % stride_h)), 0));
-        pad_w = (in_w % stride_w == 0) ? (std::max((wei_w - stride_w), 0))
-                                       : (std::max((wei_w - (in_w % stride_w)), 0));
-        pad_h /= 2;
-        pad_w /= 2;
-    }
-    else if(pmode == miopenPaddingValid)
-    {
-        pad_h = 0;
-        pad_w = 0;
-    }
-
-    if(out_h <= 0 || out_w <= 0)
-        MIOPEN_THROW("Invalid Test Case: Check Output Dimension.");
-
-    for(int o = 0; o < out_n; o++)
-    { // mini-batch size
-        for(int w = 0; w < out_c; w++)
-        { // out_channels (num filters)
-            for(int i = 0; i < out_h; i++)
-            { // output_height (from getforwardoutputdim())
-                int in_off_h = i * stride_h;
-                for(int j = 0; j < out_w; j++)
-                { // output_width (from getforwardoutputdim())
-                    /*auto acc     = static_cast<T>(0.);*/
-                    auto acc     = static_cast<double>(0.);
-                    int in_off_w = j * stride_w;
-                    for(int k = 0; k < in_c; k++)
-                    { // in_channels (RGB)
-                        for(int x = 0; x < wei_h; x++)
-                        {
-                            int in_x = in_off_h - pad_h + x * dilation_h;
-                            if(in_x >= 0 && in_x < in_h)
-                            {
-                                for(int y = 0; y < wei_w; y++)
-                                {
-                                    int in_y = in_off_w - pad_w + y * dilation_w;
-                                    if(in_y >= 0 && in_y < in_w)
-                                    {
-                                        acc += double(
-                                            static_cast<T>(input[o * in_nstride + k * in_cstride +
-                                                                 in_x * in_w + in_y]) *
-                                            static_cast<T>(weights(w, k, x, y)));
-                                    }
-                                }
-                            }
-                        }
-                    }
-                    acc = bias_mode != 0 ? acc + static_cast<double>(bias[w]) : acc;
-                    output[o * out_nstride + w * out_cstride + i * out_hstride + j] =
-                        static_cast<T>(acc);
-                }
-            }
-        }
-    }
-}
-
-template <class T, class Tref, class U, class V = U>
-void batchNormSpatialHostInference(const tensor<T>& input,
-                                   tensor<Tref>& output,
-                                   const tensor<U>& scale,
-                                   const tensor<U>& bias,
-                                   double epsilon,
-                                   const tensor<V>& estimatedMean,
-                                   const tensor<V>& estimatedVariance,
-                                   bool useInverseVariance = false)
-{
-
-    int n_batches, channels, height, width;
-    std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-    miopen::par_for(channels, 1, [&](int cidx) { // via channel
-        V mean     = estimatedMean(0, cidx, 0, 0);
-        V variance = estimatedVariance(0, cidx, 0, 0);
-        double invertVar =
-            useInverseVariance ? static_cast<double>(variance) : 1.0 / sqrt(variance + epsilon);
-        // process the batch per channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batches; bidx++)
-                { // via mini_batch
-                    double elemStd = static_cast<double>(input(bidx, cidx, row, column)) - mean;
-                    double inhat   = elemStd * invertVar;
-                    output(bidx, cidx, row, column) =
-                        static_cast<T>(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
-                    // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
-                }
-            }
-        }
-    });
-}
-
-template <class T, class U, class V, class Tref>
-void batchNormPerActivHostInference(const tensor<T>& input,
-                                    tensor<Tref>& output,
-                                    const tensor<U>& scale,
-                                    const tensor<U>& bias,
-                                    double epsilon,
-                                    const tensor<V>& estimatedMean,
-                                    const tensor<V>& estimatedVariance,
-                                    bool useInverseVariance = false)
-{
-    int n_batches, channels, height, width;
-    std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-    miopen::par_for(channels, 1, [&](int cidx) { // via channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                // apply down the n_batch dimension
-                double mean       = estimatedMean(0, cidx, row, column);
-                double variance   = estimatedVariance(0, cidx, row, column);
-                double elemInvVar = useInverseVariance ? variance : 1.0 / sqrt(variance + epsilon);
-                for(int bidx = 0; bidx < n_batches; bidx++)
-                { // via mini_batch
-                    // per (x-dims) channel load a block of data into LDS
-                    double elemStd = input(bidx, cidx, row, column) - mean;
-                    double inhat   = elemStd * elemInvVar;
-                    output(bidx, cidx, row, column) =
-                        scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column);
-                    //    printf("output: %f\n",output(bidx, cidx, row, column));
-                }
-            }
-        }
-    });
-}
-
-template <class T, class U, class Tref = U, class Tout>
-void batchNormSpatialHostFwdTrain(const tensor<T>& input,
-                                  tensor<Tout>& out,
-                                  const tensor<U>& scale,
-                                  const tensor<U>& bias,
-                                  double epsilon,
-                                  double expAvgFactor,
-                                  tensor<Tref>& saveMean,
-                                  tensor<Tref>& saveInvVar,
-                                  tensor<Tref>& runMean,
-                                  tensor<Tref>& runVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-    const auto nhw                             = double(height * width * n_batch);
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd        = 0.;
-        double variance_accum = 0.;
-        double mean_accum     = 0.;
-        double invVar         = 0.;
-        double newRunMean     = 0.;
-        double adjust         = 0.;
-
-        // process the batch per channel
-        for(int bidx = 0; bidx < n_batch; bidx++)
-        { // via mini_batch
-            for(int row = 0; row < height; row++)
-            { // via rows
-                for(int column = 0; column < width; column++)
-                { // via columns
-                    // #1 calculate the mean
-                    // iterating through the stack of images in the mini_batch
-                    auto inval = static_cast<double>(input(bidx, cidx, row, column));
-                    mean_accum += inval;
-                    variance_accum += inval * inval;
-                } // end for (column)
-            } // end for (row)
-        } // end for (n)
-
-        mean_accum /= nhw;
-        variance_accum /= nhw;
-        variance_accum += (-mean_accum * mean_accum);
-        invVar = 1.0 / sqrt(variance_accum + epsilon);
-
-        // #4 apply the normalization
-        // x_hat = (x_i - mean) / sqrt(variance_accum + epsilon)
-        for(int bidx = 0; bidx < n_batch; bidx++)
-        { // via mini_batch
-            for(int row = 0; row < height; row++)
-            { // via rows
-                for(int column = 0; column < width; column++)
-                { // via columns
-                    // #5 Gamma and Beta adjust
-                    // y_i = gamma*x_hat + beta
-                    elemStd = (static_cast<double>(input(bidx, cidx, row, column)) -
-                               mean_accum); // (x_i - mean)
-                    out(bidx, cidx, row, column) = static_cast<T>(
-                        scale(0, cidx, 0, 0) * (invVar * elemStd) + bias(0, cidx, 0, 0));
-                } // for (column)
-            } // for (row)
-        } // end for(n_batchs)
-        if(!saveMean.data.empty())
-        {
-            saveMean(0, cidx, 0, 0)   = mean_accum;
-            saveInvVar(0, cidx, 0, 0) = invVar;
-        }
-        if(!runMean.data.empty())
-        {
-            newRunMean             = runMean(0, cidx, 0, 0) * (1 - expAvgFactor);
-            runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
-            // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
-            adjust = (n_batch * height * width == 1) ? variance_accum
-                                                     : (nhw / (nhw - 1)) * variance_accum;
-            runVar(0, cidx, 0, 0) =
-                (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust;
-        }
-    });
-}
-
-template <typename XDataType,
-          typename DxDataType,
-          typename DyDataType,
-          typename ScaleDataType,
-          typename AccDataType,
-          typename RefDataType>
-void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
-                                  tensor<DyDataType>& dy_input,
-                                  tensor<DxDataType>& dx_out,
-                                  const tensor<ScaleDataType>& bnScale,
-                                  const tensor<ScaleDataType>& bnBias,
-                                  tensor<RefDataType>& dscale,
-                                  tensor<RefDataType>& dbias,
-                                  const tensor<AccDataType>& savedMean,
-                                  const tensor<AccDataType>& savedInvVar,
-                                  miopenActivationMode_t activ_mode,
-                                  double activ_beta,
-                                  double activ_alpha)
-{
-    double activ_gamma = 0.;
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-    auto nhw                                   = double(height * width * n_batch);
-    int in_cstride                             = height * width;
-
-    if(activ_mode > 0)
-    {
-        tensor<AccDataType> input_norm =
-            tensor<AccDataType>{x_input.desc.GetLayout_t(), x_input.desc.GetLengths()};
-        miopen::par_for(channels, 1, [&](int cidx) {
-            double mean           = 0.0;
-            double invVar         = 0.0;
-            double elemStd        = 0.;
-            double mean_accum     = 0.0;
-            double variance_accum = 0.0;
-            if(!savedMean.data.empty())
-            {
-                mean   = static_cast<double>(savedMean(0, cidx, 0, 0));   // HxW elements
-                invVar = static_cast<double>(savedInvVar(0, cidx, 0, 0)); // HxW elements
-            }
-            else
-            {
-                for(int row = 0; row < height; row++)
-                { // via rows
-                    for(int column = 0; column < width; column++)
-                    { // via columns
-                        for(int bidx = 0; bidx < n_batch; bidx++)
-                        { // via mini_batch
-                            auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
-                            mean_accum += inval;
-                            variance_accum += inval * inval;
-                        }
-                    }
-                }
-                mean_accum /= nhw;
-                variance_accum /= nhw;
-                variance_accum += (-mean_accum * mean_accum);
-                mean   = mean_accum;
-                invVar = 1.0 / sqrt(variance_accum);
-            }
-            for(int row = 0; row < height; row++)
-            { // via rows
-                for(int column = 0; column < width; column++)
-                { // via columns
-                    for(int bidx = 0; bidx < n_batch; bidx++)
-                    { // via mini_batch
-                        elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                                  mean; // (x_i - mean)
-                        input_norm(bidx, cidx, row, column) = static_cast<AccDataType>(
-                            bnScale(0, cidx, 0, 0) * (elemStd * invVar) + bnBias(0, cidx, 0, 0));
-                    }
-                }
-            }
-        });
-
-        activationHostBnormBwd(activ_mode,
-                               activ_gamma,
-                               activ_beta,
-                               activ_alpha,
-                               dy_input.data,
-                               input_norm.data,
-                               dy_input.data);
-    }
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd = 0.;
-        unsigned int xhat_index;
-        double mean   = 0.0;
-        double invVar = 0.0;
-        double dyelem = 0.;
-        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride, 0.0);
-        // process the batch per channel
-        dscale(0, cidx, 0, 0) = 0.;
-        dbias(0, cidx, 0, 0)  = 0.;
-
-        if(!savedMean.data.empty())
-        {
-
-            mean   = savedMean(0, cidx, 0, 0);   // HxW elements
-            invVar = savedInvVar(0, cidx, 0, 0); // HxW elements
-        }
-        else
-        {
-            double variance_accum = 0.;
-            double mean_accum     = 0.;
-            double inv_Var        = 0.;
-
-            // process the batch per channel
-            for(int bidx = 0; bidx < n_batch; bidx++)
-            { // via mini_batch
-                for(int row = 0; row < height; row++)
-                { // via rows
-                    for(int column = 0; column < width; column++)
-                    { // via columns
-                        // #1 calculate the mean
-                        // iterating through the stack of images in the mini_batch
-                        auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
-                        mean_accum += inval;
-                        variance_accum += inval * inval;
-                    } // end for (column)
-                } // end for (row)
-            } // end for (n)
-
-            mean_accum /= nhw;
-            variance_accum /= nhw;
-            variance_accum += (-mean_accum * mean_accum);
-            inv_Var = 1.0 / sqrt(variance_accum);
-
-            mean   = mean_accum;
-            invVar = inv_Var;
-        }
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    // per (x-dims) channel load a block of data into LDS
-                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                              mean; // (x_i - mean)
-                    xhat[xhat_index] = elemStd * invVar;
-                    dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));
-                    dbias(0, cidx, 0, 0) += dyelem;
-                    dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem;
-                } // end for(n_batch)
-            } // for (column)
-        } // for (row)
-
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-
-                    double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
-                    double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                    double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) =
-                        static_cast<RefDataType>(tmp3 * (tmp2 + tmp1));
-                } // end for(n_batchs)
-            } // for (column)
-        } // for (row)
-    }); // for (channel)
-}
-
-template <typename XDataType,
-          typename DyDataType,
-          typename DxDataType,
-          typename ScaleDataType,
-          typename AccDataType,
-          typename OutRefDataType,
-          typename RefDataType>
-void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
-                                       double gamma,
-                                       double beta,
-                                       double alpha,
-                                       const tensor<XDataType>& x_input,
-                                       const tensor<DyDataType>& dy_input,
-                                       const tensor<DxDataType>& y_input,
-                                       tensor<OutRefDataType>& dx_out,
-                                       const tensor<ScaleDataType>& bnScale,
-                                       const tensor<AccDataType>& bias,
-                                       tensor<RefDataType>& dscale,
-                                       tensor<RefDataType>& dbias,
-                                       const tensor<AccDataType>& savedMean,
-                                       const tensor<AccDataType>& savedInvVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-    auto nhw                                   = double(height * width * n_batch);
-    int in_cstride                             = height * width;
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd = 0.;
-        unsigned int xhat_index;
-        double mean   = static_cast<double>(savedMean(0, cidx, 0, 0));   // HxW elements
-        double invVar = static_cast<double>(savedInvVar(0, cidx, 0, 0)); // HxW elements
-        double dyelem = 0.;
-        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride, 0.0);
-        // process the batch per channel
-        dscale(0, cidx, 0, 0) = 0.;
-        dbias(0, cidx, 0, 0)  = 0.;
-
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-
-                    // recompute forward batch norm
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    // per (x-dims) channel load a block of data into LDS
-                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                              mean; // (x_i - mean)
-                    xhat[xhat_index] = elemStd * invVar;
-                    double bnrefowd =
-                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
-                    activationHostBwdElement(activMode,
-                                             gamma,
-                                             beta,
-                                             alpha,
-                                             dy_input(bidx, cidx, row, column),
-                                             bnrefowd,
-                                             y_input(bidx, cidx, row, column),
-                                             dyelem);
-                    dbias(0, cidx, 0, 0) += dyelem;
-                    dscale(0, cidx, 0, 0) += xhat[xhat_index] * dyelem;
-                } // end for(n_batch)
-            } // for (column)
-        } // for (row)
-
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    double bnrefowd =
-                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
-                    activationHostBwdElement(activMode,
-                                             gamma,
-                                             beta,
-                                             alpha,
-                                             dy_input(bidx, cidx, row, column),
-                                             bnrefowd,
-                                             y_input(bidx, cidx, row, column),
-                                             dyelem);
-                    // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
-                    double tmp1                     = nhw * dyelem - dbias(0, cidx, 0, 0);
-                    double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                    double tmp3                     = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * (tmp2 + tmp1));
-                } // end for(n_batchs)
-            } // for (column)
-        } // for (row)
-    }); // for (channel)
-}
-
-template <class T, class U, class Tref, class TOutref>
-void batchNormPerActHostFwdTrain(const tensor<T>& input,
-                                 tensor<TOutref>& out,
-                                 const tensor<U>& scale,
-                                 const tensor<U>& bias,
-                                 double epsilon,
-                                 double expAvgFactor,
-                                 tensor<Tref>& saveMean,
-                                 tensor<Tref>& saveInvVar,
-                                 tensor<Tref>& runMean,
-                                 tensor<Tref>& runVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-    const auto n                               = double(n_batch);
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double mean_accum     = 0.;
-        double variance_accum = 0.;
-        double elemStd        = 0.;
-        double elemInvVar     = 0.;
-        double inhat          = 0.;
-        double newRunMean     = 0.;
-        double adjust         = 0.;
-
-        // process the batch per channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-
-                mean_accum     = 0.;
-                variance_accum = 0.;
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    // #1 calculate the mean :: iterating through the stack of images in the
-                    // mini_batch
-                    auto intval = static_cast<double>(input(bidx, cidx, row, column));
-                    mean_accum += intval;
-                    variance_accum += intval * intval;
-                }
-                mean_accum /= n;
-                variance_accum /= n;
-                variance_accum = variance_accum - (mean_accum * mean_accum);
-                elemInvVar     = 1.0 / double(sqrt(variance_accum + epsilon));
-
-                // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum -
-                // epsilon)
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                {                                                            // via mini_batch
-                    elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean)
-                    inhat   = elemStd * elemInvVar;
-                    // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta
-                    out(bidx, cidx, row, column) = static_cast<Tref>(
-                        scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column));
-                } // end for(n_batch)
-
-                if(!runMean.data.empty())
-                {
-                    newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor);
-                    runMean(0, cidx, row, column) =
-                        mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
-                }
-                // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
-                if(!runVar.data.empty())
-                {
-                    adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum;
-                    runVar(0, cidx, row, column) =
-                        (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust;
-                }
-                if(!saveMean.data.empty() || !saveInvVar.data.empty())
-                {
-                    saveMean(0, cidx, row, column)   = static_cast<Tref>(mean_accum);
-                    saveInvVar(0, cidx, row, column) = static_cast<Tref>(elemInvVar);
-                }
-
-            } // for (column)
-        } // for (row)
-    });
-}
-
-template <typename XDataType,
-          typename DxDataType,
-          typename DyDataType = XDataType,
-          typename ScaleDataType,
-          typename AccDataType = ScaleDataType,
-          typename RefDataType = DxDataType>
-void batchNormPerActHostBwdTrain(const tensor<XDataType>& x_input,
-                                 const tensor<DyDataType>& dy_input,
-                                 tensor<DxDataType>& dx_out,
-                                 const tensor<ScaleDataType>& scale,
-                                 tensor<RefDataType>& dscale,
-                                 tensor<RefDataType>& dbias,
-                                 const tensor<AccDataType>& savedMean,
-                                 const tensor<AccDataType>& savedInvVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-    int in_cstride                             = height * width;
-    auto n                                     = double(n_batch);
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd = 0.;
-        unsigned int xhat_index;
-        double mean       = 0.;
-        double elemInvVar = 0.;
-        double dyelem     = 0.;
-        double dxhat      = 0.;
-        double dxhathat   = 0.;
-        double tmp1       = 0.;
-        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride);
-
-        // process the batch per channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                dxhat    = 0.;
-                dxhathat = 0.;
-
-                if(!savedMean.data.empty())
-                {
-                    mean       = savedMean(0, cidx, row, column);   // HxW elements
-                    elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements
-                }
-                else
-                {
-                    double variance_accum = 0.;
-                    double mean_accum     = 0.;
-
-                    // process the batch per channel
-                    for(int bidx = 0; bidx < n_batch; bidx++)
-                    { // via mini_batch
-                        auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
-                        mean_accum += inval;
-                        variance_accum += inval * inval;
-                    } // end for (n)
-
-                    mean_accum /= n;
-                    variance_accum /= n;
-                    variance_accum += (-mean_accum * mean_accum);
-
-                    mean       = mean_accum;
-                    elemInvVar = 1.0 / sqrt(variance_accum);
-                }
-
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    // per (x-dims) channel load a block of data into LDS
-                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                              mean; // (x_i - mean)
-                    xhat[xhat_index] = elemStd * elemInvVar;
-                    dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));
-                    dbias(0, cidx, row, column) += dyelem;
-                    dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem;
-                    tmp1 = scale(0, cidx, row, column) * dyelem;
-                    dxhat += tmp1;
-                    dxhathat += tmp1 * xhat[xhat_index];
-
-                } // end for(n_batchs)
-
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    tmp1       = xhat[xhat_index] * dxhathat + dxhat;
-                    double tmp2 =
-                        n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) -
-                        tmp1;
-                    double tmp3                     = elemInvVar / (double(n));
-                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * tmp2);
-                } // end for(n_batchs)
-            } // for (column)
-        } // for (row)
-    });
-}
-
-template <class T, class U>
-void batchNormActivPerActHostBwdTrain(miopenActivationMode_t activMode,
-                                      double gamma,
-                                      double beta,
-                                      double alpha,
-                                      const tensor<T>& x_input,
-                                      const tensor<T>& dy_input,
-                                      const tensor<T>& y_input,
-                                      tensor<T>& dx_out,
-                                      const tensor<U>& scale,
-                                      const tensor<U>& bias,
-                                      tensor<U>& dscale,
-                                      tensor<U>& dbias,
-                                      const tensor<U>& savedMean,
-                                      const tensor<U>& savedInvVar)
-{
-
-    int height, width, n_batch, channels;
-    std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-    int in_cstride                             = height * width;
-    auto n                                     = double(n_batch);
-
-    miopen::par_for(channels, 1, [&](int cidx) {
-        double elemStd = 0.;
-        unsigned int xhat_index;
-        double mean       = 0.;
-        double elemInvVar = 0.;
-        double dyelem     = 0.;
-        double dxhat      = 0.;
-        double dxhathat   = 0.;
-        double tmp1       = 0.;
-        std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride);
-
-        // process the batch per channel
-        for(int row = 0; row < height; row++)
-        { // via rows
-            for(int column = 0; column < width; column++)
-            { // via columns
-                dxhat    = 0.;
-                dxhathat = 0.;
-
-                mean       = savedMean(0, cidx, row, column);   // HxW elements
-                elemInvVar = savedInvVar(0, cidx, row, column); // HxW elements
-
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    // per (x-dims) channel load a block of data into LDS
-                    elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
-                              mean; // (x_i - mean)
-                    xhat[xhat_index] = elemStd * elemInvVar;
-                    double bnrefowd =
-                        scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column);
-                    activationHostBwdElement(activMode,
-                                             gamma,
-                                             beta,
-                                             alpha,
-                                             dy_input(bidx, cidx, row, column),
-                                             bnrefowd,
-                                             y_input(bidx, cidx, row, column),
-                                             dyelem);
-                    /*dyelem           = static_cast<double>(dy_input(bidx, cidx, row, column));*/
-                    dbias(0, cidx, row, column) += dyelem;
-                    dscale(0, cidx, row, column) += xhat[xhat_index] * dyelem;
-                    tmp1 = scale(0, cidx, row, column) * dyelem;
-                    dxhat += tmp1;
-                    dxhathat += tmp1 * xhat[xhat_index];
-
-                } // end for(n_batchs)
-
-                for(int bidx = 0; bidx < n_batch; bidx++)
-                { // via mini_batch
-                    xhat_index = in_cstride * bidx + (width * row + column);
-                    tmp1       = xhat[xhat_index] * dxhathat + dxhat;
-                    double bnrefowd =
-                        scale(0, cidx, row, column) * xhat[xhat_index] + bias(0, cidx, row, column);
-                    activationHostBwdElement(activMode,
-                                             gamma,
-                                             beta,
-                                             alpha,
-                                             dy_input(bidx, cidx, row, column),
-                                             bnrefowd,
-                                             y_input(bidx, cidx, row, column),
-                                             dyelem);
-                    double tmp2 = (n_batch * scale(0, cidx, row, column) * dyelem) - tmp1;
-                    double tmp3 = elemInvVar / (double(n));
-                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * tmp2);
-                } // end for(n_batchs)
-            } // for (column)
-        } // for (row)
-    });
-}
-
-template <class F>
-void visitActivationHostInfer(
-    miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f)
-{
-    switch(activMode)
-    {
-    case miopenActivationPASTHRU: //  x
-        f([=](double x) { return x; });
-        break;
-    case miopenActivationLOGISTIC: // 1 / (1 + e^-x)  //Sigmoid
-        f([=](double x) { return (1. / (1. + std::exp(-x))); });
-        break;
-    case miopenActivationTANH: // beta * tanh(alpha * x)
-        f([=](double x) { return (beta * std::tanh(alpha * x)); });
-        break;
-    case miopenActivationRELU: // max(0, x)
-        f([=](double x) { return ((x > 0.) ? x : 0.); });
-        break;
-    case miopenActivationSOFTRELU: //  log(1 + e^x)   // bonomial normal log likelihood
-        f([=](double x) {
-            return (x > 0.) ? (x + std::log1p(std::exp(-x))) : (std::log1p(std::exp(x)));
-        });
-        break;
-    case miopenActivationABS: //  abs(x)
-        f([=](double x) { return (std::fabs(x)); });
-        break;
-    case miopenActivationPOWER: // (alpha + beta * x) ^ gamma
-        f([=](double x) {
-            auto v = (alpha + beta * x);
-            return (v <= std::numeric_limits<double>::epsilon()) ? 0. : pow(v, gamma);
-        });
-        break;
-    case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x))
-        f([=](double x) { return (std::min(alpha, std::max(double(0.), x))); });
-        break;
-    case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0
-        f([=](double x) { return ((x > 0.) ? x : x * alpha); });
-        break;
-    case miopenActivationELU: // alpha * (exp(x)-1) | x<=0; x | x>0
-        f([=](double x) { return ((x > 0.) ? x : alpha * std::expm1(x)); });
-        break;
-    case miopenActivationCLAMP: // max(alpha, min(beta, x))
-        f([=](double x) { return (std::max(alpha, std::min(beta, x))); });
-        break;
-        // default: printf("ERROR: unknown neuron type: %d\n", activMode); break;
-    }
-}
-
-template <class T>
-inline void activationHostInfer(miopenActivationMode_t activMode,
-                                double gamma,
-                                double beta,
-                                double alpha,
-                                const std::vector<T> input,
-                                std::vector<T>& output)
-{
-    visitActivationHostInfer(activMode, gamma, beta, alpha, [&](auto f) {
-        miopen::par_for(input.size(), 1, [&](int index) {
-            output[index] = static_cast<T>(f(static_cast<double>(input[index])));
-        });
-    });
-}
-
-template <class F>
-void visitActivationHostBwd(
-    miopenActivationMode_t activMode, double gamma, double beta, double alpha, F f)
-{
-    switch(activMode)
-    {
-    case miopenActivationPASTHRU: //  x
-        f([=](double dy, double, double) { return dy; });
-        break;
-    case miopenActivationLOGISTIC: // 1 / (1 + e^-x)  //Sigmoid
-        f([=](double dy, double, double y) { return dy * y * (1 - y); });
-        break;
-    case miopenActivationTANH: // beta * tanh(alpha * x)
-        f([=](double dy, double, double y) { return dy * alpha * (beta - y * y / beta); });
-        break;
-    case miopenActivationRELU: // max(0, x)
-        f([=](double dy, double x, double) { return (x > 0) ? dy : 0; });
-        break;
-    case miopenActivationSOFTRELU: //  log(1 + e^x)   // bonomial normal log likelihood
-        f([=](double dy, double x, double) {
-            static const double threshold = 50.;
-            double expval                 = std::exp(std::min(x, threshold));
-            return dy * expval / (expval + 1.0);
-        });
-        break;
-    case miopenActivationABS: //  abs(x)
-        f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : -1); });
-        break;
-    case miopenActivationPOWER: // (alpha + beta * x) ^ gamma
-        f([=](double, double x, double y) {
-            auto v = alpha + beta * x;
-            return v <= std::numeric_limits<double>::epsilon() ? 0 : gamma * beta * y / v;
-        });
-        break;
-    case miopenActivationCLIPPEDRELU: // min(alpha, max(0, x))
-        f([=](double dy, double x, double) { return (x > 0 && x <= alpha) ? dy : 0; });
-        break;
-    case miopenActivationLEAKYRELU: // alpha * x | x<=0; x | x>0
-        f([=](double dy, double x, double) { return dy * ((x > 0) ? 1 : alpha); });
-        break;
-    case miopenActivationELU: // alpah * (exp(x)-1) | x<=0; x | x>0
-        f([=](double dy, double x, double y) { return dy * ((x > 0) ? 1 : y + alpha); });
-        break;
-    case miopenActivationCLAMP: // max(alpha, min(beta, x))
-        f([=](double dy, double x, double) { return (x > alpha && x <= beta) ? dy : 0; });
-        break;
-        // default: printf("ERROR: unknown neuron type: %d\n", activMode); break;
-    }
-}
-
-template <class T, class U, class V>
-inline void activationHostBnormBwd(miopenActivationMode_t activMode,
-                                   double gamma,
-                                   double beta,
-                                   double alpha,
-                                   const std::vector<U> dyinput,
-                                   const std::vector<V> xinput,
-                                   std::vector<T>& output)
-{
-    double dummy;
-    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
-        miopen::par_for(dyinput.size(), 1, [&](int index) {
-            output[index] = static_cast<T>(
-                f(static_cast<double>(dyinput[index]), static_cast<double>(xinput[index]), dummy));
-        });
-    });
-}
-
-template <class T>
-inline void activationHostBwd(miopenActivationMode_t activMode,
-                              double gamma,
-                              double beta,
-                              double alpha,
-                              const std::vector<T> dyinput,
-                              const std::vector<T> xinput,
-                              const std::vector<T> yinput,
-                              std::vector<T>& output)
-{
-    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
-        miopen::par_for(dyinput.size(), 1, [&](int index) {
-            output[index] = static_cast<T>(f(static_cast<double>(dyinput[index]),
-                                             static_cast<double>(xinput[index]),
-                                             static_cast<double>(yinput[index])));
-        });
-    });
-}
-
-inline void activationHostBwdElement(miopenActivationMode_t activMode,
-                                     double gamma,
-                                     double beta,
-                                     double alpha,
-                                     const double dyinput,
-                                     const double xinput,
-                                     const double yinput,
-                                     double& output)
-{
-    visitActivationHostBwd(activMode, gamma, beta, alpha, [&](auto f) {
-        output = static_cast<double>(f(dyinput, xinput, yinput));
-    });
-}
-
-template <class T>
-tensor<T> get_output_tensor(const miopen::ConvolutionDescriptor& filter,
-                            const tensor<T>& input,
-                            const tensor<T>& weights)
-{
-    return tensor<T>{filter.GetForwardOutputTensor(input.desc, weights.desc, miopen_type<T>{})};
-}
diff --git a/projects/miopen/test/gemm.hpp b/projects/miopen/test/gemm.hpp
index 81c38db0fdf3..be0195545352 100644
--- a/projects/miopen/test/gemm.hpp
+++ b/projects/miopen/test/gemm.hpp
@@ -1,120 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_GEMM_HPP
-#define GUARD_GEMM_HPP
-
-#include <iostream>
-#include <miopen/ford.hpp>
-#include <miopen/errors.hpp>
-
-/*
-    A and B rows and cols should be passed as default values (NxM, MxK), independently of
-   a_transponse/b_transpose flag value
-    C rows and cols should have correct values based on a_transponse/b_transpose values
-    A, B, C strides should have corret values based on a_transponse/b_transpose values
-*/
-template <typename Dtype>
-void gemm_cpu(const Dtype* a_ptr,
-              const size_t a_cols,
-              const size_t a_rows,
-              const size_t a_stride,
-              const bool a_transpose,
-              const Dtype* b_ptr,
-              const size_t b_cols,
-              const size_t b_rows,
-              const size_t b_stride,
-              const bool b_transpose,
-              Dtype* c_ptr,
-              const size_t c_cols,
-              const size_t c_rows,
-              const size_t c_stride,
-              double alpha = 1.0,
-              double beta  = 1.0)
-{
-    if((!a_transpose && !b_transpose &&
-        ((a_cols != b_rows) || (a_rows != c_rows) || (b_cols != c_cols))) ||
-       (a_transpose && b_transpose &&
-        ((a_rows != b_cols) || (a_cols != c_rows) || (b_rows != c_cols))) ||
-       (a_transpose && !b_transpose &&
-        ((a_rows != b_rows) || (a_cols != c_rows) || (b_cols != c_cols))) ||
-       (!a_transpose && b_transpose &&
-        ((a_cols != b_cols) || (a_rows != c_rows) || (b_rows != c_cols))))
-    {
-        MIOPEN_THROW("MM_CPU_ERROR. Incompatible matrix size:\nA: " + std::to_string(a_rows) + "x" +
-                     std::to_string(a_cols) + " transpose: " + (a_transpose ? "true" : "false") +
-                     "\nB: " + std::to_string(b_rows) + "x" + std::to_string(b_cols) +
-                     " transpose: " + (b_transpose ? "true" : "false") +
-                     "\nC: " + std::to_string(c_rows) + "x" + std::to_string(c_cols) + "\n");
-    }
-
-    size_t inner_loop_limit = a_transpose ? a_rows : a_cols;
-    auto inner_loop         = [&](int m, int n) {
-        double el = 0.0;
-        if(!a_transpose && !b_transpose)
-        {
-            miopen::ford(inner_loop_limit)([&](int k) {
-                el += static_cast<double>(a_ptr[m * a_stride + k]) *
-                      static_cast<double>(b_ptr[k * b_stride + n]);
-            });
-        }
-        else if(!a_transpose && b_transpose)
-        {
-            miopen::ford(inner_loop_limit)([&](int k) {
-                el += static_cast<double>(a_ptr[m * a_stride + k]) *
-                      static_cast<double>(b_ptr[n * b_stride + k]);
-            });
-        }
-        else if(a_transpose && !b_transpose)
-        {
-            miopen::ford(inner_loop_limit)([&](int k) {
-                el += static_cast<double>(a_ptr[k * a_stride + m]) *
-                      static_cast<double>(b_ptr[k * b_stride + n]);
-            });
-        }
-        else
-        {
-            miopen::ford(inner_loop_limit)([&](int k) {
-                el += static_cast<double>(a_ptr[k * a_stride + m]) *
-                      static_cast<double>(b_ptr[n * b_stride + k]);
-            });
-        }
-
-        c_ptr[m * c_stride + n] =
-            static_cast<Dtype>(beta * static_cast<double>(c_ptr[m * c_stride + n]) + alpha * el);
-    };
-
-    constexpr size_t iter_margin = 1'048'576; // 2^20
-    if(c_rows * c_cols * inner_loop_limit > iter_margin)
-    {
-        miopen::par_ford(c_rows, c_cols)(inner_loop);
-    }
-    else
-    {
-        miopen::ford(c_rows, c_cols)(inner_loop);
-    }
-}
-
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/gemm.hpp>
diff --git a/projects/miopen/test/gtest/CMakeLists.txt b/projects/miopen/test/gtest/CMakeLists.txt
index af74113fa312..dfdb6ef4630e 100644
--- a/projects/miopen/test/gtest/CMakeLists.txt
+++ b/projects/miopen/test/gtest/CMakeLists.txt
@@ -81,7 +81,7 @@ function(add_gtest TEST_NAME TEST_CPP)
   # Workaround : change in rocm-cmake was causing linking error so had to add ${CMAKE_DL_LIBS}
   #               We can remove ${CMAKE_DL_LIBS} once root cause is identified.
   # MIOpen_with_plugins ensures CK plugin .so's are built alongside the test
-  target_link_libraries(${TEST_NAME} ${CMAKE_DL_LIBS} GTest::gtest GTest::gtest_main GTest::gmock MIOpen_with_plugins hip::host )
+  target_link_libraries(${TEST_NAME} ${CMAKE_DL_LIBS} GTest::gtest GTest::gtest_main GTest::gmock MIOpen_with_plugins hip::host miopen_common_utils miopen_utils)
   if(NOT MIOPEN_EMBED_DB STREQUAL "")
       target_link_libraries(${TEST_NAME} $<BUILD_INTERFACE:miopen_data>)
   endif()
@@ -211,7 +211,7 @@ endforeach()
 # Otherwise, all files in ${SOURCES} are rebuilt for each test.
 add_library(miopen_gtest_common STATIC ${SOURCES})
 target_include_directories(miopen_gtest_common PRIVATE ../ ../../src/kernels)
-target_link_libraries(miopen_gtest_common PRIVATE GTest::gtest GTest::gmock MIOpen)
+target_link_libraries(miopen_gtest_common PRIVATE GTest::gtest GTest::gmock MIOpen miopen_common_utils miopen_utils)
 if(WIN32)
   # Refer to https://en.cppreference.com/w/cpp/language/types for details.
   target_compile_options(miopen_gtest_common PRIVATE $<BUILD_INTERFACE:$<$<CXX_COMPILER_ID:Clang>:-U__LP64__>>)
diff --git a/projects/miopen/test/gtest/adam.hpp b/projects/miopen/test/gtest/adam.hpp
index 0efd9b390765..e54ddd1fc85d 100644
--- a/projects/miopen/test/gtest/adam.hpp
+++ b/projects/miopen/test/gtest/adam.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 #define MIOPEN_BETA_API 1
-#include "../driver/tensor_driver.hpp"
 #include "cpu_adam.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/test/gtest/addlayernorm.hpp b/projects/miopen/test/gtest/addlayernorm.hpp
index 0eba1588058d..511882710ff8 100644
--- a/projects/miopen/test/gtest/addlayernorm.hpp
+++ b/projects/miopen/test/gtest/addlayernorm.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
diff --git a/projects/miopen/test/gtest/cat.hpp b/projects/miopen/test/gtest/cat.hpp
index 8d5fb109e0ea..bf29ccc7bcb0 100644
--- a/projects/miopen/test/gtest/cat.hpp
+++ b/projects/miopen/test/gtest/cat.hpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier:  MIT
 
 #define MIOPEN_BETA_API 1
-#include "../driver/tensor_driver.hpp"
 #include "cpu_cat.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/test/gtest/conv3d_test_case.hpp b/projects/miopen/test/gtest/conv3d_test_case.hpp
index a10c1809cacf..d9a061941703 100644
--- a/projects/miopen/test/gtest/conv3d_test_case.hpp
+++ b/projects/miopen/test/gtest/conv3d_test_case.hpp
@@ -30,7 +30,6 @@
 #include "get_handle.hpp"
 #include <miopen/conv/data_invoke_params.hpp>
 
-#include "../driver/tensor_driver.hpp"
 #include "conv_common.hpp"
 #include "conv_test_base.hpp"
 
diff --git a/projects/miopen/test/gtest/find_mode_trust_verify.cpp b/projects/miopen/test/gtest/find_mode_trust_verify.cpp
index 021a593f3372..178b1edff149 100644
--- a/projects/miopen/test/gtest/find_mode_trust_verify.cpp
+++ b/projects/miopen/test/gtest/find_mode_trust_verify.cpp
@@ -26,7 +26,7 @@
 
 #include <gtest/group_conv.hpp>
 #include <miopen/datatype.hpp>
-#include "../../driver/driver.hpp"
+#include <miopen_utils/gpu_mem.hpp>
 
 namespace miopen {
 std::vector<solver::ConvSolution>
diff --git a/projects/miopen/test/gtest/getitem.hpp b/projects/miopen/test/gtest/getitem.hpp
index 22c98ca67b99..8889b1d3d457 100644
--- a/projects/miopen/test/gtest/getitem.hpp
+++ b/projects/miopen/test/gtest/getitem.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
diff --git a/projects/miopen/test/gtest/group_conv.hpp b/projects/miopen/test/gtest/group_conv.hpp
index d9ab9e080898..8acdd56548e2 100644
--- a/projects/miopen/test/gtest/group_conv.hpp
+++ b/projects/miopen/test/gtest/group_conv.hpp
@@ -32,7 +32,6 @@
 #include <miopen/conv/solvers.hpp>
 #include <miopen/conv/wrw_invoke_params.hpp>
 
-#include "../driver/tensor_driver.hpp"
 #include "conv_common.hpp"
 #include "gtest_common.hpp"
 
diff --git a/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp b/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp
index 3e141b72057e..7f9c62901733 100644
--- a/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp
+++ b/projects/miopen/test/gtest/group_conv_deterministic_split_k.cpp
@@ -30,7 +30,6 @@
 #include <miopen/conv/wrw_invoke_params.hpp>
 #include "../random.hpp"
 #include "get_handle.hpp"
-#include "../driver/tensor_driver.hpp"
 #include "conv_common.hpp"
 #include "gtest_common.hpp"
 
diff --git a/projects/miopen/test/gtest/groupnorm.hpp b/projects/miopen/test/gtest/groupnorm.hpp
index 33c4ed105f59..e28c5b652605 100644
--- a/projects/miopen/test/gtest/groupnorm.hpp
+++ b/projects/miopen/test/gtest/groupnorm.hpp
@@ -31,7 +31,6 @@
 #include "cpu_groupnorm.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
-#include "../driver/tensor_driver.hpp"
 #include "verify.hpp"
 #include <random>
 
diff --git a/projects/miopen/test/gtest/kernel_tuning_net.cpp b/projects/miopen/test/gtest/kernel_tuning_net.cpp
index 304adb9800d4..760a099b2ef4 100644
--- a/projects/miopen/test/gtest/kernel_tuning_net.cpp
+++ b/projects/miopen/test/gtest/kernel_tuning_net.cpp
@@ -30,7 +30,7 @@
 #include <miopen/conv/solvers.hpp>
 #include <miopen/conv/heuristics/ai_heuristics.hpp>
 #include <miopen/datatype.hpp>
-#include "../../driver/driver.hpp"
+#include <miopen_utils/gpu_mem.hpp>
 
 struct KernelTuningNetTestCase : AIModelTestCase
 {
diff --git a/projects/miopen/test/gtest/kthvalue.hpp b/projects/miopen/test/gtest/kthvalue.hpp
index 2aa7e6fd41d1..58d7db388419 100644
--- a/projects/miopen/test/gtest/kthvalue.hpp
+++ b/projects/miopen/test/gtest/kthvalue.hpp
@@ -23,7 +23,6 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include "../driver/tensor_driver.hpp"
 #include "cpu_kthvalue.hpp"
 #include "get_handle.hpp"
 
diff --git a/projects/miopen/test/gtest/layout_transpose.cpp b/projects/miopen/test/gtest/layout_transpose.cpp
index f67c7a0387de..b688d17b2aa7 100644
--- a/projects/miopen/test/gtest/layout_transpose.cpp
+++ b/projects/miopen/test/gtest/layout_transpose.cpp
@@ -25,7 +25,6 @@
  *******************************************************************************/
 #include <gtest/gtest.h>
 
-#include "../../driver/conv_common.hpp"
 #include <miopen/batched_transpose_sol.hpp>
 #include <miopen/handle.hpp>
 #include <miopen/invoker.hpp>
@@ -38,6 +37,8 @@
 
 #include <vector>
 
+using float16 = half_float::half;
+
 namespace {
 
 template <typename T>
diff --git a/projects/miopen/test/gtest/reducecalculation.hpp b/projects/miopen/test/gtest/reducecalculation.hpp
index 2f2867423d5f..3b2de8465c0c 100644
--- a/projects/miopen/test/gtest/reducecalculation.hpp
+++ b/projects/miopen/test/gtest/reducecalculation.hpp
@@ -24,14 +24,13 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
+#include <miopen/miopen.h>
 #include "../src/kernels/MIOpenReduceCalculation.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
 #include "verify.hpp"
 #include <gtest/gtest.h>
-#include <miopen/miopen.h>
 #include <miopen/reducecalculation.hpp>
 
 template <typename T, ReduceCalculationOp_t op>
diff --git a/projects/miopen/test/gtest/reduceextreme.hpp b/projects/miopen/test/gtest/reduceextreme.hpp
index f884bb8fc5cf..0c2cde8c7564 100644
--- a/projects/miopen/test/gtest/reduceextreme.hpp
+++ b/projects/miopen/test/gtest/reduceextreme.hpp
@@ -24,7 +24,7 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
+#include <miopen/miopen.h>
 #include "../src/kernels/MIOpenReduceExtreme.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
@@ -32,7 +32,6 @@
 #include "verify.hpp"
 #include <gtest/gtest.h>
 #include <miopen/reduceextreme.hpp>
-#include <miopen/miopen.h>
 
 template <typename T>
 bool compare_equal(T r1, T r2)
diff --git a/projects/miopen/test/gtest/rope.hpp b/projects/miopen/test/gtest/rope.hpp
index 8c8dd2ed2b3d..109ff0549978 100644
--- a/projects/miopen/test/gtest/rope.hpp
+++ b/projects/miopen/test/gtest/rope.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
diff --git a/projects/miopen/test/gtest/softmax_find20.cpp b/projects/miopen/test/gtest/softmax_find20.cpp
index 094a432d4521..84dd758f4d30 100644
--- a/projects/miopen/test/gtest/softmax_find20.cpp
+++ b/projects/miopen/test/gtest/softmax_find20.cpp
@@ -28,7 +28,7 @@
 #include "test.hpp"
 #include "get_handle.hpp"
 #include "tensor_holder.hpp"
-#include "../driver/mloSoftmaxHost.hpp"
+#include <miopen_utils/mloSoftmaxHost.hpp>
 #include "verify.hpp"
 
 #include <miopen/softmax.hpp>
diff --git a/projects/miopen/test/gtest/t5layernorm.hpp b/projects/miopen/test/gtest/t5layernorm.hpp
index 1ee2f2bd6ebe..e71819273683 100644
--- a/projects/miopen/test/gtest/t5layernorm.hpp
+++ b/projects/miopen/test/gtest/t5layernorm.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "../driver/tensor_driver.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
 #include "tensor_holder.hpp"
diff --git a/projects/miopen/test/gtest/transformers_adam_w.hpp b/projects/miopen/test/gtest/transformers_adam_w.hpp
index d2a804841258..ef465fc98854 100644
--- a/projects/miopen/test/gtest/transformers_adam_w.hpp
+++ b/projects/miopen/test/gtest/transformers_adam_w.hpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 #define MIOPEN_BETA_API 1
-#include "../driver/tensor_driver.hpp"
 #include "cpu_transformers_adam_w.hpp"
 #include "get_handle.hpp"
 #include "random.hpp"
diff --git a/projects/miopen/test/network_data.hpp b/projects/miopen/test/network_data.hpp
index 987d4dda9929..18e85973ef3f 100644
--- a/projects/miopen/test/network_data.hpp
+++ b/projects/miopen/test/network_data.hpp
@@ -1,438 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
-#define GUARD_MIOPEN_TEST_NETWORK_DATA_HPP
-
-#include <initializer_list>
-#include <set>
-#include <vector>
-#include <type_traits>
-
-#ifndef MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR
-#define MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR 0
-#endif
-
-template <typename T = int>
-inline constexpr T pick_batch_size(T x, T y)
-{
-    return (y == 0 || y > x) ? 1 : x / y;
-}
-
-// Reduce tests execution time
-#define MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS 1
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size<T>(32,  n), 1,    14,  14  },
-        { pick_batch_size<T>(100, n), 1,    8,   8   },
-        { pick_batch_size<T>(256, n), 1,    27,  27  },
-#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS
-        { pick_batch_size<T>(64, n), 19,   1024,2048},
-#endif
-        { pick_batch_size<T>(100, n), 3,    32,  32  },
-        { pick_batch_size<T>(100, n), 32,   16,  16  },
-        { pick_batch_size<T>(100, n), 32,   8,   8   },
-        { pick_batch_size<T>(128, n), 256,  12,  12  },
-        { pick_batch_size<T>(128, n), 3,    231, 231 },
-        { pick_batch_size<T>(128, n), 512,  12,  12  },
-        { pick_batch_size<T>(256, n), 256,  13,  13  },
-        { pick_batch_size<T>(256, n), 3,    227, 227 },
-        { pick_batch_size<T>(256, n), 384,  13,  13  },
-        { pick_batch_size<T>(256, n), 96,   27,  27  },
-        { pick_batch_size<T>(32, n),  128,  28,  28  },
-        { pick_batch_size<T>(32, n),  144,  14,  14  },
-        { pick_batch_size<T>(32, n),  192,  28,  28  },
-        { pick_batch_size<T>(32, n),  192,  7,   7   },
-        { pick_batch_size<T>(32, n),  256,  28,  28  },
-        { pick_batch_size<T>(32, n),  3,    224, 224 },
-        { pick_batch_size<T>(32, n),  32,   28,  28  },
-        { pick_batch_size<T>(32, n),  48,   7,   7   },
-        { pick_batch_size<T>(32, n),  480,  128, 256 },
-        { pick_batch_size<T>(32, n),  480,  64,  128 },
-        { pick_batch_size<T>(32, n),  512,  4,   4   },
-        { pick_batch_size<T>(32, n),  512,  64,  128 },
-        { pick_batch_size<T>(16, n),  64,   56,  56  },
-        { pick_batch_size<T>(32, n),  832,  7,   7   },
-        { pick_batch_size<T>(64, n),  128,  56,  56  },
-        { pick_batch_size<T>(64, n),  256,  28,  28  },
-        { pick_batch_size<T>(64, n),  3,    224, 224 },
-        { pick_batch_size<T>(64, n),  512,  28,  28  },
-        { pick_batch_size<T>(64, n),  64,   112, 112 },
-        { pick_batch_size<T>(32, n),  64,   14,  14  },
-        { pick_batch_size<T>(32, n),  192,  14,  14  },
-        { pick_batch_size<T>(32, n),  320,  28,  28  },
-        { pick_batch_size<T>(32, n),  576,  14,  14  },
-        { pick_batch_size<T>(32, n),  576,  4,   4   },
-        { pick_batch_size<T>(32, n),  1056, 7,   7   },
-        { pick_batch_size<T>(32, n),  2048, 11,  11  },
-#if MIOPEN_TESTS_GET_INPUTS_ENABLE_HUGE_TENSORS
-        { pick_batch_size<T>(32, n),  16,   2048, 2048 },
-        { pick_batch_size<T>(32, n),  16,   3072, 3072 },
-        { pick_batch_size<T>(32, n),  16,   4096, 4096 },
-#endif
-        { 1,                       1,    1,   1   }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(1024, n),1024, 3,  3  },
-        { pick_batch_size(1024, n),512,  3,  3  },
-        { pick_batch_size(128, n), 256,  1,  1  },
-        { pick_batch_size(128, n), 528,  1,  1  },
-        { pick_batch_size(128, n), 96,   3,  3  },
-        { pick_batch_size(16, n),  192,  1,  1  },
-        { pick_batch_size(224, n), 112,  3,  3  },
-        { pick_batch_size(256, n), 96,   5,  5  },
-        { pick_batch_size(288, n), 144,  3,  3  },
-        { pick_batch_size(48, n),  832,  1,  1  },
-        { pick_batch_size(512, n), 256,  3,  3  },
-        { pick_batch_size(64, n),  1,    2,  2  },
-        { pick_batch_size(64, n),  3,    3,  3  },
-        { pick_batch_size(64, n),  3,    7,  7  },
-        { pick_batch_size(64, n),  32,   5,  5  },
-        { pick_batch_size(64, n),  480,  1,  1  },
-        { pick_batch_size(64, n),  64,   1,  1  },
-        { pick_batch_size(96, n),  3,    11, 11 },
-        { pick_batch_size(192, n), 64,   5,  5  },
-        { pick_batch_size(64, n),  64,   3,  3  },
-        { pick_batch_size(224, n), 224,  3,  3  },
-        { pick_batch_size(224, n), 192,  3,  3  },
-        { pick_batch_size(128, n), 320,  1,  1  },
-        { pick_batch_size(192, n), 576,  1,  1  },
-        { pick_batch_size(128, n), 1056, 1,  1  },
-        { pick_batch_size(128, n), 1024, 1,  1  },
-        { pick_batch_size(512, n), 2048, 1,  1  }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_immed_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32,  n), 1,    14,  14  },
-        { pick_batch_size(256, n), 1,    27,  27  },
-        { pick_batch_size(128, n), 512,  12,  12  },
-        { pick_batch_size(256, n), 256,  13,  13  },
-        { pick_batch_size(256, n), 3,    227, 227 },
-        { pick_batch_size(32, n),  64,   56,  56  },
-        { pick_batch_size(32, n),  96,   14,  14  },
-        { pick_batch_size(32, n),  96,   28,  28  },
-        { pick_batch_size(64, n),  128,  56,  56  },
-        { pick_batch_size(64, n),  3,    224, 224 },
-        { pick_batch_size(64, n),  256,  14,  14  },
-        { 1,                       1,    1,   1   }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_immed_weights(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(208, n), 96,   3,  3  },
-        { pick_batch_size(24, n),  512,  1,  1  },
-        { pick_batch_size(256, n), 128,  3,  3  },
-        { pick_batch_size(256, n), 256,  3,  3  },
-        { pick_batch_size(256, n), 64,   5,  5  },
-        { pick_batch_size(288, n), 144,  3,  3  },
-        { pick_batch_size(96, n),  3,    11, 11 },
-        { pick_batch_size(32, n),  128,   5,  5  },
-        { pick_batch_size(32, n),  128,  1,  1  },
-        { pick_batch_size(256, n), 256,  3,  3  },
-        { pick_batch_size(512, n), 512,  3,  3  },
-        { pick_batch_size(160, n), 128,  3,  3  },
-        { pick_batch_size(32, n),  3,    7,  7  }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>>
-get_3d_conv_input_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(128, n),   1,   1,   2,   2},
-        { pick_batch_size(128, n),  64,   1,   1,   1},
-        { pick_batch_size(128, n),  64,   3,   4,   4},
-        { pick_batch_size(352, n),  32,   4,   9,   9},
-        { pick_batch_size(192, n), 512,   3,  14,  14},
-        { pick_batch_size(352, n), 512,   4,  28,  28},
-        { pick_batch_size(256, n), 512,   4,  56,  56},
-        { pick_batch_size(192, n),   3,   4, 227, 227},
-        { pick_batch_size(128, n),   4,   4, 161, 700}
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>>
-get_3d_conv_weight_shapes(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size( 128, n),   1,   1,   1,   1},
-        { pick_batch_size( 352, n), 128,   1,   1,   1},
-        { pick_batch_size( 256, n), 128,   1,   1,   1},
-        { pick_batch_size( 352, n),  32,   3,   3,   3},
-        { pick_batch_size( 352, n),   4,   3,   3,   3},
-        { pick_batch_size( 160, n),   4,   3,   5,   5},
-        { pick_batch_size( 128, n),  64,   5,   7,   7},
-        { pick_batch_size( 192, n),   4,   3,  11,  11},
-        { pick_batch_size( 128, n),   1,   3,   1,   7},
-        { pick_batch_size( 128, n),   1,   3,   7,   1},
-        { pick_batch_size( 128, n),   1,   3,   5,  20}
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32, n),  4,    1024,2048}, //Making this much smaller
-        { pick_batch_size(100, n), 3,    32,  32  },
-        { pick_batch_size(100, n), 32,   8,   8   },
-        { pick_batch_size(128, n), 256,  12,  12  },
-        { pick_batch_size(256, n), 3,    227, 227 },
-        { pick_batch_size(64, n),  64,   112, 112 },//Batch-norm ResNet 152 after this line
-        { pick_batch_size(256, n), 1024, 14,  14  },// n is from the paper @ 256
-        { pick_batch_size(256, n), 2048, 7,   7   },
-        { pick_batch_size(256, n), 256,  56,  56  },
-        { pick_batch_size(256, n), 256,  14,  14  },
-        { pick_batch_size(256, n), 512,  28,  28  },
-        { pick_batch_size(256, n), 512,  7,   7   },
-        { pick_batch_size(256, n), 64,   112, 112 },
-        { pick_batch_size(256, n), 64,   56,  56  },//Batch-norm Inception_v3 after this
-        { pick_batch_size(32, n),  1024, 1,   1   },// n is from the paper @ 32
-        { pick_batch_size(32, n),  128,  14,  14  },
-        { pick_batch_size(32, n),  128,  28,  28  },
-        { pick_batch_size(32, n),  128,  4,   4   },
-        { pick_batch_size(32, n),  128,  7,   7   },
-        { pick_batch_size(32, n),  160,  7,   7   },
-        { pick_batch_size(32, n),  192,  14,  14  },
-        { pick_batch_size(32, n),  192,  56,  56  },
-        { pick_batch_size(32, n),  192,  7,   7   },
-        { pick_batch_size(32, n),  224,  14,  14  },
-        { pick_batch_size(32, n),  256,  7,   7   },
-        { pick_batch_size(32, n),  256,  14,  14  },
-        { pick_batch_size(32, n),  352,  7,   7   },
-        { pick_batch_size(32, n),  64,   112, 112 },
-        { pick_batch_size(32, n),  64,   14,  14  },
-        { pick_batch_size(32, n),  64,   56,  56  },
-        { pick_batch_size(32, n),  96,   28,  28  },
-        { pick_batch_size(32, n),  32,  256,  512 }, //Killing this config. Takes way too long on the CPU
-        { pick_batch_size(32, n),  256,  28,  28  },
-        { pick_batch_size(32, n),  3,    224, 224 },
-        { pick_batch_size(32, n),  480,  128, 256 },
-        { pick_batch_size(32, n),  528,  64,  128 }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32, n),  4,    1024,2048}, //Making this much smaller
-        { pick_batch_size(32, n),  192,  256, 512 },
-        { pick_batch_size(32, n),  480,  128, 256 },
-        { pick_batch_size(256, n), 3,    227, 227 },
-        { pick_batch_size(256, n), 64,   112, 112 },
-        { pick_batch_size(512, n), 16,   32,  32  },
-        { pick_batch_size(100, n), 32,   8,   8   },
-        { pick_batch_size(128, n), 256,  12,  12  },
-        { pick_batch_size(256, n), 128,  28,  28  },
-        { pick_batch_size(256, n), 2048, 7,   7   },
-        { pick_batch_size(256, n), 256,  56,  56  },
-        { pick_batch_size(256, n), 256,  14,  14  },
-        { pick_batch_size(256, n), 512,  28,  28  },
-        { pick_batch_size(256, n), 512,  7,   7   },
-        { pick_batch_size(256, n), 64,   56,  56  },//Batch-norm Inception_v3 after this
-        { pick_batch_size(32, n),  1024, 1,   1   },// n is from the paper @ 32
-        { pick_batch_size(32, n),  128,  14,  14  },
-        { pick_batch_size(32, n),  128,  4,   4   },
-        { pick_batch_size(32, n),  160,  7,   7   },
-        { pick_batch_size(32, n),  192,  14,  14  },
-        { pick_batch_size(32, n),  192,  56,  56  },
-        { pick_batch_size(32, n),  192,  7,   7   },
-        { pick_batch_size(32, n),  224,  14,  14  },
-        { pick_batch_size(32, n),  256,  7,   7   },
-        { pick_batch_size(32, n),  352,  7,   7   },
-        { pick_batch_size(32, n),  64,   14,  14  },
-        { pick_batch_size(32, n),  64,   28,  28  },
-        { pick_batch_size(32, n),  64,   56,  56  },
-        { pick_batch_size(32, n),  96,   28,  28  },
-        { pick_batch_size(32, n),  192,  256, 512 },
-        { pick_batch_size(32, n),  256,  28,  28  },
-        { pick_batch_size(32, n),  3,    224, 224 },
-        { pick_batch_size(32, n),  480,  128, 256 },
-        { pick_batch_size(32, n),  528,  64,  128 },
-        { pick_batch_size(770, n),  1,  8,  8 },
-        { pick_batch_size(770, n),  1024,  1,  1 },
-        { pick_batch_size(152, n),  128,  80,  80 },
-        { pick_batch_size(152, n),  256,  20,  20 },
-        { pick_batch_size(152, n),  32,  160,  160 },
-        { pick_batch_size(152, n),  512,  20,  20 },
-        { pick_batch_size(152, n),  64,  160,  160 },
-        { pick_batch_size(152, n),  64,  80,  80 },
-        { pick_batch_size(256, n),  256,  20,  20 },
-        { pick_batch_size(256, n),  512,  20,  20 }
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>> get_3d_bn_peract_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32, n),   1,   32,  32,  32  },       // 32x32x32 based on VoxNet arch
-        { pick_batch_size(32, n),   1,   14,  14,  14  },
-        { pick_batch_size(32, n),  32,   14,  14,  14  },
-        { pick_batch_size(32, n),  32,   12,  12,  12  },
-        { pick_batch_size(32, n),  32,    6,   6,   6  },
-        { pick_batch_size(256, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
-        { pick_batch_size(256, n), 32,   14,  14,  14  },
-        { pick_batch_size(256, n), 32,   12,  12,  12  },
-        { pick_batch_size(256, n), 32,    6,   6,   6  },
-        { pick_batch_size(512, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
-        { pick_batch_size(512, n), 32,   14,  14,  14  },
-        { pick_batch_size(512, n), 32,   12,  12,  12  },
-        { pick_batch_size(512, n), 32,    6,   6,   6  },
-        { pick_batch_size(32, n),   2,   32,  57, 125  },       // Hand-gesture recognition CVPR 2015 paper High Res Net Path
-        { pick_batch_size(32, n),  32,   14,  25,  59  },
-        { pick_batch_size(32, n),  32,    6,  10,  27  },
-        { pick_batch_size(32, n),  32,    4,   6,  11  },
-        { pick_batch_size(32, n),  32,    2,   2,   3  },
-        { pick_batch_size(32, n),  32,   32,  28,  62  },       // Hand-gesture recognition CVPR 2015 paper Low Res Net Path
-        { pick_batch_size(32, n),  32,   14,  12,  29  },
-        { pick_batch_size(32, n),  32,    6,   4,  12  },
-        { pick_batch_size(32, n),  32,    4,   2,   2  },
-        { pick_batch_size(16, n),  32,    6,  50,  50  },       // Multi-view 3D convnet
-        { pick_batch_size(1,  n),   3,    8, 240, 320  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,   16, 240, 320  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,    8, 128, 171  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,   16, 128, 171  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,    8, 112, 112  },      // 3D convet on video
-        { pick_batch_size(1,  n),   3,   16, 112, 112  }      // 3D convet on video
-    };
-
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::set<std::vector<T>>
-get_3d_bn_spatial_inputs(T n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR)
-{
-    // clang-format off
-    return
-    {
-        { pick_batch_size(32, n),   1,   32,  32,  32  },       // 32x32x32 based on VoxNet arch
-        { pick_batch_size(32, n),   1,   14,  14,  14  },
-        { pick_batch_size(32, n),  32,   14,  14,  14  },
-        { pick_batch_size(32, n),  32,   12,  12,  12  },
-        { pick_batch_size(32, n),  32,    6,   6,   6  },
-        { pick_batch_size(256, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
-        { pick_batch_size(256, n), 32,   14,  14,  14  },
-        { pick_batch_size(256, n), 32,   12,  12,  12  },
-        { pick_batch_size(256, n), 32,    6,   6,   6  },
-        { pick_batch_size(512, n),  1,   32,  32,  32  },      // 32x32x32 based on VoxNet arch
-        { pick_batch_size(512, n), 32,   14,  14,  14  },
-        { pick_batch_size(512, n), 32,   12,  12,  12  },
-        { pick_batch_size(512, n), 32,    6,   6,   6  },
-        { pick_batch_size(32,  n),  2,   32,  57, 125  },       // Hand-gesture recognition CVPR 2015 paper High Res Net Path
-        { pick_batch_size(32,  n), 32,   14,  25,  59  },
-        { pick_batch_size(32,  n), 32,    6,  10,  27  },
-        { pick_batch_size(32,  n), 32,    4,   6,  11  },
-        { pick_batch_size(32,  n), 32,    2,   2,   3  },
-        { pick_batch_size(32,  n), 32,   32,  28,  62  },       // Hand-gesture recognition CVPR 2015 paper Low Res Net Path
-        { pick_batch_size(32,  n), 32,   14,  12,  29  },
-        { pick_batch_size(32,  n), 32,    6,   4,  12  },
-        { pick_batch_size(32,  n), 32,    4,   2,   2  },
-        { pick_batch_size(16,  n), 32,    6,  50,  50  },       // Multi-view 3D convnet
-        { pick_batch_size(1,   n), 3,     8,  240, 320 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,    16,  240, 320 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,     8,  128, 171 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,    16,  128, 171 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,     8,  112, 112 },      // 3D convet on video
-        { pick_batch_size(1,   n), 3,    16,  112, 112 }      // 3D convet on video
-    };
-    // clang-format on
-}
-
-template <typename T = int>
-inline std::vector<std::vector<T>> get_sub_tensor()
-{
-    return {{16, 4, 8, 1, 4},
-            {2, 4, 8, 8, 4},
-            {16, 4, 8, 4},
-            {13, 8, 4, 8},
-            {3, 8, 7},
-            {16, 4, 10},
-            {3, 8},
-            {16, 4},
-            {4}};
-}
-
-template <typename T = int>
-inline std::vector<std::vector<T>> get_tensor_offsets()
-{
-    static_assert(std::is_signed_v<T>);
-    return {{0, 0}, {0, 2}, {4, 0}, {5, 7}};
-}
-
-template <typename T = int>
-inline std::vector<T> get_tensor_offset()
-{
-    static_assert(std::is_signed_v<T>);
-    return {0, 1, 2, 3, 4, 5};
-}
-
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/network_data.hpp>
diff --git a/projects/miopen/test/random.hpp b/projects/miopen/test/random.hpp
index 62443abb1068..3bb99a37d6c9 100644
--- a/projects/miopen/test/random.hpp
+++ b/projects/miopen/test/random.hpp
@@ -1,62 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_MIOPEN_TEST_RANDOM_HPP
-#define GUARD_MIOPEN_TEST_RANDOM_HPP
-
-#include "../driver/random.hpp"
-
-namespace prng {
-template <typename T>
-inline T gen_descreet_uniform_sign(double scale, int32_t range)
-{
-    return static_cast<T>(scale * prng::gen_A_to_B(-range + 1, range));
-}
-
-template <typename T>
-inline T gen_descreet_unsigned(double scale, int32_t range)
-{
-    return static_cast<T>(scale * static_cast<double>(gen_0_to_B(range)));
-}
-
-} // namespace prng
-
-// lambda factory
-template <typename T, typename ScaleT, typename RangeT>
-auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg)
-{
-    return [=](auto&&...) -> T {
-        // uniform sign give balance of both negative and positive values
-        return prng::gen_descreet_uniform_sign<T>(scale_arg, range_arg);
-    };
-}
-
-template <typename T, typename ScaleT, typename RangeT>
-auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg)
-{
-    return [=](auto&&...) -> T { return prng::gen_descreet_unsigned<T>(scale_arg, range_arg); };
-}
-
-#endif // GUARD_MIOPEN_TEST_RANDOM_HPP
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/random.hpp>
diff --git a/projects/miopen/test/rnn_util.hpp b/projects/miopen/test/rnn_util.hpp
index d993d0df4c57..0e771bfdfff1 100644
--- a/projects/miopen/test/rnn_util.hpp
+++ b/projects/miopen/test/rnn_util.hpp
@@ -1,305 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef MIOPEN_RNN_UTIL_H_
-#define MIOPEN_RNN_UTIL_H_
-
-#include <cfloat>
-#include <cmath>
-#include <initializer_list>
-#include <set>
-#include <vector>
-#include <cstdlib>
-#include <numeric>
-
-#include "gemm.hpp"
-#include "random.hpp"
-
-#include <miopen/tensor.hpp>
-
-// complexity O(NlogN)
-inline std::vector<int> GetReverseOrderIndex(const std::vector<int>& base_index)
-{
-    std::vector<int> reverse_index(base_index.size());
-    unsigned next_rev_index = 0;
-    for(auto id : base_index)
-        reverse_index[id] = next_rev_index++;
-    return reverse_index;
-};
-
-inline std::vector<int> GetSamplesIndexDescendingOrder(const std::vector<size_t>& unsorted_seq_lens)
-{
-    const auto sample_count = unsorted_seq_lens.size();
-
-    std::vector<int> index_v(sample_count);
-    std::iota(index_v.begin(), index_v.end(), 0);
-
-    auto seq_len_cmp = [&unsorted_seq_lens](unsigned a_id, unsigned b_id) {
-        return unsorted_seq_lens[a_id] > unsorted_seq_lens[b_id];
-    };
-
-    std::stable_sort(index_v.begin(), index_v.end(), seq_len_cmp);
-
-    return index_v;
-}
-
-template <typename Tgpu>
-inline void HiddenTensorReorder(const std::vector<Tgpu>& src_array,
-                                std::vector<Tgpu>& dst_array,
-                                const std::vector<int>& batch_order,
-                                const std::vector<size_t> hid_len,
-                                bool is_dst_direct_order)
-{
-    const size_t copy_size = hid_len[2];
-
-    const size_t batch_stride = hid_len[2];
-    const size_t layer_stride = batch_stride * hid_len[1];
-
-    for(size_t batch_id = 0; batch_id < hid_len[1]; batch_id++)
-    {
-        const auto src_batch_off =
-            batch_stride * (is_dst_direct_order ? batch_order[batch_id] : batch_id);
-        const auto dst_batch_off =
-            batch_stride * (is_dst_direct_order ? batch_id : batch_order[batch_id]);
-
-        for(size_t layer_id = 0; layer_id < hid_len[0]; layer_id++)
-        {
-            const auto dst_offset = dst_batch_off + layer_id * layer_stride;
-            const auto src_offset = src_batch_off + layer_id * layer_stride;
-
-            std::copy(src_array.begin() + src_offset,
-                      src_array.begin() + src_offset + copy_size,
-                      dst_array.begin() + dst_offset);
-        }
-    }
-}
-
-inline void createTensorDescArray(std::vector<miopen::TensorDescriptor>& td,
-                                  std::vector<miopenTensorDescriptor_t>& ptd,
-                                  const std::vector<int> bs,
-                                  const int secondDim,
-                                  miopenDataType_t dataType)
-{
-
-    std::transform(bs.begin(), bs.end(), std::back_inserter(td), [&](int x) {
-        return miopen::TensorDescriptor(
-            dataType, {static_cast<std::size_t>(x), static_cast<std::size_t>(secondDim)});
-    });
-    std::transform(td.begin(), td.end(), std::back_inserter(ptd), [](miopen::TensorDescriptor& x) {
-        return &x;
-    });
-}
-
-inline std::tuple<size_t, size_t>
-GetTempPackedBuffersSize(std::vector<int> batchs, int in_vec, int out_vec)
-{
-    size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0ULL);
-
-    size_t in_buff_size  = total_batch * in_vec;
-    size_t out_buff_size = total_batch * out_vec;
-    return {in_buff_size, out_buff_size};
-}
-
-inline size_t getSuperTensorSize(const std::vector<int>& bs,
-                                 int seqLength,
-                                 int inputSize,
-                                 int hiddenSize,
-                                 int maxPaddingVal,
-                                 bool isBidirect,
-                                 bool isInput,
-                                 bool isPadded)
-{
-    return (isPadded //
-                ? static_cast<size_t>(seqLength) * maxPaddingVal
-                : std::accumulate(bs.begin(), bs.end(), 0ULL)) //
-           * (isInput                                          //
-                  ? static_cast<size_t>(inputSize)
-                  : static_cast<size_t>(hiddenSize) * (isBidirect ? 2 : 1));
-}
-
-template <typename Tgpu>
-void ChangeDataPadding(const std::vector<Tgpu>& src_array,
-                       std::vector<Tgpu>& dst_array,
-                       const std::vector<int>& batch_list,
-                       int max_batch,
-                       int sample_size,
-                       bool is_src_packed)
-{
-    auto seq_len = batch_list.size();
-
-    auto scr_ptr = &src_array[0];
-    auto dst_ptr = &dst_array[0];
-
-    for(int seq_id = 0; seq_id < seq_len; seq_id++)
-    {
-        auto packed_size = batch_list[seq_id] * sample_size;
-
-        std::copy(scr_ptr, scr_ptr + packed_size, dst_ptr);
-
-        if(is_src_packed)
-        {
-            dst_ptr += max_batch * sample_size;
-            scr_ptr += packed_size;
-        }
-        else
-        {
-            scr_ptr += max_batch * sample_size;
-            dst_ptr += packed_size;
-        }
-    }
-}
-
-// RNN VANILLA configs
-inline std::vector<int> get_rnn_num_layers() { return {{1, 3}}; }
-
-inline std::vector<int> get_rnn_batchSize() { return {{1, 17}}; }
-
-inline std::vector<int> get_rnn_seq_len() { return {{1, 3, 51}}; }
-
-inline std::vector<int> get_rnn_vector_len() { return {31}; }
-
-inline std::vector<int> get_rnn_hidden_size() { return {127}; }
-
-// LSTM configs
-inline std::vector<int> get_lstm_num_layers() { return {{1, 3}}; }
-
-inline std::vector<int> get_lstm_batchSize() { return {{1, 17}}; }
-
-inline std::vector<int> get_lstm_seq_len() { return {{1, 25}}; }
-
-inline std::vector<int> get_lstm_vector_len() { return {17}; }
-
-inline std::vector<int> get_lstm_hidden_size() { return {67}; }
-
-// GRU configs
-inline std::vector<int> get_gru_num_layers() { return {{1, 3}}; }
-
-inline std::vector<int> get_gru_batchSize() { return {{1, 17}}; }
-
-inline std::vector<int> get_gru_seq_len() { return {{1, 23}}; }
-
-inline std::vector<int> get_gru_vector_len() { return {13}; }
-
-inline std::vector<int> get_gru_hidden_size() { return {67}; }
-
-inline std::vector<std::vector<int>> generate_batchSeq(const int batchSize, const int seqLength)
-{
-
-    static constexpr int modval = 3;
-
-    int currentval = batchSize;
-    std::vector<int> batchSeq;
-    batchSeq.reserve(seqLength);
-    for(int i = 0; i < seqLength; i++)
-    {
-        if(i > 0)
-        {
-            int nvalue = currentval - prng::gen_0_to_B(modval);
-            currentval = (nvalue < 1) ? 1 : nvalue;
-            // printf("current value: %d\n", currentval);
-        }
-        // printf("adding a value to batch sequence: %d\n", currentval);
-        batchSeq.push_back(currentval);
-    }
-    return {batchSeq};
-}
-
-inline int sumvc(const std::vector<int>& x) { return std::accumulate(x.begin(), x.end(), 0); }
-
-template <typename T>
-inline T activfunc(T x, int actvf)
-{
-    T alpha = static_cast<T>(1), beta0 = static_cast<T>(0), beta1 = static_cast<T>(1);
-    if(actvf == 0)
-    {
-        return (x > 0) ? x : x * beta0;
-    }
-    else if(actvf == 2)
-    {
-        return static_cast<T>(1 / (1 + std::exp(-x)));
-    }
-    return static_cast<T>(alpha * std::tanh(beta1 * x));
-}
-
-template <typename T>
-inline T dervactivfunc(T x, int actvf)
-{
-    if(actvf == 0)
-    {
-        return static_cast<T>(x > 0 ? 1 : 0);
-    }
-    else if(actvf == 2)
-    {
-        return static_cast<T>(std::exp(-x) / (1 + std::exp(-x)) / (1 + std::exp(-x)));
-    }
-
-    return static_cast<T>(1 / std::cosh(x) / std::cosh(x));
-}
-
-template <typename Dtype>
-void RNN_mm_cpu_batched(const Dtype* a_ptr,
-                        size_t a_cols,
-                        size_t a_rows,
-                        size_t lda,
-                        size_t a_stride,
-                        int a_flags,
-                        const Dtype* b_ptr,
-                        size_t b_cols,
-                        size_t b_rows,
-                        size_t ldb,
-                        size_t b_stride,
-                        int b_flags,
-                        Dtype* c_ptr,
-                        size_t c_cols,
-                        size_t c_rows,
-                        size_t ldc,
-                        size_t c_stride,
-                        int batchCount,
-                        double alpha,
-                        double beta)
-{
-    for(int i = 0; i < batchCount; ++i)
-    {
-        gemm_cpu(a_ptr + a_stride * i,
-                 a_cols,
-                 a_rows,
-                 lda,
-                 a_flags == 1 ? true : false,
-                 b_ptr + b_stride * i,
-                 b_cols,
-                 b_rows,
-                 ldb,
-                 b_flags == 1 ? true : false,
-                 c_ptr + c_stride * i,
-                 c_cols,
-                 c_rows,
-                 ldc,
-                 alpha,
-                 beta);
-    }
-}
-
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/rnn_util.hpp>
diff --git a/projects/miopen/test/serialize.hpp b/projects/miopen/test/serialize.hpp
index 6b9b1b29632e..c3eb459c38df 100644
--- a/projects/miopen/test/serialize.hpp
+++ b/projects/miopen/test/serialize.hpp
@@ -1,129 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2018 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef MIOPEN_GUARD_TEST_SERIALIZE_HPP
-#define MIOPEN_GUARD_TEST_SERIALIZE_HPP
-
-#include <miopen/rank.hpp>
-#include <miopen/each_args.hpp>
-#include <half/half.hpp>
-#include <fstream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-template <class T>
-struct is_trivial_serializable : std::is_trivially_copy_constructible<T>
-{
-};
-
-template <>
-struct is_trivial_serializable<half_float::half> : std::true_type
-{
-};
-
-template <class T>
-std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::ostream& os, const T& x)
-{
-    os.write(reinterpret_cast<const char*>(&x), sizeof(T));
-}
-
-template <class T>
-auto serialize(std::ostream& os,
-               const T& x) -> decltype(x.begin(), x.end(), T(x.begin(), x.end()), void())
-{
-    std::size_t n = std::distance(x.begin(), x.end());
-    serialize(os, n);
-    for(auto&& y : x)
-        serialize(os, y);
-}
-
-template <class... Ts>
-std::enable_if_t<not is_trivial_serializable<std::tuple<Ts...>>{}>
-serialize(std::ostream& os, const std::tuple<Ts...>& t)
-{
-    miopen::unpack(
-        [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(os, x); }, xs...); }, t);
-}
-
-template <class T>
-std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::istream& is, T& x)
-{
-    is.read(reinterpret_cast<char*>(&x), sizeof(T));
-}
-
-template <class T>
-std::enable_if_t<is_trivial_serializable<T>{}> serialize(std::istream& is, std::vector<T>& x)
-{
-    std::size_t n;
-    serialize(is, n);
-    x.resize(n);
-    is.read(reinterpret_cast<char*>(x.data()), sizeof(T) * n);
-}
-
-template <class T>
-auto serialize(std::istream& is,
-               T& x) -> decltype(x.begin(), x.end(), x.assign(x.begin(), x.end()), void())
-{
-    using value_type = std::decay_t<decltype(*x.begin())>;
-    std::size_t n;
-    serialize(is, n);
-    std::vector<value_type> v;
-    v.reserve(n);
-    for(std::size_t i = 0; i < n; i++)
-    {
-        value_type y;
-        serialize(is, y);
-        v.push_back(y);
-    }
-    x.assign(v.begin(), v.end());
-}
-
-template <class... Ts>
-std::enable_if_t<not is_trivial_serializable<std::tuple<Ts...>>{}>
-serialize(std::istream& is,
-          // cppcheck-suppress constParameter
-          std::tuple<Ts...>& t)
-{
-    miopen::unpack(
-        [&](auto&&... xs) { miopen::each_args([&](auto&& x) { serialize(is, x); }, xs...); }, t);
-}
-
-template <class T>
-void load(std::string name, T& x)
-{
-    std::ifstream is{name.c_str()};
-    serialize(is, x);
-}
-
-template <class T>
-void save(std::string name, const T& x)
-{
-    std::ofstream os{name.c_str()};
-    serialize(os, x);
-}
-
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/serialize.hpp>
diff --git a/projects/miopen/test/tensor_holder.hpp b/projects/miopen/test/tensor_holder.hpp
index 64be2aa7c851..bc10b5a8b12d 100644
--- a/projects/miopen/test/tensor_holder.hpp
+++ b/projects/miopen/test/tensor_holder.hpp
@@ -1,505 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_TENSOR_HOLDER_HPP
-#define GUARD_TENSOR_HOLDER_HPP
-
-#include "network_data.hpp"
-#include <miopen/ford.hpp>
-#include <miopen/tensor.hpp>
-#include <miopen/functional.hpp>
-#include <miopen/type_name.hpp>
-#include <miopen/each_args.hpp>
-#include <miopen/bfloat16.hpp>
-#include "../driver/random.hpp"
-
-#include "serialize.hpp"
-
-#include <half/half.hpp>
-using half         = half_float::half;
-using hip_bfloat16 = bfloat16;
-#include "../../src/kernels/hip_float8.hpp"
-using float8_fnuz  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
-using bfloat8_fnuz = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
-
-#include <iomanip>
-#include <fstream>
-
-template <class F>
-void visit_tensor_size(std::size_t n, F f)
-{
-    switch(n)
-    {
-    case 0: {
-        f(std::integral_constant<std::size_t, 0>{});
-        break;
-    }
-    case 1: {
-        f(std::integral_constant<std::size_t, 1>{});
-        break;
-    }
-    case 2: {
-        f(std::integral_constant<std::size_t, 2>{});
-        break;
-    }
-    case 3: {
-        f(std::integral_constant<std::size_t, 3>{});
-        break;
-    }
-    case 4: {
-        f(std::integral_constant<std::size_t, 4>{});
-        break;
-    }
-    case 5: {
-        f(std::integral_constant<std::size_t, 5>{});
-        break;
-    }
-    default: throw std::runtime_error("Unknown tensor size");
-    }
-}
-
-template <class T>
-struct miopen_type;
-
-template <>
-struct miopen_type<float> : std::integral_constant<miopenDataType_t, miopenFloat>
-{
-};
-
-template <>
-struct miopen_type<double> : std::integral_constant<miopenDataType_t, miopenDouble>
-{
-};
-
-template <>
-struct miopen_type<half_float::half> : std::integral_constant<miopenDataType_t, miopenHalf>
-{
-};
-template <>
-struct miopen_type<bfloat16> : std::integral_constant<miopenDataType_t, miopenBFloat16>
-{
-};
-
-template <>
-struct miopen_type<int8_t> : std::integral_constant<miopenDataType_t, miopenInt8>
-{
-};
-
-template <>
-struct miopen_type<int> : std::integral_constant<miopenDataType_t, miopenInt32>
-{
-};
-
-template <>
-struct miopen_type<int64_t> : std::integral_constant<miopenDataType_t, miopenInt64>
-{
-};
-
-template <>
-struct miopen_type<float8_fnuz> : std::integral_constant<miopenDataType_t, miopenFloat8_fnuz>
-{
-};
-
-template <>
-struct miopen_type<bfloat8_fnuz> : std::integral_constant<miopenDataType_t, miopenBFloat8_fnuz>
-{
-};
-
-template <>
-struct miopen_type<uint8_t> : std::integral_constant<miopenDataType_t, miopenInt8>
-{
-};
-
-template <>
-struct miopen_type<uint16_t> : std::integral_constant<miopenDataType_t, miopenHalf>
-{
-};
-
-template <>
-struct miopen_type<uint64_t> : std::integral_constant<miopenDataType_t, miopenInt64>
-{
-};
-
-template <class T>
-struct tensor
-{
-    using value_type = T;
-    miopen::TensorDescriptor desc;
-    std::vector<T> data;
-
-#if defined(__clang__) || defined(__GNUG__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-    tensor() : desc(miopen_type<T>{}) {}
-
-#if defined(__clang__) || defined(__GNUG__)
-#pragma GCC diagnostic pop
-#endif
-
-    template <class X>
-    tensor(const std::vector<X>& dims) : desc(miopen_type<T>{}, dims), data(desc.GetElementSpace())
-    {
-    }
-
-    template <class X>
-    tensor(const std::vector<X>& dims, const std::vector<X>& strides)
-        : desc(miopen_type<T>{}, dims, strides), data(desc.GetElementSpace())
-    {
-        assert(dims.size() == strides.size());
-    }
-
-    template <class X>
-    tensor(miopenTensorLayout_t layout, const std::vector<X>& dims)
-        : desc(miopen_type<T>{}, layout, dims), data(desc.GetElementSpace())
-    {
-    }
-
-    template <class X>
-    tensor(miopenTensorLayout_t layout, const std::vector<X>& dims, const std::vector<X>& strides)
-        : desc(miopen_type<T>{}, layout, dims, strides), data(desc.GetElementSpace())
-    {
-        assert(dims.size() == strides.size());
-    }
-
-    tensor(std::size_t n, std::size_t c, std::size_t h, std::size_t w)
-        : desc(miopen_type<T>{}, {n, c, h, w}), data(n * c * h * w)
-    {
-    }
-
-    tensor(miopenTensorLayout_t layout, std::size_t n, std::size_t c, std::size_t h, std::size_t w)
-        : desc(miopen_type<T>{}, layout, {n, c, h, w}), data(desc.GetElementSpace())
-    {
-    }
-
-    tensor(std::size_t n, std::size_t c, std::size_t d, std::size_t h, std::size_t w)
-        : desc(miopen_type<T>{}, {n, c, d, h, w}), data(n * c * d * h * w)
-    {
-    }
-
-    tensor(std::size_t n) : desc(miopen_type<T>{}, {n}), data(n) {}
-
-    tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs))
-    {
-        assert(desc.GetType() == miopen_type<T>{}
-               /// In the driver, T is input tensor type, but output tensor holders
-               /// are instantiatied with T as well. This leads to false assertion
-               /// failures when T is INT8 because output type is different.
-               /// \todo Get rid of this hack when the driver is improved:
-               || (miopen_type<T>{} == miopenInt8 && desc.GetType() == miopenInt32));
-        data.resize(desc.GetElementSpace());
-    }
-
-    size_t GetDataByteSize() const { return GetSize() * sizeof(T); }
-
-    size_t GetSize() const { return desc.GetElementSpace(); }
-
-    template <class G>
-    tensor& generate(G g) &
-    {
-        if(this->desc.GetVectorLength() > 1)
-            this->generate_vect_impl(g);
-        else
-            this->generate_impl(g);
-        return *this;
-    }
-
-    template <class G>
-    tensor&& generate(G g) &&
-    {
-        if(this->desc.GetVectorLength() > 1)
-            this->generate_vect_impl(g);
-        else
-            this->generate_impl(g);
-        return std::move(*this);
-    }
-
-    template <class G>
-    void generate_impl(G g)
-    {
-        auto seed = std::accumulate(desc.GetLengths().begin(),
-                                    desc.GetLengths().end(),
-                                    std::size_t{521288629},
-                                    [](auto x, auto y) {
-                                        x ^= x << 1U;
-                                        return x ^ y;
-                                    });
-        seed ^= data.size();
-        seed ^= desc.GetLengths().size();
-        prng::reset_seed(seed);
-        auto iterator = data.begin();
-        auto assign   = [&](T x) {
-            *iterator = x;
-            ++iterator;
-        };
-        this->for_each(
-            miopen::compose(miopen::compose(assign, miopen::cast_to<T>()), std::move(g)));
-    }
-
-    template <class G>
-    void generate_vect_impl(G g)
-    {
-        auto seed = std::accumulate(desc.GetLengths().begin(),
-                                    desc.GetLengths().end(),
-                                    std::size_t{521288629},
-                                    [](auto x, auto y) {
-                                        x ^= x << 1U;
-                                        return x ^ y;
-                                    });
-        seed ^= data.size();
-        seed ^= desc.GetLengths().size();
-        prng::reset_seed(seed);
-        auto iterator     = data.begin();
-        auto vectorLength = desc.GetVectorLength();
-        auto assign       = [&](T x) {
-            assert(iterator < data.end());
-            // for debugging
-            for(auto i = 0; i < vectorLength; i++)
-            {
-                *(iterator + i) = x;
-            }
-            iterator += vectorLength;
-        };
-        this->for_each(
-            miopen::compose(miopen::compose(assign, miopen::cast_to<T>()), std::move(g)));
-    }
-
-    template <class Loop, class F>
-    struct for_each_unpacked
-    {
-        Loop loop;
-        F f;
-        template <class... Ts>
-        auto operator()(Ts... xs) const -> decltype(f(xs...), void())
-        {
-            loop(xs...)(std::move(f));
-        }
-
-        struct any
-        {
-            any() {}
-            template <class X>
-            any(X)
-            {
-            }
-        };
-
-        [[noreturn]] void operator()(any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {},
-                                     any = {}) const
-        {
-            throw std::runtime_error(
-                "Arguments to for_each do not match tensor size or the function " +
-                miopen::get_type_name<F>() + " can not be called.");
-        }
-    };
-
-    struct for_each_handler
-    {
-        template <class Self, class Loop, class F, class Size>
-        void operator()(Self* self, Loop loop, F f, Size size) const
-        {
-            auto dims = miopen::tien<size>(self->desc.GetLengths());
-            miopen::unpack(for_each_unpacked<Loop, F>{loop, std::move(f)}, dims);
-        }
-    };
-
-    template <class F>
-    void for_each(F f) const
-    {
-        visit_tensor_size(
-            desc.GetLengths().size(),
-            std::bind(for_each_handler{}, this, miopen::ford, std::move(f), std::placeholders::_1));
-    }
-
-    template <class F>
-    void par_for_each(F f) const
-    {
-        visit_tensor_size(
-            desc.GetLengths().size(),
-            std::bind(
-                for_each_handler{}, this, miopen::par_ford, std::move(f), std::placeholders::_1));
-    }
-
-    template <class... Ts>
-    T& operator()(Ts... xs)
-    {
-        assert(this->desc.GetIndex(xs...) < data.size());
-        return this->data[this->desc.GetIndex(xs...)];
-    }
-
-    template <class... Ts>
-    const T& operator()(Ts... xs) const
-    {
-        assert(this->desc.GetIndex(xs...) < data.size());
-        return this->data[this->desc.GetIndex(xs...)];
-    }
-
-    template <class Integer, Integer N>
-    const T& operator()(const std::array<Integer, N>& multi_id) const
-    {
-        auto f = [&](auto... is) { return this->desc.GetIndex(is...); };
-        assert(miopen::unpack(f, multi_id) < data.size());
-        return this->data[miopen::unpack(f, multi_id)];
-    }
-
-    T& operator[](std::size_t i) { return data.at(i); }
-
-    const T& operator[](std::size_t i) const { return data.at(i); }
-
-    typename std::vector<T>::iterator begin() { return data.begin(); }
-
-    typename std::vector<T>::iterator end() { return data.end(); }
-
-    typename std::vector<T>::const_iterator begin() const { return data.begin(); }
-
-    typename std::vector<T>::const_iterator end() const { return data.end(); }
-
-    friend std::ostream& operator<<(std::ostream& stream, const tensor& t)
-    {
-        return stream << t.desc;
-    }
-
-    template <size_t N, typename Stream>
-    void dump_inner(size_t dim, std::array<size_t, N>& coord, Stream& stream) const
-    {
-        const auto lengths = this->desc.GetLengths();
-        if(lengths.size() == 0)
-        {
-            // 0D special case: Just print the one value that we have and return.
-            stream << (*this)(coord);
-        }
-        else if(dim + 1 == lengths.size())
-        {
-            // 1D special case: dump everything on one line
-            for(size_t i = 0; i < lengths[dim]; ++i)
-            {
-                if(i != 0)
-                    stream << ' ';
-
-                coord[dim] = i;
-                stream << std::setw(4) << (*this)(coord);
-            }
-
-            stream << '\n';
-        }
-        else
-        {
-            if(dim + 2 == lengths.size())
-            {
-                // 2D special case: Also print which 2D slice we are currently printing
-                // Note: this is not needed for higher dimensions, as they will also pass
-                // through this branch.
-                stream << "slice [";
-                for(size_t i = 0; i < dim; ++i)
-                {
-                    stream << coord[i] << ", ";
-                }
-                stream << ":, :]\n";
-            }
-
-            for(size_t i = 0; i < lengths[dim]; ++i)
-            {
-                coord[dim] = i;
-                this->dump_inner<N>(dim + 1, coord, stream);
-            }
-        }
-    }
-
-    template <typename Stream = decltype(std::cout)>
-    void dump(const char* name, Stream& stream = std::cout) const
-    {
-        const auto n = this->desc.GetLengths().size();
-        stream << "==== " << name << ": " << *this << n << '\n';
-        stream.fill(' ');
-
-        const auto flags = stream.flags();
-
-        visit_tensor_size(n, [&](const auto size) {
-            constexpr size_t N = decltype(size)::value;
-            std::array<size_t, N> coord;
-            this->dump_inner<N>(0, coord, stream);
-        });
-
-        stream.flags(flags);
-    }
-};
-
-template <class T>
-void serialize(std::istream& s, tensor<T>& x)
-{
-    std::vector<std::size_t> lens;
-    serialize(s, lens);
-    std::vector<std::size_t> strides;
-    serialize(s, strides);
-    x.desc = miopen::TensorDescriptor{miopen_type<T>{}, lens, strides};
-    serialize(s, x.data);
-}
-
-template <class T>
-void serialize(std::ostream& s, const tensor<T>& x)
-{
-    const auto& lens    = x.desc.GetLengths();
-    const auto& strides = x.desc.GetStrides();
-    serialize(s, lens);
-    serialize(s, strides);
-    serialize(s, x.data);
-}
-
-struct tensor_generate
-{
-    template <class Tensor, class G>
-    Tensor&& operator()(Tensor&& t, G g) const
-    {
-        return std::forward<Tensor>(t.generate(g));
-    }
-};
-
-struct tensor_elem_gen_integer
-{
-    uint64_t max_value = 17;
-
-    template <class... Ts>
-    double operator()(Ts... Xs) const
-    {
-        static_assert(sizeof...(Ts) < 6,
-                      "Dimensions in tensor_elem_gen_integer must be less than 6.");
-        assert(max_value > 0);
-        std::array<uint64_t, sizeof...(Ts)> left = {{Xs...}};
-        std::array<uint64_t, 5> right            = {{613, 547, 701, 877, 1049}};
-        uint64_t dot =
-            std::inner_product(left.begin(), left.end(), right.begin(), static_cast<uint64_t>(173));
-        return static_cast<double>(dot % max_value);
-    }
-};
-
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/tensor_holder.hpp>
diff --git a/projects/miopen/test/verify.hpp b/projects/miopen/test/verify.hpp
index 1d7d9cf80a50..8807b5ecfe2b 100644
--- a/projects/miopen/test/verify.hpp
+++ b/projects/miopen/test/verify.hpp
@@ -1,245 +1,2 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_VERIFY_HPP
-#define GUARD_VERIFY_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <iostream>
-#include <miopen/float_equal.hpp>
-#include <miopen/returns.hpp>
-#include <numeric>
-#include <miopen/bfloat16.hpp>
-using half         = half_float::half;
-using hip_bfloat16 = bfloat16;
-#include <hip_float8.hpp>
-#include "tensor_holder.hpp"
-
-namespace miopen {
-
-// Compute the value of a range
-template <class R>
-using range_value = typename std::decay<decltype(*std::declval<R>().begin())>::type;
-
-struct sum_fn
-{
-    template <class T, class U>
-    auto operator()(T x, U y) const MIOPEN_RETURNS(x + y);
-};
-static constexpr sum_fn sum{};
-
-struct max_fn
-{
-    template <class T>
-    static T id(T x)
-    {
-        return x;
-    }
-
-    template <class T, class U>
-    auto operator()(T x, U y) const MIOPEN_RETURNS(max_fn::id(x > y ? x : y));
-};
-static constexpr max_fn max{};
-
-namespace abs_diff_detail {
-using std::fabs;
-struct fn
-{
-    template <class T, class U>
-    auto operator()(T x, U y) const MIOPEN_RETURNS(fabs(x - y));
-};
-
-} // namespace abs_diff_detail
-
-static constexpr abs_diff_detail::fn abs_diff{};
-
-struct not_finite_fn
-{
-    template <class T, typename std::enable_if<(std::is_floating_point_v<T>), bool>::type = false>
-    bool operator()(T x) const
-    {
-        return !std::isfinite(x);
-    }
-
-    template <class T,
-              typename std::enable_if<
-                  (std::is_same_v<typename std::remove_cv<T>::type, half_float::half>),
-                  bool>::type = false>
-    bool operator()(T x) const
-    {
-        return !half_float::isfinite(x);
-    }
-
-    template <class T,
-              typename std::enable_if<(std::is_same_v<typename std::remove_cv<T>::type, bfloat16>),
-                                      bool>::type = false>
-    bool operator()(T x) const
-    {
-        return !std::isfinite(x); // bfloat16 has float() conversion operator
-    }
-
-    template <class T, typename std::enable_if<(std::is_integral_v<T>), bool>::type = false>
-    bool operator()(T x) const
-    {
-        std::ignore = x;
-        return false;
-    }
-};
-static constexpr not_finite_fn not_finite{};
-
-template <class T, class U>
-T as(T, U x)
-{
-    return x;
-}
-
-struct compare_mag_fn
-{
-    template <class T, class U>
-    bool operator()(T x, U y) const
-    {
-        using std::fabs;
-        return fabs(x) < fabs(y);
-    }
-};
-static constexpr compare_mag_fn compare_mag{};
-
-struct square_diff_fn
-{
-    template <class T, class U>
-    double operator()(T x, U y) const
-    {
-        double diff = static_cast<double>(x - y);
-        return diff * diff;
-    }
-};
-static constexpr square_diff_fn square_diff{};
-
-template <class T, std::enable_if_t<!std::is_floating_point_v<T>, bool> = true>
-bool equal_values(T const& lhs, T const& rhs)
-{
-    return lhs == rhs;
-}
-
-template <class T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
-bool equal_values(T const& lhs, T const& rhs)
-{
-    return miopen::float_equal_sentinel(lhs, rhs);
-}
-
-template <class R1>
-bool range_empty(R1&& r1)
-{
-    return r1.begin() == r1.end();
-}
-
-template <class R1>
-auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end()));
-
-template <class T>
-bool range_zero(const std::vector<T>& r)
-{
-    return std::all_of(r.begin(), r.end(), [](T x) { return equal_values(x, T()); });
-}
-
-template <class T>
-bool range_zero(const tensor<T>& r)
-{
-    return range_zero(r.data);
-}
-
-template <class R1, class R2, class T, class Reducer, class Product>
-T range_product(R1&& r1, R2&& r2, T state, Reducer r, Product p)
-{
-    return std::inner_product(r1.begin(), r1.end(), r2.begin(), state, r, p);
-}
-
-template <class R1, class R2, class Compare>
-std::size_t mismatch_idx(R1&& r1, R2&& r2, Compare compare)
-{
-    auto p = std::mismatch(r1.begin(), r1.end(), r2.begin(), compare);
-    return std::distance(r1.begin(), p.first);
-}
-
-template <class R1, class Predicate>
-int64_t find_idx(R1&& r1, Predicate p)
-{
-    auto it = std::find_if(r1.begin(), r1.end(), p);
-    if(it == r1.end())
-        return -1;
-    else
-        return std::distance(r1.begin(), it);
-}
-
-template <class R1, class R2>
-double max_diff(R1&& r1, R2&& r2)
-{
-    return range_product(r1, r2, 0.0, max, abs_diff);
-}
-
-template <class R1, class R2>
-auto max_diff_v2(R1&& r1, R2&& r2)
-{
-    using T            = decltype(r1[0] - r2[0]);
-    auto abs_diff_func = [](auto x, auto y) { return x > y ? x - y : y - x; };
-    // BUG: deduced wrong datatype, half_float bug
-    if constexpr(std::is_same_v<T, half_float::detail::expr>)
-        return range_product(r1, r2, half_float::half(), max, abs_diff_func);
-    else
-        return range_product(r1, r2, T(), max, abs_diff_func);
-}
-
-template <class R1, class R2, class T>
-std::size_t mismatch_diff(R1&& r1, R2&& r2, T diff)
-{
-    return mismatch_idx(
-        r1,
-        r2,
-        std::bind(
-            float_equal, diff, std::bind(abs_diff, std::placeholders::_1, std::placeholders::_2)));
-}
-
-template <class R1, class R2>
-double rms_range(R1&& r1, R2&& r2)
-{
-    std::size_t n = range_distance(r1);
-    if(n == range_distance(r2))
-    {
-        if(n == 0)
-            return 0;
-        double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff);
-        double mag1 = static_cast<double>(*std::max_element(r1.begin(), r1.end(), compare_mag));
-        double mag2 = static_cast<double>(*std::max_element(r2.begin(), r2.end(), compare_mag));
-        double mag =
-            std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
-        return std::sqrt(square_difference) / (std::sqrt(n) * mag);
-    }
-    else
-        return double(std::numeric_limits<range_value<R1>>::max());
-}
-} // namespace miopen
-#endif
+// Forwarding header — implementation moved to miopen_utils.
+#include <miopen_utils/verify.hpp>